diff --git a/.gitignore b/.gitignore
index 57d84228cfd037325716b5faa56c17f7424fe713..90324058600bee46af56e49028977971848a80de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,7 +24,7 @@ Pods
 Podfile.lock
 *.pbxproj
 *.xcworkspacedata
-/tensorflow/lite/downloads/**
+/tensorflow/lite/tools/make/downloads/**
 /tensorflow/lite/gen/**
 /tensorflow/lite/examples/ios/simple/data/*.txt
 /tensorflow/lite/examples/ios/simple/data/*.tflite
diff --git a/WORKSPACE b/WORKSPACE
index 17961829a605c2d1f2d2ba86a7c30c47618c139b..0c7bc085b512b084b9470abe17326d7c119aa327 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -14,6 +14,33 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 
 closure_repositories()
 
+http_archive(
+    name = "base_images_docker",
+    sha256 = "e2b1b7254270bb7605e814a9dbf6d1e4ae04a11136ff1714fbfdabe3f87f7cf9",
+    strip_prefix = "base-images-docker-12801524f867e657fbb5d1a74f31618aff181ac6",
+    urls = ["https://github.com/GoogleCloudPlatform/base-images-docker/archive/12801524f867e657fbb5d1a74f31618aff181ac6.tar.gz"],
+)
+
+http_archive(
+    name = "bazel_toolchains",
+    sha256 = "15b5858b1b5541ec44df31b94c3b8672815b31d71215a98398761ea9f4c4eedb",
+    strip_prefix = "bazel-toolchains-6200b238c9c2d137c0d9a7262c80cc71d98e692b",
+    urls = [
+        "https://github.com/bazelbuild/bazel-toolchains/archive/6200b238c9c2d137c0d9a7262c80cc71d98e692b.tar.gz",
+    ],
+)
+
+http_archive(
+    name = "io_bazel_rules_docker",
+    sha256 = "29d109605e0d6f9c892584f07275b8c9260803bf0c6fcb7de2623b2bedc910bd",
+    strip_prefix = "rules_docker-0.5.1",
+    urls = ["https://github.com/bazelbuild/rules_docker/archive/v0.5.1.tar.gz"],
+)
+
+load("//third_party/toolchains/preconfig/generate:workspace.bzl", "remote_config_workspace")
+
+remote_config_workspace()
+
 # We must check the bazel version before trying to parse any other BUILD
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
@@ -79,3 +106,4 @@ new_http_archive(
         "http://download.tensorflow.org/models/speech_commands_v0.01.zip",
     ],
 )
+
diff --git a/configure.py b/configure.py
index 2eeeceb3399c79775ce62b9569e940e469141a17..234561d94a46f57c4de5ca487360e2d5a3dfdb2f 100644
--- a/configure.py
+++ b/configure.py
@@ -43,7 +43,7 @@ _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
 _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
-_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16]
+_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16, 17, 18]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
@@ -1555,6 +1555,9 @@ def main():
   check_bazel_version('0.15.0')
 
   reset_tf_configure_bazelrc()
+  # Explicitly import tools/bazel.rc, this is needed for Bazel 0.19.0 or later
+  write_to_bazelrc('import %workspace%/tools/bazel.rc')
+
   cleanup_makefile()
   setup_python(environ_cp)
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 11b42f349df89c605f4ce1130033a85c920258c9..17577afecb74b7008db5a282255278b35ed138a6 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -43,6 +43,11 @@ TENSORFLOW_API_INIT_FILES_V2 = (
     TENSORFLOW_API_INIT_FILES + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
 )
 
+# @unused
+TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT = (
+    TENSORFLOW_API_INIT_FILES_V1 + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
+)
+
 # Config setting used when building for products
 # which requires restricted licenses to be avoided.
 config_setting(
@@ -213,31 +218,31 @@ config_setting(
 #
 config_setting(
     name = "no_aws_support",
-    define_values = {"no_aws_support": "false"},
+    define_values = {"no_aws_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_gcp_support",
-    define_values = {"no_gcp_support": "false"},
+    define_values = {"no_gcp_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_hdfs_support",
-    define_values = {"no_hdfs_support": "false"},
+    define_values = {"no_hdfs_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_ignite_support",
-    define_values = {"no_ignite_support": "false"},
+    define_values = {"no_ignite_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_kafka_support",
-    define_values = {"no_kafka_support": "false"},
+    define_values = {"no_kafka_support": "true"},
     visibility = ["//visibility:public"],
 )
 
@@ -350,8 +355,9 @@ package_group(
         "-//third_party/tensorflow/python/estimator",
         "//learning/meta_rank/...",
         "//tensorflow/...",
-        "//tensorflow_estimator/...",
+        "//tensorflow_estimator/contrib/...",
         "//tensorflow_fold/llgtm/...",
+        "//tensorflow_text/...",
         "//third_party/py/tensor2tensor/...",
     ],
 )
@@ -553,18 +559,24 @@ genrule(
     }),
     outs = ["__init__.py"],
     cmd = select({
-        "api_version_2": "cp $(@D)/_api/v2/__init__.py $(OUTS)",
-        "//conditions:default": "cp $(@D)/_api/v1/__init__.py $(OUTS)",
+        "api_version_2": "cp $(@D)/_api/v2/v2.py $(OUTS)",
+        "//conditions:default": "cp $(@D)/_api/v1/v1.py $(OUTS)",
     }),
 )
 
 gen_api_init_files(
     name = "tf_python_api_gen_v1",
-    srcs = ["api_template_v1.__init__.py"],
+    srcs = [
+        "api_template_v1.__init__.py",
+        "compat_template_v1.__init__.py",
+    ],
     api_version = 1,
+    compat_api_versions = [1],
+    compat_init_templates = ["compat_template_v1.__init__.py"],
     output_dir = "_api/v1/",
-    output_files = TENSORFLOW_API_INIT_FILES_V1,
+    output_files = TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT,
     output_package = "tensorflow._api.v1",
+    root_file_name = "v1.py",
     root_init_template = "api_template_v1.__init__.py",
 )
 
@@ -580,6 +592,7 @@ gen_api_init_files(
     output_dir = "_api/v2/",
     output_files = TENSORFLOW_API_INIT_FILES_V2,
     output_package = "tensorflow._api.v2",
+    root_file_name = "v2.py",
     root_init_template = "api_template.__init__.py",
 )
 
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 0d49756838505289a960a6cabeb7cab02fad995b..2efb8846c6837a3935e0a8439a18838cb2bea804 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -34,7 +34,8 @@ from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-
 
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(app.__file__))  # pylint: disable=undefined-variable
+# We're using bitwise, but there's nothing special about that.
+_tf_api_dir = _os.path.dirname(_os.path.dirname(bitwise.__file__))  # pylint: disable=undefined-variable
 if _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 16f633643d4726f6e2d1a23c3b192d48dbbc8f14..f653e581bf3beda9fdbf8fb7905a4f9fe170e7fb 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -60,6 +60,7 @@ tf_cuda_library(
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core:op_gen_lib",
+            "//tensorflow/core/distributed_runtime:server_lib",
         ],
     }),
 )
@@ -95,6 +96,7 @@ tf_cuda_library(
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
+            "//tensorflow/core/distributed_runtime:server_lib",
         ],
     }) + select({
         "//tensorflow:with_xla_support": [
@@ -119,7 +121,8 @@ tf_cuda_library(
         ":c_api",
         ":c_api_internal",
         "//tensorflow/c/eager:c_api",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/contrib/tpu:all_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -172,6 +175,28 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "kernels",
+    srcs = [
+        "kernels.cc",
+    ],
+    hdrs = [
+        "kernels.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            ":c_api",
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            ":c_api",
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
 # -----------------------------------------------------------------------------
 # Tests
 
@@ -199,7 +224,7 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["c_api_test.cc"],
     data = [
-        ":test_op.so",
+        ":test_op1.so",
         "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     ],
     kernels = [":test_op_kernel"],
@@ -207,7 +232,10 @@ tf_cuda_cc_test(
         "//tensorflow:darwin": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    tags = ["noasan"],
+    tags = [
+        "no_oss",  # http://b/119522529
+        "noasan",
+    ],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
     # linkstatic = tf_kernel_tests_linkstatic(),
@@ -218,6 +246,7 @@ tf_cuda_cc_test(
         "//tensorflow/cc:grad_ops",
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/cc/saved_model:tag_constants",
+        "//tensorflow/compiler/jit",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
@@ -235,7 +264,7 @@ tf_cuda_cc_test(
 
 tf_cc_test(
     name = "c_api_experimental_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_experimental_test.cc"],
     data = ["testdata/tf_record"],
     linkopts = select({
@@ -246,8 +275,11 @@ tf_cc_test(
     # the shared library must be able to use core:framework.
     # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
+        ":c_api",
         ":c_api_experimental",
         ":c_test_util",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -284,8 +316,8 @@ tf_cc_test(
 )
 
 tf_custom_op_library(
-    name = "test_op.so",
-    srcs = ["test_op.cc"],
+    name = "test_op1.so",
+    srcs = ["test_op1.cc"],
 )
 
 tf_kernel_library(
@@ -298,6 +330,30 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
+tf_cuda_cc_test(
+    name = "kernels_test",
+    size = "small",
+    srcs = ["kernels_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    tags = ["noasan"],
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api",
+        ":kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Python API target
 
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 4540dcd6638a58c25628dccd2fa78f1fe06bef1d..f13e8777dff164bcd8eedf46310ae846abd0c804 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2810,4 +2810,71 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
   }
   return ret;
 }
+
+// TF_Server functions ----------------------------------------------
+
+#ifndef __ANDROID__
+TF_Server::TF_Server(std::unique_ptr<tensorflow::ServerInterface> server)
+    : target(server->target()), server(std::move(server)) {}
+#endif  // __ANDROID__
+
+TF_Server* TF_NewServer(const void* proto, size_t proto_len,
+                        TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+  return nullptr;
+#else
+  tensorflow::ServerDef server_def;
+  if (!server_def.ParseFromArray(proto, static_cast<int>(proto_len))) {
+    status->status = InvalidArgument(
+        "Could not parse provided bytes into a ServerDef protocol buffer");
+    return nullptr;
+  }
+
+  std::unique_ptr<tensorflow::ServerInterface> out_server;
+  status->status = tensorflow::NewServer(server_def, &out_server);
+  if (!status->status.ok()) return nullptr;
+
+  return new TF_Server(std::move(out_server));
+#endif
+}
+
+void TF_ServerStart(TF_Server* server, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+#else
+  status->status = server->server->Start();
+#endif
+}
+
+void TF_ServerStop(TF_Server* server, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+#else
+  status->status = server->server->Stop();
+#endif
+}
+
+void TF_ServerJoin(TF_Server* server, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+#else
+  status->status = server->server->Join();
+#endif
+}
+
+const char* TF_ServerTarget(TF_Server* server) {
+#ifdef __ANDROID__
+  return nullptr;
+#else
+  return server->target.c_str();
+#endif
+}
+
+void TF_DeleteServer(TF_Server* server) { delete server; }
+
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index da8ad1cec59e328b9f1a77f81416651a618e97d3..3d56268110edbe96616201d15a69cc8c84d3115a 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -1668,6 +1668,47 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status);
 TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp(
     const char* name, TF_Status* status);
 
+// --------------------------------------------------------------------------
+// In-process TensorFlow server functionality, for use in distributed training.
+// A Server instance encapsulates a set of devices and a Session target that
+// can participate in distributed training. A server belongs to a cluster
+// (specified by a ClusterSpec), and corresponds to a particular task in a
+// named job. The server can communicate with any other server in the same
+// cluster.
+
+// In-process TensorFlow server.
+typedef struct TF_Server TF_Server;
+
+// Creates a new in-process TensorFlow server configured using a serialized
+// ServerDef protocol buffer provided via `proto` and `proto_len`.
+//
+// The server will not serve any requests until TF_ServerStart is invoked.
+// The server will stop serving requests once TF_ServerStop or
+// TF_DeleteServer is invoked.
+TF_CAPI_EXPORT extern TF_Server* TF_NewServer(const void* proto,
+                                              size_t proto_len,
+                                              TF_Status* status);
+
+// Starts an in-process TensorFlow server.
+TF_CAPI_EXPORT extern void TF_ServerStart(TF_Server* server, TF_Status* status);
+
+// Stops an in-process TensorFlow server.
+TF_CAPI_EXPORT extern void TF_ServerStop(TF_Server* server, TF_Status* status);
+
+// Blocks until the server has been successfully stopped (via TF_ServerStop or
+// TF_ServerClose).
+TF_CAPI_EXPORT extern void TF_ServerJoin(TF_Server* server, TF_Status* status);
+
+// Returns the target string that can be provided to TF_SetTarget() to connect
+// a TF_Session to `server`.
+//
+// The returned string is valid only until TF_DeleteServer is invoked.
+TF_CAPI_EXPORT extern const char* TF_ServerTarget(TF_Server* server);
+
+// Destroy an in-process TensorFlow server, frees memory. If server is running
+// it will be stopped and joined.
+TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index fabe2fa0f60bc8baafa7f83802da74bb7ab93c6d..69de4cb711ef89734af3729c5e5518c14a7f5738 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -15,13 +15,18 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/net.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
@@ -51,8 +56,8 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
     // These XLA flags are needed to trigger XLA properly from C (more generally
     // non-Python) clients. If this API is called again with `enable` set to
     // false, it is safe to keep these flag values as is.
-    tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
-        tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+    tensorflow::MarkForCompilationPassFlags* flags =
+        tensorflow::GetMarkForCompilationPassFlags();
     flags->tf_xla_cpu_global_jit = true;
     flags->tf_xla_min_cluster_size = 1;
   } else {
@@ -71,8 +76,8 @@ TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
     // These XLA flags are needed to trigger XLA properly from C (more generally
     // non-Python) clients. If this API is called again with `enable` set to
     // false, it is safe to keep these flag values as is.
-    tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
-        tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+    tensorflow::MarkForCompilationPassFlags* flags =
+        tensorflow::GetMarkForCompilationPassFlags();
     flags->tf_xla_cpu_global_jit = true;
     flags->tf_xla_min_cluster_size = 1;
   } else {
@@ -8739,8 +8744,55 @@ void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   TF_DeleteStatus(status);
 }
 
-TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
-                                                      const char* errMsg) {
+struct TFE_ExecuteOpNotification {
+  TFE_ExecuteOpNotification() : status(TF_NewStatus(), TF_DeleteStatus) {}
+  tensorflow::Notification n;
+  std::unique_ptr<tensorflow::Thread> thread;
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status;
+};
+
+TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread(TFE_Op* op,
+                                                    TFE_TensorHandle** retvals,
+                                                    int* num_retvals,
+                                                    TF_Status* status) {
+  TFE_ExecuteOpNotification* n = new TFE_ExecuteOpNotification;
+
+  n->thread.reset(op->operation.EagerContext()->TFEnv()->StartThread(
+      tensorflow::ThreadOptions(), "ExecuteOpThread",
+      [op, retvals, num_retvals, n]() {
+        TFE_Execute(op, retvals, num_retvals, n->status.get());
+        n->n.Notify();
+      }));
+
+  return n;
+}
+
+void TFE_ExecuteOpNotificationWaitAndDelete(
+    TFE_ExecuteOpNotification* notification, TF_Status* status) {
+  if (notification == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Passed in notification is a nullptr.");
+
+    return;
+  }
+  if (notification->thread == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Passed in notification didn't start a thread correctly. Cleaning up "
+        "this notification. Please re-execute the operation to get a new "
+        "notification.");
+
+    delete notification;
+    return;
+  }
+
+  notification->n.WaitForNotification();
+
+  status->status = notification->status->status;
+
+  delete notification;
+}
+
+void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg) {
   status->status = tensorflow::errors::Internal(errMsg);
 }
 
@@ -8800,3 +8852,21 @@ const char* TF_GetNumberAttrForOpListInput(const char* op_name, int input_index,
   // The returned string is owned by OpRegistry, so liveness is not a concern.
   return input_arg.number_attr().c_str();
 }
+
+int TF_OpIsStateful(const char* op_type, TF_Status* status) {
+  const tensorflow::OpRegistrationData* op_reg_data;
+  status->status =
+      tensorflow::OpRegistry::Global()->LookUp(op_type, &op_reg_data);
+  if (!status->status.ok()) {
+    return 0;
+  }
+  return op_reg_data->op_def.is_stateful();
+}
+
+void TF_InitMain(const char* usage, int* argc, char*** argv) {
+  tensorflow::port::InitMain(usage, argc, argv);
+}
+
+int TF_PickUnusedPortOrDie() {
+  return tensorflow::internal::PickUnusedPortOrDie();
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 6639b0be72bdf81d0e3c806770364d7bc5082ad2..c04cd441bfbd89dafc8b3f0882ab06cd98a1b6fb 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -180,6 +180,25 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
 TF_CAPI_EXPORT extern void TFE_TensorHandlePrintDebugString(
     TFE_TensorHandle* handle);
 
+typedef struct TFE_ExecuteOpNotification TFE_ExecuteOpNotification;
+
+// Allows invoking a kernel asynchronously, and explicitly returns a
+// notification that can be waited upon. This always executes the kernel in a
+// new thread.
+// 1. `retvals` and `num_retvals` can only be consumed after
+// `TFE_ExecuteOp` returns successfully. They shouldn't be used
+// if the return is unsuccessful
+// 2. These new APIs cannot be used together with the TFE context level async
+// support.
+TF_CAPI_EXPORT extern TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread(
+    TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
+    TF_Status* status);
+
+// Waits to complete the op execution, and cleans up the notification.
+// Errors reported by op execution are set in `status`.
+TF_CAPI_EXPORT extern void TFE_ExecuteOpNotificationWaitAndDelete(
+    TFE_ExecuteOpNotification* notification, TF_Status* status);
+
 TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
                                                       const char* errMsg);
 
@@ -209,6 +228,19 @@ TF_CAPI_EXPORT extern void TF_AttrBuilderCheckCanRunOnDevice(
 TF_CAPI_EXPORT extern const char* TF_GetNumberAttrForOpListInput(
     const char* op_name, int input_index, TF_Status* status);
 
+// Returns 1 if the op is stateful, 0 otherwise. The return value is undefined
+// if the status is not ok.
+TF_CAPI_EXPORT extern int TF_OpIsStateful(const char* op_type,
+                                          TF_Status* status);
+
+// Platform specific initialization routine. Very few platforms actually require
+// this to be called.
+TF_CAPI_EXPORT void TF_InitMain(const char* usage, int* argc, char*** argv);
+
+// Platform-specific implementation to return an unused port. (This should used
+// in tests only.)
+TF_CAPI_EXPORT int TF_PickUnusedPortOrDie();
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index c6effd39697e0397278770b53e98508074f99862..daa7701b7fe7e8ce757b6504329cf6434ad39778 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/c_test_util.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -162,5 +164,137 @@ protocol: "grpc"
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI_EXPERIMENTAL, IsStateful) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  int assign = TF_OpIsStateful("AssignAddVariableOp", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  EXPECT_EQ(assign, 1);
+  int id = TF_OpIsStateful("Identity", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  EXPECT_EQ(id, 0);
+}
+
+TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Simple) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+
+  TFE_Op* matmul_op = MatMulOp(ctx, m, m);
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+
+  auto* r =
+      TFE_ExecuteOpInNewThread(matmul_op, &retvals[0], &num_retvals, status);
+
+  TFE_ExecuteOpNotificationWaitAndDelete(r, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TFE_DeleteOp(matmul_op);
+  TFE_DeleteTensorHandle(m);
+
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
+// Perform a send/recv test. Recv blocks, so they need to be executed
+// asynchronously.
+TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Blocking) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  // Returns a 2x2 float32 Tensor on the CPU, with data 1., 2., 3., 4.
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+
+  // Build a send op.
+  TFE_Op* send_op = TFE_NewOp(ctx, "_Send", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(send_op, m, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  string tensor_name = "Tensor";
+  TFE_OpSetAttrType(send_op, "T", TF_FLOAT);
+  TFE_OpSetAttrString(send_op, "tensor_name", tensor_name.c_str(),
+                      tensor_name.size());
+  string send_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_OpSetAttrString(send_op, "send_device", send_device.c_str(),
+                      send_device.size());
+  TFE_OpSetAttrInt(send_op, "send_device_incarnation", 1234);
+  string recv_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_OpSetAttrString(send_op, "recv_device", recv_device.c_str(),
+                      recv_device.size());
+  TFE_OpSetAttrBool(send_op, "client_terminated", true);
+
+  // Build a recv op.
+  TFE_Op* recv_op = TFE_NewOp(ctx, "_Recv", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_OpSetAttrType(recv_op, "tensor_type", TF_FLOAT);
+  TFE_OpSetAttrString(recv_op, "tensor_name", tensor_name.c_str(),
+                      tensor_name.size());
+  TFE_OpSetAttrString(recv_op, "send_device", send_device.c_str(),
+                      send_device.size());
+  TFE_OpSetAttrInt(recv_op, "send_device_incarnation", 1234);
+  TFE_OpSetAttrString(recv_op, "recv_device", recv_device.c_str(),
+                      recv_device.size());
+  TFE_OpSetAttrBool(recv_op, "client_terminated", true);
+
+  TFE_TensorHandle* send_retvals;
+  int send_num_retvals = 0;
+  auto* send_result = TFE_ExecuteOpInNewThread(send_op, &send_retvals,
+                                               &send_num_retvals, status);
+
+  TFE_TensorHandle* recv_retvals[1] = {nullptr};
+  int recv_num_retvals = 1;
+  auto* recv_result = TFE_ExecuteOpInNewThread(recv_op, &recv_retvals[0],
+                                               &recv_num_retvals, status);
+
+  TFE_ExecuteOpNotificationWaitAndDelete(send_result, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_ExecuteOpNotificationWaitAndDelete(recv_result, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(recv_retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(1, product[0]);
+  EXPECT_EQ(2, product[1]);
+  EXPECT_EQ(3, product[2]);
+  EXPECT_EQ(4, product[3]);
+
+  TFE_DeleteOp(send_op);
+  TFE_DeleteOp(recv_op);
+  TFE_DeleteTensorHandle(m);
+
+  TFE_DeleteTensorHandle(recv_retvals[0]);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index f68f8a3e90a971b5e4a024feaf26ba498afc48da..28b9f8df9c873ee394eb6a241dd9ac06ba6c8796 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -392,26 +392,26 @@ Status ProcessInputs(
     EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   input_tensors->reserve(ninputs);
   for (int i = 0; i < ninputs; ++i) {
-    const Node& node = inputs[i].oper->node;
+    Node* node = &inputs[i].oper->node;
     int idx = inputs[i].index;
 
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        fn_body->graph.IsValidOutputTensor(&node, idx),
+        fn_body->graph.IsValidOutputTensor(node, idx),
         "Encountered while processing input ", i, " into function '", fn_name,
         "'");
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(&node, idx),
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(node, idx),
                                     "Encountered while processing input ", i,
                                     " into function '", fn_name, "'");
 
-    input_tensors->emplace_back(&node, idx);
+    input_tensors->emplace_back(node, idx);
 
-    const auto& iter = input_nodes->find(&node);
+    const auto& iter = input_nodes->find(node);
     if (iter == input_nodes->end()) {
-      input_nodes->insert({&node, {idx}});
+      input_nodes->insert({node, {idx}});
     } else {
       auto& indices = iter->second;
       if (std::find(indices.begin(), indices.end(), idx) != indices.end()) {
-        return InvalidArgument("TF_Output ", node.name(), ":", idx,
+        return InvalidArgument("TF_Output ", node->name(), ":", idx,
                                " appears more than once in the input list");
       }
       indices.push_back(idx);
@@ -428,16 +428,16 @@ Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name,
     EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   output_tensors->reserve(noutputs);
   for (int i = 0; i < noutputs; ++i) {
-    const Node& node = outputs[i].oper->node;
+    Node* node = &outputs[i].oper->node;
     int idx = outputs[i].index;
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        fn_body->graph.IsValidOutputTensor(&node, idx),
+        fn_body->graph.IsValidOutputTensor(node, idx),
         "Encountered while processing output ", i, " from function '", fn_name,
         "'");
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(&node, idx),
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(node, idx),
                                     "Encountered while creating function '",
                                     fn_name, "'");
-    output_tensors->emplace_back(&node, idx);
+    output_tensors->emplace_back(node, idx);
   }
   return Status::OK();
 }
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 95652a11378d6276b5ba6540a07baa15aa77cc1c..5ba26d3c585350aa510f9970cbfc246a9a108543 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #ifndef __ANDROID__
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #endif
 #include "tensorflow/core/common_runtime/shape_refiner.h"
@@ -179,6 +180,15 @@ struct TF_ApiDefMap {
   tensorflow::mutex lock;
 };
 
+#ifndef __ANDROID__
+struct TF_Server {
+  TF_Server(std::unique_ptr<tensorflow::ServerInterface> server);
+
+  const tensorflow::string target;
+  std::unique_ptr<tensorflow::ServerInterface> server;
+};
+#endif
+
 namespace tensorflow {
 
 class TensorCApi {
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index b0dc0363fdb266a7bb8babcd41ac469b5e763551..d5934a10395ae094f65d3bc8b6cd7b94dbd32410 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -187,15 +187,26 @@ TEST(CAPI, LibraryLoadFunctions) {
   // tf_cuda_cc_test() bazel rule and remove the next line.
   if (!GPUDeviceName().empty()) return;
 
-  // Load the library.
-  TF_Status* status = TF_NewStatus();
-  TF_Library* lib =
-      TF_LoadLibrary("tensorflow/c/test_op.so", status);
-  TF_Code code = TF_GetCode(status);
-  string status_msg(TF_Message(status));
-  TF_DeleteStatus(status);
-  ASSERT_EQ(TF_OK, code) << status_msg;
+#if !defined(TENSORFLOW_NO_SHARED_OBJECTS)
+  {
+    // Load the library.
+    TF_Status* status = TF_NewStatus();
+    TF_Library* lib =
+        TF_LoadLibrary("tensorflow/c/test_op1.so", status);
+    TF_Code code = TF_GetCode(status);
+    string status_msg(TF_Message(status));
+    TF_DeleteStatus(status);
+    ASSERT_EQ(TF_OK, code) << status_msg;
 
+    // Test op list.
+    TF_Buffer op_list_buf = TF_GetOpList(lib);
+    tensorflow::OpList op_list;
+    EXPECT_TRUE(op_list.ParseFromArray(op_list_buf.data, op_list_buf.length));
+    ASSERT_EQ(op_list.op_size(), 1);
+    EXPECT_EQ("TestCApi1", op_list.op(0).name());
+    TF_DeleteLibraryHandle(lib);
+  }
+#endif  // !defined(TENSORFLOW_NO_SHARED_OBJECTS)
   {
     TF_Buffer* op_list_buffer = TF_GetAllOpList();
     tensorflow::OpList op_list;
@@ -210,19 +221,6 @@ TEST(CAPI, LibraryLoadFunctions) {
     EXPECT_TRUE(found);
     TF_DeleteBuffer(op_list_buffer);
   }
-
-#if !defined(TENSORFLOW_NO_SHARED_OBJECTS)
-  {
-    // Test op list.
-    TF_Buffer op_list_buf = TF_GetOpList(lib);
-    tensorflow::OpList op_list;
-    EXPECT_TRUE(op_list.ParseFromArray(op_list_buf.data, op_list_buf.length));
-    ASSERT_EQ(op_list.op_size(), 1);
-    EXPECT_EQ("TestCApi", op_list.op(0).name());
-  }
-#endif  // !defined(TENSORFLOW_NO_SHARED_OBJECTS)
-
-  TF_DeleteLibraryHandle(lib);
 }
 
 void TestEncodeDecode(int line, const std::vector<string>& data) {
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3ee31a6a7ac641bbd3fc4c05568b61e433a1d523..ba3d8533db7623b8fa7fdf35093abcd1450776b1 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -69,7 +69,7 @@ tf_cuda_library(
     name = "c_api_internal",
     hdrs = ["c_api_internal.h"],
     visibility = [
-        "//learning/deepmind/courier:__pkg__",
+        "//learning/deepmind/courier:__subpackages__",
         "//tensorflow:internal",
     ],
     deps = [
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 3554ec0bf3202b54bfc38d67e51b89df19832302..192044915f06e3644aebb200a229cce5f220752b 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/platform/host_info.h"
 #ifdef TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #endif  // TENSORFLOW_EAGER_USE_XLA
@@ -404,8 +405,7 @@ const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) {
         "The passed in handle is a nullptr");
     return nullptr;
   }
-  tensorflow::Device* d = nullptr;
-  status->status = h->handle->OpDevice(&d);
+  tensorflow::Device* d = h->handle->op_device();
   return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                         : d->name().c_str();
 }
@@ -459,13 +459,20 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
   const char* name = op_or_function_name;  // Shorthand
   const tensorflow::AttrTypeMap* types;
-  status->status = tensorflow::AttrTypeMapForOp(name, &types);
-  if (status->status.ok()) return new TFE_Op(ctx, name, types);
-  if (TF_GetCode(status) == TF_NOT_FOUND) {
-    if (ctx->context.FindFunctionByName(name)) {
-      status->status = tensorflow::Status::OK();
-      return new TFE_Op(ctx, name, nullptr);
+  bool is_function = false;
+  status->status = tensorflow::AttrTypeMapForOp(name, &types, &is_function);
+  if (status->status.ok()) {
+    if (is_function && !ctx->context.FindFunctionByName(name)) {
+      status->status = tensorflow::errors::NotFound(
+          "'", name,
+          "' is neither a type of a primitive operation nor a name "
+          "of a function registered in binary running on ",
+          tensorflow::port::Hostname(),
+          ". Make sure the operation or function is "
+          "registered in the binary running in this process.");
+      return nullptr;
     }
+    return new TFE_Op(ctx, name, is_function, types);
   }
   return nullptr;
 }
@@ -498,12 +505,6 @@ void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret;
-  if (op->operation.is_function()) {
-    status->status = tensorflow::errors::Unimplemented(
-        "TODO(apassos): Support for attributes for TensorFlow functions is not "
-        "ready yet.");
-    return TF_ATTR_INT;  // The compiler requires that we return something.
-  }
   status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
                                               attr_name, &ret, is_list);
   return ret;
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index 5006b76f1981d068e99a2c081115ebb3a66d8c7f..52b0824552855860dfb138f3ac9a5d3afa7dc965 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -57,13 +57,9 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
     return nullptr;
   }
 
-  tensorflow::Device* device;
-  status->status = handle->handle->Device(&device);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-
 #ifdef TENSORFLOW_EAGER_USE_XLA
+  tensorflow::Device* device = handle->handle->device();
+
   // If tensor resides on an XLA device, use XLA device's PaddedShapeFn.
   tensorflow::XlaDevice* xla_device =
       dynamic_cast<tensorflow::XlaDevice*>(device);
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 104d52430cf7aa14d4d2a335a1b96e667f21ce87..67bc1bcd24605f8363d6a7c8d5d6a0836a42fc82 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -79,10 +79,6 @@ struct TFE_TensorHandle {
                    tensorflow::Device* op_device)
       : handle(new tensorflow::TensorHandle(t, d, op_device, nullptr)) {}
 
-  TFE_TensorHandle(tensorflow::uint64 node_id, tensorflow::DataType dtype,
-                   tensorflow::EagerContext* ctx)
-      : handle(new tensorflow::TensorHandle(node_id, dtype, ctx)) {}
-
   TFE_TensorHandle(tensorflow::TensorHandle* handle) : handle(handle) {}
 
   tensorflow::TensorHandle* handle;
@@ -97,10 +93,9 @@ struct TFE_TensorDebugInfo {
 };
 
 struct TFE_Op {
-  // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
-  // primitive operation.
-  TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
-      : operation(&ctx->context, op, t) {}
+  TFE_Op(TFE_Context* ctx, const char* op, bool is_function,
+         const tensorflow::AttrTypeMap* t)
+      : operation(&ctx->context, op, is_function, t) {}
 
   tensorflow::EagerOperation operation;
 };
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 55331022b9dbd0696928fa44430f340f371432ac..0045bb5622647974a3c9f2cdf35bc21e126b4f52 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -589,9 +589,22 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) {
   TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   const int num_devices = TF_DeviceListCount(devices);
+  bool has_gpu0 = false;
+  bool has_gpu1 = false;
+  for (int i = 0; i < num_devices; ++i) {
+    const char* dev = TF_DeviceListName(devices, i, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    string device_name(dev);
+    if (device_name.find("GPU:0") != string::npos) {
+      has_gpu0 = true;
+    }
+    if (device_name.find("GPU:1") != string::npos) {
+      has_gpu1 = true;
+    }
+  }
 
   const char* kCPUDevice = "CPU:0";
-  if (num_devices < 3) {
+  if (!has_gpu0 || !has_gpu1) {
     TF_DeleteDeviceList(devices);
     TF_DeleteTensor(t);
     TFE_DeleteTensorHandle(hcpu);
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 5ba55a203ff70cc64c07e96b5a869a1f11c9334e..5c11f51e8749de84547ae873f5f55ebd42bc4b3d 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -141,8 +141,9 @@ class GradientTape {
   // null. The result is populated with one tensor per target element.
   Status ComputeGradient(
       const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-      gtl::ArraySlice<int64> target_tensor_ids,
-      gtl::ArraySlice<int64> source_tensor_id,
+      const gtl::ArraySlice<int64> target_tensor_ids,
+      const gtl::ArraySlice<int64> source_tensor_ids,
+      const gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
       gtl::ArraySlice<Gradient*> output_gradients,
       std::vector<Gradient*>* result);
 
@@ -396,6 +397,7 @@ template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status InitialGradients(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
     gtl::ArraySlice<int64> target_tensor_ids,
+    gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
     const OpTape<BackwardFunction, TapeTensor>& op_tape,
     gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
@@ -425,8 +427,13 @@ Status InitialGradients(
               "none of operations outputs match expected tensor");
         }
       } else {
-        // No record of the target tensor found on the tape, so no gradient
-        // needs to be computed from it. Do nothing.
+        // This target tensor was not generated by any operation recorded on
+        // the tape, so no gradient needs to be computed from it unless this
+        // target is also a source.
+        auto source_tensor = sources_that_are_targets.find(id);
+        if (source_tensor != sources_that_are_targets.end()) {
+          (*result)[id].push_back(vspace.Ones(source_tensor->second));
+        }
       }
     } else {
       (*result)[id].push_back(output_gradients[i]);
@@ -467,8 +474,9 @@ constexpr int kMinAggregateBytes = 128 * 1024 * 1024;
 template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<int64> source_tensor_ids,
+    const gtl::ArraySlice<int64> target_tensor_ids,
+    const gtl::ArraySlice<int64> source_tensor_ids,
+    const gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients,
     std::vector<Gradient*>* result) {
   gtl::FlatSet<int64> sources_set(source_tensor_ids.begin(),
@@ -478,7 +486,8 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
   std::vector<int64> op_stack =
       InitialStack(state.op_tape, state.op_missing_tensor);
   gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
-  Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
+  Status s = InitialGradients(vspace, target_tensor_ids,
+                              sources_that_are_targets, output_gradients,
                               tensor_tape_, state.op_tape, &gradients);
   auto cleanup = [this, &state]() {
     if (!persistent_) {
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca69345264607ac689fb556b4f5c9bc08ea5eb88
--- /dev/null
+++ b/tensorflow/c/kernels.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+// This file forms the basis of a stable ABI for third-party kernel
+// implementations. It is crucial that changes to this file are made cautiously
+// and with a focus on maintaining both source and binary compatibility.
+
+struct TF_KernelBuilder {
+  ::tensorflow::KernelDefBuilder* cc_builder;
+
+  void* (*create_function)(TF_OpKernelConstruction*);
+  void (*compute_function)(void*, TF_OpKernelContext*);
+  void (*delete_function)(void*);
+};
+
+TF_KernelBuilder* TF_NewKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_func)(void*, TF_OpKernelContext*),
+    void (*delete_func)(void*)) {
+  TF_KernelBuilder* result = new TF_KernelBuilder;
+  result->cc_builder = new ::tensorflow::KernelDefBuilder(op_name);
+  result->cc_builder->Device(device_name);
+  result->create_function = create_func;
+  result->compute_function = compute_func;
+  result->delete_function = delete_func;
+  return result;
+}
+
+void TF_DeleteKernelBuilder(TF_KernelBuilder* builder) {
+  DCHECK_NE(builder, nullptr);
+  delete builder->cc_builder;
+  delete builder;
+}
+
+namespace tensorflow {
+namespace {
+
+// An OpKernel whose methods delegate to C function pointers.
+class COpKernel : public OpKernel {
+ public:
+  explicit COpKernel(OpKernelConstruction* ctx,
+                     void* (*create_func)(TF_OpKernelConstruction*),
+                     void (*compute_func)(void*, TF_OpKernelContext*),
+                     void (*delete_func)(void*))
+      : OpKernel(ctx), compute_func_(compute_func), delete_func_(delete_func) {
+    if (create_func != nullptr) {
+      c_kernel_ =
+          (*create_func)(reinterpret_cast<TF_OpKernelConstruction*>(ctx));
+    } else {
+      c_kernel_ = nullptr;
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    (*compute_func_)(c_kernel_, reinterpret_cast<TF_OpKernelContext*>(ctx));
+  }
+
+  ~COpKernel() override {
+    if (delete_func_ != nullptr) {
+      (*delete_func_)(c_kernel_);
+    }
+  }
+
+ private:
+  void (*compute_func_)(void*, TF_OpKernelContext* context);
+  void (*delete_func_)(void*);
+  void* c_kernel_;
+};
+
+// A KernelFactory that returns COpKernel instances.
+class KernelBuilderFactory
+    : public ::tensorflow::kernel_factory::OpKernelFactory {
+ public:
+  explicit KernelBuilderFactory(TF_KernelBuilder* builder)
+      : builder_(builder) {}
+  ::tensorflow::OpKernel* Create(
+      ::tensorflow::OpKernelConstruction* context) override {
+    return new ::tensorflow::COpKernel(context, builder_->create_function,
+                                       builder_->compute_function,
+                                       builder_->delete_function);
+  }
+  ~KernelBuilderFactory() override { TF_DeleteKernelBuilder(builder_); }
+
+ private:
+  TF_KernelBuilder* builder_;
+};
+}  // namespace
+}  // namespace tensorflow
+
+void TF_RegisterKernelBuilder(const char* name, TF_KernelBuilder* builder,
+                              TF_Status* status) {
+  using tensorflow::register_kernel::Name;
+
+  tensorflow::kernel_factory::OpKernelRegistrar(
+      builder->cc_builder->Build(), name,
+      absl::make_unique<tensorflow::KernelBuilderFactory>(builder));
+
+  TF_SetStatus(status, TF_OK, "");
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..2518789a3c141755d0b3373d53642c487331f68b
--- /dev/null
+++ b/tensorflow/c/kernels.h
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_KERNELS_H_
+#define TENSORFLOW_C_KERNELS_H_
+
+#include "tensorflow/c/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// C API for TensorFlow Kernels.
+//
+// This API allows developers to register custom kernel implementations for
+// TensorFlow.
+//
+// See c_api.h header comments for a discussion about API conventions.
+//
+// Users wishing to extend TensorFlow with new kernels will call
+// `TF_NewKernelBuilder`. The resulting kernel builder can be registered with
+// `TF_RegisterKernelBuilder`, which will allow TF to construct user-provided
+// kernels when necessary.
+
+struct TF_KernelBuilder;
+struct TF_OpKernelConstruction;
+struct TF_OpKernelContext;
+
+// Allocates a new kernel builder and returns a pointer to it.
+//
+// If non-null, TensorFlow will call create_func when it needs to instantiate
+// the kernel. The pointer returned by create_func will be passed to
+// compute_func and delete_func, thereby functioning as a "this" pointer for
+// referring to kernel instances.
+//
+// The TF_OpKernelConstruction pointer passed to create_func is owned by
+// TensorFlow and will be deleted once create_func returns. It must not be used
+// after this.
+//
+// When TensorFlow needs to perform a computation with this kernel, it will
+// call compute_func. This function will receive the pointer returned by
+// create_func (or null if no create_func was provided), along with the inputs
+// to the computation.
+//
+// The TF_OpKernelContext pointer received by compute_func is owned by
+// TensorFlow and will be deleted once compute_func returns. It must not be used
+// after this.
+//
+// Finally, when TensorFlow no longer needs the kernel, it will call
+// delete_func if one is provided. This function will receive the pointer
+// returned in `create_func` or nullptr if no `create_func` was provided.
+//
+// The caller should pass the result of this function to
+// TF_RegisterKernelBuilder, which will take ownership of the pointer. If, for
+// some reason, the kernel builder will not be registered, the caller should
+// delete it with TF_DeleteKernelBuilder.
+TF_CAPI_EXPORT extern TF_KernelBuilder* TF_NewKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_func)(void*, TF_OpKernelContext*),
+    void (*delete_func)(void*));
+
+// Register the given kernel builder with the TensorFlow runtime. If
+// registration fails, the given status will be populated.
+//
+// This call takes ownership of the `builder` pointer.
+TF_CAPI_EXPORT extern void TF_RegisterKernelBuilder(const char* kernel_name,
+                                                    TF_KernelBuilder* builder,
+                                                    TF_Status* status);
+
+// Deletes the given TF_KernelBuilder. This should be called only if the kernel
+// builder is not registered with TensorFlow via TF_RegisterKernelBuilder.
+TF_CAPI_EXPORT extern void TF_DeleteKernelBuilder(TF_KernelBuilder* builder);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_KERNELS_H_
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e706c7c1d96ee1781d8efc0f28c5e0cbcbc80861
--- /dev/null
+++ b/tensorflow/c/kernels_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/kernels.h"
+
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/node_def.pb_text.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+struct MyCustomKernel {
+  bool created;
+  bool compute_called;
+};
+
+static bool delete_called = false;
+
+static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
+  LOG(INFO) << "Wow, actually got into creation";
+  struct MyCustomKernel* s = new struct MyCustomKernel;
+  s->created = true;
+  s->compute_called = false;
+  return s;
+}
+
+static void MyComputeFunc(void* kernel, TF_OpKernelContext* ctx) {
+  struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
+  s->compute_called = true;
+}
+
+static void MyDeleteFunc(void* kernel) {
+  struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
+  EXPECT_TRUE(s->created);
+  EXPECT_TRUE(s->compute_called);
+  delete_called = true;
+  delete s;
+}
+
+// Tests registration of a single C kernel and checks that calls through the
+// C/C++ boundary are being made.
+TEST(TestKernel, TestRegisterKernelBuilder) {
+  const char* kernel_name = "SomeKernelName";
+  const char* op_name = "FooOp";
+  const char* device_name = "barDev";
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(
+      op_name, device_name, &MyCreateFunc, &MyComputeFunc, &MyDeleteFunc);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_Buffer* buf = TF_GetRegisteredKernelsForOp("FooOp", status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    ::tensorflow::KernelList list;
+    list.ParseFromArray(buf->data, buf->length);
+    ASSERT_EQ(1, list.kernel_size());
+    ASSERT_EQ("barDev", list.kernel(0).device_type());
+    TF_DeleteBuffer(buf);
+    TF_DeleteStatus(status);
+  }
+
+  REGISTER_OP("FooOp")
+      .Input("input1: double")
+      .Input("input2: uint8")
+      .Output("output1: uint8");
+
+  {
+    ::tensorflow::NodeDef def;
+    def.set_op("FooOp");
+    def.set_device("bar");
+    def.add_input("input1");
+    def.add_input("input2");
+    ::tensorflow::Status status;
+    std::unique_ptr<::tensorflow::OpKernel> kernel =
+        ::tensorflow::CreateOpKernel(::tensorflow::DeviceType("barDev"),
+                                     nullptr, nullptr, def, 1, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+    kernel->Compute(nullptr);
+  }
+
+  ASSERT_TRUE(delete_called);
+}
diff --git a/tensorflow/c/test_op1.cc b/tensorflow/c/test_op1.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b22cc9aef2b344282f45340ff12ee849935a26f9
--- /dev/null
+++ b/tensorflow/c/test_op1.cc
@@ -0,0 +1,23 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+REGISTER_OP("TestCApi1").Doc(R"doc(Used to test C API)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index c18b07603ae3841d3581741ab5a43f2e8b628356..a09becc49b10d2c58f98fbcc11df5190f794c1d4 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -170,6 +170,7 @@ cc_library_with_android_deps(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -488,6 +489,7 @@ tf_gen_op_wrappers_cc(
         "image_ops",
         "io_ops",
         "linalg_ops",
+        "list_ops",
         "logging_ops",
         "lookup_ops",
         "manip_ops",
@@ -516,6 +518,8 @@ tf_gen_op_wrappers_cc(
         ":array_ops",
         ":const_op",
         ":math_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
     ],
 )
 
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index c6abe2f41b9b5ec2faee6f65b429ff606f8ac08e..ec116f68cf4b61c9b2d15065916ad9169017b659 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -193,6 +193,15 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
 
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
                         std::vector<AssetFileDef>* asset_file_defs) {
+  // With SavedModel v2, we write asset file def into metagraph instead of
+  // collection, so read from metagraph first.
+  if (meta_graph_def.asset_file_def_size() > 0) {
+    for (const auto& asset : meta_graph_def.asset_file_def()) {
+      asset_file_defs->push_back(asset);
+    }
+    return Status::OK();
+  }
+  // Fall back to read from collection to be backward compatible with v1.
   const auto& collection_def_map = meta_graph_def.collection_def();
   const auto assets_it = collection_def_map.find(kSavedModelAssetsKey);
   if (assets_it == collection_def_map.end()) {
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 6c29f09cde7ee17c11cb44ce48d8e9128daae4d0..16151e77737429f4fbf690fc34b12a70bacebdc4 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -93,7 +93,7 @@ cc_library(
         ":tfcompile_lib",
         "//tensorflow/compiler/tf2xla:tf2xla_proto",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index b95b063348c5cdfdcaed635ba527e9f0bfd6092d..d548de8c44285f6d21dd778db464a31e1b19645b 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/flags.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -103,7 +103,7 @@ Status Main(const MainFlags& flags) {
     return errors::InvalidArgument("Must specify --cpp_class");
   }
   codegen_opts.gen_hlo_profile_printer_data =
-      xla::legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile();
+      xla::GetDebugOptionsFromFlags().xla_hlo_profile();
   TF_RETURN_IF_ERROR(ParseCppClass(flags.cpp_class, &codegen_opts.class_name,
                                    &codegen_opts.namespaces));
 
@@ -132,7 +132,7 @@ int main(int argc, char** argv) {
 
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
 
   tensorflow::string usage = tensorflow::tfcompile::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 0c41e095c7bc73e517d4c11c590a21439db1e3da..682c0f0cb05c8c83acac28c8f3abf4f5e355e7c0 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -21,7 +21,6 @@ package(
 )
 
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
@@ -52,6 +51,7 @@ cc_library(
     deps = [
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",
     ],
@@ -65,6 +65,7 @@ cc_library(
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/xla/service:gpu_plugin",
     ]),
     alwayslink = 1,
@@ -75,10 +76,10 @@ cc_library(
     srcs = ["xla_cpu_device.cc"],
     visibility = [":friends"],
     deps = [
+        ":flags",
         ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_ops",
-        "//tensorflow/compiler/jit/legacy_flags:xla_device_flags",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
@@ -190,6 +191,7 @@ cc_library(
         "//tensorflow/core/kernels:resource_variable_ops",
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:shape_ops",
+        "//tensorflow/core/kernels:stack",
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/kernels/data:generator_dataset_op",
         "//tensorflow/core/kernels/data:iterator_ops",
@@ -208,6 +210,18 @@ cc_library(
 
 # Internal targets below this point.
 
+cc_library(
+    name = "flags",
+    srcs = ["flags.cc"],
+    hdrs = ["flags.h"],
+    visibility = [":friends"],
+    deps = [
+        "//tensorflow/compiler/xla:parse_flags_from_env",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "common",
     srcs = [
@@ -241,6 +255,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -253,6 +268,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/core:core_cpu",
@@ -263,6 +279,22 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_compilation_cache_test",
+    srcs = [
+        "xla_compilation_cache_test.cc",
+    ],
+    deps = [
+        ":xla_compilation_cache",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
 
@@ -468,6 +500,7 @@ cc_library(
     deps = [
         ":common",
         ":encapsulate_util",
+        ":flags",
         ":shape_inference_helpers",
         ":union_find",
         ":xla_cluster_util",
@@ -475,8 +508,6 @@ cc_library(
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope_internal",
         "//tensorflow/compiler/jit/graphcycles",
-        "//tensorflow/compiler/jit/legacy_flags:build_xla_ops_pass_flags",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
@@ -500,6 +531,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -524,25 +556,6 @@ cc_library(
     hdrs = ["union_find.h"],
 )
 
-cc_library(
-    name = "producer_consumer_queue",
-    hdrs = ["producer_consumer_queue.h"],
-    deps = ["//tensorflow/core:lib"],
-)
-
-tf_cc_test(
-    name = "producer_consumer_queue_test",
-    size = "small",
-    srcs = ["producer_consumer_queue_test.cc"],
-    deps = [
-        ":producer_consumer_queue",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "deadness_analysis_test",
     size = "small",
@@ -606,6 +619,7 @@ tf_cc_test(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
         "//tensorflow/compiler/tf2xla/cc:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -648,31 +662,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "xla_launch_util_test",
-    size = "small",
-    srcs = ["xla_launch_util_test.cc"],
-    deps = [
-        ":common",
-        ":xla_compilation_cache",
-        ":xla_launch_util",
-        ":xla_tensor",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core/kernels:variable_ops",
-    ],
-)
-
 cc_library(
     name = "xla_fusion_optimizer",
     srcs = ["xla_fusion_optimizer.cc"],
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 054f31ba3352b2215e6b0448c8ec8a70cb98b8e5..9f4042630edaec1b9519b6434d859a48372e8b15 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
-#include "tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
@@ -214,7 +214,8 @@ Status NodeRequiresCompilation(Node* n, bool* result) {
     return errors::Internal("Could not find compilation device ",
                             device_type.type());
   }
-  *result = registration->requires_compilation;
+  *result = registration->autoclustering_policy ==
+            XlaOpRegistry::AutoclusteringPolicy::kAlways;
   return Status::OK();
 }
 
@@ -319,10 +320,10 @@ Status BuildXlaOpsPass::Run(const GraphOptimizationPassOptions& options) {
                     return IsXlaCompiledKernel(*n);
                   });
 
-  bool lazy_compilation_enabled = enable_lazy_compilation_
-                                      ? *enable_lazy_compilation_
-                                      : legacy_flags::GetBuildXlaOpsPassFlags()
-                                            .tf_xla_enable_lazy_compilation;
+  bool lazy_compilation_enabled =
+      enable_lazy_compilation_
+          ? *enable_lazy_compilation_
+          : GetBuildXlaOpsPassFlags().tf_xla_enable_lazy_compilation;
 
   for (Node* n : xla_compiled_kernels) {
     TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun(
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 617e31488c7daeb714c0ff7056b786e4eaf7873f..8a73101c184e6190921fd7729742922bd96f4bcf 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -127,7 +127,8 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
   Output loop_cond =
       ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr);
   ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
-  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
+  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"),
+                           latch.output_false);
   Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"),
                             latch.output_true, increment_by);
   Output next_iteration =
@@ -191,7 +192,8 @@ DependentInductionVar CreateDependentLoopInvariantValue(
                                             value, frame_name);
   ops::Merge iv(root.WithOpName(prefix + "/iv"), {enter_value, enter_value});
   ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
-  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
+  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"),
+                           latch.output_false);
   Output next_iteration = ops::NextIteration(
       root.WithOpName(prefix + "/next_iteration"), latch.output_true);
   CHECK(root.graph()
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index 28ec37b1b9c8a1a306b5e778bac5b6ba01c2c997..bcc3213285bee2a2094bd6c39b37ba95874d90ed 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -86,7 +86,7 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
       continue;
     } else if (src_xla_computation && !dst_xla_computation) {
       if (src_outside_compilation) {
-        // Case 1d: outside compilation to host computation control edge.
+        // Case 1c: outside compilation to host computation control edge.
         edges_to_remove.push_back(e);
 
         TF_RETURN_IF_ERROR(AppendToListAttr<string>(
@@ -94,7 +94,7 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
       }
     } else if (!src_xla_computation && dst_xla_computation) {
       if (dst_outside_compilation) {
-        // Case 1d: host computation control to outside compilation edge.
+        // Case 1c: host computation control to outside compilation edge.
         edges_to_remove.push_back(e);
 
         TF_RETURN_IF_ERROR(AppendToListAttr<string>(
@@ -103,40 +103,24 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
     } else {  // src_xla_computation && dst_xla_computation
       if (*src_xla_computation != *dst_xla_computation) {
         if (src_outside_compilation && dst_outside_compilation) {
-          // Case 1c: outside compilation to outside compilation control edge.
+          // Case 1b: outside compilation to outside compilation control edge.
           edges_to_remove.push_back(e);
 
           TF_RETURN_IF_ERROR(AppendToListAttr<string>(
               e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
         } else if (src_outside_compilation && !dst_outside_compilation) {
-          // Case 1b: outside compilation to another XLA computaition control
+          // Case 1a: outside compilation to another XLA computaition control
           // edge.
           TF_RETURN_IF_ERROR(AppendToListAttr<string>(
               e->src(), kXlaConnectedToOtherXlaComputationAttrName,
               *dst_xla_computation));
         } else if (!src_outside_compilation && dst_outside_compilation) {
-          // Case 1b: another XLA computaition to outside compilation control
+          // Case 1a: another XLA computaition to outside compilation control
           // edge.
           TF_RETURN_IF_ERROR(AppendToListAttr<string>(
               e->dst(), kXlaConnectedFromOtherXlaComputationAttrName,
               *src_xla_computation));
         }
-      } else {  // *src_xla_computation == *dst_xla_computation
-        if (src_outside_compilation && dst_outside_compilation) {
-          if (*src_outside_compilation != *dst_outside_compilation) {
-            // Case 1c: outside compilation to outside compilation control edge.
-            edges_to_remove.push_back(e);
-
-            TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-                e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
-          }
-        } else if (src_outside_compilation && !dst_outside_compilation) {
-          // Case 1a: outside compilation to its XLA computation control edge.
-          ReplaceAttr(e->src(), kXlaConnectedToXlaComputationAttrName, true);
-        } else if (!src_outside_compilation && dst_outside_compilation) {
-          // Case 1a: XLA computation to outside compilation in it control edge.
-          ReplaceAttr(e->dst(), kXlaConnectedFromXlaComputationAttrName, true);
-        }
       }
     }
   }
@@ -181,12 +165,6 @@ Status ProcessXlaToXlaDataEdges(Graph* g,
         edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
         VLOG(4) << "XLA -> XLA edge: " << e->DebugString();
       }
-    } else {  // *src_xla_computation == *dst_xla_computation
-      if (src_outside_compilation && dst_outside_compilation &&
-          *src_outside_compilation != *dst_outside_compilation) {
-        edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
-        VLOG(4) << "XLA -> XLA edge: " << e->DebugString();
-      }
     }
   }
 
@@ -594,14 +572,242 @@ Status AddControlDependencies(
   return Status::OK();
 }
 
+// Step 1 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PreprocessEdgesBetweenOutsideCompilations` for details.
+Status PreprocessControlEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather edges to remove. We should not remove the edge while iterating.
+  std::vector<const Edge*> edges_to_remove;
+  for (const Edge* e : g->edges()) {
+    if (!e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (src_outside_compilation && dst_outside_compilation) {
+      if (*src_outside_compilation != *dst_outside_compilation) {
+        // Case 1a: outside compilation to outside compilation control edge.
+        edges_to_remove.push_back(e);
+
+        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+            e->dst(), kXlaControlDependenciesWithinXlaClusterAttrName,
+            e->src()->name()));
+      }
+    } else if (src_outside_compilation && !dst_outside_compilation) {
+      // Case 1b: outside compilation to its XLA computation control edge.
+      ReplaceAttr(e->src(), kXlaConnectedToXlaComputationAttrName, true);
+    } else if (!src_outside_compilation && dst_outside_compilation) {
+      // Case 1b: XLA computation to outside compilation in it control edge.
+      ReplaceAttr(e->dst(), kXlaConnectedFromXlaComputationAttrName, true);
+    }
+  }
+
+  for (auto e : edges_to_remove) {
+    g->RemoveEdge(e);
+  }
+  return Status::OK();
+}
+
+// Step 2 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PreprocessEdgesBetweenOutsideCompilations` for details.
+Status PreprocessDataEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather edges between outside compilation and host computation. Notice that
+  // we do not store `Edge*` directly because we remove some nodes while adding
+  // Identity nodes, and those Edge pointers might be invalidated.
+  struct EdgeInfo {
+    int dst_input, dst_node_id;
+  };
+  std::vector<EdgeInfo> edges;
+  for (const Edge* e : g->edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (src_outside_compilation && dst_outside_compilation &&
+        *src_outside_compilation != *dst_outside_compilation) {
+      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
+      VLOG(4) << "Oc -> oc edge: " << e->DebugString();
+    }
+  }
+
+  // Remove the edge from host to outside compilation. Add a placeholder as
+  // outside compilation node input.
+  std::map<string, Node*> placeholders;
+  for (int i = 0; i < edges.size(); i++) {
+    Node* dst = g->FindNodeId(edges[i].dst_node_id);
+    const Edge* e;
+    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
+    Node* src = e->src();
+    int src_output = e->src_output(), dst_input = e->dst_input();
+    g->RemoveEdge(e);
+
+    // Find or create placeholder node.
+    string new_name = absl::StrCat(src->name(), "_oc_to_oc_placeholder");
+    auto iter = placeholders.find(new_name);
+    Node* placeholder_node;
+    if (iter == placeholders.end()) {
+      NodeDefBuilder placeholder_builder(new_name, "Placeholder");
+      placeholder_builder.Attr("dtype", src->output_type(src_output));
+      string outside_compilation_attr;
+      TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(),
+                                     outside_compilation_attr_name,
+                                     &outside_compilation_attr));
+      placeholder_builder.Attr(outside_compilation_attr_name,
+                               outside_compilation_attr);
+      placeholder_builder.Attr(kOutsideCompilationOriginalNodeAttrName,
+                               src->name());
+      placeholder_builder.Attr(kOutsideCompilationSrcOutputAttrName,
+                               src_output);
+      NodeDef placeholder_def;
+      TF_RETURN_IF_ERROR(placeholder_builder.Finalize(&placeholder_def));
+      Status s;
+      placeholder_node = g->AddNode(placeholder_def, &s);
+      TF_RETURN_IF_ERROR(s);
+      placeholders[new_name] = placeholder_node;
+    } else {
+      placeholder_node = iter->second;
+    }
+    g->AddEdge(placeholder_node, 0, dst, dst_input);
+
+    // Replace `e->dst()` because its input node changed.
+    NodeDef new_def = dst->def();
+    *new_def.mutable_input(dst_input) = placeholder_node->name();
+    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
+
+    // Other edge in `edges` might have `e->dst()` as src or dst
+    // node. Before removing `e->dst()`, replace those edges with
+    // corresponding edges for `dst_replace_node`.
+    for (int j = i + 1; j < edges.size(); j++) {
+      if (edges[j].dst_node_id == edges[i].dst_node_id) {
+        edges[j].dst_node_id = dst_replace_node->id();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Step 1 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PostprocessEdgesBetweenOutsideCompilations` for details.
+Status PostprocessDataEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather all outside compilation to outside compilation nodes.
+  std::vector<Node*> placeholder_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "Placeholder" &&
+        HasNodeAttr(n->def(), kOutsideCompilationOriginalNodeAttrName)) {
+      placeholder_nodes.push_back(n);
+    }
+  }
+
+  // Remove the placeholder nodes, and reconnect original edge.
+  auto node_name_index = g->BuildNodeNameIndex();
+  for (auto n : placeholder_nodes) {
+    string node_name;
+    int node_src_output;
+    TF_RETURN_IF_ERROR(GetNodeAttr(
+        n->attrs(), kOutsideCompilationOriginalNodeAttrName, &node_name));
+    TF_RETURN_IF_ERROR(GetNodeAttr(
+        n->attrs(), kOutsideCompilationSrcOutputAttrName, &node_src_output));
+    auto iter = node_name_index.find(node_name);
+    if (iter == node_name_index.end()) {
+      return errors::Internal(
+          "Cannot find original node for oc -> host placeholder node ",
+          node_name);
+    }
+
+    // Change all usage node to use the original node instead.
+    Node* original_node = iter->second;
+    std::vector<const Edge*> control_edges;
+    std::vector<OutEdgeInfo> data_edges;
+    for (auto e : n->out_edges()) {
+      if (e->IsControlEdge()) {
+        control_edges.push_back(e);
+      } else {
+        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
+      }
+    }
+    for (const Edge* e : control_edges) {
+      g->AddControlEdge(original_node, e->dst());
+      g->RemoveEdge(e);
+    }
+    for (int i = 0; i < data_edges.size(); i++) {
+      Node* dst = data_edges[i].dst;
+      NodeDef new_def = dst->def();
+      int dst_input = data_edges[i].dst_input;
+      *new_def.mutable_input(dst_input) =
+          absl::StrCat(original_node->name(), ":", node_src_output);
+      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
+
+      const Edge* edge_to_replace = nullptr;
+      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
+      g->RemoveEdge(edge_to_replace);
+      g->AddEdge(original_node, node_src_output, replace_node, dst_input);
+
+      // Other edges might have `dst` as dst node. Update those edges with
+      // `replace_node`.
+      for (int j = i + 1; j < data_edges.size(); j++) {
+        if (data_edges[j].dst == dst) {
+          data_edges[j].dst = replace_node;
+        }
+      }
+
+      // Other placeholder node might have `dst` as original node. Update
+      // `node_name_index` with `replace_node`.
+      node_name_index[replace_node->name()] = replace_node;
+    }
+
+    // Remove placeholder node.
+    g->RemoveNode(n);
+  }
+  return Status::OK();
+}
+
+// Step 2 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PostprocessEdgesBetweenOutsideCompilations` for details.
+Status PostprocessControlEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  auto node_name_index = g->BuildNodeNameIndex();
+
+  // Reconnect outside compilation to outside compilation control edge.
+  for (Node* n : g->nodes()) {
+    std::vector<string> control_deps;
+    Status s =
+        GetNodeAttr(n->attrs(), kXlaControlDependenciesWithinXlaClusterAttrName,
+                    &control_deps);
+    if (!s.ok()) {
+      if (s.code() != error::NOT_FOUND) {
+        return s;
+      } else {
+        continue;
+      }
+    } else {
+      n->ClearAttr(kXlaControlDependenciesWithinXlaClusterAttrName);
+      for (const string& control_input : control_deps) {
+        auto iter = node_name_index.find(control_input);
+        if (iter == node_name_index.end()) {
+          return errors::Internal("Cannot find original node for ",
+                                  control_input);
+        }
+        g->AddControlEdge(iter->second, n);
+      }
+    }
+  }
+  return Status::OK();
+}
 }  // namespace
 
 const char kXlaInferredShapesAttrName[] = "_xla_inferred_shapes";
 
-const char kXlaConnectedToXlaComputationAttrName[] =
-    "_xla_connected_to_xla_computation";
-const char kXlaConnectedFromXlaComputationAttrName[] =
-    "_xla_connected_from_xla_computation";
 const char kXlaConnectedToOtherXlaComputationAttrName[] =
     "_xla_connected_to_other_xla_computation";
 const char kXlaConnectedFromOtherXlaComputationAttrName[] =
@@ -616,6 +822,15 @@ const char kHostToOutsideCompilationOriginalNodeAttrName[] =
     "_xla_host_to_oc_node_name";
 const char kHostToOutsideCompilationSrcOutputAttrName[] =
     "_xla_host_to_oc_src_output";
+const char kXlaConnectedToXlaComputationAttrName[] =
+    "_xla_connected_to_xla_computation";
+const char kXlaConnectedFromXlaComputationAttrName[] =
+    "_xla_connected_from_xla_computation";
+const char kOutsideCompilationOriginalNodeAttrName[] =
+    "_xla_oc_to_oc_node_name";
+const char kOutsideCompilationSrcOutputAttrName[] = "_xla_oc_to_oc_src_output";
+const char kXlaControlDependenciesWithinXlaClusterAttrName[] =
+    "_xla_control_dependencies_within_xla_cluster";
 
 Status PerformStaticShapeInferenceBeforeEncapsulation(
     Graph* g, const string& xla_computation_attr_name,
@@ -699,4 +914,39 @@ Status PostprocessForEncapsulation(
   return Status::OK();
 }
 
+Status PreprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Remove edges from source node to outside compilation nodes, and edges
+  // from outside compilation nodes to sink node.
+  std::vector<const Edge*> edges_to_remove;
+  for (const Edge* e : g->source_node()->out_edges()) {
+    if (HasNodeAttr(e->dst()->def(), outside_compilation_attr_name)) {
+      edges_to_remove.push_back(e);
+    }
+  }
+  for (const Edge* e : g->sink_node()->in_edges()) {
+    if (HasNodeAttr(e->src()->def(), outside_compilation_attr_name)) {
+      edges_to_remove.push_back(e);
+    }
+  }
+  for (auto e : edges_to_remove) {
+    g->RemoveEdge(e);
+  }
+
+  TF_RETURN_IF_ERROR(PreprocessControlEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(PreprocessDataEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  return Status::OK();
+}
+
+Status PostprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  TF_RETURN_IF_ERROR(PostprocessDataEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(PostprocessControlEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index a3b193eea745d4e44781225130216253c19371da..e363bc5754ac395bae262dc67a780a0173efaf5e 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -44,14 +44,6 @@ Status PerformStaticShapeInferenceBeforeEncapsulation(
     Graph* g, const string& xla_computation_attr_name,
     const string& outside_compilation_attr_name);
 
-// Attribute indicating that some ops in this node's XLA computation has control
-// dependency on this node. Attribute value will always be "true".
-extern const char kXlaConnectedToXlaComputationAttrName[];
-
-// Attribute indicating that this node has control dependency on some ops in
-// this node's XLA computation. Attribute value will always be "true".
-extern const char kXlaConnectedFromXlaComputationAttrName[];
-
 // Attribute indicating that some ops in other XLA computation has control
 // dependency on this node. Attribute value will be a list of string (XLA
 // computation names).
@@ -81,6 +73,14 @@ extern const char kOutsideCompilationToHostOriginalNodeAttrName[];
 // int (src_output for original edge).
 extern const char kOutsideCompilationToHostSrcOutputAttrName[];
 
+// Attribute indicating that some ops in this node's XLA computation has control
+// dependency on this node. Attribute value will always be "true".
+extern const char kXlaConnectedToXlaComputationAttrName[];
+
+// Attribute indicating that this node has control dependency on some ops in
+// this node's XLA computation. Attribute value will always be "true".
+extern const char kXlaConnectedFromXlaComputationAttrName[];
+
 // Attribute indicating that this is an Placeholder node added to act as a
 // temporary input node for an host node. Attribute value will be string
 // (original input node name).
@@ -91,19 +91,31 @@ extern const char kHostToOutsideCompilationOriginalNodeAttrName[];
 // for original edge).
 extern const char kHostToOutsideCompilationSrcOutputAttrName[];
 
-// Preprocesses the graph for encapsulation. It will perform the following
-// operations in order:
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// string (original input node name).
+extern const char kOutsideCompilationOriginalNodeAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// int (src_output for original edge).
+extern const char kOutsideCompilationSrcOutputAttrName[];
+
+// Attribute indicating that this node has control dependencies on some other
+// nodes within the same XLA cluster. Attribute value will be a list of string
+// (node names).
+extern const char kXlaControlDependenciesWithinXlaClusterAttrName[];
+
+// Preprocesses edges between different XLA clusters for encapsulation. It will
+// perform the following operations in order:
 //
-// 1a. For control edges between outside compilation and its XLA computation,
-//     add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the
-//     outside compilation node.
-// 1b. For control edges between outside compilation and another XLA
+// 1a. For control edges between outside compilation and another XLA
 //     computation, add attr "kXlaConnected{From, To}OtherXlaComputationAttrName
 //     = XLA computation node name" to the outside compilation node.
-// 1c. For control edges between different outside compilations, remove the edge
-//     and add attr "kXlaControlDependenciesAttrName = src node name" to dst
-//     node.
-// 1d. For control edges between outside compilation and host computation,
+// 1b. For control edges between different outside compilations (in different
+//     XLA computations), remove the edge and add attr
+//     "kXlaControlDependenciesAttrName = src node name" to dst node.
+// 1c. For control edges between outside compilation and host computation,
 //     remove the edge and add attr "kXlaControlDependenciesAttrName = src node
 //     name" to dst node.
 // 2. For data edges between different XLA computations, if either src or dst
@@ -117,6 +129,25 @@ Status PreprocessForEncapsulation(Graph* g,
 
 // Information for XLA computation.
 struct XlaClusterInfo {
+  // Add an explicitly-defined default constructor for this class.
+  //
+  // The compiler may delete the default constructor here because
+  // host_compute_core is a const member whose type (std::map) doesn't
+  // necessarily have a user provided constructor -- while libc++ and
+  // libstdc++ 4.8 provide a user defined default constructor, libstdc++ at
+  // least >= 7.3 does not. See also c++11 [class.ctor] p5.
+  //
+  // TODO(klimek): In c++17 we'll be able to initialize host_compute_core
+  // without losing aggregate initialization, which allows us to get rid of
+  // the constructor definitions again.
+  XlaClusterInfo() {}
+  XlaClusterInfo(const string& cluster_name,
+                 const NameAttrList& func_name_attrs, Node* node,
+                 const std::map<string, int>& host_compute_core)
+      : cluster_name(cluster_name),
+        func_name_attrs(func_name_attrs),
+        node(node),
+        host_compute_core(host_compute_core) {}
   // XLA cluster name. It might be different from `func_name`.
   const string cluster_name;
   // Name and attributes of XLA computation function.
@@ -127,26 +158,53 @@ struct XlaClusterInfo {
   const std::map<string, int> host_compute_core;
 };
 
-// Postprocesses the graph for encapsulation. This function reverts what
-// `PreprocessForEncapsulation` did. It will perform the following operations in
-// order:
+// Postprocesses edges between different XLA clusters for encapsulation. This
+// function reverts what `PreprocessForEncapsulation` did. It will perform the
+// following operations in order:
 //
 // 1. Remove Placeholder nodes between outside compilation and host computation
 //     (created in `PreprocessForEncapsulation` step 3).
 // 2. Remove Identity nodes created in `PreprocessForEncapsulation` step 2.
-// 3a. Reconnect control edges between different outside compilations (marked by
-//     `PreprocessForEncapsulation` step 1c) and control edges between outside
-//     compilation and host computation (marked by `PreprocessForEncapsulation`
-//     step 1d).
-// 3b. Reconnect control edges between outside compilation and another XLA
-//     computation (marked by `PreprocessForEncapsulation` step 1b).
-// Notice that control edges marked by `PreprocessForEncapsulation` step 1a are
-// not handled here. They are handled in `RewriteOutsideCompilationSubgraphFn`.
+// 3a. Reconnect control edges between outside compilation and another XLA
+//     computation (marked by `PreprocessForEncapsulation` step 1a).
+// 3b. Reconnect control edges between different outside compilations (marked by
+//     `PreprocessForEncapsulation` step 1b).
+// 3c. Reconnect control edges between outside compilation and host computation
+//     (marked by `PreprocessForEncapsulation` step 1c).
 Status PostprocessForEncapsulation(
     Graph* g, const string& xla_computation_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters);
 
+// Preprocesses edges within the same XLA cluster. It will perform the following
+// operations in order:
+//
+// 0.  Remove edges from source node to outside compilation nodes, and edges
+//     from outside compilation nodes to sink node.
+// 1a. For edges between different outside compilation clusters, remove the edge
+//     and add attr "kXlaControlDependenciesWithinXlaClusterAttrName = src node
+//     name" to dst node.
+// 1b. For control edges between outside compilation and its XLA computation,
+//     add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the
+//     outside compilation node.
+// 2.  For data edges between different outside compilations, remove the edge
+//     and create a Placeholder node as dst node's input.
+Status PreprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
+
+// Postprocesses edges within the same XLA cluster. This function reverts what
+// `PreprocessEdgesBetweenOutsideCompilations` did. It will perform the
+// following operations in order:
+//
+// 1. Remove Placeholder nodes between different outside compilations (created
+//    in `PreprocessEdgesBetweenOutsideCompilations` step 2).
+// 2a. Reconnect control edges between different outside compilations (marked by
+//     `PreprocessEdgesBetweenOutsideCompilations` step 1a).
+// Notice that control edges marked by
+// `PreprocessEdgesBetweenOutsideCompilations` step 1b are not handled here.
+// They are handled in `RewriteOutsideCompilationSubgraphFn`.
+Status PostprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
index 7255df3112916b7abcc98ff8204efc8c02209b13..25c32cef01d7f9877a35001457539f2ad189192f 100644
--- a/tensorflow/compiler/jit/encapsulate_util_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -107,28 +107,19 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) {
   identity4_node->AddAttr("_xla", "1");
   identity4_node->AddAttr("_oc", "0");
   identity5_node->AddAttr("_xla", "1");
-  // Case 1a: control edges between outside compilation and its XLA computation.
-  g.AddControlEdge(add_node, identity0_node);
-  g.AddControlEdge(identity0_node, identity1_node);
-  // Case 1b: control edges between outside compilation and another XLA
+  // Case 1a: control edges between outside compilation and another XLA
   // computation.
   g.AddControlEdge(identity0_node, identity3_node);
   g.AddControlEdge(identity1_node, identity4_node);
-  // Case 1c: control edges between different outside compilations.
+  // Case 1b: control edges between different outside compilations.
   g.AddControlEdge(identity0_node, identity4_node);
-  // Case 1d: control edges between outside compilation and host computation.
+  // Case 1c: control edges between outside compilation and host computation.
   g.AddControlEdge(const0_node, identity0_node);
   g.AddControlEdge(identity0_node, identity2_node);
 
   TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
 
-  // Case 1a: add attr "_xla_connected_{from/to}_xla_computation = true" to the
-  // outside compilation node.
-  EXPECT_TRUE(HasNodeAttr(identity0_node->def(),
-                          kXlaConnectedFromXlaComputationAttrName));
-  EXPECT_TRUE(HasNodeAttr(identity0_node->def(),
-                          kXlaConnectedToXlaComputationAttrName));
-  // Case 1b: add attr "_xla_control_deps_{from/to} = XLA computation node name"
+  // Case 1a: add attr "_xla_control_deps_{from/to} = XLA computation node name"
   // to the outside compilation node.
   std::vector<string> attr;
   TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
@@ -140,13 +131,13 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) {
                           kXlaConnectedFromOtherXlaComputationAttrName, &attr));
   EXPECT_EQ(attr.size(), 1);
   EXPECT_EQ(attr[0], "0");
-  // Case 1c: add attr "_xla_control_deps = src node name" to dst node.
+  // Case 1b: add attr "_xla_control_deps = src node name" to dst node.
   attr.clear();
   TF_CHECK_OK(GetNodeAttr(identity4_node->def(),
                           kXlaControlDependenciesAttrName, &attr));
   EXPECT_EQ(attr.size(), 1);
   EXPECT_EQ(attr[0], "identity0");
-  // Case 1d: add attr "_xla_control_deps = src node name" to dst node.
+  // Case 1c: add attr "_xla_control_deps = src node name" to dst node.
   attr.clear();
   TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
                           kXlaControlDependenciesAttrName, &attr));
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 70b019d35fc80c975bc23ef42d61e3e36e4d0924..e3c7e2f89be9b37b51a633dabb099969c181013f 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -366,7 +366,7 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
 //    replace this node with compilation result node.
 // 3) all outside compilation graphs.
 Status ConstructHostGraph(
-    const string& xla_cluster_name,
+    const string& xla_cluster_name, const string& outside_compilation_attr_name,
     const std::vector<string>& outside_compilation_host_graphs,
     FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph) {
   host_graph->reset(new Graph(fld));
@@ -394,12 +394,12 @@ Status ConstructHostGraph(
   for (const string& host_func : outside_compilation_host_graphs) {
     VLOG(4) << "Expanding host graph " << host_func;
     FunctionBody* host_fbody = nullptr;
-    TF_RETURN_IF_ERROR(
-        FunctionDefToBodyHelper(*fld->Find(host_func), AttrSlice(), fld,
-                                [&](const string& op, const OpDef** sig) {
-                                  return fld->LookUpOpDef(op, sig);
-                                },
-                                &host_fbody));
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+        *fld->Find(host_func), AttrSlice(), fld,
+        [&](const string& op, const OpDef** sig) {
+          return fld->LookUpOpDef(op, sig);
+        },
+        &host_fbody));
     std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
 
     // We use ReverseDFS() to copy nodes. Make sure all nodes are reverse
@@ -411,52 +411,53 @@ Status ConstructHostGraph(
     node_map[host_fbody->graph->source_node()] = (*host_graph)->source_node();
     node_map[host_fbody->graph->sink_node()] = (*host_graph)->sink_node();
     Status s;
-    ReverseDFS(*host_fbody->graph, /*enter=*/nullptr,
-               [&](const Node* n) {
-                 if (!s.ok()) {
-                   return;
-                 }
-
-                 Node* copy;
-                 if (node_map.find(n) != node_map.end()) {
-                   // Already copied this node.
-                   copy = node_map.at(n);
-                 } else if (IsKeyPlaceholderNode(*n)) {
-                   // Change a).
-                   copy = key_placeholder;
-                   node_map[n] = copy;
-                 } else {
-                   // Copy the node.
-                   NodeDef copy_def = n->def();
-                   // Change c).
-                   copy_def.clear_device();
-                   copy = (*host_graph)->AddNode(copy_def, &s);
-                   if (!s.ok()) {
-                     return;
-                   }
-                   node_map[n] = copy;
-                 }
-
-                 // Only handle input edges. Output edges will be added later as
-                 // its output nodes' input edges.
-                 for (auto e : n->in_edges()) {
-                   if (node_map.find(e->src()) == node_map.end()) {
-                     s = errors::Internal("Cannot find node image for ",
-                                          e->src()->DebugString());
-                     return;
-                   }
-                   (*host_graph)
-                       ->AddEdge(node_map[e->src()], e->src_output(), copy,
-                                 e->dst_input());
-                 }
-
-                 // Change b).
-                 if (copy->type_string() == "_XlaRecvAtHost" ||
-                     copy->type_string() == "_XlaSendFromHost") {
-                   (*host_graph)->AddControlEdge(copy, sequencer);
-                 }
-               },
-               NodeComparatorID());
+    ReverseDFS(
+        *host_fbody->graph, /*enter=*/nullptr,
+        [&](const Node* n) {
+          if (!s.ok()) {
+            return;
+          }
+
+          Node* copy;
+          if (node_map.find(n) != node_map.end()) {
+            // Already copied this node.
+            copy = node_map.at(n);
+          } else if (IsKeyPlaceholderNode(*n)) {
+            // Change a).
+            copy = key_placeholder;
+            node_map[n] = copy;
+          } else {
+            // Copy the node.
+            NodeDef copy_def = n->def();
+            // Change c).
+            copy_def.clear_device();
+            copy = (*host_graph)->AddNode(copy_def, &s);
+            if (!s.ok()) {
+              return;
+            }
+            node_map[n] = copy;
+          }
+
+          // Only handle input edges. Output edges will be added later as
+          // its output nodes' input edges.
+          for (auto e : n->in_edges()) {
+            if (node_map.find(e->src()) == node_map.end()) {
+              s = errors::Internal("Cannot find node image for ",
+                                   e->src()->DebugString());
+              return;
+            }
+            (*host_graph)
+                ->AddEdge(node_map[e->src()], e->src_output(), copy,
+                          e->dst_input());
+          }
+
+          // Change b).
+          if (copy->type_string() == "_XlaRecvAtHost" ||
+              copy->type_string() == "_XlaSendFromHost") {
+            (*host_graph)->AddControlEdge(copy, sequencer);
+          }
+        },
+        NodeComparatorID());
     if (!s.ok()) {
       return s;
     }
@@ -475,6 +476,10 @@ Status ConstructHostGraph(
       host_graph->get(),
       std::unordered_set<const Node*>{(*host_graph)->sink_node()});
 
+  // Postprocess edges between different outside compilations.
+  TF_RETURN_IF_ERROR(PostprocessEdgesBetweenOutsideCompilations(
+      host_graph->get(), outside_compilation_attr_name));
+
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_host_graph_for_",
@@ -800,6 +805,11 @@ Status ExtractOutsideCompilationForFunction(
       },
       &fbody));
   std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+
+  // Preprocess edges between different outside compilations. They will be
+  // restored in `ConstructHostGraph()`.
+  TF_RETURN_IF_ERROR(PreprocessEdgesBetweenOutsideCompilations(
+      fbody->graph, outside_compilation_attr_name));
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_for_func_before_", func_name),
@@ -838,7 +848,12 @@ Status ExtractOutsideCompilationForFunction(
         FunctionDef shape_inference_fdef = *xla_fdef;
         shape_inference_fdef.mutable_signature()->set_name(
             shape_inference_graph);
-        TF_RETURN_IF_ERROR(fld->AddFunctionDef(shape_inference_fdef));
+        if (fld->Find(shape_inference_graph)) {
+          TF_RETURN_IF_ERROR(fld->ReplaceFunction(shape_inference_graph,
+                                                  shape_inference_fdef));
+        } else {
+          TF_RETURN_IF_ERROR(fld->AddFunctionDef(shape_inference_fdef));
+        }
       }
     }
   }
@@ -854,8 +869,9 @@ Status ExtractOutsideCompilationForFunction(
 
   // Construct host graph.
   if (!outside_compilation_host_graphs.empty()) {
-    TF_RETURN_IF_ERROR(ConstructHostGraph(
-        xla_cluster_name, outside_compilation_host_graphs, fld, host_graph));
+    TF_RETURN_IF_ERROR(
+        ConstructHostGraph(xla_cluster_name, outside_compilation_attr_name,
+                           outside_compilation_host_graphs, fld, host_graph));
   }
 
   // Remove the outside compilation graphs from function library.
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index c5bd64f004ef98853955372680277e04c16bdc9e..bff956100da661b679b4557fce53671e6cef88c5 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -290,21 +290,18 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shapes", &shapes));
   EXPECT_EQ(shapes.size(), 1);
   EXPECT_EQ(shapes[0].dim_size(), 1);
-  // Check XlaHostCompute nodes' "shape_inference_graph" attr. "0" should have a
-  // non-empty value, and "1" should have an empty value.
+  // Check XlaHostCompute nodes' "shape_inference_graph" attr. Both should have
+  // empty values.
   string shape_inference_graph;
   TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
-  EXPECT_EQ(shape_inference_graph,
-            "_outside_compilation_shape_inference_cluster_0");
+  EXPECT_EQ(shape_inference_graph, "");
   TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
   EXPECT_EQ(shape_inference_graph, "");
 
   // Check `shape_inference_graphs`.
-  EXPECT_EQ(shape_inference_graphs.size(), 1);
-  EXPECT_EQ(shape_inference_graphs[0],
-            "_outside_compilation_shape_inference_cluster_0");
+  EXPECT_EQ(shape_inference_graphs.size(), 0);
 
   // Check `host_graph`: verify we have key placeholder and sequencer.
   Node *key_placeholder = nullptr, *sequencer = nullptr;
@@ -333,8 +330,8 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
       send_recv_nodes.push_back(n);
     }
   }
-  EXPECT_EQ(num_send_from_host, 2);
-  EXPECT_EQ(num_recv_at_host, 2);
+  EXPECT_EQ(num_send_from_host, 1);
+  EXPECT_EQ(num_recv_at_host, 1);
   for (Node *n : send_recv_nodes) {
     Node *input_node;
     TF_CHECK_OK(n->input_node(n->num_inputs() - 1, &input_node));
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98e344b3a080aa8aab27cd41564a90427bac151e
--- /dev/null
+++ b/tensorflow/compiler/jit/flags.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <mutex>  // NOLINT
+
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace {
+
+BuildXlaOpsPassFlags* build_ops_flags;
+DumpGraphFlags* dump_graph_flags;
+MarkForCompilationPassFlags* mark_for_compilation_flags;
+XlaDeviceFlags* device_flags;
+XlaOpsCommonFlags* ops_flags;
+
+std::vector<Flag>* flag_list;
+std::once_flag flags_init;
+
+void AppendDumpGraphFlagsInternal(std::vector<Flag>* flag_list) {
+  std::vector<Flag> new_flags = {
+      Flag("tf_dump_graph_prefix", &dump_graph_flags->tf_dump_graph_prefix,
+           "Path prefix to which graphs dumped during debugging should be "
+           "written."),
+  };
+  flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
+}
+
+void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
+  std::vector<Flag> new_flags = {
+      Flag("tf_xla_auto_jit", &mark_for_compilation_flags->tf_xla_auto_jit,
+           "Control compilation of operators into XLA computations on CPU and "
+           "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
+           "things very likely to be improved; 2 = on for everything.  "
+           "Experimental."),
+      Flag("tf_xla_min_cluster_size",
+           &mark_for_compilation_flags->tf_xla_min_cluster_size,
+           "Minimum number of operators in an XLA compilation. Ignored for "
+           "operators placed on an XLA device or operators explicitly marked "
+           "for compilation."),
+      Flag("tf_xla_max_cluster_size",
+           &mark_for_compilation_flags->tf_xla_max_cluster_size,
+           "Maximum number of operators in an XLA compilation."),
+      Flag("tf_xla_clustering_debug",
+           &mark_for_compilation_flags->tf_xla_clustering_debug,
+           "Dump graphs during XLA compilation."),
+      Flag("tf_xla_cpu_global_jit",
+           &mark_for_compilation_flags->tf_xla_cpu_global_jit,
+           "Enables global JIT compilation for CPU via SessionOptions."),
+      Flag("tf_xla_clustering_fuel",
+           &mark_for_compilation_flags->tf_xla_clustering_fuel,
+           "Places an artificial limit on the number of ops marked as "
+           "eligible for clustering."),
+      Flag("tf_xla_fusion_only",
+           &mark_for_compilation_flags->tf_xla_fusion_only,
+           "enable fusion of element-wise operations only using XLA when "
+           "global_jit_level is ON*.")};
+  flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
+}
+
+void AllocateAndParseFlags() {
+  build_ops_flags = new BuildXlaOpsPassFlags;
+  build_ops_flags->tf_xla_enable_lazy_compilation = true;
+
+  dump_graph_flags = new DumpGraphFlags;
+  dump_graph_flags->tf_dump_graph_prefix = "/tmp/";
+
+  mark_for_compilation_flags = new MarkForCompilationPassFlags;
+  mark_for_compilation_flags->tf_xla_auto_jit = 0;
+  mark_for_compilation_flags->tf_xla_min_cluster_size = 2;
+  mark_for_compilation_flags->tf_xla_max_cluster_size =
+      std::numeric_limits<int32>::max();
+  mark_for_compilation_flags->tf_xla_clustering_debug = false;
+  mark_for_compilation_flags->tf_xla_cpu_global_jit = false;
+  mark_for_compilation_flags->tf_xla_clustering_fuel =
+      std::numeric_limits<int64>::max();
+  mark_for_compilation_flags->tf_xla_fusion_only = false;
+
+  device_flags = new XlaDeviceFlags;
+  device_flags->tf_xla_compile_on_demand = false;
+
+  ops_flags = new XlaOpsCommonFlags;
+  ops_flags->tf_xla_always_defer_compilation = false;
+
+  flag_list = new std::vector<Flag>({
+      Flag("tf_xla_enable_lazy_compilation",
+           &build_ops_flags->tf_xla_enable_lazy_compilation, ""),
+
+      Flag("tf_xla_compile_on_demand", &device_flags->tf_xla_compile_on_demand,
+           "Switch a device into 'on-demand' mode, where instead of "
+           "autoclustering ops are compiled one by one just-in-time."),
+
+      Flag("tf_xla_always_defer_compilation",
+           &ops_flags->tf_xla_always_defer_compilation, ""),
+  });
+  AppendDumpGraphFlagsInternal(flag_list);
+  AppendMarkForCompilationPassFlagsInternal(flag_list);
+  xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", *flag_list);
+}
+
+}  // namespace
+
+const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return *build_ops_flags;
+}
+
+DumpGraphFlags* GetDumpGraphFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return dump_graph_flags;
+}
+
+MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return mark_for_compilation_flags;
+}
+
+XlaDeviceFlags* GetXlaDeviceFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return device_flags;
+}
+
+const XlaOpsCommonFlags& GetXlaOpsCommonFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return *ops_flags;
+}
+
+void AppendMarkForCompilationPassFlags(std::vector<Flag>* flag_list) {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  AppendMarkForCompilationPassFlagsInternal(flag_list);
+}
+
+void AppendDumpGraphFlags(std::vector<Flag>* flag_list) {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  AppendDumpGraphFlagsInternal(flag_list);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h b/tensorflow/compiler/jit/flags.h
similarity index 56%
rename from tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
rename to tensorflow/compiler/jit/flags.h
index 2affda6ab4e0fbad32a246744fa5b38aeb629c1b..5ddea588eef5270880d91623dc05893da265960a 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
-
-// Legacy flags for the XLA bridge's mark_for_compilation_pass module.
+#ifndef TENSORFLOW_COMPILER_JIT_FLAGS_H_
+#define TENSORFLOW_COMPILER_JIT_FLAGS_H_
 
 #include <vector>
 
@@ -24,16 +22,9 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// mark_for_compilation_pass module.
-void AppendMarkForCompilationPassFlags(
-    std::vector<tensorflow::Flag>* flag_list);
 
-// The values of flags associated with the XLA bridge's
-// mark_for_compilation_pass module.
-typedef struct {
+// Flags associated with the XLA bridge's mark_for_compilation_pass module.
+struct MarkForCompilationPassFlags {
   int32 tf_xla_auto_jit;  // Control compilation of operators into XLA
                           // computations on CPU and GPU devices.  0 = use
                           // ConfigProto setting; -1 = off; 1 = on for things
@@ -55,14 +46,58 @@ typedef struct {
                             // is set to ON* and overrides its behavior. If
                             // true, enable fusion of element-wise operations
                             // only using XLA.
-} MarkForCompilationPassFlags;
+};
+
+// Flags associated with the XLA bridge's xla_device module.
+struct XlaDeviceFlags {
+  // Switch the CPU device into "on-demand" mode, where instead of
+  // autoclustering ops are compiled one by one just-in-time.
+  // Enabling this mode by a legacy flag is a temporary mechanism. When this
+  // feature is battle-tested, we will switch this to be a session option.
+  bool tf_xla_compile_on_demand;
+};
+
+// Flags common to the _Xla* ops and their kernels.
+struct XlaOpsCommonFlags {
+  // If true, _XlaCompile always refuses to compile the cluster, which means the
+  // XLA clusters always run in the TF executor.  Defaults to false.
+  bool tf_xla_always_defer_compilation;
+};
 
-// Return a pointer to the MarkForCompilationPassFlags struct;
+// Flags for the build_xla_ops pass.
+struct BuildXlaOpsPassFlags {
+  // Enables lazy compilation for TF/XLA (only when auto-clustering) if true.
+  // Defaults to true.
+  bool tf_xla_enable_lazy_compilation;
+};
+
+// Flags for the XLA bridge's dump_graph module.
+struct DumpGraphFlags {
+  // Path prefix to which graphs dumped during debugging should be written.
+  string tf_dump_graph_prefix;
+};
+
+// Return a pointer to the DumpGraphFlags struct;
 // repeated calls return the same pointer.
 // This should be called only after Flags::Parse() has returned.
+
+// Getters for flags structs defined above.  The first call to any of these
+// parses TF_XLA_FLAGS for all of them.  Those functions which return a pointer
+// always return the same pointer.
 MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
+const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags();
+XlaDeviceFlags* GetXlaDeviceFlags();
+const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
+DumpGraphFlags* GetDumpGraphFlags();
+
+// Appends the flag definitions associated with
+// MarkForCompilationPassFlags/DumpGraphFlags to `flag_list`.
+//
+// Has the side-effect of parsing TF_XLA_FLAGS if that hasn't happened yet.
+void AppendMarkForCompilationPassFlags(
+    std::vector<tensorflow::Flag>* flag_list);
+void AppendDumpGraphFlags(std::vector<tensorflow::Flag>* flag_list);
 
-}  // namespace legacy_flags
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
+#endif  // TENSORFLOW_COMPILER_JIT_FLAGS_H_
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index bd8719b7f1acb79e0b0cd91f2f0de0d66d8dab46..ce53f70b79d97ab087fefe542920b33f883632a2 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -18,11 +18,12 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "absl/types/optional.h"
 #include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/math_ops.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
@@ -34,14 +35,30 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
-Status GetTensorFromConstOp(Node* n, Tensor* out_tensor) {
-  TF_RET_CHECK(n->type_string() == "Const");
+
+// StatusOrOptional<T> instances hold
+//
+//  - A non-OK Status to indicate an error that needs to be propagated out of
+//    this pass (e.g. the Graph is malformed).
+//
+//  - A nullopt to indicate the function that created the instance failed to do
+//    what it set out to do but this is not actually an error
+//    (e.g. TryToGetTensorFromConstOp was passed a non-Const node).
+//
+//  - A T to indicate a successful operation.
+template <class T>
+using StatusOrOptional = xla::StatusOr<absl::optional<T>>;
+
+StatusOrOptional<Tensor> TryToGetTensorFromConstOp(Node* n) {
+  if (n->type_string() != "Const") {
+    return {absl::nullopt};
+  }
+
   const TensorProto* proto = nullptr;
   TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "value", &proto));
   Tensor tensor(proto->dtype());
   TF_RET_CHECK(tensor.FromProto(*proto));
-  *out_tensor = std::move(tensor);
-  return Status::OK();
+  return {tensor};
 }
 
 struct SliceInputs {
@@ -70,7 +87,7 @@ std::vector<int64> IntTensorAsVector(const Tensor& t) {
 
 // Packages up the inputs to a Slice operation into an instance of
 // `SliceInputs`.
-Status GetSliceInputs(Node* slice, SliceInputs* slice_inputs) {
+StatusOrOptional<SliceInputs> GetSliceInputs(Node* slice) {
   const int kSliceInputIndex = 0;
   const int kSliceBeginIndex = 1;
   const int kSliceSizeIndex = 2;
@@ -81,23 +98,27 @@ Status GetSliceInputs(Node* slice, SliceInputs* slice_inputs) {
   TF_RETURN_IF_ERROR(slice->input_edge(kSliceSizeIndex, &slice_size_edge));
   const Edge* slice_begin_edge;
   TF_RETURN_IF_ERROR(slice->input_edge(kSliceBeginIndex, &slice_begin_edge));
-  slice_inputs->input =
+
+  SliceInputs slice_inputs;
+  slice_inputs.input =
       Output(slice_input_edge->src(), slice_input_edge->src_output());
-  slice_inputs->begin =
+  slice_inputs.begin =
       Output(slice_begin_edge->src(), slice_begin_edge->src_output());
-  slice_inputs->size =
+  slice_inputs.size =
       Output(slice_size_edge->src(), slice_size_edge->src_output());
 
-  Tensor tf_slice_size;
-  TF_RETURN_IF_ERROR(
-      GetTensorFromConstOp(slice_inputs->size.node(), &tf_slice_size));
+  TF_ASSIGN_OR_RETURN(absl::optional<Tensor> tf_slice_size,
+                      TryToGetTensorFromConstOp(slice_inputs.size.node()));
+  if (!tf_slice_size.has_value()) {
+    return {absl::nullopt};
+  }
 
-  if (tf_slice_size.dims() != 1) {
-    return errors::Internal("Expected vector for the slice size input.");
+  if (tf_slice_size->dims() != 1) {
+    return {absl::nullopt};
   }
 
-  slice_inputs->size_as_vector = IntTensorAsVector(tf_slice_size);
-  return Status::OK();
+  slice_inputs.size_as_vector = IntTensorAsVector(*tf_slice_size);
+  return {slice_inputs};
 }
 
 // Casts `x` to a DT_INT64 if it isn't one already.
@@ -187,8 +208,12 @@ Status ComputeSliceSize(const Scope& host_scope,
     DCHECK_EQ(slice_size.back().type(), DT_INT64);
   }
 
-  *size = ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
-                      ops::Const(host_scope.WithOpName("concat_axis"), 0));
+  // Trivial ConcatV2 nodes (with exactly one input) are disallowed.
+  *size =
+      slice_size.size() == 1
+          ? slice_size[0]
+          : ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
+                        ops::Const(host_scope.WithOpName("concat_axis"), 0));
   return Status::OK();
 }
 
@@ -221,6 +246,9 @@ Status ConvertTensorFlowSliceToStaticShapedSlice(
                      .WithOpName("static_shaped_slice"),
                  slice_inputs_int64.input, slice_inputs_int64.begin, slice_size)
           .node();
+
+  TF_RETURN_IF_ERROR(main_scope.status());
+
   std::vector<string> compile_time_const_inputs;
   compile_time_const_inputs.push_back("size");
   (*result)->AddAttr(kXlaCompileTimeConstantInputsAttr,
@@ -263,10 +291,9 @@ Status RewriteSlice(Graph* g, Node* slice, const SliceInputs& slice_inputs,
   return Status::OK();
 }
 
-// Returns true if `n` is a slice we can rewrite to have a static shape
-// (i.e. have the output shape only depend on the "size" input).  Fills in
-// `slice_inputs` in the process.
-bool IsRewritableSlice(Node* n, SliceInputs* slice_inputs) {
+// Return true if `n` is a slice we can rewrite to have a static shape
+// (i.e. have the output shape only depend on the "size" input).
+xla::StatusOr<bool> IsRewritableSlice(Node* n) {
   if (n->type_string() != "Slice") {
     return false;
   }
@@ -276,8 +303,9 @@ bool IsRewritableSlice(Node* n, SliceInputs* slice_inputs) {
     return false;
   }
 
-  if (!GetSliceInputs(n, slice_inputs).ok()) {
-    // Could not parse slice inputs.  E.g. the sizes input was not a constant.
+  TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
+                      GetSliceInputs(n));
+  if (!slice_inputs.has_value()) {
     return false;
   }
 
@@ -288,17 +316,20 @@ bool IsRewritableSlice(Node* n, SliceInputs* slice_inputs) {
 }
 
 Status FindAndRewriteSlices(Graph* g, bool* changed) {
-  std::vector<std::pair<Node*, SliceInputs>> slices_to_rewrite;
+  std::vector<Node*> slices_to_rewrite;
   for (Node* n : g->nodes()) {
-    SliceInputs slice_inputs;
-    if (IsRewritableSlice(n, &slice_inputs)) {
-      slices_to_rewrite.push_back({n, std::move(slice_inputs)});
+    TF_ASSIGN_OR_RETURN(bool is_rewritable, IsRewritableSlice(n));
+    if (is_rewritable) {
+      slices_to_rewrite.push_back(n);
     }
   }
 
-  for (const auto& pair : slices_to_rewrite) {
-    TF_RETURN_IF_ERROR(RewriteSlice(g, pair.first, pair.second,
-                                    *GetXlaClusterForNode(*pair.first)));
+  for (Node* n : slices_to_rewrite) {
+    TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
+                        GetSliceInputs(n));
+    TF_RET_CHECK(slice_inputs.has_value());
+    TF_RETURN_IF_ERROR(
+        RewriteSlice(g, n, *slice_inputs, *GetXlaClusterForNode(*n)));
   }
 
   if (!slices_to_rewrite.empty()) {
@@ -314,8 +345,7 @@ Status FindAndRewriteSlices(Graph* g, bool* changed) {
 
 Status IncreaseDynamismForAutoJitPass::Run(
     const GraphOptimizationPassOptions& options) {
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   if (flags->tf_xla_clustering_debug) {
     dump_graph::DumpGraphToFile("before_increase_dynamism_for_auto_jit_pass",
                                 **options.graph, options.flib_def);
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
index 0f6f612e967035f6af3e4aff2a499d5cedd018af..a2f1b831ad7605237e23c15cc43b337e06265553 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ::testing::_;
 using testing::matchers::AssignedDevice;
 using testing::matchers::Attr;
 using testing::matchers::Const;
@@ -142,6 +143,26 @@ TEST(SliceToDynamicSliceRewriteTest, Basic) {
   EXPECT_THAT(static_shaped_slice, m_dynamic_slice);
 }
 
+TEST(SliceToDynamicSliceRewriteTest, SliceFromVector) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size = ops::Const(root.WithOpName("size"), {-1});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  EXPECT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(result->nodes(), Not(Contains(NodeWith(Op("ConcatV2")))));
+}
+
 TEST(SliceToDynamicSliceRewriteTest, ControlDependencePreserved) {
   Scope root = Scope::NewRootScope()
                    .ExitOnError()
@@ -166,18 +187,18 @@ TEST(SliceToDynamicSliceRewriteTest, ControlDependencePreserved) {
                        CtrlDeps(NodeWith(Op("Placeholder"), Name("control")))));
 }
 
+int64 ToInt64(int v) { return static_cast<int64>(v); }
+
 TEST(SliceToDynamicSliceRewriteTest, Int64Indices) {
   Scope root = Scope::NewRootScope()
                    .ExitOnError()
                    .WithAssignedDevice(kDeviceName)
                    .WithXlaCluster("cluster_0");
 
-  auto to_int64 = [](int v) { return static_cast<int64>(v); };
-
   Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
   Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
   Output size =
-      ops::Const(root.WithOpName("size"), {to_int64(-1), to_int64(500)});
+      ops::Const(root.WithOpName("size"), {ToInt64(-1), ToInt64(500)});
   Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
 
   std::unique_ptr<Graph> result;
@@ -252,13 +273,35 @@ TEST(SliceToDynamicSliceRewriteTest, DontRewriteSliceWithNonConstSize) {
                                     Attr(kXlaCompileTimeConstantInputsAttr)))));
 }
 
+TEST(SliceToDynamicSliceRewriteTest, ScalarSlice) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
+  Output size = ops::Const<int64>(root.WithOpName("size"), {});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(static_shaped_slice,
+              NodeWith(Op("Slice"), Attr(kXlaCompileTimeConstantInputsAttr),
+                       Inputs(_, _, Out(NodeWith(Name(size.node()->name()))))));
+}
+
 TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
   Scope root = Scope::NewRootScope()
                    .ExitOnError()
                    .WithAssignedDevice(kDeviceName)
                    .WithXlaCluster("cluster_0");
 
-  auto to_int64 = [](int v) { return static_cast<int64>(v); };
+  auto ToInt64 = [](int v) { return static_cast<int64>(v); };
 
   Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
   Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
@@ -271,7 +314,7 @@ TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
       ops::Slice(root.WithOpName("slice"), input, begin, size_placeholder);
 
   Output size =
-      ops::Const(root.WithOpName("size"), {{to_int64(-1)}, {to_int64(500)}});
+      ops::Const(root.WithOpName("size"), {{ToInt64(-1)}, {ToInt64(500)}});
   TF_ASSERT_OK(root.graph()->UpdateEdge(size.node(), 0, slice.node(), 2));
 
   std::unique_ptr<Graph> result;
@@ -281,5 +324,82 @@ TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
               Not(Contains(NodeWith(Op("Slice"),
                                     Attr(kXlaCompileTimeConstantInputsAttr)))));
 }
+
+TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceInput) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size_a = ops::Const(root.WithOpName("size_a"), {-1, 500});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size_a);
+
+  Output size_b = ops::Const(root.WithOpName("size_a"), {-1, 200});
+  Output slice_with_slice_input = ops::Slice(
+      root.WithOpName("slice_with_slice_input"), slice, begin, size_b);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(),
+      "slice_with_slice_input/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_EQ(static_shaped_slice->output_type(0), DT_FLOAT)
+      << "Expected DT_FLOAT, was "
+      << DataType_Name(static_shaped_slice->output_type(0));
+  EXPECT_THAT(
+      static_shaped_slice,
+      NodeWith(
+          Op("Slice"),
+          Inputs(Out(NodeWith(
+                     Op("Slice"),
+                     Name("slice/static_shaped_slice/static_shaped_slice"))),
+                 _, _)));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceBegin) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input_float =
+      ops::Placeholder(root.WithOpName("input_float"), DT_FLOAT);
+  Output input_i64 = ops::Placeholder(root.WithOpName("input_i64"), DT_INT64);
+
+  Output begin_begin =
+      ops::Placeholder(root.WithOpName("begin_begin"), DT_INT32);
+  Output begin_size = ops::Const(root.WithOpName("begin_size"), {-1});
+  Output begin =
+      ops::Slice(root.WithOpName("begin"), input_i64, begin_begin, begin_size);
+
+  Output size =
+      ops::Const(root.WithOpName("size"), {ToInt64(-1), ToInt64(200)});
+  Output slice_with_slice_begin = ops::Slice(
+      root.WithOpName("slice_with_slice_begin"), input_float, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(),
+      "slice_with_slice_begin/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_EQ(static_shaped_slice->output_type(0), DT_FLOAT)
+      << "Expected DT_FLOAT, was "
+      << DataType_Name(static_shaped_slice->output_type(0));
+  EXPECT_THAT(
+      static_shaped_slice,
+      NodeWith(
+          Op("Slice"),
+          Inputs(_,
+                 Out(NodeWith(
+                     Op("Slice"),
+                     Name("begin/static_shaped_slice/static_shaped_slice"))),
+                 _)));
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
index 107d521077c3fe2ac72d113d46e2566c78c9fafb..f79bdc1e2e8d82c9144d1bb9923ad36d8541cbdb 100644
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@@ -44,11 +44,8 @@ REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 26,
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 10,
                       MarkForCompilationPass);
 
-// TODO(b/111210515): IncreaseDynamismForAutoJitPass creates slices with index
-// type DT_INT64 which do not have a kernel on GPU.
-//
-// REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20,
-//                       IncreaseDynamismForAutoJitPass);
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20,
+                      IncreaseDynamismForAutoJitPass);
 
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30,
                       PartiallyDeclusterPass);
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 830db9ebdd92608c375ad778eced833e26729325..0583774714c6db7a2fa515fc8a0d304e1898db97 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -12,10 +12,10 @@ cc_library(
     hdrs = ["xla_ops.h"],
     deps = [
         "//tensorflow/compiler/jit:common",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:xla_compilation_cache",
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/jit:xla_launch_util",
-        "//tensorflow/compiler/jit/legacy_flags:xla_ops_common_flags",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 6bcae1dcc3dcf87faa5317e0064c4c0cf80af465..ad71df5a694a5f8da94675049df1062a7edb6253 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -39,12 +39,22 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
+// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that
+// in error case, it returns RET instead of void.
+#define OP_REQUIRES_OK_RETURN(CTX, RET, ...)                \
+  do {                                                      \
+    ::tensorflow::Status _s(__VA_ARGS__);                   \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      return RET;                                           \
+    }                                                       \
+  } while (0)
+
 namespace tensorflow {
 
 namespace {
 
-Status PlatformInfoFromContext(OpKernelConstruction* ctx,
-                               XlaPlatformInfo* result) {
+XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
   DeviceType device_type = ctx->device_type();
   se::Platform::Id platform_id = nullptr;
   const XlaDevice::Metadata* xla_device_metadata = nullptr;
@@ -76,16 +86,16 @@ Status PlatformInfoFromContext(OpKernelConstruction* ctx,
   }
 
   if (!device_allocator) {
-    TF_ASSIGN_OR_RETURN(se::Platform* const platform,
-                        se::MultiPlatformManager::PlatformWithId(platform_id));
+    xla::StatusOr<se::Platform*> maybe_platform =
+        se::MultiPlatformManager::PlatformWithId(platform_id);
+    OP_REQUIRES_OK_RETURN(ctx, XlaPlatformInfo(), maybe_platform.status());
+
     xla_allocator = absl::make_unique<XlaAllocator>(
-        platform, ctx->device()->GetAllocator({}));
+        maybe_platform.ValueOrDie(), ctx->device()->GetAllocator({}));
   }
 
-  *result = XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
-                            std::move(xla_allocator), device_allocator);
-
-  return Status::OK();
+  return XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
+                         std::move(xla_allocator), device_allocator);
 }
 
 // A closure describing how to run a compiled version of a TensorFlow function.
@@ -179,9 +189,8 @@ XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
     : OpKernel(ctx),
       constants_(constants),
       resources_(resources),
-      function_(function) {
-  OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_));
-}
+      function_(function),
+      platform_info_(PlatformInfoFromContext(ctx)) {}
 
 static Status BuildCompilationCache(OpKernelContext* ctx,
                                     const XlaPlatformInfo& platform_info,
@@ -277,8 +286,10 @@ static Status CompileToLocalExecutable(
   // rather than a one-element tuple.
   compile_options.always_return_tuple = false;
 
-  return cache->Compile(options, function, constant_args, *variables, ctx,
-                        compile_options,
+  std::vector<XlaCompiler::Argument> args;
+  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
+      constant_args, *variables, ctx, &args));
+  return cache->Compile(options, function, args, compile_options,
                         lazy ? XlaCompilationCache::CompileMode::kLazy
                              : XlaCompilationCache::CompileMode::kStrict,
                         kernel, executable);
@@ -333,18 +344,6 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
 }
 
 namespace {
-
-// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that
-// in error case, it returns RET instead of void.
-#define OP_REQUIRES_OK_RETURN(CTX, RET, ...)                \
-  do {                                                      \
-    ::tensorflow::Status _s(__VA_ARGS__);                   \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
-      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
-      return RET;                                           \
-    }                                                       \
-  } while (0)
-
 // Helper static functions to construct parameters for
 // XlaLocalLaunchBase constructor from OpKernelConstruction.
 std::vector<int> ConstantsVector(OpKernelConstruction* ctx) {
@@ -381,7 +380,12 @@ NameAttrList FunctionAttr(OpKernelConstruction* ctx) {
   return *func;
 }
 
-#undef OP_REQUIRES_OK_RETURN
+bool MustCompileAttr(OpKernelConstruction* ctx) {
+  bool must_compile;
+  OP_REQUIRES_OK_RETURN(ctx, false,
+                        ctx->GetAttr("must_compile", &must_compile));
+  return must_compile;
+}
 }  // namespace
 
 XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
@@ -396,10 +400,9 @@ XlaCompileOp::XlaCompileOp(OpKernelConstruction* ctx)
     : OpKernel(ctx),
       constants_(ConstantsVector(ctx)),
       resources_(ResourcesVector(ctx)),
-      function_(FunctionAttr(ctx)) {
-  OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("must_compile", &must_compile_));
-}
+      function_(FunctionAttr(ctx)),
+      platform_info_(PlatformInfoFromContext(ctx)),
+      must_compile_(MustCompileAttr(ctx)) {}
 
 void XlaCompileOp::Compute(OpKernelContext* ctx) {
   VLOG(3) << "XlaCompileOp " << def().name()
@@ -409,13 +412,30 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   std::map<int, OptionalTensor> variables;
 
-  if (legacy_flags::GetXlaOpsCommonFlags().tf_xla_always_defer_compilation) {
+  bool cannot_compile_cluster;
+  {
+    mutex_lock guard(cannot_compile_cluster_mu_);
+    cannot_compile_cluster = cannot_compile_cluster_;
+  }
+
+  if (GetXlaOpsCommonFlags().tf_xla_always_defer_compilation ||
+      cannot_compile_cluster) {
     executable = nullptr;
   } else {
-    OP_REQUIRES_OK(ctx, CompileToLocalExecutable(
-                            ctx, function_, platform_info_, resources_,
-                            constants_, /*lazy=*/!must_compile_, &client,
-                            &variables, &kernel, &executable));
+    Status status = CompileToLocalExecutable(
+        ctx, function_, platform_info_, resources_, constants_,
+        /*lazy=*/!must_compile_, &client, &variables, &kernel, &executable);
+    if (must_compile_ || status.code() != error::UNIMPLEMENTED) {
+      OP_REQUIRES_OK(ctx, status);
+    }
+
+    if (status.code() == error::UNIMPLEMENTED) {
+      LOG(WARNING) << "Compilation failed:" << status.ToString()
+                   << ".  Falling back to TF function call.";
+      executable = nullptr;
+      mutex_lock guard(cannot_compile_cluster_mu_);
+      cannot_compile_cluster_ = true;
+    }
   }
 
   AllocatorAttributes host_alloc_attrs;
@@ -452,9 +472,8 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   ctx->set_output(1, compilation_successful);
 }
 
-XlaRunOp::XlaRunOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-  OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_));
-}
+XlaRunOp::XlaRunOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx), platform_info_(PlatformInfoFromContext(ctx)) {}
 
 void XlaRunOp::Compute(OpKernelContext* ctx) {
   VLOG(3) << "XlaRunOp " << def().name();
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index ac90837e0d90943b93e2cdb01a30fa0837ba94df..7b4d4b5b4737784d4fe277d5bbe9cab79cfaf4c9 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
 #define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
 
+#include <atomic>
+
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
@@ -33,6 +35,7 @@ namespace tensorflow {
 class XlaPlatformInfo {
  public:
   XlaPlatformInfo() : device_type_("") {}
+  XlaPlatformInfo(XlaPlatformInfo&&) = default;
   explicit XlaPlatformInfo(const DeviceType device_type,
                            se::Platform::Id platform_id,
                            const XlaDevice::Metadata* xla_device_metadata,
@@ -110,12 +113,12 @@ class XlaLocalLaunchBase : public OpKernel {
 
  protected:
   // Indexes of compile-time constant inputs
-  std::vector<int> constants_;
+  const std::vector<int> constants_;
   // Indexes of resource inputs
-  std::vector<int> resources_;
+  const std::vector<int> resources_;
 
-  NameAttrList function_;
-  XlaPlatformInfo platform_info_;
+  const NameAttrList function_;
+  const XlaPlatformInfo platform_info_;
 };
 
 // XlaLocalLaunchOp is used to replace a region of the TensorFlow graph
@@ -144,15 +147,23 @@ class XlaCompileOp : public OpKernel {
 
  private:
   // Indexes of compile-time constant inputs
-  std::vector<int> constants_;
+  const std::vector<int> constants_;
   // Indexes of resource inputs
-  std::vector<int> resources_;
+  const std::vector<int> resources_;
 
-  NameAttrList function_;
+  const NameAttrList function_;
 
   XlaPlatformInfo platform_info_;
 
-  bool must_compile_;
+  const bool must_compile_;
+
+  // cannot_compile_cluster_ is set to true if XLA returns an Unimplemented
+  // error when compiling the cluster this _XlaCompile is supposed to compile.
+  // If `cannot_compile_cluster_` is true then we avoid compiling this cluster
+  // on any future calls to _XlaCompile.
+  bool cannot_compile_cluster_ GUARDED_BY(cannot_compile_cluster_mu_) = false;
+
+  mutex cannot_compile_cluster_mu_;
 };
 
 class XlaRunOp : public OpKernel {
@@ -162,7 +173,7 @@ class XlaRunOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  XlaPlatformInfo platform_info_;
+  const XlaPlatformInfo platform_info_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/BUILD b/tensorflow/compiler/jit/legacy_flags/BUILD
deleted file mode 100644
index 49ff9a3ddd1fc14ba59209c39e00856986deab2d..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/BUILD
+++ /dev/null
@@ -1,65 +0,0 @@
-# Legacy command line flags for the XLA bridge libraries.
-
-# Please do not add more flags to this package.
-
-# The XLA bridge libraries were written in an environment that allowed
-# command-line flags to be scattered freely throughout the libraries.  This
-# model, while initially convenient, leads to a proliferation in unused command
-# line flags in tests and binaries, and serious problems in servers, where one
-# might wish parameters to be different in independent RPC calls to the same
-# routine.
-#
-# Please don't add more flags.  If you're a library author, pass options and
-# parameters explicitly through the library's interface.
-
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
-cc_library(
-    name = "mark_for_compilation_pass_flags",
-    srcs = ["mark_for_compilation_pass_flags.cc"],
-    hdrs = ["mark_for_compilation_pass_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "xla_device_flags",
-    srcs = ["xla_device_flags.cc"],
-    hdrs = ["xla_device_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "build_xla_ops_pass_flags",
-    srcs = ["build_xla_ops_pass_flags.cc"],
-    hdrs = ["build_xla_ops_pass_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "xla_ops_common_flags",
-    srcs = ["xla_ops_common_flags.cc"],
-    hdrs = ["xla_ops_common_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
diff --git a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc
deleted file mode 100644
index 73f4dc73ed83e2d1e89ccd6c99970d46b5767104..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <mutex>  // NOLINT
-
-#include "tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-namespace {
-
-BuildXlaOpsPassFlags* flags;
-std::vector<Flag>* flag_list;
-std::once_flag flags_init;
-
-void AllocateAndParseFlags() {
-  flags = new BuildXlaOpsPassFlags;
-  flags->tf_xla_enable_lazy_compilation = true;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_enable_lazy_compilation",
-           &flags->tf_xla_enable_lazy_compilation, ""),
-  });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-}  // namespace
-
-const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags() {
-  std::call_once(flags_init, &AllocateAndParseFlags);
-  return *flags;
-}
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h b/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h
deleted file mode 100644
index 9aa5cf64d6db56ae36875ca08d2ae88c73604733..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_BUILD_XLA_OPS_PASS_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_BUILD_XLA_OPS_PASS_FLAGS_H_
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Flags for the build_xla_ops pass.
-struct BuildXlaOpsPassFlags {
-  // Enables lazy compilation for TF/XLA (only when auto-clustering) if true.
-  // Defaults to true.
-  bool tf_xla_enable_lazy_compilation;
-};
-
-// Parses the flags in BuildXlaOpsPassFlags from the TF_XLA_FLAGS environment
-// variable and returns a reference to the parsed copy.  Parses TF_XLA_FLAGS
-// only the first time this routine is called.
-const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_BUILD_XLA_OPS_PASS_FLAGS_H_
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
deleted file mode 100644
index 7277a1d1f8ad5fa045645ead839ab9efa01e89c7..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's mark_for_compilation_pass module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static MarkForCompilationPassFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new MarkForCompilationPassFlags;
-  flags->tf_xla_auto_jit = 0;
-  flags->tf_xla_min_cluster_size = 2;
-  flags->tf_xla_max_cluster_size = std::numeric_limits<int32>::max();
-  flags->tf_xla_clustering_debug = false;
-  flags->tf_xla_cpu_global_jit = false;
-  flags->tf_xla_clustering_fuel = std::numeric_limits<int64>::max();
-  flags->tf_xla_fusion_only = false;
-  flag_list = new std::vector<Flag>(
-      {Flag("tf_xla_auto_jit", &flags->tf_xla_auto_jit,
-            "Control compilation of operators into XLA computations on CPU and "
-            "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
-            "things very likely to be improved; 2 = on for everything.  "
-            "Experimental."),
-       Flag("tf_xla_min_cluster_size", &flags->tf_xla_min_cluster_size,
-            "Minimum number of operators in an XLA compilation. Ignored for "
-            "operators placed on an XLA device or operators explicitly marked "
-            "for compilation."),
-       Flag("tf_xla_max_cluster_size", &flags->tf_xla_max_cluster_size,
-            "Maximum number of operators in an XLA compilation."),
-       Flag("tf_xla_clustering_debug", &flags->tf_xla_clustering_debug,
-            "Dump graphs during XLA compilation."),
-       Flag("tf_xla_cpu_global_jit", &flags->tf_xla_cpu_global_jit,
-            "Enables global JIT compilation for CPU via SessionOptions."),
-       Flag("tf_xla_clustering_fuel", &flags->tf_xla_clustering_fuel,
-            "Places an artificial limit on the number of ops marked as "
-            "eligible for clustering."),
-       Flag("tf_xla_fusion_only", &flags->tf_xla_fusion_only,
-            "enable fusion of element-wise operations only using XLA when "
-            "global_jit_level is ON*.")});
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// mark_for_compilation_pass module.
-void AppendMarkForCompilationPassFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the MarkForCompilationPassFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
deleted file mode 100644
index 1bb2fce2dbad5bffce2e33b665b7222090d0855a..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's xla_device module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static XlaDeviceFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new XlaDeviceFlags;
-  flags->tf_xla_compile_on_demand = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_compile_on_demand", &flags->tf_xla_compile_on_demand,
-           "Switch a device into 'on-demand' mode, where instead of "
-           "autoclustering ops are compiled one by one just-in-time."),
-  });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-// Return a pointer to the XlaDeviceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-XlaDeviceFlags* GetXlaDeviceFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
deleted file mode 100644
index 27b22121ac1e089bd5d5a494e1e3fb60b05bc76d..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
-
-// Legacy flags for the XLA bridge's xla_device module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// The values of flags associated with the XLA bridge's
-// xla_device module.
-typedef struct {
-  // Switch the CPU device into "on-demand" mode, where instead of
-  // autoclustering ops are compiled one by one just-in-time.
-  // Enabling this mode by a legacy flag is a temporary mechanism. When this
-  // feature is battle-tested, we will switch this to be a session option.
-  bool tf_xla_compile_on_demand;
-} XlaDeviceFlags;
-
-// Return a pointer to the XlaDeviceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-XlaDeviceFlags* GetXlaDeviceFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc
deleted file mode 100644
index ae17fdffb9b6a574449b7f3155e050b029702db7..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <mutex>  // NOLINT
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-XlaOpsCommonFlags* flags;
-std::vector<Flag>* flag_list;
-std::once_flag flags_init;
-
-void AllocateAndParseFlags() {
-  flags = new XlaOpsCommonFlags;
-  flags->tf_xla_always_defer_compilation = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_always_defer_compilation",
-           &flags->tf_xla_always_defer_compilation, ""),
-  });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-const XlaOpsCommonFlags& GetXlaOpsCommonFlags() {
-  std::call_once(flags_init, &AllocateAndParseFlags);
-  return *flags;
-}
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h b/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h
deleted file mode 100644
index 7c5c1818ef2d1dcf38c324a2c926db9c4bfa8ef5..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_OPS_COMMON_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_OPS_COMMON_FLAGS_H_
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Flags common to the _Xla* ops and their kernels.
-struct XlaOpsCommonFlags {
-  // If true, _XlaCompile always refuses to compile the cluster, which means the
-  // XLA clusters always run in the TF executor.  Defaults to false.
-  bool tf_xla_always_defer_compilation;
-};
-
-// Parses the flags in XlaOpsCommonFlags from the TF_XLA_FLAGS environment
-// variable and returns a reference to the parsed copy.  Parses TF_XLA_FLAGS
-// only the first time this routine is called.
-const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_OPS_COMMON_FLAGS_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 11975a6bb07e03dc3d182beb3748eb2559de7e25..25796435a5c87af5e252981abf96833f4cda9a5e 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
@@ -61,14 +61,40 @@ struct OperationFilter {
   // seeding behavior as TensorFlow's RNG (b/34749654).  So we avoid
   // auto-clustering stateful RNG ops.
   bool allow_stateful_rng_ops;
+
+  // TODO(b/118970344): Whether ControlTrigger ops are allowed.  It is unsound
+  // to cluster ControlTrigger because of how we use deadness analysis.
+  bool allow_control_trigger;
+
+  // Whether ops with dummy implementations are allowed. We avoid
+  // auto-clustering these ops so that the user is not surprised when XLA is
+  // implicitly enabled. If the user explicitly specifies to use XLA, it is fine
+  // to resort to a dummy implementation. Currently Assert and CheckNumerics ops
+  // have dummy XLA implementations.
+  bool allow_dummy_ops;
+
+  // Whether ops that produce or consume DT_VARIANT values are allowed.  We
+  // don't auto-cluster these ops because we don't yet support live-in or
+  // live-out DT_VARIANT values.
+  bool allow_ops_producing_or_consuming_variant;
 };
 
+bool IsDummyImplOp(absl::string_view op_name) {
+  return op_name == "Assert" || op_name == "CheckNumerics";
+}
+
 bool IsStatefulRandomOp(absl::string_view op_name) {
   return op_name == "RandomUniform" || op_name == "RandomShuffle" ||
          op_name == "RandomUniformInt" || op_name == "RandomStandardNormal" ||
          op_name == "TruncatedNormal";
 }
 
+bool OpProducesOrConsumesVariant(const Node& node) {
+  auto is_variant = [](DataType dtype) { return dtype == DT_VARIANT; };
+  return absl::c_any_of(node.input_types(), is_variant) ||
+         absl::c_any_of(node.output_types(), is_variant);
+}
+
 bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
   // is really a kind of function call and will be handled by
@@ -225,6 +251,16 @@ bool IsCompilableCall(const NodeDef& call_def,
         IsStatefulRandomOp(node->type_string())) {
       return false;
     }
+    if (!op_filter.allow_control_trigger && node->IsControlTrigger()) {
+      return false;
+    }
+    if (!op_filter.allow_dummy_ops && IsDummyImplOp(node->type_string())) {
+      return false;
+    }
+    if (!op_filter.allow_ops_producing_or_consuming_variant &&
+        OpProducesOrConsumesVariant(*node)) {
+      return false;
+    }
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, op_filter, depth + 1,
                           lib_runtime)) {
@@ -406,8 +442,7 @@ Status FindCompilationCandidates(
       BackwardsConstAnalysis(graph, /*compile_time_const_arg_indices=*/nullptr,
                              &compile_time_const_nodes));
 
-  int64& fuel =
-      legacy_flags::GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
+  int64& fuel = GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
 
   // Iterate over nodes in sorted order so that compiler fuel is deterministic.
   // We can't simply pass op_nodes().begin() and op_nodes().end to the
@@ -450,9 +485,15 @@ Status FindCompilationCandidates(
         XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration));
     DeviceType jit_device_type(registration->compilation_device_name);
 
+    bool always_auto_cluster = registration->autoclustering_policy ==
+                               XlaOpRegistry::AutoclusteringPolicy::kAlways;
+
     OperationFilter op_filter;
     op_filter.allow_resource_ops = registration->compile_resource_ops;
-    op_filter.allow_stateful_rng_ops = registration->requires_compilation;
+    op_filter.allow_stateful_rng_ops = always_auto_cluster;
+    op_filter.allow_control_trigger = always_auto_cluster;
+    op_filter.allow_dummy_ops = always_auto_cluster;
+    op_filter.allow_ops_producing_or_consuming_variant = always_auto_cluster;
 
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, op_filter, 0,
@@ -467,6 +508,21 @@ Status FindCompilationCandidates(
       VLOG(2) << "Rejecting " << node->name() << ": stateful random operation";
       continue;
     }
+    if (!op_filter.allow_control_trigger && node->IsControlTrigger()) {
+      VLOG(2) << "Rejecting " << node->name() << ": is a control trigger op";
+      continue;
+    }
+    if (!op_filter.allow_dummy_ops && IsDummyImplOp(node->type_string())) {
+      VLOG(2) << "Rejecting " << node->name() << ": dummy op ("
+              << node->type_string() << ")";
+      continue;
+    }
+    if (!op_filter.allow_ops_producing_or_consuming_variant &&
+        OpProducesOrConsumesVariant(*node)) {
+      VLOG(2) << "Rejecting " << node->name()
+              << ": produces or consumes DT_VARIANT";
+      continue;
+    }
 
     if (!op_filter.allow_resource_ops &&
         (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) {
@@ -570,8 +626,7 @@ OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
     // To set compilation to be on by default, change the following line.
     global_jit_level = OptimizerOptions::OFF;
   }
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   if (flags->tf_xla_auto_jit == -1 ||
       (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) {
     // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides
@@ -597,11 +652,15 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
                                             &registration));
   DeviceType jit_device_type(registration->compilation_device_name);
 
-  // We can always *compile* resource operations and stateful RNGs, even if we
-  // are sometimes unable to auto-cluster them.
+  // We can always *compile* resource operations, stateful RNGs and dummy ops,
+  // even if we are sometimes unable to auto-cluster them.
   OperationFilter op_filter;
   op_filter.allow_resource_ops = true;
   op_filter.allow_stateful_rng_ops = true;
+  op_filter.allow_control_trigger = true;
+  op_filter.allow_dummy_ops = true;
+  op_filter.allow_ops_producing_or_consuming_variant = true;
+
   return IsCompilableCall(ndef, jit_device_type, op_filter, 0, flr);
 }
 
@@ -611,12 +670,9 @@ Status MarkForCompilationPass::Run(
   // device ahead of time.
   OptimizerOptions::GlobalJitLevel global_jit_level =
       GetGlobalJitLevel(options);
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
-  bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   bool fusion_only = flags->tf_xla_fusion_only;
 
-  VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
   VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
   VLOG(1) << "flags->tf_xla_auto_jit = " << flags->tf_xla_auto_jit;
   const FunctionLibraryDefinition* fld = options.flib_def;
@@ -635,9 +691,6 @@ Status MarkForCompilationPass::Run(
       return false;
     }
 
-    // If this device requires a JIT, we must say yes.
-    if (registration->requires_compilation) return true;
-
     // If there is a _XlaCompile annotation, use its value.
     bool compile = false;
     Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
@@ -674,18 +727,21 @@ Status MarkForCompilationPass::Run(
       return false;
     }
 
-    // Otherwise use the value of global_jit_level.
-    // Ignore enable_jit_by_default if global jit compilation for CPU
-    // is explicitly requested via tf_xla_cpu_global_jit flag
-    bool ignore_registration = cpu_global_jit && device_type == DEVICE_CPU;
+    // Otherwise use the value of global_jit_level and the device's
+    // autoclustering policy.
     bool should_compile =
-        (ignore_registration || registration->enable_jit_by_default) &&
-        global_jit_level != OptimizerOptions::OFF;
+        registration->autoclustering_policy ==
+            XlaOpRegistry::AutoclusteringPolicy::kAlways ||
+        (registration->autoclustering_policy ==
+             XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally &&
+         global_jit_level != OptimizerOptions::OFF);
     if (!should_compile) {
       if (global_jit_level == OptimizerOptions::OFF) {
         VLOG(2) << "Rejecting " << node->name() << ": global jit disabled.";
       } else {
-        VLOG(2) << "Rejecting " << node->name() << ": JIT for device disabled.";
+        VLOG(2)
+            << "Rejecting " << node->name()
+            << ": autoclustering for device only when requested explicitly.";
       }
     }
     return should_compile;
@@ -915,8 +971,7 @@ Status MarkForCompilationPass::RunImpl(
 
   OptimizerOptions::GlobalJitLevel global_jit_level =
       GetGlobalJitLevel(options);
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
 
   // Repeatedly contract edges between clusters that are on the same device,
   // provided the contraction would not create a cycle.
@@ -1073,12 +1128,10 @@ Status MarkForCompilationPass::RunImpl(
     XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
     // Compile if this is a cluster of >= min_cluster_size compilable operators.
-    // Also, always compile if the operator is placed on a device that requires
-    // compilation, or if it contains at least one op that is marked for
+    // Also, always compile if it contains at least one op that is marked for
     // compilation that is not an Identity op.
     if (effective_cluster_sizes[cluster] >= min_cluster_size ||
-        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation) ||
-        registration->requires_compilation) {
+        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation)) {
       string& name = cluster_names[cluster];
 
       if (name.empty()) {
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index ead1cf4fd5faff649e8518aaeb95935ccef4ca52..bf2c5508ea9e987e80093f4c2e15d3ff5191126f 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/list_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -817,14 +818,10 @@ TEST(XlaCompilationTest, ClusterControlTrigger) {
 
   std::unordered_map<string, string> clusters = GetClusters(*graph);
 
-  ASSERT_FALSE(clusters.empty());
-  string cluster_name = clusters.begin()->second;
-
-  // ctrl_trigger_a has inputs with mismatching deadness so it won't be
-  // clustered.  ctrl_trigger_b is okay to cluster.
-  std::unordered_map<string, string> expected_clusters(
-      {{"const_a", cluster_name}, {"ctrl_trigger_b", cluster_name}});
-  EXPECT_EQ(clusters, expected_clusters);
+  // TODO(b/118970344): ctrl_trigger_a has inputs with mismatching deadness so
+  // it won't be clustered.  ctrl_trigger_b is okay to cluster but we don't
+  // cluster it because of b/118970344.
+  EXPECT_TRUE(clusters.empty());
 }
 
 TEST(XlaCompilationTest, RandomShape) {
@@ -923,9 +920,8 @@ TEST(XlaCompilationTest, RandomShapeOnXlaDevice) {
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   std::unordered_map<string, string> clusters = GetClusters(*graph);
-  EXPECT_NE(clusters["test/shape_rng"], "");
-  EXPECT_NE(clusters["test/reshape"], "");
-  EXPECT_NE(clusters["test/shape_rng"], clusters["test/reshape"]);
+  EXPECT_EQ(clusters["test/shape_rng"], "");
+  EXPECT_EQ(clusters["test/reshape"], "");
 }
 
 TEST(XlaCompilationTest, TensorArrayShapeOnXlaDevice) {
@@ -1088,7 +1084,7 @@ TEST(XlaCompilationTest, ClusterStatefulRandomOpOnXlaDevice) {
   EXPECT_NE(clusters["test/c"], "");
 }
 
-TEST(XlaCompilationTest, DontAutoclusterStatefulRandomOp) {
+TEST(XlaCompilationTest, DontAutoClusterStatefulRandomOp) {
   Scope root = Scope::NewRootScope().ExitOnError();
   Output shape = ops::Const(root.WithOpName("test/shape_shape"), {200, 200});
   Output a = ops::RandomUniform(root.WithOpName("test/a"), shape, DT_FLOAT);
@@ -1104,5 +1100,128 @@ TEST(XlaCompilationTest, DontAutoclusterStatefulRandomOp) {
   EXPECT_EQ(clusters["test/a"], "");
   EXPECT_EQ(clusters["test/b"], "");
 }
+
+TEST(XlaCompilationTest, ClusterDummyOpsOnXlaDevice) {
+  absl::string_view xla_cpu_device =
+      "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+  Output check =
+      ops::CheckNumerics(root.WithOpName("test/check"), a, "test/check");
+  Output ge = ops::GreaterEqual(root.WithOpName("test/greaterequal"), check, b);
+  Operation assert = ops::Assert(root.WithOpName("test/assert"), ge, {a, b});
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  for (Node* n : graph->nodes()) {
+    if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
+      n->set_assigned_device_name(string(xla_cpu_device));
+    }
+  }
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["test/check"], "");
+  EXPECT_NE(clusters["test/greaterequal"], "");
+  EXPECT_NE(clusters["test/assert"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterDummyOps) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+  Output check =
+      ops::CheckNumerics(root.WithOpName("test/check"), a, "test/check");
+  Output ge = ops::GreaterEqual(root.WithOpName("test/greaterequal"), check, b);
+  Operation assert = ops::Assert(root.WithOpName("test/assert"), ge, {a, b});
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/assert"], "");
+  EXPECT_EQ(clusters["test/check"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterOpsProducingVariant) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64);
+
+  Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32);
+  Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32);
+
+  Output tensor_list_reserve = ops::TensorListReserve(
+      root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/tensor_list_reserve"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterOpsConsumingVariant) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output dummy_input =
+      ops::Placeholder(root.WithOpName("test/dummy_input"), DT_INT64);
+  Output variant_input =
+      ops::Placeholder(root.WithOpName("test/variant_input"), DT_VARIANT);
+
+  // Create one more node so that we don't avoid creating a cluster solely
+  // because it would be trivial.
+  Output dummy_cast =
+      ops::Cast(root.WithOpName("test/dummy_cast"), dummy_input, DT_INT32);
+
+  Output tensor_list_element_shape = ops::TensorListElementShape(
+      root.WithOpName("test/tensor_list_element_shape"), variant_input,
+      DT_INT32);
+
+  root.graph()->AddControlEdge(dummy_cast.node(),
+                               tensor_list_element_shape.node());
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/tensor_list_element_shape"], "");
+}
+
+TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64);
+
+  Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32);
+  Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32);
+
+  Output tensor_list_reserve = ops::TensorListReserve(
+      root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  for (Node* n : graph->nodes()) {
+    if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
+      n->set_assigned_device_name(xla_cpu_device);
+    }
+  }
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["test/tensor_list_reserve"], "");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index 5b9610322336acbcede0bef0538043b8ff917c16..42ea3926e16ae791dbe1bede3b8742383db7667c 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -26,6 +26,10 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+
+bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
+
+namespace reduce_device_to_host_copies {
 Status FindNodesToDecluster(const Graph& graph,
                             absl::flat_hash_set<Node*>* result,
                             absl::Span<Node* const> post_order) {
@@ -133,11 +137,13 @@ Status PartiallyDeclusterNode(Graph* graph, Node* n) {
     graph->RemoveEdge(out_edge_to_clone);
   }
 
+  if (n->out_edges().empty()) {
+    graph->RemoveNode(n);
+  }
+
   return Status::OK();
 }
 
-bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
-
 // Clones nodes to outside their cluster to avoid device-to-host copies.  For
 // instance, converts this:
 //
@@ -164,7 +170,7 @@ bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
 // where the ===> arrow has a hostmem source and destination and would entail a
 // device to host copy if the source and destination were not in the same XLA
 // cluster.
-Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
+Status PartiallyDeclusterGraph(Graph* graph) {
   // When deciding whether to decluster a particular node, we base our decision
   // on if we've decided that some of its consumers have to be declustered too.
   // Iterating the graph in post-order guarantees that consumers have been
@@ -191,6 +197,10 @@ Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
     }
   }
 
+  // Recompute post order since PartiallyDeclusterNode may have deleted nodes.
+  post_order.clear();
+  GetPostOrder(*graph, &post_order, /*stable_comparator=*/NodeComparatorName(),
+               /*edge_filter=*/NotBackedge);
   nodes_to_partially_decluster.clear();
   TF_RETURN_IF_ERROR(
       FindNodesToDecluster(*graph, &nodes_to_partially_decluster, post_order));
@@ -198,7 +208,9 @@ Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
 
   return Status::OK();
 }
+}  // namespace reduce_device_to_host_copies
 
+namespace reduce_recompilation {
 bool IsIntraClusterEdge(const Edge& edge) {
   absl::optional<absl::string_view> src_cluster_name =
       GetXlaClusterForNode(*edge.src());
@@ -210,7 +222,8 @@ bool IsIntraClusterEdge(const Edge& edge) {
 bool IsMustCompileDevice(const DeviceType& device_type) {
   const XlaOpRegistry::DeviceRegistration* registration;
   if (XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
-    return registration->requires_compilation;
+    return registration->autoclustering_policy ==
+           XlaOpRegistry::AutoclusteringPolicy::kAlways;
   }
 
   return false;
@@ -260,7 +273,7 @@ Status MustCompileNode(const Node* n, bool* must_compile) {
 // regress performance in any significant manner.  We will have to revisit this
 // algorith with a more complex cost model if this assumption turns out to be
 // incorrect.
-Status DeclusterNodesToReduceRecompilations(Graph* graph) {
+Status PartiallyDeclusterGraph(Graph* graph) {
   std::vector<bool> compile_time_const_nodes(graph->num_node_ids());
   TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
       *graph, nullptr, &compile_time_const_nodes, IsIntraClusterEdge));
@@ -313,7 +326,7 @@ Status DeclusterNodesToReduceRecompilations(Graph* graph) {
 
   return Status::OK();
 }
-
+}  // namespace reduce_recompilation
 }  // namespace
 
 Status PartiallyDeclusterPass::Run(
@@ -325,8 +338,9 @@ Status PartiallyDeclusterPass::Run(
 
   Graph* graph = options.graph->get();
 
-  TF_RETURN_IF_ERROR(PartiallyDeclusterToRemoveDeviceToHostCopies(graph));
-  TF_RETURN_IF_ERROR(DeclusterNodesToReduceRecompilations(graph));
+  TF_RETURN_IF_ERROR(
+      reduce_device_to_host_copies::PartiallyDeclusterGraph(graph));
+  TF_RETURN_IF_ERROR(reduce_recompilation::PartiallyDeclusterGraph(graph));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 74d5ef57184197ad6e9e5048722e84863756a3f5..1fc5da5071f7aa6f6dd6636aacd60e33c12431a6 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -437,5 +437,32 @@ TEST(PartiallyDeclusterPassTest, DontDeclusterNonTensorFlowOps) {
   EXPECT_EQ(GetXlaClusterForNode(*n), "cluster_0");
 }
 
+TEST(PartiallyDeclusterPassTest, EliminatedUnusedNodes) {
+  const char* const kClusteredProducer0Name = "ClusteredProducer0";
+  const char* const kClusteredProducer1Name = "ClusteredProducer1";
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer_0 =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName(kClusteredProducer0Name));
+    Node* clustered_producer_1 =
+        ops::BinaryOp("FakeBinary", clustered_producer_0, input,
+                      builder.opts().WithName(kClusteredProducer1Name));
+    ops::BinaryOp("FakeBinary", clustered_producer_1, input,
+                  builder.opts().WithName("UnclusteredConsumer"));
+    clustered_producer_0->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_producer_1->AddAttr(kXlaClusterAttr, "cluster_0");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  EXPECT_EQ(FindNodeByName(*graph, kClusteredProducer0Name), nullptr);
+  EXPECT_EQ(FindNodeByName(*graph, kClusteredProducer1Name), nullptr);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/producer_consumer_queue.h b/tensorflow/compiler/jit/producer_consumer_queue.h
deleted file mode 100644
index 7c8c04152d2f3a0fd46711df24756b7e68b967ea..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/producer_consumer_queue.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
-#define TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
-
-#include <deque>
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-
-// A thread-safe, first-in-first-out queue.
-template <typename T>
-class ProducerConsumerQueue {
- public:
-  ProducerConsumerQueue()
-      : capacity_(std::numeric_limits<std::size_t>::max()) {}
-  ~ProducerConsumerQueue() = default;
-
-  // Wait until the queue is non-full, then append a copy of v.
-  void Put(const T &v);
-
-  // Wait until the queue is non-empty, then remove and return the head value.
-  T Get();
-
-  // If the queue is non-empty, remove the head value, placing it in *pv, and
-  // return true; otherwise return false.
-  bool TryGet(T *pv);
-
-  // Set the capacity of the queue; the queue is full whenever count() >=
-  // capacity().  The initial value is the maximum size_t.  Requires size > 0.
-  void set_capacity(std::size_t size);
-
-  // Return the capacity of the queue.
-  std::size_t capacity() const;
-
-  // Return the number of elements in the queue.
-  std::size_t count() const;
-
-  // Implementation details follow.  Clients should ignore.
- private:
-  mutable tensorflow::mutex mu_;  // protects all fields below
-  tensorflow::condition_variable non_empty_ GUARDED_BY(mu_);
-  tensorflow::condition_variable non_full_ GUARDED_BY(mu_);
-  std::size_t capacity_ GUARDED_BY(mu_);
-  std::deque<T> queue_ GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ProducerConsumerQueue);
-};
-
-// ------------------------------------------------------
-// Implementation details follow.  Clients should ignore.
-
-// Wait until the queue is non-full, then append a copy of v.
-template <typename T>
-void ProducerConsumerQueue<T>::Put(const T &v) {
-  mutex_lock lock(mu_);
-  while (queue_.size() >= capacity_) {
-    non_full_.wait(lock);
-  }
-  queue_.push_back(v);
-  non_empty_.notify_one();
-}
-
-// Wait until the queue is non-empty, then remove and return the head value.
-template <typename T>
-T ProducerConsumerQueue<T>::Get() {
-  mutex_lock lock(mu_);
-  while (queue_.empty()) {
-    non_empty_.wait(lock);
-  }
-  non_full_.notify_one();
-  T result_value = queue_.front();
-  queue_.pop_front();
-  return result_value;
-}
-
-// If the queue is non-empty, remove the head value, placing it in *pv, and
-// return true; otherwise return false.
-template <typename T>
-bool ProducerConsumerQueue<T>::TryGet(T *pv) {
-  mutex_lock lock(mu_);
-  bool got_element = !queue_.empty();
-  if (got_element) {
-    non_full_.notify_one();
-    *pv = queue_.front();
-    queue_.pop_front();
-  }
-  return got_element;
-}
-
-// Set the capacity of the queue; the queue is full whenever count() >=
-// capacity().  The initial value is the maximum size_t.  Requires size > 0.
-template <typename T>
-void ProducerConsumerQueue<T>::set_capacity(std::size_t size) {
-  mutex_lock lock(mu_);
-  CHECK_NE(size, 0);
-  capacity_ = size;
-  non_full_.notify_all();
-}
-
-// Return the capacity of the queue.
-template <typename T>
-std::size_t ProducerConsumerQueue<T>::capacity() const {
-  mutex_lock lock(mu_);
-  std::size_t max_elements = capacity_;
-  return max_elements;
-}
-
-// Return the number of elements in the queue.
-template <typename T>
-std::size_t ProducerConsumerQueue<T>::count() const {
-  mutex_lock lock(mu_);
-  std::size_t num_elements = queue_.size();
-  return num_elements;
-}
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
diff --git a/tensorflow/compiler/jit/producer_consumer_queue_test.cc b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
deleted file mode 100644
index f61260c6e52756ee039829afdc7452f5f760c221..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/producer_consumer_queue_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/producer_consumer_queue.h"
-
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace {
-
-typedef ProducerConsumerQueue<int> IntQueue;
-
-// Insert integers between low inclusive and high exclusive into q.
-void PushRange(IntQueue *q, int low, int high) {
-  while (low != high) {
-    q->Put(low);
-    VLOG(2) << "Pushing " << low;
-    ++low;
-  }
-}
-
-// Push the numbers between 0 and 999 inclusive from several threads in the
-// pool.
-void PushRanges(IntQueue *queue, thread::ThreadPool *pool) {
-  VLOG(1) << "Adding 20-36";
-  pool->Schedule([queue] { PushRange(queue, 20, 36); });
-  VLOG(1) << "Adding 7-20";
-  pool->Schedule([queue] { PushRange(queue, 7, 20); });
-  VLOG(1) << "Adding 36-501";
-  pool->Schedule([queue] { PushRange(queue, 36, 501); });
-  VLOG(1) << "Adding 501-1000";
-  pool->Schedule([queue] { PushRange(queue, 501, 1000); });
-  VLOG(1) << "Adding 0-5";
-  pool->Schedule([queue] { PushRange(queue, 0, 5); });
-  VLOG(1) << "Adding 5-7";
-  pool->Schedule([queue] { PushRange(queue, 5, 7); });
-}
-
-// Pop elements from queue using Get().  Make sure that exactly <high> elements
-// were present and their values are all integers between 0 and high-1
-// inclusive.
-void GetRange(IntQueue *queue, int high) {
-  VLOG(1) << "Testing Wait";
-  std::vector<int> results;
-  for (int i = 0; i != high; ++i) {
-    int r = queue->Get();
-    VLOG(2) << "Waited and got " << r;
-    results.push_back(r);
-  }
-  CHECK_EQ(queue->count(), 0);
-  std::sort(results.begin(), results.end());
-  for (int i = 0; i != high; ++i) {
-    CHECK(results[i] == i);
-  }
-}
-
-// Pop elements from queue using TryGet().  Make sure that exactly <high>
-// elements were present and their values are all integers between 0 and high-1
-// inclusive.
-void TryGetRange(IntQueue *queue, int high) {
-  std::vector<int> results;
-  // Give up if we don't get all the elements back from the queue
-  // in 10 seconds.
-  int timeout = 10;
-  int r;
-  for (int i = 0; i != high; ++i) {
-    while (!queue->TryGet(&r)) {
-      if (!timeout--) {
-        LOG(FATAL) << "Can't find all elements in the queue";
-      }
-      VLOG(1) << "Sleeping for a second...";
-      sleep(1);
-    }
-    VLOG(2) << "Popped " << r;
-    results.push_back(r);
-  }
-  CHECK_EQ(queue->count(), 0);
-  CHECK(!queue->TryGet(&r));
-  std::sort(results.begin(), results.end());
-  for (int i = 0; i != high; ++i) {
-    CHECK_EQ(i, results[i]);
-  }
-}
-
-const int kNumThreads = 15;
-
-TEST(ProducerConsumerQueue, GetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    PushRanges(&queue, &pool);
-  }
-  GetRange(&queue, 1000);
-}
-
-TEST(ProducerConsumerQueue, TryGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    PushRanges(&queue, &pool);
-  }
-  TryGetRange(&queue, 1000);
-}
-
-TEST(ProducerConsumerQueue, ParallelGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    pool.Schedule([&queue] { GetRange(&queue, 1000); });
-    PushRanges(&queue, &pool);
-  }
-}
-
-TEST(ProducerConsumerQueue, ParallelTryGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    pool.Schedule([&queue] { TryGetRange(&queue, 1000); });
-    PushRanges(&queue, &pool);
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 4a5ea9e0a5f8cf79478069931da598099ae4e716..3df5479a55e841380ca7b8cdd0add9fd17487091 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <numeric>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -65,14 +66,14 @@ string XlaCompilationCache::DebugString() {
 
 // Compute a string signature which encodes the shapes of the
 // arguments in the supplied list.
-string XlaCompilationCache::SignatureDebugString(const Signature& sig) {
-  string result = sig.name;
-  for (const auto& a : sig.arg_types) {
+string XlaCompilationCache::Signature::HumanString() const {
+  string result = name;
+  for (const auto& a : arg_types) {
     absl::StrAppend(&result, ",", DataTypeString(a.first),
                     a.second.DebugString());
   }
 
-  for (const auto& v : sig.arg_values) {
+  for (const auto& v : arg_values) {
     absl::StrAppend(&result, "; ", v.DebugString());
   }
   return result;
@@ -84,7 +85,9 @@ bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
 
   if (arg_values.size() != other.arg_values.size()) return false;
   for (int i = 0; i < arg_values.size(); ++i) {
-    if (arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) {
+    if (arg_values[i].dtype() != other.arg_values[i].dtype() ||
+        arg_values[i].shape() != other.arg_values[i].shape() ||
+        arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) {
       return false;
     }
   }
@@ -108,96 +111,30 @@ uint64 XlaCompilationCache::Signature::Hash::operator()(
   return h;
 }
 
-Status XlaCompilationCache::BuildSignature(
-    const NameAttrList& function, const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-    Signature* signature) {
-  signature->name = Canonicalize(function.name(), AttrSlice(&function.attr()));
-  signature->arg_values.reserve(constant_args.size());
-
-  signature->arg_types.reserve(ctx->num_inputs() - constant_args.size());
-
-  for (int i = 0; i < ctx->num_inputs(); ++i) {
-    if (constant_args.count(i) > 0) {
-      // Use the values of compile time constants in the signature.
-      signature->arg_values.push_back(constant_args.at(i));
-    } else if (variable_args.count(i) > 0) {
-      const OptionalTensor& variable = variable_args.at(i);
-      if (variable.present) {
-        signature->arg_types.emplace_back(variable.value.dtype(),
-                                          variable.value.shape());
-      } else {
-        signature->arg_types.emplace_back(DT_INVALID, TensorShape());
-      }
-    } else {
-      signature->arg_types.emplace_back(ctx->input_dtype(i),
-                                        ctx->input(i).shape());
-    }
-  }
-  return Status::OK();
-}
-
-namespace {
-
-// Builds a XlaCompiler::Argument vector from the arguments to the XlaLaunch op.
-Status BuildArguments(const std::map<int, Tensor>& constant_args,
-                      const std::map<int, OptionalTensor>& variable_args,
-                      OpKernelContext* ctx,
-                      std::vector<XlaCompiler::Argument>* args) {
-  args->resize(ctx->num_inputs());
-
-  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
-    XlaCompiler::Argument& arg = (*args)[input_num];
-    if (constant_args.count(input_num) > 0) {
-      // Handles compile-time constants.
-      const Tensor& input = constant_args.at(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-      arg.kind = XlaCompiler::Argument::kConstant;
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-      arg.constant_value = input;
-    } else if (variable_args.count(input_num) == 0) {
-      // Handles the non-constant arguments.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-      if (input.NumElements() > 0) {
-        arg.kind = XlaCompiler::Argument::kParameter;
-      } else {
-        arg.kind = XlaCompiler::Argument::kConstant;
-        arg.constant_value = input;
-      }
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-    } else {
-      // Handles resource variables.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
-      const OptionalTensor& variable = variable_args.at(input_num);
-      arg.name = variable.name;
-      arg.kind = XlaCompiler::Argument::kResource;
-      arg.resource_kind = XlaResource::kVariable;
-      if (variable.present) {
-        const Tensor& value = variable.value;
-        arg.type = value.dtype();
-        arg.shape = value.shape();
-        arg.initialized = true;
-      } else {
-        // The values of uninitialized variables are not passed as inputs, since
-        // they are meaningless. However, it is legal to assign to a resource
-        // variable for the first time inside the XLA computation, so we do
-        // permit uninitialized variables.
-        arg.initialized = false;
-        arg.type = DT_INVALID;
-        arg.shape = TensorShape();
-      }
+xla::StatusOr<XlaCompilationCache::Signature>
+XlaCompilationCache::BuildSignature(
+    const NameAttrList& function,
+    absl::Span<const XlaCompiler::Argument> args) {
+  Signature signature;
+  signature.name = Canonicalize(function.name(), AttrSlice(&function.attr()));
+  for (const XlaCompiler::Argument& arg : args) {
+    switch (arg.kind) {
+      case XlaCompiler::Argument::kConstant:
+        signature.arg_values.push_back(arg.constant_value);
+        break;
+      case XlaCompiler::Argument::kParameter:
+      case XlaCompiler::Argument::kResource:
+        signature.arg_types.emplace_back(arg.type, arg.shape);
+        break;
+      default:
+        return errors::InvalidArgument(
+            "Unhandled argument kind in XlaCompilationCache: ",
+            arg.HumanString());
     }
   }
-
-  return Status::OK();
+  return std::move(signature);
 }
 
-}  // namespace
-
 Status XlaCompilationCache::BuildExecutable(
     const XlaCompiler::Options& options,
     const XlaCompiler::CompilationResult& result,
@@ -227,25 +164,38 @@ Status XlaCompilationCache::BuildExecutable(
 
 Status XlaCompilationCache::Compile(
     const XlaCompiler::Options& options, const NameAttrList& function,
-    const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    absl::Span<const XlaCompiler::Argument> args,
     const XlaCompiler::CompileOptions& compile_options,
     CompileMode compile_mode,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
-  // Set the compile threshold to 1 to implement CompileMode::kStrict.
-  int64 compile_threshold =
-      compile_mode == CompileMode::kLazy ? kDefaultCompilationThreshold : 1;
-  return CompileImpl(options, function, constant_args, variable_args, ctx,
-                     compile_options, /*compile_single_op=*/false,
+  absl::optional<int64> compile_threshold;
+  if (compile_mode == CompileMode::kLazy) {
+    compile_threshold = kDefaultCompilationThreshold;
+  }
+  auto compile_fn = [&](XlaCompiler* compiler,
+                        XlaCompiler::CompilationResult* result) {
+    return compiler->CompileFunction(compile_options, function, args, result);
+  };
+  return CompileImpl(options, function, args, compile_fn,
                      /*compile_threshold=*/compile_threshold,
                      out_compilation_result, out_executable);
 }
 
+static bool IsMegamorphic(int64 compile_count, int64 execution_count) {
+  const int64 kCompileThreshold = 10;
+  const int64 kMinExecutionsPerCompile = 50;
+
+  // This heuristic is trying to capture the following property: have we sunk a
+  // certain minimum amount of compile time into the cluster that didn't quite
+  // "pay off"?
+  return compile_count > kCompileThreshold &&
+         execution_count < kMinExecutionsPerCompile * compile_count;
+}
+
 Status XlaCompilationCache::CompileSingleOp(
     const XlaCompiler::Options& options,
-    const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    absl::Span<const XlaCompiler::Argument> args, OpKernelContext* ctx,
     const XlaCompiler::CompileOptions& compile_options,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
@@ -253,54 +203,41 @@ Status XlaCompilationCache::CompileSingleOp(
   NameAttrList name;
   name.set_name(def.op());
   *name.mutable_attr() = def.attr();
-  return CompileImpl(options, name, constant_args, variable_args, ctx,
-                     compile_options,
-                     /*compile_single_op=*/true, /*compile_threshold=*/1,
+  auto compile_op = [&](XlaCompiler* compiler,
+                        XlaCompiler::CompilationResult* result) {
+    std::vector<DataType> result_dtypes(ctx->num_outputs());
+    for (int i = 0; i < result_dtypes.size(); ++i) {
+      result_dtypes[i] = ctx->expected_output_dtype(i);
+    }
+    return compiler->CompileSingleOp(compile_options, ctx->op_kernel().def(),
+                                     args, result_dtypes, result);
+  };
+  return CompileImpl(options, name, args, compile_op,
+                     /*compile_threshold=*/absl::nullopt,
                      out_compilation_result, out_executable);
 }
 
 Status XlaCompilationCache::CompileImpl(
     const XlaCompiler::Options& options, const NameAttrList& function,
-    const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-    const XlaCompiler::CompileOptions& compile_options, bool compile_single_op,
-    int64 compile_threshold,
+    absl::Span<const XlaCompiler::Argument> args,
+    const std::function<Status(XlaCompiler* compiler,
+                               XlaCompiler::CompilationResult*)>& compile_fn,
+    absl::optional<int64> compile_threshold,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
   DCHECK_NE(out_executable, nullptr);
   VLOG(2) << "XlaCompilationCache::Compile " << DebugString();
 
   if (VLOG_IS_ON(2)) {
-    VLOG(2) << "num_inputs=" << ctx->num_inputs()
-            << " num_constant_args=" << constant_args.size()
-            << " num_variable_args=" << variable_args.size();
-    for (int i = 0; i < ctx->num_inputs(); i++) {
-      TensorShape shape = ctx->input(i).shape();
-      VLOG(2) << i << ": dtype=" << DataTypeString(ctx->input_dtype(i))
-              << " present=" << ctx->has_input(i)
-              << " shape=" << shape.DebugString();
-    }
-    for (auto& iterator : variable_args) {
-      const OptionalTensor& variable = iterator.second;
-      VLOG(2) << "variable present=" << variable.present
-              << " type=" << DataTypeString(variable.value.dtype())
-              << " shape=" << variable.value.shape().DebugString()
-              << " TF arg= " << iterator.first;
-    }
-    VLOG(2) << "num_outputs = " << ctx->num_outputs();
-    for (int i = 0; i < ctx->num_outputs(); i++) {
-      VLOG(2) << i << ": dtype=" << ctx->expected_output_dtype(i);
+    VLOG(2) << "num_inputs=" << args.size();
+    for (int i = 0; i < args.size(); i++) {
+      VLOG(2) << i << ": " << args[i].HumanString();
     }
   }
 
-  TF_RET_CHECK(constant_args.size() + variable_args.size() <=
-               ctx->num_inputs());
-
-  Signature signature;
-  TF_RETURN_IF_ERROR(
-      BuildSignature(function, constant_args, variable_args, ctx, &signature));
+  TF_ASSIGN_OR_RETURN(Signature signature, BuildSignature(function, args));
+  VLOG(2) << "Signature: " << signature.HumanString();
 
-  VLOG(2) << "Signature: " << SignatureDebugString(signature);
   // The outer lock protects the existence of the cache entry. It does not
   // protect the contents of the cache entry.
   Entry* entry;
@@ -319,25 +256,67 @@ Status XlaCompilationCache::CompileImpl(
   // (since they get the benefit of XLA right away without waiting for warmup)
   // and doesn't hurt much for dynamically shaped TensorFlow graphs (we "pay" at
   // most one cluster-compilation's worth of compile time).
-  bool is_first_execution = [&] {
+  bool is_first_execution;
+
+  // We avoid compiling clusters that have "gone megamorphic" i.e. have an
+  // excessive amount of shape dynamism.
+  bool is_megamorphic;
+
+  {
     mutex_lock lock(cluster_compile_stats_mu_);
     auto it =
         cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{})
             .first;
-    return it->second.execution_count++ == 0;
-  }();
+    is_first_execution = it->second.execution_count++ == 0;
+
+    // The is_megamorphic bit is "sticky".  We assume clusters that have been
+    // observed to be megamorphic once stay megamorphic forever.
+    it->second.is_megamorphic |=
+        IsMegamorphic(/*compile_count=*/it->second.compile_count,
+                      /*execution_count=*/it->second.execution_count);
+    is_megamorphic = it->second.is_megamorphic;
+  }
 
   // Acquire the cache entry lock and compile, if necessary.
   // TODO(phawkins): this locking will need to be restructured when we implement
   // cache eviction.
   mutex_lock entry_lock(entry->mu);
   int64 current_request_count = ++entry->request_count;
+  VLOG(2) << "Compilation cache entry hit: " << entry->compiled
+          << " signature: " << signature.HumanString() << " with request count "
+          << current_request_count << " and compile threshold "
+          << compile_threshold.value_or(0);
   if (!entry->compiled) {
-    VLOG(2) << "Compilation cache miss for signature: "
-            << SignatureDebugString(signature) << " with request count "
-            << current_request_count << " and compile threshold "
-            << compile_threshold;
-    if (!is_first_execution && current_request_count < compile_threshold) {
+    const bool should_compile = [&] {
+      if (!compile_threshold.has_value()) {
+        // Lazy compilation is disabled.
+        return true;
+      }
+
+      if (is_megamorphic) {
+        VLOG(3) << "Not compiling cluster " << function.name()
+                << " because it is megamorphic.";
+        return false;
+      }
+
+      if (is_first_execution) {
+        return true;
+      }
+
+      bool reached_compile_threshold =
+          current_request_count >= *compile_threshold;
+      if (!reached_compile_threshold) {
+        VLOG(3)
+            << "Not compiling cluster " << function.name()
+            << " because it has not reached compile threshold; threshold is "
+            << *compile_threshold << " execution count "
+            << current_request_count << ".";
+      }
+      return reached_compile_threshold;
+    }();
+
+    if (!should_compile) {
+      VLOG(2) << "Not compiling for signature: " << signature.HumanString();
       *out_compilation_result = nullptr;
       *out_executable = nullptr;
       return Status::OK();
@@ -347,21 +326,12 @@ Status XlaCompilationCache::CompileImpl(
     const uint64 compile_start_us = env->NowMicros();
     // Do the actual JIT compilation without holding the lock (it can take
     // a long time.)
-    std::vector<XlaCompiler::Argument> args;
-    TF_RETURN_IF_ERROR(
-        BuildArguments(constant_args, variable_args, ctx, &args));
 
     XlaCompiler compiler(options);
     entry->compiled = true;
 
-    if (compile_single_op) {
-      entry->compilation_status =
-          compiler.CompileSingleOp(compile_options, signature.name, ctx, args,
-                                   &entry->compilation_result);
-    } else {
-      entry->compilation_status = compiler.CompileFunction(
-          compile_options, function, args, &entry->compilation_result);
-    }
+    entry->compilation_status =
+        compile_fn(&compiler, &entry->compilation_result);
     TF_RETURN_IF_ERROR(entry->compilation_status);
     CHECK_EQ(entry->executable.get(), nullptr);
     entry->compilation_status =
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index b43e5d40e6402d24b80f7c689018d81e8a5d7f09..846d0c963dbfdf55f51120f2f138d12f5f63839b 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -17,9 +17,12 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -30,13 +33,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Struct that represents a possibly-absent Tensor.
-struct OptionalTensor {
-  string name;           // A descriptive name
-  bool present = false;  // Is the tensor present?
-  Tensor value;          // If present, what is the Tensor's value?
-};
-
 // The XlaCompilationCache class caches the results of the XlaCompiler class,
 // which converts a Tensorflow graph into a compiled XLA compilation.
 //
@@ -58,11 +54,7 @@ class XlaCompilationCache : public ResourceBase {
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
   // to execute an XLA Computation. Compilation results are cached.
   // `function` is the name of a Tensorflow function to compile.
-  // `constant_args` is a map of tensorflow argument number to its constant
-  //  value.
-  // `variable_args` is a snapshot of the current values of the
-  // resource variable arguments to `function`; uninitialized variables are
-  // represented by an absent OptionalTensor.
+  // `args` is a description of the arguments to the computation.
   //
   // `compile_mode` controls the behavior of the compilation cache on a cache
   // miss.  If `compile_mode` is `kLazy` then, based on some profitability
@@ -78,9 +70,7 @@ class XlaCompilationCache : public ResourceBase {
   // outputs.
   Status Compile(const XlaCompiler::Options& options,
                  const NameAttrList& function,
-                 const std::map<int, Tensor>& constant_args,
-                 const std::map<int, OptionalTensor>& variable_args,
-                 OpKernelContext* ctx,
+                 absl::Span<const XlaCompiler::Argument> args,
                  const XlaCompiler::CompileOptions& compile_options,
                  CompileMode compile_mode,
                  const XlaCompiler::CompilationResult** out_compilation_result,
@@ -90,8 +80,7 @@ class XlaCompilationCache : public ResourceBase {
   // XlaCompiler::CompileFunction.
   Status CompileSingleOp(
       const XlaCompiler::Options& options,
-      const std::map<int, Tensor>& constant_args,
-      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      absl::Span<const XlaCompiler::Argument> args, OpKernelContext* ctx,
       const XlaCompiler::CompileOptions& compile_options,
       const XlaCompiler::CompilationResult** out_compilation_result,
       xla::LocalExecutable** out_executable);
@@ -101,26 +90,6 @@ class XlaCompilationCache : public ResourceBase {
 
   string DebugString() override;
 
- private:
-  // Common implementation of Compile and CompileSingleOp.
-  Status CompileImpl(
-      const XlaCompiler::Options& options, const NameAttrList& function,
-      const std::map<int, Tensor>& constant_args,
-      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-      const XlaCompiler::CompileOptions& compile_options,
-      bool compile_single_op, int64 compile_threshold,
-      const XlaCompiler::CompilationResult** out_compilation_result,
-      xla::LocalExecutable** out_executable);
-
-  // Takes `result` which has been compiled from a Tensorflow subgraph to a
-  // XLA computation already, and generates an XLA LocalExecutable `executable`.
-  Status BuildExecutable(const XlaCompiler::Options& options,
-                         const XlaCompiler::CompilationResult& result,
-                         std::unique_ptr<xla::LocalExecutable>* executable);
-
-  xla::LocalClient* const client_;
-  const DeviceType device_type_;
-
   // Describes the types, shapes and any compile-time constant arguments
   // to a kernel. Key that uniquely identifies a compilation output.
   struct Signature {
@@ -137,14 +106,35 @@ class XlaCompilationCache : public ResourceBase {
     struct Hash {
       uint64 operator()(const Signature& signature) const;
     };
+
+    // Returns a human-readable description of the signature.
+    string HumanString() const;
   };
-  static string SignatureDebugString(const Signature& sig);
 
   // Builds the signature for a compilation.
-  Status BuildSignature(const NameAttrList& function,
-                        const std::map<int, Tensor>& constant_args,
-                        const std::map<int, OptionalTensor>& variable_args,
-                        OpKernelContext* ctx, Signature* signature);
+  static xla::StatusOr<Signature> BuildSignature(
+      const NameAttrList& function,
+      absl::Span<const XlaCompiler::Argument> args);
+
+ private:
+  // Common implementation of Compile and CompileSingleOp.
+  Status CompileImpl(
+      const XlaCompiler::Options& options, const NameAttrList& function,
+      absl::Span<const XlaCompiler::Argument> args,
+      const std::function<Status(XlaCompiler* compiler,
+                                 XlaCompiler::CompilationResult*)>& compile_fn,
+      absl::optional<int64> compile_threshold,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      xla::LocalExecutable** out_executable);
+
+  // Takes `result` which has been compiled from a Tensorflow subgraph to a
+  // XLA computation already, and generates an XLA LocalExecutable `executable`.
+  Status BuildExecutable(const XlaCompiler::Options& options,
+                         const XlaCompiler::CompilationResult& result,
+                         std::unique_ptr<xla::LocalExecutable>* executable);
+
+  xla::LocalClient* const client_;
+  const DeviceType device_type_;
 
   // The value associated with a cache entry.
   struct Entry {
@@ -180,7 +170,13 @@ class XlaCompilationCache : public ResourceBase {
 
     // Cumulative time spent compiling the cluster.
     int64 cumulative_compile_time_us = 0;
+
+    // True if we have decided that this cluster is too dynamic (i.e. its shapes
+    // change too frequently) to profitably JIT compile.  Once a cluster is
+    // tagged megamorphic, it stays megamorphic forever.
+    bool is_megamorphic = false;
   };
+
   mutex cluster_compile_stats_mu_;
 
   // Maps cluster names to compilation statistics for said cluster.
diff --git a/tensorflow/compiler/jit/xla_compilation_cache_test.cc b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..018c7c219f445bdca17f4f8b060e3678fe1be9ee
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(XlaCompilationCacheTest, SignatureEquality) {
+  NameAttrList fn;
+  fn.set_name("afunction");
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kConstant;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({4, 0});
+  args[0].constant_value = Tensor(DT_INT32, {4, 0});
+  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s1,
+                          XlaCompilationCache::BuildSignature(fn, args));
+
+  args[0].type = DT_FLOAT;
+  args[0].constant_value = Tensor(DT_FLOAT, {4, 0});
+  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s2,
+                          XlaCompilationCache::BuildSignature(fn, args));
+
+  args[0].shape = TensorShape({0, 4});
+  args[0].constant_value = Tensor(DT_FLOAT, {0, 4});
+  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s3,
+                          XlaCompilationCache::BuildSignature(fn, args));
+
+  std::vector<XlaCompilationCache::Signature> signatures = {s1, s2, s3};
+  for (int i = 0; i < signatures.size(); ++i) {
+    for (int j = 0; j < signatures.size(); ++j) {
+      EXPECT_EQ(i == j, signatures[i] == signatures[j])
+          << signatures[i].HumanString() << " " << signatures[j].HumanString();
+    }
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 31cb32e3059bc17e3cde36e5c9f90cc78a39e473..1fe612d43d10030675cf307b109e4dcc89cb2d79 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -187,8 +187,13 @@ Status XlaCompileOnDemandOp::Compile(
   compile_options.always_return_tuple = false;
 
   std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
-  return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx,
-                                compile_options, result, executable);
+
+  std::vector<XlaCompiler::Argument> args;
+  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
+      constant_arguments, variable_args, ctx, &args));
+
+  return cache->CompileSingleOp(options, args, ctx, compile_options, result,
+                                executable);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index cbfeb38805038825917c16684b9c441818972042..9006dd514b166ad8291d2d437305e53de2a093a4 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -17,8 +17,8 @@ limitations under the License.
 // operators using XLA via the XLA "Host" (CPU) backend.
 
 #include "absl/memory/memory.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@@ -37,13 +37,15 @@ class XlaCpuDeviceFactory : public DeviceFactory {
 Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
                                           const string& name_prefix,
                                           std::vector<Device*>* devices) {
-  legacy_flags::XlaDeviceFlags* flags = legacy_flags::GetXlaDeviceFlags();
+  XlaDeviceFlags* flags = GetXlaDeviceFlags();
   bool compile_on_demand = flags->tf_xla_compile_on_demand;
 
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
-  registration.requires_compilation = !compile_on_demand;
-  registration.enable_jit_by_default = false;
+  registration.autoclustering_policy =
+      compile_on_demand
+          ? XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested
+          : XlaOpRegistry::AutoclusteringPolicy::kAlways;
   registration.compile_resource_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_CPU, registration);
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 2289abd2df372620c05db900bd46d1cdf6174377..4201ff91a89b1bee370e6a43337c51abe3bf974a 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -218,6 +218,9 @@ XlaDevice::XlaDevice(const SessionOptions& session_options,
 XlaDevice::~XlaDevice() {
   VLOG(1) << "Destroying XLA device " << jit_device_name_ << " " << this;
   mutex_lock lock(mu_);
+  while (outstanding_asynchronous_operations_ > 0) {
+    outstanding_asynchronous_operations_cv_.wait(lock);
+  }
   if (device_context_) {
     device_context_->Unref();
   }
@@ -384,6 +387,7 @@ void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
 
 Status XlaDevice::Sync() {
   VLOG(1) << "XlaDevice::Sync";
+  tracing::ScopedActivity activity("XlaDevice::Sync", /*is_expensive=*/true);
   std::shared_ptr<se::Stream> stream;
   {
     mutex_lock lock(mu_);
@@ -391,13 +395,46 @@ Status XlaDevice::Sync() {
   }
   if (!stream) return Status::OK();
 
-  if (!stream->parent()->SynchronizeAllActivity() || !stream->ok()) {
+  Status status = stream->BlockHostUntilDone();
+  {
+    mutex_lock lock(mu_);
+    while (outstanding_asynchronous_operations_ > 0) {
+      outstanding_asynchronous_operations_cv_.wait(lock);
+    }
+  }
+  TF_RETURN_IF_ERROR(status);
+  if (!stream->ok()) {
     return errors::Internal("XlaDevice::Sync() failed.");
   }
   VLOG(1) << "XlaDevice::Sync completed";
   return Status::OK();
 }
 
+void XlaDevice::Sync(const DoneCallback& done) {
+  VLOG(1) << "XlaDevice::Sync (asynchronous)";
+  std::shared_ptr<se::Stream> stream;
+  {
+    mutex_lock lock(mu_);
+    stream = stream_;
+  }
+  if (!stream) {
+    done(Status::OK());
+    return;
+  }
+
+  stream->ThenEnqueueOnBackgroundThread(
+      [this, stream, done](se::StreamExecutor*) {
+        tracing::ScopedActivity activity("XlaDevice::Sync::Callback",
+                                         /*is_expensive=*/true);
+        mutex_lock lock(mu_);
+        while (outstanding_asynchronous_operations_ > 0) {
+          outstanding_asynchronous_operations_cv_.wait(lock);
+        }
+        done(stream->ok() ? Status::OK()
+                          : errors::Internal("XlaDevice::Sync() failed."));
+      });
+}
+
 Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                       const AllocatorAttributes alloc_attrs,
                                       Tensor* tensor) {
@@ -441,12 +478,55 @@ bool XlaDevice::RequiresSyncOnCompletion() const {
   return sync_on_completion_;
 }
 
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    XlaDevice* device)
+    : device_(device) {
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+}
+
+XlaDevice::AsynchronousOperationHandle::~AsynchronousOperationHandle() {
+  if (device_) {
+    mutex_lock lock(device_->mu_);
+    --device_->outstanding_asynchronous_operations_;
+    device_->outstanding_asynchronous_operations_cv_.notify_all();
+  }
+}
+
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    const XlaDevice::AsynchronousOperationHandle& other)
+    : device_(other.device_) {
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+}
+
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    XlaDevice::AsynchronousOperationHandle&& other)
+    : device_(other.device_) {
+  other.device_ = nullptr;
+}
+
+XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
+operator=(const XlaDevice::AsynchronousOperationHandle& other) {
+  device_ = other.device_;
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+  return *this;
+}
+
+XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
+operator=(XlaDevice::AsynchronousOperationHandle&& other) {
+  device_ = other.device_;
+  other.device_ = nullptr;
+  return *this;
+}
+
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
                                                    const char* jit_device) {
   // Any op assigned to the device that isn't rewritten by the graph rewriter
   // gets executed by a n XlaCompileOnDemandOp, which compiles it and executes
   // it just-in-time.
-  kernel_factory::OpKernelRegistrar::Factory factory =
+  OpKernel* (*factory)(OpKernelConstruction*) =
       [](OpKernelConstruction* context) -> OpKernel* {
     return new XlaCompileOnDemandOp(context);
   };
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 8881b697bc863e58006361924f7761c2e5bba493..c8bb276cdb9673fdcba4cc15a9f33ecd3ae96dbb 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -112,6 +112,12 @@ class XlaDevice : public LocalDevice {
     // compute, host-to-device, and device-to-host communication.
     bool use_multiple_streams = false;
 
+    // A function that describes how the on-host shapes of
+    // a) argument and return value, for entry computations
+    // b) variables, for all computations,
+    // should be represented in XLA. Parameters/return values will be shaped
+    // according to this function, and reshaped back to/from their declared
+    // shapes for computations. Must be non-null.
     XlaCompiler::ShapeRepresentationFn shape_representation_fn;
 
     // If padded_shape_fn is empty, a default implementation that returns
@@ -129,6 +135,7 @@ class XlaDevice : public LocalDevice {
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
   Status Sync() override;
+  void Sync(const DoneCallback& done) override;
 
   Status FillContextMap(const Graph* graph,
                         DeviceContextMap* device_context_map) override
@@ -158,7 +165,30 @@ class XlaDevice : public LocalDevice {
 
   bool RequiresSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
 
+  // A simple RAII handle. On construction the device's
+  // outstanding_asynchronous_operations_ field is incremented; on destruction
+  // it is decremented.
+  class AsynchronousOperationHandle {
+   public:
+    AsynchronousOperationHandle(XlaDevice* device);
+    ~AsynchronousOperationHandle();
+    AsynchronousOperationHandle(const AsynchronousOperationHandle& other);
+    AsynchronousOperationHandle(AsynchronousOperationHandle&& other);
+    AsynchronousOperationHandle& operator=(
+        const AsynchronousOperationHandle& other);
+    AsynchronousOperationHandle& operator=(AsynchronousOperationHandle&& other);
+
+   private:
+    XlaDevice* device_ = nullptr;
+  };
+
+  AsynchronousOperationHandle CreateAsynchronousOperationHandle() {
+    return AsynchronousOperationHandle(this);
+  }
+
  private:
+  friend class AsynchronousOperationHandle;
+
   xla::LocalClient* client() const;
   Allocator* GetAllocatorLocked(AllocatorAttributes attr)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -221,6 +251,11 @@ class XlaDevice : public LocalDevice {
   // True if the device requires XlaDevice::Sync to be called on completion
   // regardless of status.
   bool sync_on_completion_ GUARDED_BY(mu_) = false;
+
+  // Count of outstanding asynchronous operations which must be zero on Sync()
+  // completion.
+  int64 outstanding_asynchronous_operations_ GUARDED_BY(mu_) = 0;
+  condition_variable outstanding_asynchronous_operations_cv_;
 };
 
 // Builds OpKernel registrations on 'device' for the JIT operators
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index eb3cf27624bb76058c8f0cf2e999818434d38d9e..6e6532731e64bd42ee56aa719748988f321e0f17 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -70,9 +70,12 @@ XlaDeviceContext::XlaDeviceContext(
   CHECK(device_to_host_stream_ != nullptr);
   CHECK(stream_ != nullptr);
   if (!shape_representation_fn_) {
-    shape_representation_fn_ =
-        [](const TensorShape& shape,
-           DataType dtype) -> xla::StatusOr<TensorShape> { return shape; };
+    shape_representation_fn_ = [](const TensorShape& shape,
+                                  DataType dtype) -> xla::StatusOr<xla::Shape> {
+      xla::Shape xla_shape;
+      TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
+      return xla_shape;
+    };
   }
 }
 
@@ -99,7 +102,7 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
   CHECK(xla_tensor);
 
   Status status = [&]() -> Status {
-    TF_ASSIGN_OR_RETURN(TensorShape shape,
+    TF_ASSIGN_OR_RETURN(xla::Shape shape,
                         shape_representation_fn_(device_tensor->shape(),
                                                  device_tensor->dtype()));
 
@@ -111,9 +114,15 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
         xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_,
                                          stream_->parent()->device_ordinal()));
 
+    // The cpu_tensor and literal that we created here hold the data of host
+    // tensor in descending layout. The layout could be different from layout in
+    // device_tensor (but the logical shape has to be the same). The
+    // transfer_manager is responsible to do corresponding transposing when
+    // transferring the data to device.
     xla::BorrowingLiteral literal(
         static_cast<const char*>(DMAHelper::base(cpu_tensor)),
-        xla_tensor->shaped_buffer().on_host_shape());
+        xla::ShapeUtil::MakeShape(shape.element_type(),
+                                  xla::AsInt64Slice(shape.dimensions())));
 
     VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
             << xla_tensor->shaped_buffer().ToString();
@@ -183,8 +192,15 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
   xla_tensor->WaitForDefinitionEventOnStream(device_to_host_stream_.get());
 
+  // Transfer manager requires the shape of the shaped buffer to be the same as
+  // literal shape except for the layout.  Set the literal to use xla_tensor's
+  // shape as it is derived from the cpu_tensor's shape using
+  // shape_representation_fn_.
   xla::MutableBorrowingLiteral literal;
-  TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(cpu_tensor, &literal));
+  TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(
+      xla::LayoutUtil::GetWithDefaultLayout(
+          xla_tensor->shaped_buffer().on_host_shape()),
+      cpu_tensor, &literal));
 
   TensorReference ref(*device_tensor);
   transfer_manager_->TransferLiteralFromDevice(
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 241ea8f60df8b66a9a39e3e176ecd4119f27d780..adf0f994b84d9fbf918a5b2478aa7d106853e038 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/resource_variable_ops.h"
 #include "tensorflow/core/kernels/sendrecv_ops.h"
 #include "tensorflow/core/kernels/shape_ops.h"
+#include "tensorflow/core/kernels/stack.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 namespace tensorflow {
@@ -257,9 +258,27 @@ class XlaAssignVariableOp : public OpKernel {
                               .Device(DEVICE)                                  \
                               .TypeConstraint<string>("T")                     \
                               .HostMemory("input"),                            \
-                          RetvalOp);
+                          RetvalOp);                                           \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(Name("StackV2")                                      \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("max_size")                          \
+                              .HostMemory("handle"),                           \
+                          StackOp);                                            \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")                                  \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("handle")                            \
+                              .TypeConstraint("T", TYPES),                     \
+                          TemplatedStackPushOp</*allow_swapping=*/false>);     \
+  REGISTER_KERNEL_BUILDER(Name("StackPopV2")                                   \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("handle")                            \
+                              .TypeConstraint("elem_type", TYPES),             \
+                          StackPopOp);                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("StackCloseV2").Device(DEVICE).HostMemory("handle"), StackCloseOp);
 
-// TODO(phawkins): currently we do not register the QueueEnqueueMany,
+// TODO(b/118881356): currently we do not register the QueueEnqueueMany,
 // QueueDequeueMany, or QueueDequeueUpTo kernels because they attempt to read
 // and write the tensors they access in order to concatenate them into a batch.
 // We would need either to call out to an XLA computation to perform the
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 8f28b38b5e15052e9a14bd1ecf1b3047085d98f1..441970169581d53e0d8683b98d26712445b170ea 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -37,8 +37,8 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
                                           std::vector<Device*>* devices) {
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
-  registration.requires_compilation = true;
-  registration.enable_jit_by_default = false;
+  registration.autoclustering_policy =
+      XlaOpRegistry::AutoclusteringPolicy::kAlways;
   registration.compile_resource_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_GPU, registration);
 
@@ -53,24 +53,25 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
     return Status::OK();
   }
 
-  XlaDevice::Options options;
-  options.platform = platform.ValueOrDie();
-  options.device_name_prefix = name_prefix;
-  options.device_name = DEVICE_XLA_GPU;
-  options.device_ordinal = 0;
-  options.compilation_device_name = DEVICE_GPU_XLA_JIT;
-  options.use_multiple_streams = false;
-  auto device = absl::make_unique<XlaDevice>(session_options, options);
-
-  // TODO(b/78468222): Uncomment after fixing this bug
-  // status = device->UseGpuDeviceInfo();
-  // if (!status.ok()) {
-  //  errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT,
-  //                          " device");
-  //  return status;
-  // }
-
-  devices->push_back(device.release());
+  for (int i = 0; i < platform.ValueOrDie()->VisibleDeviceCount(); ++i) {
+    XlaDevice::Options options;
+    options.platform = platform.ValueOrDie();
+    options.device_name_prefix = name_prefix;
+    options.device_name = DEVICE_XLA_GPU;
+    options.device_ordinal = i;
+    options.compilation_device_name = DEVICE_GPU_XLA_JIT;
+    options.use_multiple_streams = true;
+    auto device = absl::make_unique<XlaDevice>(session_options, options);
+
+    Status status = device->UseGpuDeviceInfo();
+    if (!status.ok()) {
+      errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT,
+                              " device number ", i);
+      return status;
+    }
+
+    devices->push_back(device.release());
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index dc37362fd8611577317f62083796fd4d655e7066..e828bae865d630bd40f227943cdabb2d8d95ca48 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -45,8 +45,8 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
 
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
-  registration.requires_compilation = true;
-  registration.enable_jit_by_default = false;
+  registration.autoclustering_policy =
+      XlaOpRegistry::AutoclusteringPolicy::kAlways;
   registration.compile_resource_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER,
                                            registration);
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 6e51bfca4a1504f8f11fe60159cb44b2ae19fa1b..3b0bda4caa161a7561a3098b89420329998ff8a7 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -191,40 +191,6 @@ Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
   return Status::OK();
 }
 
-namespace internal {
-// Return the 'index''th subtree of the given ShapedBuffer as a
-// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
-// subtree, and sets the input's buffer pointers to nullptr for the subtree.
-ScopedShapedBuffer ExtractSubShapedBuffer(
-    ShapedBuffer* shaped_buffer, int index,
-    xla::DeviceMemoryAllocator* allocator) {
-  const xla::Shape& on_host_shape = xla::ShapeUtil::GetTupleElementShape(
-      shaped_buffer->on_host_shape(), index);
-  const xla::Shape& on_device_shape = xla::ShapeUtil::GetTupleElementShape(
-      shaped_buffer->on_device_shape(), index);
-
-  ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
-                                 shaped_buffer->platform(),
-                                 shaped_buffer->device_ordinal());
-
-  auto& shape_tree = shaped_buffer->buffers();
-  auto& sub_shape_tree = sub_shaped_buffer.buffers();
-  sub_shape_tree.CopySubtreeFrom(shape_tree,
-                                 /*source_base_index=*/{index},
-                                 /*target_base_index=*/{});
-  shape_tree.ForEachMutableElement(
-      [index](const xla::ShapeIndex& shape_index,
-              tensorflow::se::DeviceMemoryBase* data) {
-        // shape_index is empty for the root node. Ignore that.
-        if (!shape_index.empty() && shape_index[0] == index) {
-          *data = tensorflow::se::DeviceMemoryBase(nullptr, 0);
-        }
-      });
-  return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
-}
-}  // namespace internal
-using internal::ExtractSubShapedBuffer;
-
 XlaComputationLaunchContext::XlaComputationLaunchContext(
     xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator,
     bool allocate_xla_tensors, bool use_multiple_streams)
@@ -391,8 +357,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
           TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor));
           XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
           if (xla_tensor) {
-            xla_tensor->set_shaped_buffer(ScopedShapedBuffer(
-                ExtractSubShapedBuffer(&output, output_num, xla_allocator_)));
+            xla_tensor->set_shaped_buffer(output.TakeSubTree({output_num}));
             if (use_multiple_streams_) {
               xla_tensor->ResetDefinitionEvent(definition_event, stream);
             }
@@ -445,7 +410,6 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   for (int i = 0; i < kernel->resource_updates.size(); ++i) {
     Allocator* allocator = ctx->device()->GetAllocator({});
     const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
-    se::DeviceMemoryBase buffer = output.buffer({output_num});
 
     if (variable_infos[i].var()->tensor()->dtype() != write.type) {
       return errors::Internal("Mismatched type in variable write");
@@ -455,18 +419,20 @@ Status XlaComputationLaunchContext::PopulateOutputs(
       Tensor output_tensor;
       TF_RETURN_IF_ERROR(
           ctx->allocate_temp(write.type, write.shape, &output_tensor));
-      XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
-      CHECK(xla_tensor);
-      xla_tensor->set_shaped_buffer(
-          ExtractSubShapedBuffer(&output, output_num, xla_allocator_));
-      if (use_multiple_streams_) {
-        xla_tensor->ResetDefinitionEvent(definition_event, stream);
+      if (write.shape.num_elements() > 0) {
+        XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
+        CHECK(xla_tensor);
+        xla_tensor->set_shaped_buffer(output.TakeSubTree({output_num}));
+        if (use_multiple_streams_) {
+          xla_tensor->ResetDefinitionEvent(definition_event, stream);
+        }
       }
       *variable_infos[i].var()->tensor() = output_tensor;
     } else {
+      se::DeviceMemoryBase buffer = output.buffer({output_num});
+      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
-      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
       *variable_infos[i].var()->tensor() = output_tensor;
     }
     ++output_num;
@@ -474,4 +440,60 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   return Status::OK();
 }
 
+Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
+    const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    std::vector<XlaCompiler::Argument>* args) {
+  args->resize(ctx->num_inputs());
+
+  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
+    XlaCompiler::Argument& arg = (*args)[input_num];
+    if (constant_args.count(input_num) > 0) {
+      // Handles compile-time constants.
+      const Tensor& input = constant_args.at(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      arg.kind = XlaCompiler::Argument::kConstant;
+      arg.type = input.dtype();
+      arg.shape = input.shape();
+      arg.constant_value = input;
+    } else if (variable_args.count(input_num) == 0) {
+      // Handles the non-constant arguments.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      if (input.NumElements() > 0) {
+        arg.kind = XlaCompiler::Argument::kParameter;
+      } else {
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = input;
+      }
+      arg.type = input.dtype();
+      arg.shape = input.shape();
+    } else {
+      // Handles resource variables.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
+      const OptionalTensor& variable = variable_args.at(input_num);
+      arg.name = variable.name;
+      arg.kind = XlaCompiler::Argument::kResource;
+      arg.resource_kind = XlaResource::kVariable;
+      if (variable.present) {
+        const Tensor& value = variable.value;
+        arg.type = value.dtype();
+        arg.shape = value.shape();
+        arg.initialized = true;
+      } else {
+        // The values of uninitialized variables are not passed as inputs, since
+        // they are meaningless. However, it is legal to assign to a resource
+        // variable for the first time inside the XLA computation, so we do
+        // permit uninitialized variables.
+        arg.initialized = false;
+        arg.type = DT_INVALID;
+        arg.shape = TensorShape();
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 81e205d13f711a701026b82100c17423595919ed..437db019a0eabe66417725148d8b121842e90479 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -35,6 +35,13 @@ limitations under the License.
 namespace tensorflow {
 class XlaAllocator;
 
+// Struct that represents a possibly-absent Tensor.
+struct OptionalTensor {
+  string name;           // A descriptive name
+  bool present = false;  // Is the tensor present?
+  Tensor value;          // If present, what is the Tensor's value?
+};
+
 // Takes a snapshot of the values of resource variable arguments, whose indices
 // are specified in `variable_indices` argument. We snapshot tensors that back
 // resource variables since concurrent updates may modify the shape, and it is
@@ -139,6 +146,13 @@ class XlaComputationLaunchContext {
                               bool allocate_xla_tensors,
                               bool use_multiple_streams);
 
+  // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
+  // op.
+  static Status BuildXlaCompilerArguments(
+      const std::map<int, Tensor>& constant_args,
+      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      std::vector<XlaCompiler::Argument>* args);
+
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
   // `variables` is a map from TensorFlow argument number to resource variable.
   //
@@ -223,17 +237,6 @@ class XlaTensorBuffer : public TensorBuffer {
   Allocator* allocator_;
 };
 
-// Exposed in this header file for microbenchmarking purposes, but this is an
-// internal implementation detail.
-namespace internal {
-// Return the 'index''th subtree of the given ShapedBuffer as a
-// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
-// subtree, and sets the input's buffer pointers to nullptr for the subtree.
-xla::ScopedShapedBuffer ExtractSubShapedBuffer(
-    xla::ShapedBuffer* shaped_buffer, int index,
-    xla::DeviceMemoryAllocator* allocator);
-}  // namespace internal
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
deleted file mode 100644
index a45932403ec1760d6b985d5357fd6d84fbf257a2..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/xla_launch_util_test.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Contains microbenchmarks for performance critical functions in
-// xla_launch_util.cc.
-
-#include "tensorflow/compiler/jit/xla_launch_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-// Test ExtractSubBuffer with different depths (depth of ShapeTree) and fan-outs
-// (cardinality of each non-leaf node's children).
-void BM_ExtractSubBuffer(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
-  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
-  for (int i = 0; i < depth; ++i) {
-    std::vector<xla::Shape> shapes(fan_out, shape);
-    shape = xla::ShapeUtil::MakeTupleShape(shapes);
-  }
-  xla::ShapedBuffer shaped_buffer(shape, shape, /*platform=*/nullptr,
-                                  /*device_ordinal=*/0);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    // Extract a buffer from approximately the middle of the first level of the
-    // tree.
-    (void)tensorflow::internal::ExtractSubShapedBuffer(&shaped_buffer,
-                                                       /*index=*/fan_out / 2,
-                                                       /*allocator=*/nullptr)
-        .release();
-  }
-}
-
-BENCHMARK(BM_ExtractSubBuffer)
-    ->ArgPair(1, 4)
-    ->ArgPair(1, 8)
-    ->ArgPair(1, 32)
-    ->ArgPair(1, 64)
-    ->ArgPair(1, 128)
-    ->ArgPair(1, 256)
-    ->ArgPair(1, 512)
-    ->ArgPair(2, 4)
-    ->ArgPair(2, 8)
-    ->ArgPair(2, 32)
-    ->ArgPair(2, 64)
-    ->ArgPair(2, 128);
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  tensorflow::testing::RunBenchmarks();
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index 6f8b198262dfb46b3fd76c52b5c005778cb906eb..d1f7f754c8338487557eda512c56be34c9e958b7 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -43,11 +43,10 @@ namespace tensorflow {
   }
 }
 
-Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+Status XlaTensor::AllocateShapedBuffer(DataType dtype,
+                                       const xla::Shape& on_host_shape,
                                        xla::LocalClient* client,
                                        int device_ordinal) {
-  xla::Shape on_host_shape;
-  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &on_host_shape));
   xla::Shape on_device_shape =
       client->backend().transfer_manager()->HostShapeToDeviceShape(
           on_host_shape);
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 6d7a6fd66c80f6b8c29ad7adb4c9ae8505f5ed81..77e80aa2527ecc2221ac61f7b7e6ebcce0982931 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -50,7 +50,7 @@ class XlaTensor {
   // Assign the internal ShapedBuffer to new memory for the given dtype and
   // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it
   // is replaced and the managed memory deallocated.
-  Status AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+  Status AllocateShapedBuffer(DataType dtype, const xla::Shape& on_host_shape,
                               xla::LocalClient* client, int device_ordinal);
 
   // Some Tensors can have complex on-device shapes, including tuple shapes. To
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 664df006232f399ced0f29bf786c023a9688e64f..2b88a64fed322f662b3ff1d6bf706a813c52c758 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -375,6 +375,27 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "resampler_ops_test",
+    size = "small",
+    srcs = ["resampler_ops_test.py"],
+    disabled_backends = [
+        # TODO(b/74459949) Support BatchDot in CPU backend.
+        "cpu",
+        "cpu_ondemand",
+    ],
+    # TODO(b/112295522): figure out how to make OSS build pass.
+    tags = ["no_oss"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/contrib/resampler:resampler_ops",
+        "//tensorflow/contrib/resampler:resampler_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "dynamic_stitch_test",
     size = "small",
@@ -449,12 +470,11 @@ tf_xla_py_test(
     tags = ["optonly"],
     deps = [
         ":xla_test",
-        "//tensorflow/contrib/signal:signal_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:spectral_ops",
+        "//tensorflow/python/ops/signal",
     ],
 )
 
@@ -816,8 +836,6 @@ tf_xla_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
-    # Stack ops are not implemented in the on-demand compilation model yet.
-    disabled_backends = ["cpu_ondemand"],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/compiler/tests/adagrad_da_test.py b/tensorflow/compiler/tests/adagrad_da_test.py
index 69fb3ec2964a09508e612515b9e291fc14121d68..e9c2d363acab96c0fb968cb7f901ce105ea8703e 100644
--- a/tensorflow/compiler/tests/adagrad_da_test.py
+++ b/tensorflow/compiler/tests/adagrad_da_test.py
@@ -50,8 +50,8 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllClose([0.0, 0.0], var0.eval())
-        self.assertAllClose([0.0, 0.0], var1.eval())
+        self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+        self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
@@ -63,9 +63,9 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
         # For -0.1*3.0*(0.1 - 0)/(0 + sqrt(0.1 + 0.1*0.1)) = -0.904534
         # similarly for others.
         self.assertAllCloseAccordingToType(
-            np.array([-0.904534, -1.603567]), var0.eval())
+            np.array([-0.904534, -1.603567]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.094821, -0.189358]), var1.eval())
+            np.array([-0.094821, -0.189358]), self.evaluate(var1))
 
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in self.float_types:
@@ -87,16 +87,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.904534, -1.603567]), var0.eval())
+            np.array([-0.904534, -1.603567]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.094821, -0.189358]), var1.eval())
+            np.array([-0.094821, -0.189358]), self.evaluate(var1))
 
   def testAdagradDAWithL1(self):
     for dtype in self.float_types:
@@ -118,16 +118,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.895489, -1.59555]), var0.eval())
+            np.array([-0.895489, -1.59555]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.085339, -0.17989]), var1.eval())
+            np.array([-0.085339, -0.17989]), self.evaluate(var1))
 
   def testAdagradDAWithL1_L2(self):
     for dtype in self.float_types:
@@ -149,16 +149,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.046907, -0.093659]), var0.eval())
+            np.array([-0.046907, -0.093659]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.004275, -0.009023]), var1.eval())
+            np.array([-0.004275, -0.009023]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py
index ab69319c59fb07e7ce56c3c287a50a6290effdfd..e26483303c3934fd51675cb1fbc998b276caf527 100644
--- a/tensorflow/compiler/tests/adagrad_test.py
+++ b/tensorflow/compiler/tests/adagrad_test.py
@@ -42,17 +42,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
   def testTensorLearningRate(self):
@@ -68,17 +70,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
   def testSharing(self):
@@ -103,18 +107,20 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values.
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Mix the first and the second adagrad for 3 steps.
         ada_update1.run()
         ada_update2.run()
         ada_update1.run()
         # Validate updated params (the same as with only 1 Adagrad).
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
 
diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py
index 058576b3d4b695209952158769162bb24e7ccfce..8bcff9d379d34f8a6bb8b0fdc60b7588c6d80be9 100644
--- a/tensorflow/compiler/tests/adam_test.py
+++ b/tensorflow/compiler/tests/adam_test.py
@@ -75,23 +75,24 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTensorLearningRate(self):
     for dtype in self.float_types:
@@ -117,23 +118,24 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testSharing(self):
     for dtype in self.float_types:
@@ -162,13 +164,14 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 3 steps of intertwined Adam1 and Adam2.
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           if t % 2 == 0:
             update1.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
           else:
@@ -178,8 +181,8 @@ class AdamOptimizerTest(xla_test.XLATestCase):
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/adamax_test.py b/tensorflow/compiler/tests/adamax_test.py
index 3ed1d41b7121f44dd7470f61180f7a7055369174..961b46375c941bdc3922e460a2f58345086dbceb 100644
--- a/tensorflow/compiler/tests/adamax_test.py
+++ b/tensorflow/compiler/tests/adamax_test.py
@@ -78,8 +78,8 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power = opt._get_beta_accumulators()
 
@@ -87,14 +87,17 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
         for t in range(1, 4):
           update.run()
 
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval(), rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, var1.eval(), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(var0), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var1_np, self.evaluate(var1), rtol=1e-2)
           self.assertEqual("var0_%d/AdaMax:0" % (i,),
                            opt.get_slot(var=var0, name="m").name)
 
@@ -118,22 +121,23 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power = opt._get_beta_accumulators()
 
         # Run 3 steps of AdaMax
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
           update.run()
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/addsign_test.py b/tensorflow/compiler/tests/addsign_test.py
index 1bc07ace23ccdc83103abe71ee11b72994c75a6d..a37c97e6d374440aeb860b9d02f2d5dd95c91f62 100644
--- a/tensorflow/compiler/tests/addsign_test.py
+++ b/tensorflow/compiler/tests/addsign_test.py
@@ -90,8 +90,8 @@ class AddSignTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 7 steps of AddSign
         # first 4 steps with positive gradient
@@ -125,8 +125,8 @@ class AddSignTest(xla_test.XLATestCase):
 
           # Validate updated params
           self.assertAllCloseAccordingToType(
-              var0_np, var0.eval(), half_rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+              var0_np, self.evaluate(var0), half_rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testDense(self):
     decay_steps = 10
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 4e6dd6abfc9cdbabbbcdf0734be828f0aa28683b..332381c59eed06d5697e58efb1d8fa2b6ef604d2 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
@@ -967,7 +969,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
       self._testBinary(
           array_ops.expand_dims,
           np.array([42], dtype=dtype),
-          np.int32(0),
+          np.array([0], dtype=np.int64),
           expected=np.array([[42]], dtype=dtype))
       self._testBinary(
           array_ops.expand_dims,
@@ -994,15 +996,21 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array([[[1, 2], [3, 4]]], dtype=dtype),
           np.int32(3),
           expected=np.array([[[[1], [2]], [[3], [4]]]], dtype=dtype))
+      self._testBinary(
+          array_ops.expand_dims,
+          np.array([[[1, 2], [3, 4]]], dtype=dtype),
+          np.array([2], dtype=np.int64),
+          expected=np.array([[[[1, 2]], [[3, 4]]]], dtype=dtype))
 
   def testPad(self):
-    for dtype in self.numeric_types:
+    for dtype, pad_type in itertools.product(
+        self.numeric_types, [np.int32, np.int64]):
       self._testBinary(
           array_ops.pad,
           np.array(
               [[1, 2, 3], [4, 5, 6]], dtype=dtype),
           np.array(
-              [[1, 2], [2, 1]], dtype=np.int32),
+              [[1, 2], [2, 1]], dtype=pad_type),
           expected=np.array(
               [[0, 0, 0, 0, 0, 0],
                [0, 0, 1, 2, 3, 0],
@@ -1016,7 +1024,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array(
               [[1, 2, 3], [4, 5, 6]], dtype=dtype),
           np.array(
-              [[0, 3], [2, 1]], dtype=np.int32),
+              [[0, 3], [2, 1]], dtype=pad_type),
           expected=np.array(
               [[7, 7, 1, 2, 3, 7],
                [7, 7, 4, 5, 6, 7],
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index a57d1dc81ea2c9c188b0a3005904738aa8156bf3..15108487cfa8b9f07a5705fa6897fe16375ad7bf 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import googletest
 
 
@@ -60,7 +61,7 @@ class CategoricalTest(xla_test.XLATestCase):
       random_seed.set_random_seed(1618)
       op = random_ops.multinomial(logits, num_samples,
                                   output_dtype=dtypes.int32)
-      d = sess.run(op)
+      d = self.evaluate(op)
 
     batch_size, num_classes = logits.shape
     freqs_mat = []
@@ -85,9 +86,9 @@ class CategoricalTest(xla_test.XLATestCase):
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # deterministic output, all three outputs will be bitwise identical.
@@ -112,7 +113,7 @@ class CategoricalTest(xla_test.XLATestCase):
             x = random_ops.multinomial(
                 array_ops.ones(shape=[1, 20], dtype=dtype), 1000,
                 output_dtype=output_dtype)
-          y = sess.run(x)
+          y = self.evaluate(x)
           self.assertTrue((y >= 0).sum() == 1000)
           self.assertTrue((y < 20).sum() == 1000)
 
@@ -138,6 +139,36 @@ class CategoricalTest(xla_test.XLATestCase):
       chi2 = self._chi2(probs, freqs)
       self.assertLess(chi2, 1e-3)
 
+  def testStatelessMultinomialIsInRange(self):
+    for dtype in self.float_types:
+      for output_dtype in self.output_dtypes():
+        with self.cached_session() as sess:
+          with self.test_scope():
+            seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+            x = stateless_random_ops.stateless_multinomial(
+                array_ops.ones(shape=[1, 20], dtype=dtype),
+                1000,
+                seed_t,
+                output_dtype=output_dtype)
+          y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+          self.assertTrue((y >= 0).sum() == 1000)
+          self.assertTrue((y < 20).sum() == 1000)
+
+  def testDeterminismMultinomial(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    num_samples = 10
+    with self.cached_session(), self.test_scope():
+      seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+      for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                [0.25, 0.75]]):
+        pure = stateless_random_ops.stateless_multinomial(
+            logits, num_samples, seed=seed_t)
+        values = [(seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds]
+        for s0, v0 in values:
+          for s1, v1 in values:
+            self.assertEqual(s0 == s1, np.all(v0 == v1))
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tests/clustering_test.py b/tensorflow/compiler/tests/clustering_test.py
index 88bd58b2da6b2892f898ad10f3467d8ce39d6388..ef2d7af69deeebd5f4c4c7225d7027f8f76bf861 100644
--- a/tensorflow/compiler/tests/clustering_test.py
+++ b/tensorflow/compiler/tests/clustering_test.py
@@ -43,7 +43,7 @@ class ClusteringTest(xla_test.XLATestCase):
         input1 = constant_op.constant(val1, name="const1")
         input2 = constant_op.constant(val2, name="const2")
         output = math_ops.add(input1, input2)
-      result = output.eval()
+      result = self.evaluate(output)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testAddFromCpuMultiple(self):
@@ -57,7 +57,7 @@ class ClusteringTest(xla_test.XLATestCase):
       with self.test_scope():
         output = math_ops.add(input1, input2)
       for _ in xrange(10):
-        result = output.eval()
+        result = self.evaluate(output)
         self.assertAllClose(result, expected, rtol=1e-3)
 
   def testDeadlock(self):
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 2d225ad226cac368042b95eae8fc29e6fd8e82e0..deb9ac186e63a520054993cb56375f152c8c6587 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -72,7 +72,7 @@ class ConcatTest(xla_test.XLATestCase):
       x2 = constant_op.constant(p2)
       with self.test_scope():
         c = array_ops.concat([x1, x2], 0)
-      result = c.eval()
+      result = self.evaluate(c)
     self.assertAllEqual(result[:2, :], p1)
     self.assertAllEqual(result[2:, :], p2)
 
@@ -150,7 +150,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 1)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
     self.assertAllEqual(result, grad_inp)
 
   def testGradientsSimpleAll(self):
@@ -177,7 +177,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 0)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -205,7 +205,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 2)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -242,7 +242,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, concat_dim)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -280,7 +280,7 @@ class ConcatTest(xla_test.XLATestCase):
       with self.test_scope():
         concat_list_t = array_ops.concat([c1, c2], 0)
         concat_tuple_t = array_ops.concat((c1, c2), 0)
-      self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval())
+      self.assertAllEqual(concat_list_t.eval(), self.evaluate(concat_tuple_t))
 
   def testConcatNoScalars(self):
     with self.cached_session():
@@ -337,7 +337,7 @@ class ConcatOffsetTest(xla_test.XLATestCase):
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
         s2 = constant_op.constant([2, 20, 5], dtypes.int32)
         off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-        ans = sess.run(off)
+        ans = self.evaluate(off)
         self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
 
@@ -350,7 +350,7 @@ class PackTest(xla_test.XLATestCase):
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
         s2 = constant_op.constant([2, 20, 5], dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [[2, 3, 5], [2, 7, 5], [2, 20, 5]])
 
   def testScalars(self):
@@ -360,7 +360,7 @@ class PackTest(xla_test.XLATestCase):
         s1 = constant_op.constant(3, dtypes.int32)
         s2 = constant_op.constant(5, dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [2, 3, 5])
 
   def testEmpty(self):
@@ -370,7 +370,7 @@ class PackTest(xla_test.XLATestCase):
         s1 = constant_op.constant([[]], dtypes.int32)
         s2 = constant_op.constant([[]], dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [[[]], [[]], [[]]])
 
 
diff --git a/tensorflow/compiler/tests/conv3d_test.py b/tensorflow/compiler/tests/conv3d_test.py
index d59fd0236f4f7da2bbfb3409342c7f70f8f5d1f6..01cc1b6392845be2418c50d55be97487eb290843 100644
--- a/tensorflow/compiler/tests/conv3d_test.py
+++ b/tensorflow/compiler/tests/conv3d_test.py
@@ -85,7 +85,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells = kernel_depth * kernel_height * kernel_width
@@ -135,7 +135,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[3]):
@@ -173,7 +173,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 63cee550fde9d9d4314b1541fba191df776a4da2..76706ad40a0f0e9d033196d2e32e9b6c154268f0 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -106,7 +106,7 @@ class EagerTest(xla_test.XLATestCase):
         three = constant_op.constant(3)
         five = constant_op.constant(5)
         product = three * five
-        self.assertAllEqual(15, sess.run(product))
+        self.assertAllEqual(15, self.evaluate(product))
 
   def testDegenerateSlices(self):
     with self.test_scope():
diff --git a/tensorflow/compiler/tests/fft_test.py b/tensorflow/compiler/tests/fft_test.py
index b3e13fbaa6b33bdaa1be123be558059e96de282e..61abf9c9c045b835b3a2e92fc588cd31f3da76ff 100644
--- a/tensorflow/compiler/tests/fft_test.py
+++ b/tensorflow/compiler/tests/fft_test.py
@@ -24,11 +24,10 @@ import numpy as np
 import scipy.signal as sps
 
 from tensorflow.compiler.tests import xla_test
-from tensorflow.contrib.signal.python.ops import spectral_ops as signal
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import signal
 from tensorflow.python.platform import googletest
 
 BATCH_DIMS = (3, 5)
@@ -107,39 +106,39 @@ class FFTTest(xla_test.XLATestCase):
 
   def testFFT(self):
     self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.fft,
-                          spectral_ops.fft)
+                          signal.fft)
 
   def testFFT2D(self):
     self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.fft2,
-                          spectral_ops.fft2d)
+                          signal.fft2d)
 
   def testFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.fftn(x, axes=(-3, -2, -1)),
-                          spectral_ops.fft3d)
+                          signal.fft3d)
 
   def testIFFT(self):
     self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.ifft,
-                          spectral_ops.ifft)
+                          signal.ifft)
 
   def testIFFT2D(self):
     self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.ifft2,
-                          spectral_ops.ifft2d)
+                          signal.ifft2d)
 
   def testIFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.ifftn(x, axes=(-3, -2, -1)),
-                          spectral_ops.ifft3d)
+                          signal.ifft3d)
 
   def testRFFT(self):
     self._VerifyFftMethod(
         INNER_DIMS_1D, np.real, lambda x: np.fft.rfft(x, n=x.shape[-1]),
-        lambda x: spectral_ops.rfft(x, fft_length=[x.shape[-1].value]))
+        lambda x: signal.rfft(x, fft_length=[x.shape[-1].value]))
 
   def testRFFT2D(self):
 
     def _tf_fn(x):
-      return spectral_ops.rfft2d(
+      return signal.rfft2d(
           x, fft_length=[x.shape[-2].value, x.shape[-1].value])
 
     self._VerifyFftMethod(
@@ -153,7 +152,7 @@ class FFTTest(xla_test.XLATestCase):
           x, axes=(-3, -2, -1), s=[x.shape[-3], x.shape[-2], x.shape[-1]])
 
     def _tf_fn(x):
-      return spectral_ops.rfft3d(
+      return signal.rfft3d(
           x,
           fft_length=[x.shape[-3].value, x.shape[-2].value, x.shape[-1].value])
 
@@ -162,7 +161,7 @@ class FFTTest(xla_test.XLATestCase):
   def testIRFFT(self):
 
     def _tf_fn(x):
-      return spectral_ops.irfft(x, fft_length=[2 * (x.shape[-1].value - 1)])
+      return signal.irfft(x, fft_length=[2 * (x.shape[-1].value - 1)])
 
     self._VerifyFftMethod(
         INNER_DIMS_1D, lambda x: np.fft.rfft(np.real(x), n=x.shape[-1]),
@@ -171,7 +170,7 @@ class FFTTest(xla_test.XLATestCase):
   def testIRFFT2D(self):
 
     def _tf_fn(x):
-      return spectral_ops.irfft2d(
+      return signal.irfft2d(
           x, fft_length=[x.shape[-2].value, 2 * (x.shape[-1].value - 1)])
 
     self._VerifyFftMethod(
@@ -195,7 +194,7 @@ class FFTTest(xla_test.XLATestCase):
           s=[x.shape[-3], x.shape[-2], 2 * (x.shape[-1] - 1)])
 
     def _tf_fn(x):
-      return spectral_ops.irfft3d(
+      return signal.irfft3d(
           x,
           fft_length=[
               x.shape[-3].value, x.shape[-2].value, 2 * (x.shape[-1].value - 1)
diff --git a/tensorflow/compiler/tests/fifo_queue_test.py b/tensorflow/compiler/tests/fifo_queue_test.py
index 8c7edfd277c992c35a81dd5f261256a86352254e..91d77d2f791834346f43aecb60d116ddbf2faa6e 100644
--- a/tensorflow/compiler/tests/fifo_queue_test.py
+++ b/tensorflow/compiler/tests/fifo_queue_test.py
@@ -129,7 +129,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -192,9 +192,9 @@ class FIFOQueueTest(xla_test.XLATestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index 5b197afd655404e4e36a8b3442f8db60cb1d648d..b078053cdbd6d129645734492d34dd25d28ab3ef 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -50,14 +50,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Ftrl for a few steps
     for _ in range(steps):
       ftrl_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivAdagradTest_AdagradPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -65,14 +65,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     adagrad_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Adagrad for a few steps
     for _ in range(steps):
       adagrad_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivGradientDescentTest_FtrlPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -85,14 +85,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Ftrl for a few steps
     for _ in range(steps):
       ftrl_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivGradientDescentTest_GradientDescentPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -100,14 +100,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     sgd_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run GradientDescent for a few steps
     for _ in range(steps):
       sgd_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testFtrlwithoutRegularization(self):
     for dtype in self.float_types:
@@ -124,8 +124,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([0.0, 0.0], var0.eval())
-        self.assertAllClose([0.0, 0.0], var1.eval())
+        self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+        self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
         # Run 3 steps FTRL
         for _ in range(3):
@@ -134,12 +134,12 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType(
             np.array([-2.60260963, -4.29698515]),
-            var0.eval(),
+            self.evaluate(var0),
             float_rtol=1e-4,
             half_rtol=1e-2)
         self.assertAllCloseAccordingToType(
             np.array([-0.28432083, -0.56694895]),
-            var1.eval(),
+            self.evaluate(var1),
             float_rtol=1e-5,
             half_rtol=1e-2)
 
@@ -158,8 +158,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 3 steps FTRL
         for _ in range(3):
@@ -167,10 +167,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-2.55607247, -3.98729396]), var0.eval(), 1e-5, 1e-5,
+            np.array([-2.55607247, -3.98729396]),
+            self.evaluate(var0),
+            1e-5,
+            1e-5,
             float_rtol=1e-4)
         self.assertAllCloseAccordingToType(
-            np.array([-0.28232238, -0.56096673]), var1.eval(), 1e-5, 1e-5)
+            np.array([-0.28232238, -0.56096673]), self.evaluate(var1), 1e-5,
+            1e-5)
 
   def testFtrlWithL1(self):
     for dtype in self.float_types:
@@ -187,8 +191,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -197,12 +201,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType(
             np.array([-7.66718769, -10.91273689]),
-            var0.eval(),
+            self.evaluate(var0),
             rtol=1e-4,
             bfloat16_rtol=1e-1,
             bfloat16_atol=1e-1)
         self.assertAllCloseAccordingToType(
-            np.array([-0.93460727, -1.86147261]), var1.eval(), rtol=1e-4)
+            np.array([-0.93460727, -1.86147261]),
+            self.evaluate(var1),
+            rtol=1e-4)
 
   def testFtrlWithL1_L2(self):
     for dtype in self.float_types:
@@ -219,8 +225,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -228,9 +234,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-0.24059935, -0.46829352]), var0.eval(), rtol=1e-5)
+            np.array([-0.24059935, -0.46829352]),
+            self.evaluate(var0),
+            rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([-0.02406147, -0.04830509]), var1.eval(), rtol=1e-5)
+            np.array([-0.02406147, -0.04830509]),
+            self.evaluate(var1),
+            rtol=1e-5)
 
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
@@ -254,8 +264,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -263,9 +273,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-0.22578996, -0.44345799]), var0.eval(), rtol=1e-4)
+            np.array([-0.22578996, -0.44345799]),
+            self.evaluate(var0),
+            rtol=1e-4)
         self.assertAllCloseAccordingToType(
-            np.array([-0.14378493, -0.13229476]), var1.eval(), rtol=1e-4)
+            np.array([-0.14378493, -0.13229476]),
+            self.evaluate(var1),
+            rtol=1e-4)
 
   def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
     """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
@@ -291,8 +305,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         update1 = opt1.apply_gradients([(grads1, var1)])
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([1.0, 2.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -301,7 +315,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # var0 is experiencing L2 shrinkage so it should be smaller than var1
         # in magnitude.
-        self.assertTrue((var0.eval()**2 < var1.eval()**2).all())
+        self.assertTrue((var0.eval()**2 < self.evaluate(var1)**2).all())
         accum0 = list(opt0._slots["accum"].values())[0].eval()
         accum1 = list(opt1._slots["accum"].values())[0].eval()
         # L2 shrinkage should not change how we update grad accumulator.
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index b1891b918c6584abce9da382088ed0037f5319fb..dd9b7f30efedaa45c96e60290b14a42d7f969b34 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -50,7 +50,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_f = Foo(a, b)
-      result = sess.run(call_f)
+      result = self.evaluate(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testNestedFunctions(self):
@@ -76,7 +76,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_g = Foo(a, b)
-      result = sess.run(call_g)
+      result = self.evaluate(call_g)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testFunctionMultipleRetvals(self):
@@ -100,7 +100,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_f = Foo(a, b)
-      result = sess.run(call_f)
+      result = self.evaluate(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testCompileTimeConstantsInDefun(self):
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 561715ee1c3e0db37169cfd3fb431c0872987d75..6f51ae33a1b0fc8670ddf0cacb03a3b5a9176a91 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -593,6 +593,67 @@ class LazyCompilationTest(test.TestCase):
       self.assertFalse(
           InLabels(RunMetadataLabels(run_metadata_for_new_shape), "_XlaRun"))
 
+  def testIsMegamorphic(self):
+
+    @function.Defun(compiled=True)
+    def CompiledFunction(x):
+      return math_ops.log(x)
+
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
+      x = array_ops.placeholder(dtypes.float32)
+      y = CompiledFunction(x)
+
+      # Make the cluster go megamorphic by running it with lots of shape
+      # signatures where the cluster is executed with each signature only a few
+      # times.  Then check that we don't compile the cluster ever again.
+
+      for shape in range(10, 50):
+        for _ in range(0, 49):
+          sess.run(y, feed_dict={x: [0.] * shape})
+
+      for _ in range(0, 50):
+        run_metadata = config_pb2.RunMetadata()
+        sess.run(
+            y,
+            feed_dict={x: [0.] * 60},
+            run_metadata=run_metadata,
+            options=config_pb2.RunOptions(
+                trace_level=config_pb2.RunOptions.FULL_TRACE))
+        self.assertTrue(
+            InLabels(RunMetadataLabels(run_metadata), "_XlaCompile"))
+        self.assertFalse(InLabels(RunMetadataLabels(run_metadata), "_XlaRun"))
+
+  def testIsNotMegamorphic(self):
+
+    @function.Defun(compiled=True)
+    def CompiledFunction(x):
+      return math_ops.log(x)
+
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
+      x = array_ops.placeholder(dtypes.float32)
+      y = CompiledFunction(x)
+
+      # Run the cluster with lots of shape signatures, but in a way that it
+      # isn't megamorphic (i.e. each shape signature sees a lot of executions).
+      # Then check that the cluster has not been marked as megamorphic.
+
+      for shape in range(10, 50):
+        for _ in range(0, 1000):
+          sess.run(y, feed_dict={x: [0.] * shape})
+
+      for _ in range(0, 10):
+        sess.run(y, feed_dict={x: [0.] * 60})
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(
+          y,
+          feed_dict={x: [0.] * 60},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(InLabels(RunMetadataLabels(run_metadata), "_XlaCompile"))
+      self.assertTrue(InLabels(RunMetadataLabels(run_metadata), "_XlaRun"))
+
 
 if __name__ == "__main__":
   os.environ["TF_XLA_FLAGS"] = ("--tf_xla_enable_lazy_compilation=true " +
diff --git a/tensorflow/compiler/tests/lrn_ops_test.py b/tensorflow/compiler/tests/lrn_ops_test.py
index c6ad67993e8bc196a74c9a328df8c9200c92c575..5dddf6ae4e8c8a3d5e9eb7b2c62298df02a0093c 100644
--- a/tensorflow/compiler/tests/lrn_ops_test.py
+++ b/tensorflow/compiler/tests/lrn_ops_test.py
@@ -120,8 +120,8 @@ class LRNTest(xla_test.XLATestCase):
       with self.test_scope():
         actual = gen_nn_ops.lrn_grad(out_grads, in_image, out_image,
                                      depth_radius, bias, alpha, beta)
-      expected_val = expected.eval()
-      actual_val = actual.eval()
+      expected_val = self.evaluate(expected)
+      actual_val = self.evaluate(actual)
     self.assertAllClose(actual_val, expected_val, rtol=1e-3)
 
 
diff --git a/tensorflow/compiler/tests/lstm_test.py b/tensorflow/compiler/tests/lstm_test.py
index 265c0b6d1412de7be3a5bf5e79129cb330ceb162..fd02a50aff94d2bd2e180a092a27c8195178c5e5 100644
--- a/tensorflow/compiler/tests/lstm_test.py
+++ b/tensorflow/compiler/tests/lstm_test.py
@@ -88,7 +88,7 @@ class LSTMTest(test.TestCase):
                  (basename, m_prev_scalar, c_prev_scalar, pad_scalar))
 
       # Initialize variables and run the unrolled LSTM step.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       return sess.run([m, c])
 
   def testLSTMCell(self):
@@ -173,7 +173,7 @@ class LSTMTest(test.TestCase):
                  (basename, m_init_scalar, c_init_scalar, pad_scalar))
 
       # Initialize variables and run the unrolled LSTM layer.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       return sess.run(out_seq)
 
   def testLSTMLayer(self):
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
index f77521a7c49dba39849869ddceb7c0e885147722..3416f7dbd6bdd264bf79785084f981f5b07cb8a9 100644
--- a/tensorflow/compiler/tests/momentum_test.py
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -61,37 +61,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
   def testNesterovMomentum(self):
     for dtype in self.float_types:
@@ -115,8 +121,8 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
               var0_np, accum0_np, var0_np * 0.8, 0.1, 0.9)
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(
               var1_np, accum1_np, 0.9, 0.1, 0.9)
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in self.float_types:
@@ -141,37 +147,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py
index 77bb839409f0c323ff6ed2c8d6bd105d3003b398..9671ae0ae973ff82d22744a1feb9b4293d94bbdd 100644
--- a/tensorflow/compiler/tests/placeholder_test.py
+++ b/tensorflow/compiler/tests/placeholder_test.py
@@ -33,7 +33,7 @@ class PlaceholderTest(xla_test.XLATestCase):
       ph = array_ops.placeholder_with_default(v, shape=[])
       out = ph * 2
       sess.run(variables.variables_initializer([v]))
-      self.assertEqual(8.0, sess.run(out))
+      self.assertEqual(8.0, self.evaluate(out))
 
   def test_placeholder_with_default_fed(self):
     with self.cached_session() as sess, self.test_scope():
diff --git a/tensorflow/compiler/tests/powersign_test.py b/tensorflow/compiler/tests/powersign_test.py
index 86536da7fed0e2309beb32fee9c7c605491592ed..5b35c20027700b34500a31e174061d7087094b61 100644
--- a/tensorflow/compiler/tests/powersign_test.py
+++ b/tensorflow/compiler/tests/powersign_test.py
@@ -91,8 +91,8 @@ class PowerSignTest(xla_test.XLATestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 7 steps of powersign
         # first 4 steps with positive gradient
@@ -125,8 +125,8 @@ class PowerSignTest(xla_test.XLATestCase):
           )
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testDense(self):
     decay_steps = 10
diff --git a/tensorflow/compiler/tests/proximal_adagrad_test.py b/tensorflow/compiler/tests/proximal_adagrad_test.py
index c41b4171e26af4f7ad0237d7407a5b3691299595..63cc51a470164915b2614a06d18ca1850bb64a3c 100644
--- a/tensorflow/compiler/tests/proximal_adagrad_test.py
+++ b/tensorflow/compiler/tests/proximal_adagrad_test.py
@@ -45,15 +45,17 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([0.0, 0.0], var0.eval())
-      self.assertAllClose([0.0, 0.0], var1.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+      self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([-2.60260963, -4.29698515]), var0.eval())
-      self.assertAllClose(np.array([-0.28432083, -0.56694895]), var1.eval())
+      self.assertAllClose(
+          np.array([-2.60260963, -4.29698515]), self.evaluate(var0))
+      self.assertAllClose(
+          np.array([-0.28432083, -0.56694895]), self.evaluate(var1))
       opt_vars = opt.variables()
       self.assertStartsWith(opt_vars[0].name, var0._shared_name)
       self.assertStartsWith(opt_vars[1].name, var1._shared_name)
@@ -74,14 +76,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
-      self.assertAllClose(np.array([-1.60261, -2.296985]), var0.eval())
-      self.assertAllClose(np.array([3.715679, 2.433051]), var1.eval())
+      self.assertAllClose(np.array([-1.60261, -2.296985]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.715679, 2.433051]), self.evaluate(var1))
 
   def testProximalAdagradWithL1(self):
     with self.cached_session(), self.test_scope():
@@ -98,14 +100,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Adagrad
       for _ in range(10):
         update.run()
-      self.assertAllClose(np.array([-6.663634, -9.190331]), var0.eval())
-      self.assertAllClose(np.array([2.959304, 1.029232]), var1.eval())
+      self.assertAllClose(np.array([-6.663634, -9.190331]), self.evaluate(var0))
+      self.assertAllClose(np.array([2.959304, 1.029232]), self.evaluate(var1))
 
   def testProximalAdagradWithL1_L2(self):
     with self.cached_session(), self.test_scope():
@@ -122,15 +124,15 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Adagrad.
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval())
-      self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval())
+      self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1))
 
   def applyOptimizer(self, opt, steps=5):
     var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
@@ -141,14 +143,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
 
-    self.assertAllClose([1.0, 2.0], var0.eval())
-    self.assertAllClose([3.0, 4.0], var1.eval())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
     # Run ProximalAdagrad for a few steps
     for _ in range(steps):
       update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testEquivAdagradwithoutRegularization(self):
     with self.cached_session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/proximal_gradient_descent_test.py b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
index 3d808e6b8a71ef9fa60b671d07bfd907e9f58efc..5aec433be765dd0a04bd7ab10d5c39a5a7f48c5c 100644
--- a/tensorflow/compiler/tests/proximal_gradient_descent_test.py
+++ b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
@@ -42,15 +42,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([0.0, 0.0], var0.eval())
-      self.assertAllClose([0.0, 0.0], var1.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+      self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Gradient Descent.
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([-0.9, -1.8]), var0.eval())
-      self.assertAllClose(np.array([-0.09, -0.18]), var1.eval())
+      self.assertAllClose(np.array([-0.9, -1.8]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.09, -0.18]), self.evaluate(var1))
 
   def testProximalGradientDescentwithoutRegularization2(self):
     with self.cached_session(), self.test_scope():
@@ -64,15 +64,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Gradient Descent
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([0.1, 0.2]), var0.eval())
-      self.assertAllClose(np.array([3.91, 2.82]), var1.eval())
+      self.assertAllClose(np.array([0.1, 0.2]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.91, 2.82]), self.evaluate(var1))
 
   def testProximalGradientDescentWithL1(self):
     with self.cached_session(), self.test_scope():
@@ -86,15 +86,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps proximal gradient descent.
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-1.988, -3.988001]), var0.eval())
-      self.assertAllClose(np.array([3.67, 2.37]), var1.eval())
+      self.assertAllClose(np.array([-1.988, -3.988001]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.67, 2.37]), self.evaluate(var1))
 
   def testProximalGradientDescentWithL1_L2(self):
     with self.cached_session(), self.test_scope():
@@ -108,15 +108,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Gradient Descent
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval())
-      self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval())
+      self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1))
 
   def applyOptimizer(self, opt, steps=5):
     var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
@@ -127,14 +127,14 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
 
-    self.assertAllClose([1.0, 2.0], var0.eval())
-    self.assertAllClose([3.0, 4.0], var1.eval())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
     # Run ProximalAdagrad for a few steps
     for _ in range(steps):
       update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testEquivGradientDescentwithoutRegularization(self):
     with self.cached_session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/qr_op_test.py b/tensorflow/compiler/tests/qr_op_test.py
index 236b1b881dcaffc1a5b0c6395f0605c1d7ef0269..b4d4193e35f9e0e3b23d0242ed076dd811f4ee2b 100644
--- a/tensorflow/compiler/tests/qr_op_test.py
+++ b/tensorflow/compiler/tests/qr_op_test.py
@@ -63,7 +63,7 @@ class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
-    precision = self.AdjustedNorm(xx.eval() - identity.eval())
+    precision = self.AdjustedNorm(xx.eval() - self.evaluate(identity))
     self.assertTrue(np.all(precision < 5.0))
 
   def _test(self, dtype, shape, full_matrices):
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 36ef6ed5fee78bad10bb1ee0bf3eb7824d05c206..1e913909452d54ed59f33bb0d313fd062570d459 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -46,9 +46,9 @@ class RandomOpsTest(xla_test.XLATestCase):
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # deterministic output, all three outputs will be bitwise identical.
@@ -83,7 +83,7 @@ class RandomOpsTest(xla_test.XLATestCase):
         with self.test_scope():
           x = random_ops.random_uniform(
               shape=[1000], dtype=dtype, minval=-2, maxval=33)
-        y = sess.run(x)
+        y = self.evaluate(x)
         self.assertTrue((y >= -2).sum() == 1000)
         self.assertTrue((y < 33).sum() == 1000)
 
@@ -102,7 +102,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.cached_session() as sess:
         with self.test_scope():
           x = random_ops.truncated_normal(shape=[count], dtype=dtype)
-        y = sess.run(x)
+        y = self.evaluate(x)
 
         def normal_cdf(x):
           return .5 * math.erfc(-x / math.sqrt(2))
@@ -148,7 +148,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.test_scope():
         x = math_ops.range(1 << 16)
         shuffle = random_ops.random_shuffle(x)
-      result = sess.run(shuffle)
+      result = self.evaluate(shuffle)
       expected = range(1 << 16)
       # Compare sets to avoid randomness behavior changes but make sure still
       # have all the values.
@@ -159,7 +159,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.test_scope():
         x = array_ops.diag(math_ops.range(20))
         shuffle = random_ops.random_shuffle(x)
-      result = sess.run(shuffle)
+      result = self.evaluate(shuffle)
       expected = np.diag(range(20)).flatten()
       # Compare sets to avoid randomness behavior changes but make sure still
       # have all the values.
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index cfccf5f3d2a0a3f2910b2ac1c2747381b172a685..a6b58020126a3297944f199e99b0801387615564 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -2466,20 +2466,21 @@ TEST_F(OpTest, Pack) {
   });
 }
 
-// TODO(b/31741898): crashes on GPU.
 TEST_F(OpTest, Pad) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64> t_dims = RandomDims();
 
-    // TODO(b/31741996): re-enable DT_INT64 when bug is fixed.
-    // DataType tpaddings = Choose<DataType>({DT_INT32, DT_INT64});
-    DataType tpaddings = DT_INT32;
+    DataType tpaddings = Choose<DataType>({DT_INT32, DT_INT64});
     std::vector<int64> paddings_vec;
-    std::uniform_int_distribution<int> distribution(0, 7);
     for (int i = 0; i < t_dims.size(); ++i) {
-      paddings_vec.push_back(distribution(generator()));
-      paddings_vec.push_back(distribution(generator()));
+      std::uniform_int_distribution<int> pad_distribution(0, t_dims[i]);
+      int pad_size = pad_distribution(generator());
+      std::uniform_int_distribution<int> lower_distribution(0, pad_size);
+      int low_pad_size = lower_distribution(generator());
+      paddings_vec.push_back(low_pad_size);
+      paddings_vec.push_back(pad_size - low_pad_size);
+      t_dims[i] -= pad_size;
     }
     Tensor paddings;
     CHECK(
diff --git a/tensorflow/compiler/tests/resampler_ops_test.py b/tensorflow/compiler/tests/resampler_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8ca0eab276b39f025d018edebb78eed7a8433bb
--- /dev/null
+++ b/tensorflow/compiler/tests/resampler_ops_test.py
@@ -0,0 +1,205 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for resampler ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.contrib import resampler
+from tensorflow.contrib.resampler.ops import gen_resampler_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ResamplerOpsTest(xla_test.XLATestCase):
+
+  def _assertForwardOpMatchesExpected(self, image_np, warp_np, expected):
+    with self.test_session() as sess, self.test_scope():
+      input_image = array_ops.placeholder(image_np.dtype)
+      warp = array_ops.placeholder(warp_np.dtype)
+      resampled = resampler.resampler(input_image, warp, name='resampler')
+      out = sess.run(resampled, {input_image: image_np, warp: warp_np})
+
+      self.assertAllCloseAccordingToType(
+          expected, out, rtol=5e-3, half_rtol=1e-2, bfloat16_rtol=3e-2)
+
+  def _assertBackwardOpMatchesExpected(self, input_np, warp_np, grad_output_np,
+                                       expected_grad_data, expected_grad_warp):
+    with self.cached_session() as sess, self.test_scope():
+      input_image = array_ops.placeholder(input_np.dtype)
+      warp = array_ops.placeholder(warp_np.dtype)
+      grad_output = array_ops.placeholder(grad_output_np.dtype)
+
+      grad_data, grad_warp = gen_resampler_ops.resampler_grad(
+          input_image, warp, grad_output)
+
+      grad_data_tf, grad_warp_tf = sess.run([grad_data, grad_warp], {
+          input_image: input_np,
+          warp: warp_np,
+          grad_output: grad_output_np
+      })
+
+      self.assertAllCloseAccordingToType(
+          expected_grad_warp, grad_warp_tf, half_rtol=1e-2, bfloat16_rtol=3e-2)
+      self.assertAllCloseAccordingToType(
+          expected_grad_data, grad_data_tf, half_rtol=1e-2, bfloat16_rtol=3e-2)
+
+  def testSimple(self):
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [0, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2]
+      warp_data = [0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[26.42]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+      grad_output = np.ones([1, 1], dtype=dtype)
+
+      expected_grad_data = [[[[0.12], [0.27999997]], [[0.18000001],
+                                                      [0.42000002]]]]
+
+      expected_grad_warp = [[26.60000038, 38.20000076]]
+
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
+  def testMultiChannel(self):
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 3]
+      input_rgb_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+      input_np = np.array(input_rgb_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2]
+      warp_data = [0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[59.58000183, 146.94000244, 107.37999725]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+      grad_output = np.ones([1, 3], dtype=dtype)
+
+      expected_grad_data = [[[[0.12, 0.12, 0.12],
+                              [0.27999997, 0.27999997, 0.27999997]],
+                             [[0.18000001, 0.18000001, 0.18000001],
+                              [0.42000002, 0.42000002, 0.42000002]]]]
+
+      expected_grad_warp = [[199, 30]]
+
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
+  def testBatch2Height3byWidth3RGB(self):
+    for dtype in self.float_types:
+      input_shape = [2, 3, 3, 3]
+      input_rgb_data = [
+          0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1, 30, 105, 2, 40, 115,
+          3, 50, 125, 4, 60, 135, 5, 70, 145, 6, 0, 5, 13, 54, 135, 226, 37, 8,
+          234, 90, 255, 1, 30, 105, 2, 40, 115, 3, 50, 125, 4, 60, 135, 5, 70,
+          145, 6
+      ]
+      input_np = np.array(input_rgb_data, dtype=dtype).reshape(input_shape)
+
+      # 2 batches and 2 samples for each batch.
+      warp_shape = [2, 2, 2]
+      warp_data = [0.7, 0.6, 1, 0.7, 0.9, 1.2, 1.3, 1.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+
+      expected_forward = [[[43.92, 128.4, 65.86], [37.2, 114., 69.2]],
+                          [[40.6, 122.8, 2.5], [51., 126, 4.1]]]
+
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected_forward)
+
+      expected_grad_data = [[[[0.12, 0.12, 0.12],
+                              [0.57999998, 0.57999998, 0.57999998],
+                              [0., 0., 0.]],
+                             [[0.18000001, 0.18000001, 0.18000001],
+                              [1.12, 1.12, 1.12], [0., 0., 0.]],
+                             [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]],
+                            [[[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]],
+                             [[0.08000001, 0.08000001, 0.08000001],
+                              [0.99999988, 0.99999988, 0.99999988],
+                              [0.11999997, 0.11999997, 0.11999997]],
+                             [[0.02000001, 0.02000001, 0.02000001],
+                              [0.60000008, 0.60000008, 0.60000008],
+                              [0.17999998, 0.17999998, 0.17999998]]]]
+      expected_grad_warp = [[[33.39999008, -96.20000458], [-26.10000229,
+                                                           -278.]],
+                            [[-162.99998474, 39.99999619], [21., 63.]]]
+
+      grad_output = np.ones([2, 2, 3], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
+  def testOutOfBoundWarps(self):
+    # (x, y) are both less than 0.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-1, -1, 0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [27.62]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # One of (x, y) is less than 0.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-1, 0.1, 0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [27.62]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # Both of (x, y) are greater than image size.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-0.1, 0.1, 1.2, 2.1]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [0.0]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # One of (x, y) is greater than image size.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [0.1, -0.1, 1.2, 0.1]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [0.0]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tests/rmsprop_test.py b/tensorflow/compiler/tests/rmsprop_test.py
index 8840a1329a907bddc6ef1cb6dd1c2a6d234def5c..dc3e90b4afa41c08d899ee195d42fb91678bad1c 100644
--- a/tensorflow/compiler/tests/rmsprop_test.py
+++ b/tensorflow/compiler/tests/rmsprop_test.py
@@ -76,7 +76,7 @@ class RmspropTest(xla_test.XLATestCase):
           rms_opt = rmsprop.RMSPropOptimizer(learning_rate, centered=centered)
           rms_update = rms_opt.apply_gradients(
               zip([grads0, grads1], [var0, var1]))
-          variables.global_variables_initializer().run()
+          self.evaluate(variables.global_variables_initializer())
 
           mg0 = rms_opt.get_slot(var0, "mg")
           self.assertEqual(mg0 is not None, centered)
@@ -92,12 +92,12 @@ class RmspropTest(xla_test.XLATestCase):
           self.assertTrue(mom1 is not None)
 
           # Fetch params to validate initial values
-          self.assertAllClose([1.0, 2.0], var0.eval())
-          self.assertAllClose([3.0, 4.0], var1.eval())
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
           # Run 3 steps of RMSProp
           for _ in range(3):
-            rms_update.run()
+            self.evaluate(rms_update)
 
             var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
                 var0_np,
@@ -118,14 +118,14 @@ class RmspropTest(xla_test.XLATestCase):
 
             # Validate updated params
             if centered:
-              self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-              self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-            self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-            self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-            self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-            self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-            self.assertAllCloseAccordingToType(var0_np, var0.eval())
-            self.assertAllCloseAccordingToType(var1_np, var1.eval())
+              self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+              self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+            self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+            self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+            self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+            self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index 46ca371c8abf1cb4710717a183ee12820c4c4ca0..d7e26d79c4c054860ade5c8960a3bca984e020b0 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -79,7 +79,8 @@ class TensorArrayTest(xla_test.XLATestCase):
       c0 = w2.stack()
 
       self.assertAllEqual(
-          convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]), c0.eval())
+          convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]),
+          self.evaluate(c0))
 
   def testTensorArrayWritePack(self):
     for dtype in self.numeric_tf_types:
@@ -97,7 +98,7 @@ class TensorArrayTest(xla_test.XLATestCase):
 
       c0 = w2.stack()
 
-      self.assertAllEqual([3, 0, 1], c0.eval().shape)
+      self.assertAllEqual([3, 0, 1], self.evaluate(c0).shape)
 
   def _testTensorArrayWriteConcat(self, tf_dtype):
     with self.cached_session(), self.test_scope():
@@ -113,8 +114,8 @@ class TensorArrayTest(xla_test.XLATestCase):
       c0 = w2.concat()
 
       self.assertAllEqual(
-          convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0],
-                   [106.0, 107.0], [8.0, 9.0], [204.0, 205.0]]), c0.eval())
+          convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0], [106.0, 107.0],
+                   [8.0, 9.0], [204.0, 205.0]]), self.evaluate(c0))
 
   def testTensorArrayWriteConcat(self):
     for dtype in self.numeric_tf_types:
@@ -341,7 +342,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtype2, flow_in=w0.flow)
         with self.assertRaisesOpError("TensorArray dtype is "):
-          r0_bad.eval()
+          self.evaluate(r0_bad)
 
         # Test reading from a different index than the one we wrote to
         w0.read(1)
@@ -422,7 +423,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       w2 = h2.write(0, 5.0)
       r2 = w2.read(0)
       r = r1 + r2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
   def _testTensorArrayGradientWriteReadType(self, dtype):
     with self.cached_session() as session, self.test_scope():
@@ -504,7 +505,7 @@ class TensorArrayTest(xla_test.XLATestCase):
                 [-0.5, 1.5],  # read(0) gradient
                 [20.0, 30.0, 40.0, 50.0],  # concat gradient
             ])
-      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+      grad_vals = self.evaluate(grad_r)  # 2 + 2 entries
 
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
@@ -526,7 +527,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       with ops.control_dependencies([r0_readtwice]):
         r1_readtwice = w_readtwice.read(0)
 
-      self.assertAllEqual([1.0, -1.0], r1_readtwice.eval())
+      self.assertAllEqual([1.0, -1.0], self.evaluate(r1_readtwice))
 
   def _testTensorArrayGradientUnpackRead(self):
     with self.cached_session() as session, self.test_scope():
@@ -592,7 +593,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       s = ta.size()
-      self.assertAllEqual(3, s.eval())
+      self.assertAllEqual(3, self.evaluate(s))
 
   def testWriteCloseTensorArray(self):
     with self.cached_session(), self.test_scope():
@@ -722,7 +723,7 @@ class TensorArrayTest(xla_test.XLATestCase):
 
   #     r = acc2.stack()
   #     grad = gradients_impl.gradients(r, [x])[0]
-  #     self.assertAllClose(31.0, grad.eval())
+  #     self.assertAllClose(31.0, self.evaluate(grad))
 
   def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
     with self.cached_session() as session, self.test_scope():
@@ -912,7 +913,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertEqual(0, ta.size().eval())
       ta = ta.unstack(array_ops.zeros([0, 3, 5]))
       packed = ta.stack()
-      self.assertAllEqual([0, 3, 5], packed.eval().shape)
+      self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
       self.assertAllEqual([0, 5], ta.concat().eval().shape)
@@ -1041,8 +1042,8 @@ class TensorArrayTest(xla_test.XLATestCase):
           (read0, read1, size0, size1))
 
       # Tests that the control dependencies was added and executed.
-      self.assertEqual(1, v0.eval())
-      self.assertEqual(1, v1.eval())
+      self.assertEqual(1, self.evaluate(v0))
+      self.assertEqual(1, self.evaluate(v1))
 
       # Tests correct TensorArray.
       self.assertEqual(read0_v, 0)
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index dd2c252d383bca9c59033ac07e442b487e4975a6..e776c8a951c7ac24c65408a67007b03ae07e8be0 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -40,6 +40,19 @@ from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
 class VariableOpsTest(xla_test.XLATestCase):
   """Test cases for resource variable operators."""
 
+  def testWriteEmptyShape(self):
+    # Verifies that we can pass an uninitialized variable with an empty shape,
+    # assign it a value, and successfully return it.
+    for dtype in self.numeric_types:
+      with self.test_session() as sess, self.test_scope():
+        zeros = np.zeros([3, 0], dtype=dtype)
+        v = resource_variable_ops.ResourceVariable(zeros)
+        p = array_ops.placeholder(dtype)
+        x = v.assign(p)
+        with ops.control_dependencies([x]):
+          y = v.read_value()
+        self.assertAllClose(zeros, sess.run(y, {p: zeros}))
+
   def testOneWriteOneOutput(self):
     # Regression test for a bug where computations with one non-constant
     # output and one variable update were mishandled.
@@ -216,7 +229,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_add(
               handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[3], [7]])
+      self.assertAllEqual(self.evaluate(read), [[3], [7]])
 
   def testScatterSub(self):
     with self.test_session() as sess, self.test_scope():
@@ -229,7 +242,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_sub(
               handle, [1], constant_op.constant([[2]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[4], [-1]])
+      self.assertAllEqual(self.evaluate(read), [[4], [-1]])
 
   def testScatterMul(self):
     with self.test_session() as sess, self.test_scope():
@@ -242,7 +255,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_mul(
               handle, [0], constant_op.constant([[5]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[5]])
+      self.assertEqual(self.evaluate(read), [[5]])
 
   def testScatterDiv(self):
     with self.test_session() as sess, self.test_scope():
@@ -255,7 +268,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_div(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[2]])
+      self.assertAllEqual(self.evaluate(read), [[2]])
 
   def testScatterMin(self):
     with self.test_session() as sess, self.test_scope():
@@ -268,7 +281,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_min(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterMax(self):
     with self.test_session() as sess, self.test_scope():
@@ -281,7 +294,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_max(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[6]])
+      self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterUpdate(self):
     with self.test_session() as sess, self.test_scope():
@@ -294,7 +307,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_update(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterAddScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -307,7 +320,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_add(
               handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterSubScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -320,7 +333,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_sub(
               handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[-1]])
+      self.assertEqual(self.evaluate(read), [[-1]])
 
   def testScatterMulScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -333,7 +346,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_mul(
               handle, [0], constant_op.constant(5, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[5]])
+      self.assertEqual(self.evaluate(read), [[5]])
 
   def testScatterDivScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -346,7 +359,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_div(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[2]])
+      self.assertEqual(self.evaluate(read), [[2]])
 
   def testScatterMinScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -359,7 +372,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_min(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterMaxScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -372,7 +385,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_max(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[6]])
+      self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterNdAddOps(self):
     with self.test_session() as sess, self.test_scope():
@@ -387,7 +400,7 @@ class VariableOpsTest(xla_test.XLATestCase):
       sess.run(gen_state_ops.resource_scatter_nd_add(handle, indices, updates))
       read = resource_variable_ops.read_variable_op(
           handle, dtype=dtypes.float32)
-      self.assertAllClose(expected, sess.run(read))
+      self.assertAllClose(expected, self.evaluate(read))
 
   def testScatterNdUpdateAddOps(self):
     with self.test_session() as sess, self.test_scope():
@@ -403,7 +416,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           gen_state_ops.resource_scatter_nd_update(handle, indices, updates))
       read = resource_variable_ops.read_variable_op(
           handle, dtype=dtypes.float32)
-      self.assertAllClose(expected, sess.run(read))
+      self.assertAllClose(expected, self.evaluate(read))
 
 
 class StridedSliceAssignChecker(object):
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 5fc9a352ff930c7d281ec5c52168580e453c04b0..3458c7f1c40cd70187e209eb40db24245d595d04 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -166,6 +166,7 @@ cc_library(
         "xla_compilation_device.cc",
         "xla_compiler.cc",
         "xla_context.cc",
+        "xla_expression.cc",
         "xla_helpers.cc",
         "xla_op_kernel.cc",
         "xla_op_registry.cc",
@@ -180,6 +181,7 @@ cc_library(
         "xla_compilation_device.h",
         "xla_compiler.h",
         "xla_context.h",
+        "xla_expression.h",
         "xla_helpers.h",
         "xla_op_kernel.h",
         "xla_op_registry.h",
@@ -193,6 +195,7 @@ cc_library(
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:xla_cluster_util",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/xla:literal",
@@ -201,13 +204,13 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -217,6 +220,8 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
@@ -362,8 +367,12 @@ tf_cc_test(
 
 tf_cc_test(
     name = "xla_compiler_test",
-    srcs = ["xla_compiler_test.cc"],
+    srcs = [
+        "xla_compiler_test.cc",
+        "xla_expression_test.cc",
+    ],
     deps = [
+        ":common",
         ":side_effect_util",
         ":xla_compiler",
         "//tensorflow/cc:cc_ops",
@@ -386,6 +395,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -428,14 +438,13 @@ cc_library(
     name = "dump_graph",
     srcs = [
         "dump_graph.cc",
-        "dump_graph_flags.cc",
-        "dump_graph_flags.h",
     ],
     hdrs = [
         "dump_graph.h",
     ],
     deps = [
-        "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
+        "//tensorflow/compiler/jit:flags",
+        "//tensorflow/compiler/xla:parse_flags_from_env",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index 380c6a7e23da92d949b26876836b999bf6406c6c..1de85004a51bea464f8f0166511402e5dd85ac14 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2xla/dump_graph_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 
@@ -61,8 +61,7 @@ string MakeUniqueFilename(string name) {
 string WriteTextProtoToUniqueFile(
     Env* env, const string& name, const char* proto_type,
     const ::tensorflow::protobuf::Message& proto) {
-  const string& dirname =
-      legacy_flags::GetDumpGraphFlags()->tf_dump_graph_prefix;
+  const string& dirname = GetDumpGraphFlags()->tf_dump_graph_prefix;
   Status status = env->RecursivelyCreateDir(dirname);
   if (!status.ok()) {
     LOG(WARNING) << "Failed to create " << dirname << " for dumping "
diff --git a/tensorflow/compiler/tf2xla/dump_graph_flags.cc b/tensorflow/compiler/tf2xla/dump_graph_flags.cc
deleted file mode 100644
index a6c908ba011afb90fabacc855df8c6afbb35d254..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/dump_graph_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's dump_graph module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/dump_graph_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static DumpGraphFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new DumpGraphFlags;
-  flags->tf_dump_graph_prefix = "/tmp/";
-  flag_list = new std::vector<Flag>({
-      Flag("tf_dump_graph_prefix", &flags->tf_dump_graph_prefix,
-           "Path prefix to which graphs dumped during debugging should be "
-           "written."),
-  });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// dump_graph module.
-void AppendDumpGraphFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the DumpGraphFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-DumpGraphFlags* GetDumpGraphFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/dump_graph_flags.h b/tensorflow/compiler/tf2xla/dump_graph_flags.h
deleted file mode 100644
index 80a3307d920f2cc3d668d507786a02e43589f86f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/dump_graph_flags.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
-#define TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
-
-// Legacy flags for the XLA bridge's dump_graph module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// dump_graph module.
-void AppendDumpGraphFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with the XLA bridge's
-// dump_graph module.
-typedef struct {
-  string tf_dump_graph_prefix;  // Path prefix to which graphs dumped during
-                                // debugging should be written.
-} DumpGraphFlags;
-
-// Return a pointer to the DumpGraphFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-DumpGraphFlags* GetDumpGraphFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index f818d80022da0bad851c896f2714c15b20b22195..3dfd3f854c8646ebbf06d3378201d22e8741b7eb 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -75,6 +75,25 @@ Status FunctionalizeControlFlow(Graph* graph,
   return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library);
 }
 
+Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
+                                           FunctionLibraryDefinition* library) {
+  return FunctionalizeControlFlowForGraphDef(/*lookup_library=*/nullptr,
+                                             graph_def, library);
+}
+
+Status FunctionalizeControlFlowForGraphDef(
+    const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def,
+    FunctionLibraryDefinition* library) {
+  FunctionDefLibrary function_lib = graph_def->library();
+  Graph graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph({}, *graph_def, &graph));
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(lookup_library, &graph, library));
+  graph.ToGraphDef(graph_def);
+  std::swap(*graph_def->mutable_library(), function_lib);
+  return Status::OK();
+}
+
 Status FunctionalizeControlFlowForFunction(
     const string& func_name, const string& new_func_name,
     const protobuf::Map<string, tensorflow::AttrValue>& attrs,
@@ -242,23 +261,20 @@ Status FunctionalizeControlFlowPass::Run(
       continue;
     }
     const string func_attr = it->second;
-    if (kNodeTypeToFunctionAttrMapping->find(n->type_string()) !=
-        kNodeTypeToFunctionAttrMapping->end()) {
-      NameAttrList func;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
-      VLOG(2) << "Graph has node " << n->type_string()
-              << ". Corresponding function: " << func.name();
-      string new_func_name = options.flib_def->UniqueFunctionName(
-          absl::StrCat(func.name(), "_f15n_"));
-      bool modified;
-      TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
-          func.name(), new_func_name, func.attr(), options.flib_def, flr,
-          &canonicalized_name_to_new_name, &modified));
-      if (modified) {
-        n->ClearAttr(func_attr);
-        func.set_name(new_func_name);
-        n->AddAttr(func_attr, func);
-      }
+    NameAttrList func;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
+    VLOG(2) << "Graph has node " << n->type_string()
+            << ". Corresponding function: " << func.name();
+    string new_func_name = options.flib_def->UniqueFunctionName(
+        absl::StrCat(func.name(), "_f15n_"));
+    bool modified;
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
+        func.name(), new_func_name, func.attr(), options.flib_def, flr,
+        &canonicalized_name_to_new_name, &modified));
+    if (modified) {
+      n->ClearAttr(func_attr);
+      func.set_name(new_func_name);
+      n->AddAttr(func_attr, func);
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index ba99205640ccdc83a3a4d50e3ec474907894a835..91d33fa405834d7f1f8f66180583580f4f2e448a 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -33,6 +33,12 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
                                 Graph* graph,
                                 FunctionLibraryDefinition* library);
 
+Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
+                                           FunctionLibraryDefinition* library);
+Status FunctionalizeControlFlowForGraphDef(
+    const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def,
+    FunctionLibraryDefinition* library);
+
 // This pass looks at the graph and all associated FunctionDefs, and turns
 // traditional control flow structure (Switch/Merge/etc.) into functional
 // control flow structure (If/While).
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index c3841f996f801e855da75b23f01d41674ec51c4d..9784985af83a18619d837528f99a60b98a501ec5 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -95,77 +95,87 @@ TEST(FunctionalizeControlFlow, Conditional) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    string op_name;
+    NameAttrList then_fn;
+    NameAttrList else_fn;
+    TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
+    InstantiationResultForTest else_result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(else_fn.name(), library, &else_result));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
+      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+      auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
+      auto if_op = ops::If(scope.WithOpName(op_name), less,
+                           std::initializer_list<Input>{less, y, x}, {DT_INT32},
+                           then_fn, else_fn);
+      auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-  string op_name;
-  NameAttrList then_fn;
-  NameAttrList else_fn;
-  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
-  InstantiationResultForTest else_result;
-  TF_EXPECT_OK(
-      InstantiateFunctionForTest(else_fn.name(), library, &else_result));
-
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
-    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-    auto if_op = ops::If(scope.WithOpName(op_name), less,
-                         std::initializer_list<Input>{less, y, x}, {DT_INT32},
-                         then_fn, else_fn);
-    auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // then body.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
-    auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
-    auto cond = ops::Const(
-        scope.WithOpName("cond").WithControlDependencies(identity), 17);
-    auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(then_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+    // then body.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
+      auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
+      auto cond = ops::Const(
+          scope.WithOpName("cond").WithControlDependencies(identity), 17);
+      auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(then_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
+                result.arg_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-  // else body.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
-    auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
-    auto cond_1 = ops::Const(
-        scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
-    auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(else_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // else body.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
+      auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
+      auto cond_1 = ops::Const(
+          scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
+      auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(else_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
+                result.arg_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -239,75 +249,77 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
-
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
-    auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
-    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+    // Condition graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
+      auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+      auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
-// @function.Defun(noinline=True)
-// def increment_fn(x):
-//   return [x + 1]
-// Define the above function, and add it to the given graph. It's used as the
-// while loop body in NoinlineLoopBody test.
-Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
+FunctionDef GetNoinlineFunctionDef() {
   FunctionDef fdef = FunctionDefHelper::Create(
       "increment_fn", {"x:int32"}, {"add:int32"}, {},
       {
@@ -316,8 +328,17 @@ Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
       },
       {{"add", "add_0:z:0"}});
   (*fdef.mutable_attr())["_noinline"].set_b(true);
+  return fdef;
+}
+
+// @function.Defun(noinline=True)
+// def increment_fn(x):
+//   return [x + 1]
+// Define the above function, and add it to the given graph. It's used as the
+// while loop body in NoinlineLoopBody test.
+Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
   FunctionDefLibrary fdef_lib;
-  *(fdef_lib.add_function()) = fdef;
+  *(fdef_lib.add_function()) = GetNoinlineFunctionDef();
   TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdef_lib));
   NodeDef increment_fn;
   increment_fn.set_name(node_name);
@@ -376,55 +397,88 @@ TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
   FunctionLibraryDefinition lookup_lib(graph.flib_def());
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   // Function increment_fn will be copied from lookup_lib to library.
-  TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library));
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
+  *(optimized_graph_def.mutable_library()->add_function()) =
+      GetNoinlineFunctionDef();
 
-  NameAttrList cond_fn, body_fn;
-  TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+  TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef(
+      &lookup_lib, &optimized_graph_def, &library));
+  TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      GraphDef expected;
+      TF_ASSERT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    GraphDef expected;
-    TF_ASSERT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      TF_ASSERT_OK(
+          AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      NodeDef retval;
+      retval.set_name("_retval0_RetVal");
+      retval.set_op(FunctionLibraryDefinition::kRetOp);
+      *retval.add_input() = noinline_node_name;
+      (*retval.mutable_attr())["T"].set_type(DT_INT32);
+      (*retval.mutable_attr())["index"].set_i(0);
+      Status status;
+      scope.graph()->AddNode(retval, &status);
+      TF_ASSERT_OK(status);
+
+      GraphDef expected;
+      TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      // Verify that increment_fn has been copied to library.
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      // Ignore the function library when comparing the graphs.
+      expected.clear_library();
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
+}
 
-  // Body graph.
+TEST(FunctionalizeControlFlow, MissingFunctionDefInLibrary) {
+  const string& noinline_node_name = "while/increment_fn";
+  Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+    auto identity = ops::Identity(scope.WithOpName("while/Identity"), source);
     TF_ASSERT_OK(AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    NodeDef retval;
-    retval.set_name("_retval0_RetVal");
-    retval.set_op(FunctionLibraryDefinition::kRetOp);
-    *retval.add_input() = noinline_node_name;
-    (*retval.mutable_attr())["T"].set_type(DT_INT32);
-    (*retval.mutable_attr())["index"].set_i(0);
-    Status status;
-    scope.graph()->AddNode(retval, &status);
-    TF_ASSERT_OK(status);
-
-    GraphDef expected;
-    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+  }
 
-    InstantiationResultForTest result;
-    // Verify that increment_fn has been copied to library.
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+  FunctionLibraryDefinition lookup_lib(graph.flib_def());
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+  graph_def.clear_library();
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    // Ignore the function library when comparing the graphs.
-    expected.clear_library();
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+  Status status =
+      FunctionalizeControlFlowForGraphDef(&lookup_lib, &graph_def, &library);
+  EXPECT_EQ(tensorflow::error::NOT_FOUND, status.code());
 }
 
 // Tests functionalizing OneLoopVar where the loop value is not used post the
@@ -467,65 +521,72 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
-
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
-    auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
-    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+    // Condition graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
+      auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+      auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -608,86 +669,95 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
+      auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{x, y}, cond_fn, body_fn);
+      auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]);
+      auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
-
-  // Outer graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
-    auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{x, y}, cond_fn, body_fn);
-    auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]);
-    auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+    // Condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+                                         .WithControlDependencies(arg0.output),
+                                     3);
+      auto cond_add =
+          ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
+      auto ten = ops::Const<int32>(scope.WithOpName("while/cond/ten")
                                        .WithControlDependencies(arg0.output),
-                                   3);
-    auto cond_add =
-        ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/cond/ten").WithControlDependencies(arg0.output),
-        10);
-    auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-
-    auto identity_x = ops::Identity(scope.WithOpName("while/Identity/x"), arg0);
-    auto identity_y = ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
-
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
-        1);
-    auto two = ops::Const<int32>(
-        scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
-        2);
+                                   10);
+      auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
 
-    auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
-    auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+
+      auto identity_x =
+          ops::Identity(scope.WithOpName("while/Identity/x"), arg0);
+      auto identity_y =
+          ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
+
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
+          1);
+      auto two = ops::Const<int32>(
+          scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
+          2);
+
+      auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
+      auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+      auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -841,177 +911,192 @@ TEST(FunctionalizeControlFlow, Complex) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList outer_cond_fn, outer_body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
-
-  // Outer graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
-    auto y = ops::Add(scope.WithOpName("y"), x, three);
-
-    auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
-                                TensorShape({}));
-
-    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
-
-    auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
-                               std::initializer_list<Input>{zero, y, x, var},
-                               outer_cond_fn, outer_body_fn);
-    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Outer condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
-        10);
-    auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Outer body graph.
-  NameAttrList inner_cond_fn, inner_body_fn;
-  {
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
-
-    // Find the inner condition and body names.
-    TF_EXPECT_OK(
-        FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
-
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
-    auto one_j = ops::Const<int32>(
-        scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
-    auto while_op =
-        ops::While(scope.WithOpName("outer/LoopCond_1"),
-                   std::initializer_list<Input>{one_j, arg1, arg2, arg3},
-                   inner_cond_fn, inner_body_fn);
-
-    auto one_outer = ops::Const<int32>(
-        scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
-    auto add_i =
-        ops::Add(scope.WithOpName("outer/add")
-                     .WithControlDependencies(absl::Span<const Operation>{
-                         while_op[0].op(), while_op[1].op()}),
-                 identity_i, one_outer);
-
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
-    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
-    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Inner condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto five = ops::Const<int32>(
-        scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), 5);
-    auto less_j = ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList outer_cond_fn, outer_body_fn;
     TF_EXPECT_OK(
-        InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Inner body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto identity_j =
-        ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
-    auto identity_k =
-        ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
-
-    auto mul_jk =
-        ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
-    auto add_jkx = ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
-    auto assign = ops::AssignAddVariableOp(
-        scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
-
-    auto one = ops::Const<int32>(
-        scope.WithOpName("outer/inner/One")
-            .WithControlDependencies(
-                absl::Span<const Operation>{assign.operation}),
-        1);
-    auto add_j =
-        ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+        FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
+
+    // Outer graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+      auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+      auto y = ops::Add(scope.WithOpName("y"), x, three);
+
+      auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
+                                  TensorShape({}));
+
+      auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
+
+      auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
+                                 std::initializer_list<Input>{zero, y, x, var},
+                                 outer_cond_fn, outer_body_fn);
+      auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0);
-    auto retval1 =
-        ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
-    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+    // Outer condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
+          10);
+      auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    // Outer body graph.
+    NameAttrList inner_cond_fn, inner_body_fn;
+    {
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
+
+      // Find the inner condition and body names.
+      TF_EXPECT_OK(
+          FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
+
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
+      auto one_j = ops::Const<int32>(
+          scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
+      auto while_op =
+          ops::While(scope.WithOpName("outer/LoopCond_1"),
+                     std::initializer_list<Input>{one_j, arg1, arg2, arg3},
+                     inner_cond_fn, inner_body_fn);
+
+      auto one_outer = ops::Const<int32>(
+          scope.WithOpName("outer/add/y").WithControlDependencies(identity_i),
+          1);
+      auto add_i =
+          ops::Add(scope.WithOpName("outer/add")
+                       .WithControlDependencies(absl::Span<const Operation>{
+                           while_op[0].op(), while_op[1].op()}),
+                   identity_i, one_outer);
+
+      auto retval0 =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
+      auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
+      auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+                result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+    // Inner condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto five = ops::Const<int32>(
+          scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0),
+          5);
+      auto less_j =
+          ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
+      auto retval =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Inner body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto identity_j =
+          ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
+      auto identity_k =
+          ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
+
+      auto mul_jk =
+          ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
+      auto add_jkx =
+          ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
+      auto assign = ops::AssignAddVariableOp(
+          scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
+
+      auto one = ops::Const<int32>(
+          scope.WithOpName("outer/inner/One")
+              .WithControlDependencies(
+                  absl::Span<const Operation>{assign.operation}),
+          1);
+      auto add_j =
+          ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+
+      auto retval0 =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0);
+      auto retval1 =
+          ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
+      auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+                result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 706ed4f5bbfac60de4653cc8c326214cd4d8d886..efb75749722893100494e089c0beb96944e9f1d4 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/validate.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -51,12 +52,11 @@ namespace {
 Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
                         const std::vector<const XlaExpression*>& expressions,
                         std::vector<XlaCompiler::Argument>* args) {
-  auto builder = ctx->builder();
   auto client = ctx->compiler()->client();
-  std::vector<bool> compile_time_constant_flags(expressions.size());
+  std::vector<bool> arg_must_be_compile_time_constant(expressions.size());
 
   TF_RETURN_IF_ERROR(
-      BackwardsConstAnalysis(*graph, &compile_time_constant_flags,
+      BackwardsConstAnalysis(*graph, &arg_must_be_compile_time_constant,
                              /*compile_time_const_nodes=*/nullptr));
 
   args->resize(expressions.size());
@@ -65,24 +65,31 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
     arg.type = ctx->input_type(i);
     arg.shape = ctx->InputShape(i);
 
-    if (arg.type == DT_RESOURCE) {
-      return errors::InvalidArgument(
-          "Resource as function argument is not yet implemented.");
-    } else if (expressions[i]->has_constant_value()) {
-      arg.kind = XlaCompiler::Argument::kConstant;
-      arg.constant_value = expressions[i]->constant_value();
-    } else if (compile_time_constant_flags[i]) {
-      arg.kind = XlaCompiler::Argument::kConstant;
-      TF_RET_CHECK(expressions[i]->resource() == nullptr)
-          << "Input with resource is not yet implemented.";
-      TF_ASSIGN_OR_RETURN(auto constant_graph, builder->BuildConstantSubGraph(
-                                                   expressions[i]->handle()));
-      TF_ASSIGN_OR_RETURN(auto literal,
-                          client->ComputeConstant(constant_graph));
-      TF_RETURN_IF_ERROR(
-          LiteralToHostTensor(literal, arg.type, &arg.constant_value));
-    } else {
-      arg.kind = XlaCompiler::Argument::kParameter;
+    switch (expressions[i]->kind()) {
+      case XlaExpression::Kind::kConstant:
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = expressions[i]->constant_value();
+        break;
+      case XlaExpression::Kind::kXlaOp:
+        if (arg_must_be_compile_time_constant[i]) {
+          TF_ASSIGN_OR_RETURN(absl::optional<Tensor> value,
+                              expressions[i]->ResolveConstant(client));
+          if (!value.has_value()) {
+            return errors::InvalidArgument(
+                "Argument to function must be a compile-time constant, but "
+                "unable to resolve argument value to a constant.");
+          }
+          arg.kind = XlaCompiler::Argument::kConstant;
+          arg.constant_value = *value;
+        } else {
+          arg.kind = XlaCompiler::Argument::kParameter;
+        }
+        break;
+      case XlaExpression::Kind::kResource:
+        return errors::Unimplemented(
+            "Resource as function argument is not yet implemented.");
+      case XlaExpression::Kind::kInvalid:
+        return errors::InvalidArgument("Invalid function argument");
     }
   }
   return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 9ee4178f5c213e919255bb33e9b15800a77256e6..d85b4f5ae0cb9c7d2476158a5830f921742ae980 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -178,6 +178,32 @@ tf_kernel_library(
     ],
 )
 
+# A separate cc_library for resampler_ops is needed because resampler is in
+# contrib/, and thus the declaration of resampler cannot be pulled into the deps
+# of xla_ops. Therefore, resampler_ops is its own cc_library target, and its
+# corresponding tf_kernel_library is defined in contrib/resampler/BUILD.
+cc_library(
+    name = "resampler_ops",
+    srcs = ["resampler_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:numeric",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "conv_op_helpers",
     srcs = ["conv_op_helpers.cc"],
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 276d744c096f8996c774964204feaa3762bdb844..2db2514397deca39e6874cf994532a20d2186316 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -14,11 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
@@ -49,13 +51,9 @@ class XlaArgOp : public XlaOpKernel {
     }
 
     const XlaExpression& arg = XlaContext::Get(ctx).args()[index_];
-    if (arg.resource() != nullptr) {
-      ctx->SetResourceOutput(0, arg.resource());
-    } else if (arg.has_constant_value()) {
-      ctx->SetConstantOutput(0, arg.constant_value());
-    } else {
-      ctx->SetOutput(0, arg.handle());
-    }
+    OP_REQUIRES(ctx, arg.kind() != XlaExpression::Kind::kInvalid,
+                errors::InvalidArgument("Invalid/missing argument expression"));
+    ctx->SetOutputExpression(0, arg);
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index 9fa57b76f8e3649c03fe41f39638b88cb065ed0e..c022284fec6bc91951170e243ea3609c8d5d0c43 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -94,14 +94,10 @@ class BCastGradArgsOp : public XlaOpKernel {
       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(in_shape),
                   errors::InvalidArgument("In[", i, "] must be a vector.",
                                           in_shape.DebugString()));
-      xla::Literal literal;
-      OP_REQUIRES_OK(ctx, ctx->ConstantInput(i, &literal));
-
-      BCast::Vec vec;
-      for (int64 i = 0; i < in_shape.num_elements(); ++i) {
-        vec.push_back(literal.Get<int>({i}));
-      }
-      shapes.push_back(vec);
+      std::vector<int64> vec;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(i, &vec));
+
+      shapes.push_back(BCast::Vec(vec.begin(), vec.end()));
     }
     BCast bcast(shapes[0], shapes[1]);
     OP_REQUIRES(ctx, bcast.IsValid(),
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index ad85940920ebb82e72331516e3fe46c79f853892..3e398fff951a211f5af42d26983bc7473bddde63 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -21,10 +21,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -57,8 +60,6 @@ class CategoricalOp : public XlaOpKernel {
     const int64 batch_size = logits_shape.dim_size(0);
     const int64 num_classes = logits_shape.dim_size(1);
 
-    xla::XlaBuilder* builder = ctx->builder();
-
     xla::Shape uniform_shape;
     int class_dimension;
     if (num_samples > 1) {
@@ -83,16 +84,16 @@ class CategoricalOp : public XlaOpKernel {
           xla::ShapeUtil::MakeShape(uniform_xla_type, uniform_shape_array);
       class_dimension = 1;
     }
-    xla::XlaOp uniforms =
-        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
-                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    xla::PrimitiveType type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(0), &type));
+    xla::XlaOp log_uniforms = GetLogUniforms(uniform_shape, type, ctx);
 
     // Use Gumbel softmax trick to generate categorical samples.
     // See:
     // https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/
     // TODO(b/68769470): Switch to using a cumulative sum approach.
     auto softmax_entries =
-        xla::Sub(logits, xla::Log(-xla::Log(uniforms)),
+        xla::Sub(logits, log_uniforms,
                  /*broadcast_dimensions=*/{0, class_dimension});
 
     xla::PrimitiveType xla_output_type;
@@ -107,6 +108,16 @@ class CategoricalOp : public XlaOpKernel {
     ctx->SetOutput(0, argmax);
   }
 
+  virtual xla::XlaOp GetLogUniforms(xla::Shape uniform_shape,
+                                    xla::PrimitiveType type,
+                                    XlaOpKernelContext* ctx) {
+    xla::XlaBuilder* builder = ctx->builder();
+    auto uniforms =
+        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
+                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    return xla::Log(-xla::Log(uniforms));
+  }
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(CategoricalOp);
 };
@@ -115,5 +126,48 @@ class CategoricalOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("Multinomial").CompileTimeConstantInput("num_samples"),
                 CategoricalOp);
 
+class StatelessCategoricalOp : public CategoricalOp {
+ public:
+  explicit StatelessCategoricalOp(OpKernelConstruction* ctx)
+      : CategoricalOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  xla::XlaOp GetLogUniforms(xla::Shape uniform_shape, xla::PrimitiveType type,
+                            XlaOpKernelContext* ctx) override {
+    xla::XlaOp seed = ctx->Input(2);
+    auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
+    auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
+
+    xla::XlaBuilder* builder = ctx->builder();
+    if (uniform_shape.element_type() == xla::BF16) {
+      uniform_shape.set_element_type(xla::F32);
+    }
+    auto uniforms = xla::StatelessRngUniform(
+        {seed0, seed1}, uniform_shape, XlaHelpers::Zero(builder, DT_FLOAT),
+        XlaHelpers::One(builder, DT_FLOAT));
+    return xla::ConvertElementType(xla::Log(-xla::Log(uniforms)), type);
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape seed_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, seed_shape.dims() == 1 && seed_shape.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    CategoricalOp::Compile(ctx);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessCategoricalOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessMultinomial")
+                    .CompileTimeConstantInput("num_samples")
+                    .TypeConstraint("T", {DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("Tseed", DT_INT32),
+                StatelessCategoricalOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index e28755dd73bf6f8e1518dda2494cade79b7db22e..cd7c7f4a82df7a65829787efcb1fd2f77870e945 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -45,15 +46,13 @@ class ConcatBaseOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape concat_dim_tensor_shape = ctx->InputShape(axis_index_);
-    OP_REQUIRES(
-        ctx, IsLegacyScalar(concat_dim_tensor_shape),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_tensor_shape.DebugString()));
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(axis_index_, &literal));
-    // TODO(annarev): add a helper to support int64 input.
-    const int32 concat_dim = literal.Get<int>({});
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(concat_dim_tensor_shape),
+                errors::InvalidArgument(
+                    "Concat dim tensor should be a scalar, but got shape ",
+                    concat_dim_tensor_shape.DebugString()));
+    int64 concat_dim;
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsIntScalar(axis_index_, &concat_dim));
 
     std::vector<xla::XlaOp> values;
     std::vector<TensorShape> shapes;
@@ -63,9 +62,7 @@ class ConcatBaseOp : public XlaOpKernel {
     const TensorShape& input_shape = shapes[0];
 
     int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(ctx,
-                (0 <= axis && axis < input_dims) ||
-                    (allow_legacy_scalars() && concat_dim == 0),
+    OP_REQUIRES(ctx, 0 <= axis && axis < input_dims,
                 errors::InvalidArgument(
                     "ConcatOp : Expected concatenating dimensions in the range "
                     "[",
@@ -75,14 +72,11 @@ class ConcatBaseOp : public XlaOpKernel {
     // elements.
     std::vector<xla::XlaOp> input_data;
     int output_concat_dim = 0;
-    const bool input_is_scalar = IsLegacyScalar(input_shape);
     for (int i = 0; i < N; ++i) {
       xla::XlaOp handle = values[i];
       const TensorShape& in_shape = shapes[i];
-      const bool in_is_scalar = IsLegacyScalar(in_shape);
       OP_REQUIRES(
-          ctx,
-          in_shape.dims() == input_dims || (input_is_scalar && in_is_scalar),
+          ctx, in_shape.dims() == input_dims,
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
               input_shape.DebugString(), " vs. shape[", i,
@@ -131,11 +125,10 @@ class ConcatOffsetOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape concat_dim_shape = ctx->InputShape(0);
-    OP_REQUIRES(
-        ctx, IsLegacyScalar(concat_dim_shape),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(concat_dim_shape),
+                errors::InvalidArgument(
+                    "Concat dim tensor should be a scalar, but got shape ",
+                    concat_dim_shape.DebugString()));
     for (int i = 1; i < ctx->num_inputs(); ++i) {
       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ctx->InputShape(i)),
                   errors::InvalidArgument("input ", i,
@@ -162,39 +155,38 @@ class ConcatOffsetOp : public XlaOpKernel {
     //  [0, 5, 0, 0]
     const int32 N = ctx->num_inputs() - 1;
     const TensorShape inp0_shape = ctx->InputShape(1);
-    xla::Literal inp0_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &inp0_literal));
-    const int64 dims = inp0_shape.num_elements();
+    std::vector<int64> inp0_dims;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &inp0_dims));
+    const int64 inp0_rank = inp0_shape.num_elements();
 
-    xla::Literal concat_dim_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &concat_dim_literal));
-    const int64 cdim = concat_dim_literal.Get<int>({});
+    int64 cdim;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &cdim));
 
-    VLOG(1) << "ConcatOffset " << cdim << "," << dims;
-    int32 axis = cdim < 0 ? cdim + dims : cdim;
-    OP_REQUIRES(ctx, FastBoundsCheck(axis, dims),
+    VLOG(1) << "ConcatOffset " << cdim << "," << inp0_rank;
+    int32 axis = cdim < 0 ? cdim + inp0_rank : cdim;
+    OP_REQUIRES(ctx, FastBoundsCheck(axis, inp0_rank),
                 errors::InvalidArgument("Concat dim is out of range: ", axis,
-                                        " vs. ", dims));
+                                        " vs. ", inp0_rank));
     int32 offset = 0;
     for (int i = 0; i < N; ++i) {
       const TensorShape inp_shape = ctx->InputShape(1 + i);
-      OP_REQUIRES(ctx, dims == inp_shape.num_elements(),
-                  errors::InvalidArgument("input ", i, " should contain ", dims,
-                                          " elements, but got ",
+      OP_REQUIRES(ctx, inp0_rank == inp_shape.num_elements(),
+                  errors::InvalidArgument("input ", i, " should contain ",
+                                          inp0_rank, " elements, but got ",
                                           inp_shape.num_elements()));
-      xla::Literal inp_literal;
-      OP_REQUIRES_OK(ctx, ctx->ConstantInput(1 + i, &inp_literal));
+      std::vector<int64> inp_dims;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1 + i, &inp_dims));
 
-      Tensor out_constant(DT_INT32, TensorShape({dims}));
+      Tensor out_constant(DT_INT32, TensorShape({inp0_rank}));
       auto out_vec = out_constant.vec<int32>();
-      for (int64 j = 0; j < dims; ++j) {
+      for (int64 j = 0; j < inp0_rank; ++j) {
         if (j == axis) {
           out_vec(j) = offset;
-          offset += inp_literal.Get<int>({j});
+          offset += inp_dims[j];
         } else {
-          const int32 inp0_element = inp0_literal.Get<int>({j});
-          const int32 inp_element = inp_literal.Get<int>({j});
-          OP_REQUIRES(ctx, (inp0_element == inp_element),
+          const int32 inp0_element = inp0_dims[j];
+          const int32 inp_element = inp_dims[j];
+          OP_REQUIRES(ctx, inp0_element == inp_element,
                       errors::InvalidArgument("input[", i, ",", j,
                                               "] mismatch: ", inp0_element,
                                               " vs. ", inp_element));
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index 2628ef8e2454976aeff3859fa5dc1d8e106f32e1..dff8af800229b9605bb93e0498bc5e5cf012f244 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -42,11 +42,6 @@ class ConstOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape shape(proto_.tensor_shape());
 
-    if (proto_.dtype() == DT_STRING) {
-      LOG(WARNING) << "Not computing Const of type DT_STRING";
-      ctx->SetInvalidOutput(0);
-      return;
-    }
     xla::XlaBuilder* b = ctx->builder();
 
     // To avoid blowups for large constants filled with the same value,
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index c9a1be494066e4f935a1d818bc86c86333e34fae..b1046fcc0001a3eb450c82a8545e2cfdf4e43fd0 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index c68b0bfd7961892294c2931e5c4c44de534a7740..29687c7b82f92d9f336854c4575746589c63b64f 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index e9bdb15aa0c57fd95530798f87c68e2e63e84e1d..35e0625dbb0d4c696d36cce642d6f50f1d220c45 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace {
@@ -33,39 +34,20 @@ class FillOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     // The output of this Op is a tensor of shape 'dims_shape' with each
     // element set to the scalar 'dims_literal'.
-    const TensorShape dims_shape = ctx->InputShape(0);
-    const TensorShape value_shape = ctx->InputShape(1);
+    const TensorShape dims_shape = ctx->InputShape("dims");
+    const TensorShape value_shape = ctx->InputShape("value");
     OP_REQUIRES(
-        ctx, IsLegacyVector(dims_shape),
+        ctx, TensorShapeUtils::IsVector(dims_shape),
         errors::InvalidArgument("dims must be a vector of int32, got shape ",
                                 dims_shape.DebugString()));
-    OP_REQUIRES(ctx, IsLegacyScalar(value_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(value_shape),
                 errors::InvalidArgument("value must be a scalar, got shape ",
                                         value_shape.DebugString()));
-    // Evaluate the 'dims' constant input, reshaping to a vector if it
-    // was a 'legacy' vector (secretly a scalar).
-    xla::Literal dims_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(
-                            0, {dims_shape.num_elements()}, &dims_literal));
 
-    // Convert the dims literal into a vector that we can pass to
-    // XlaBuilder.
-    std::vector<int64> broadcast;
-    broadcast.reserve(dims_literal.shape().dimensions(0));
-    for (int i = 0; i < dims_literal.shape().dimensions(0); ++i) {
-      broadcast.push_back(dims_literal.Get<int>({i}));
-    }
-    // Look up the value input, reshaping to a scalar if it was a
-    // 'legacy' scalar (secretly a vector).
-    xla::XlaOp data = ctx->Input(1);
-    if (value_shape.dims() > 0) {
-      CHECK_EQ(value_shape.dims(), 1);
-      data = xla::Reshape(data, {});
-    }
-    // Emit the actual computation, which broadcasts the scalar to the
-    // desired shape.
-    auto result = xla::Broadcast(data, broadcast);
+    std::vector<int64> dims;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("dims", &dims));
 
+    auto result = xla::Broadcast(ctx->Input("value"), dims);
     ctx->SetOutput(0, result);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index d069373086a6dcd6e7901abfc63d851a731da321..42bf4b06e5da7c6f99ad32ae36131dffd124d103 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -48,9 +48,8 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     // We require that the dimension argument is a constant, since it lets us
     // dispatch to a specialized custom-call function without any run-time
     // overhead, when compiling ahead-of-time.
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &literal));
-    const int32 dim = literal.Get<int32>({});
+    int64 dim;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &dim));
     OP_REQUIRES(ctx, dim >= 0, errors::InvalidArgument("dim must be >= 0"));
     OP_REQUIRES(
         ctx, dim < input_shape.dims(),
@@ -120,6 +119,10 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
                         ", but got shape: ",
                         input_shape.DebugString()));
     }
+    const DataType dtype = output_type(0);
+    xla::PrimitiveType output_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype, &output_type));
+    output = xla::ConvertElementType(output, output_type);
     ctx->SetOutput(0, output);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
index 8dfd7de591c4a3c4768dd60b41e03d294ad49397..a99b74565dab4587bee999e3d73340ff58d21f77 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
index c0ca881ff82cee04e0c5e35f9a2d5732fabdd8a6..4f980b6d14ed667bdf4756ed740894098cae5919 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index 4833a9662dd12ca72b5715373b549af105625d45..f6b8534f4d7c537e5b708ee000e00cb92123584b 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -41,10 +41,8 @@ class MirrorPadOp : public XlaOpKernel {
     for (int64 dimno = xla::ShapeUtil::Rank(original_shape) - 1; dimno >= 0;
          --dimno) {
       auto t_rev = xla::Rev(accum, {dimno});
-      TF_ASSIGN_OR_RETURN(int64 lhs_padding,
-                          pad_literal.GetIntegralAsS64({dimno, 0}));
-      TF_ASSIGN_OR_RETURN(int64 rhs_padding,
-                          pad_literal.GetIntegralAsS64({dimno, 1}));
+      int64 lhs_padding = pad_literal.Get<int64>({dimno, 0});
+      int64 rhs_padding = pad_literal.Get<int64>({dimno, 1});
       int64 dim_size = original_shape.dimensions(dimno);
 
       // Padding amounts on each side must be no more than the size of the
@@ -65,8 +63,8 @@ class MirrorPadOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape pad_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape pad_shape = ctx->InputShape("paddings");
 
     MirrorPadMode mode;
     OP_REQUIRES_OK(ctx, GetNodeAttr(def(), "mode", &mode));
@@ -81,23 +79,19 @@ class MirrorPadOp : public XlaOpKernel {
         TensorShapeUtils::IsMatrix(pad_shape) && pad_shape.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 pad_shape.DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && pad_shape.dim_size(0) == 1)
-            ? 1
-            : dims;
     OP_REQUIRES(
-        ctx, fixed_dims == pad_shape.dim_size(0),
+        ctx, dims == pad_shape.dim_size(0),
         errors::InvalidArgument(
             "The first dimension of paddings must be the rank of inputs",
             pad_shape.DebugString(), " ", input_shape.DebugString()));
 
     // Evaluate the 'padding' constant input, reshaping to a matrix.
     xla::Literal pad_literal;
-    OP_REQUIRES_OK(
-        ctx, ctx->ConstantInputReshaped(1, {fixed_dims, 2}, &pad_literal));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsInt64Literal("paddings", &pad_literal));
 
     xla::XlaBuilder* b = ctx->builder();
-    auto in0 = ctx->Input(0);
+    auto in0 = ctx->Input("input");
     xla::StatusOr<xla::Shape> in0_shape = b->GetShape(in0);
     OP_REQUIRES(ctx, in0_shape.ok(), in0_shape.status());
     xla::StatusOr<xla::XlaOp> accum_status =
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index 3f5445b4821b0918f3b220ecfe2be20bccb33dc2..36ea70ac392ff18fb52d400efa886533f8335eba 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace {
@@ -29,40 +30,36 @@ class PadOp : public XlaOpKernel {
   explicit PadOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape pad_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape pad_shape = ctx->InputShape("paddings");
     const int dims = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsMatrix(pad_shape) && pad_shape.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 pad_shape.DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && pad_shape.dim_size(0) == 1)
-            ? 1
-            : dims;
     OP_REQUIRES(
-        ctx, fixed_dims == pad_shape.dim_size(0),
+        ctx, dims == pad_shape.dim_size(0),
         errors::InvalidArgument(
             "The first dimension of paddings must be the rank of inputs",
             pad_shape.DebugString(), " ", input_shape.DebugString()));
 
-    if (fixed_dims == 0) {
+    xla::XlaOp input = ctx->Input("input");
+    if (dims == 0) {
       // Tensor is rank 0. Return it unchanged.
-      ctx->SetOutput(0, ctx->Input(0));
+      ctx->SetOutput(0, input);
       return;
     }
 
-    // Evaluate the 'padding' constant input, reshaping to a matrix.
     xla::Literal pad_literal;
-    OP_REQUIRES_OK(
-        ctx, ctx->ConstantInputReshaped(1, {fixed_dims, 2}, &pad_literal));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsInt64Literal("paddings", &pad_literal));
 
     xla::PaddingConfig config;
-    for (int i = 0; i < fixed_dims; ++i) {
+    for (int i = 0; i < dims; ++i) {
       auto* dim = config.add_dimensions();
-      int before = pad_literal.Get<int32>({i, 0});
-      int after = pad_literal.Get<int32>({i, 1});
+      int before = pad_literal.Get<int64>({i, 0});
+      int after = pad_literal.Get<int64>({i, 1});
       OP_REQUIRES(ctx, before >= 0 && after >= 0,
                   errors::InvalidArgument(
                       "Paddings must be non-negative: ", before, " ", after));
@@ -73,12 +70,13 @@ class PadOp : public XlaOpKernel {
     // PadV2 added a "constant_values" input that indicates the pad value.
     xla::XlaOp constant_values;
     if (ctx->num_inputs() == 3) {
-      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(2)),
-                  errors::InvalidArgument("constant_values must be a scalar."));
-      ctx->SetOutput(0, xla::Pad(ctx->Input(0), ctx->Input(2), config));
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsScalar(ctx->InputShape("constant_values")),
+          errors::InvalidArgument("constant_values must be a scalar."));
+      ctx->SetOutput(0, xla::Pad(input, ctx->Input("constant_values"), config));
     } else {
       auto zero = XlaHelpers::Zero(ctx->builder(), input_type(0));
-      ctx->SetOutput(0, xla::Pad(ctx->Input(0), zero, config));
+      ctx->SetOutput(0, xla::Pad(input, zero, config));
     }
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 415ce9b77ffeac8a6a5f3c23537afb16c1d3567c..8822e29f7e77b1cbc6fa6ca61d0062d9b1b0c36e 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 107fa62967a55dffcfff8728b65338564e5202d2..132160de707911f26389034e16236985bb18e6ad 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -113,11 +113,21 @@ class MeanOp : public XlaReductionOp {
     xla::Add(scalar_lhs, scalar_rhs);
   }
 
-  xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
-                            const xla::XlaOp& reduce_output,
-                            int64 num_elements_reduced) override {
-    auto divisor = XlaHelpers::IntegerLiteral(builder, input_type(0),
-                                              num_elements_reduced);
+  xla::XlaOp BuildFinalizer(
+      xla::XlaBuilder* /*builder*/, const xla::XlaOp& input,
+      const xla::XlaOp& reduce_output,
+      const std::vector<int64>& dimensions_to_reduce) override {
+    if (dimensions_to_reduce.empty()) {
+      return reduce_output;
+    }
+    auto divisor = xla::GetDimensionSize(input, dimensions_to_reduce[0]);
+    for (int i = 1; i < dimensions_to_reduce.size(); i++) {
+      auto size = xla::GetDimensionSize(input, dimensions_to_reduce[i]);
+      divisor = xla::Mul(divisor, size);
+    }
+    xla::PrimitiveType type;
+    TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type));
+    divisor = xla::ConvertElementType(divisor, type);
     return reduce_output / divisor;
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 466e79828d111ee7cadcf713703e8f252c63e62c..8f1667df5b72e9ecf97e5771670ef209dee287a3 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -48,13 +48,14 @@ class XlaReductionOp : public XlaOpKernel {
                             const xla::XlaOp& scalar_rhs) = 0;
 
   // Applies a transformation to the output of the reduction. The desired
-  // computation should be added to 'builder'. Argument 'reduce_output' is the
-  // output of the reduction. 'num_elements_reduced' is the number of elements
-  // that contributed to the reduction. Returns the transformed reduction
-  // output, Defaults to returning 'reduce_output' unchanged.
-  virtual xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
-                                    const xla::XlaOp& reduce_output,
-                                    int64 num_elements_reduced);
+  // computation should be added to 'builder'. Argument 'input' is the original
+  // input of the reduction; 'reduce_output' is the output of the reduction.
+  // Returns the transformed reduction output, Defaults to returning
+  // 'reduce_output' unchanged.
+  virtual xla::XlaOp BuildFinalizer(
+      xla::XlaBuilder* builder, const xla::XlaOp& input,
+      const xla::XlaOp& reduce_output,
+      const std::vector<int64>& dimensions_to_reduce);
 
   void Compile(XlaOpKernelContext* ctx) override;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 118f2798d559f43acb7f6394a7337426164325ef..e96cabbb853be744dbba7f19fbbd227bb52ebb06 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -37,9 +37,10 @@ XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx,
 
 // Unless BuildFinalizer is overridden the reduction has no
 // finalizer.
-xla::XlaOp XlaReductionOp::BuildFinalizer(xla::XlaBuilder* builder,
-                                          const xla::XlaOp& reduce_output,
-                                          int64 num_elements_reduced) {
+xla::XlaOp XlaReductionOp::BuildFinalizer(
+    xla::XlaBuilder* /*builder*/, const xla::XlaOp& /*input*/,
+    const xla::XlaOp& reduce_output,
+    const std::vector<int64>& /*dimensions_to_reduce*/) {
   return reduce_output;
 }
 
@@ -71,7 +72,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
 
   absl::InlinedVector<bool, 4> bitmap(data_shape.dims(), false);
   std::vector<int64> xla_axes;
-  int64 num_elements_reduced = 1LL;
   for (int64 i = 0; i < axes_tensor_shape.num_elements(); ++i) {
     int64 index = axes[i];
     OP_REQUIRES(ctx,
@@ -82,7 +82,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
     index = (index + data_shape.dims()) % data_shape.dims();
     bitmap[index] = true;
     xla_axes.push_back(index);
-    num_elements_reduced *= data_shape.dim_size(index);
   }
 
   std::vector<int64> final_shape;
@@ -119,7 +118,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
 
   auto reduce = xla::Reduce(data, initial, reduction_computation, xla_axes);
   auto deconverted = XlaHelpers::ConvertElementType(b, reduce, input_type(0));
-  auto finalized = BuildFinalizer(b, deconverted, num_elements_reduced);
+  auto finalized = BuildFinalizer(b, data, deconverted, xla_axes);
   auto result = keep_dims_ ? xla::Reshape(finalized, final_shape) : finalized;
   ctx->SetOutput(0, result);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a8f33c8f39e47d7bd1f59413be880c51d273cf1
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
@@ -0,0 +1,587 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+using xla::XlaOp;
+
+// Calculates the bilinear weight tensor, given basis ratio (px, py) of the
+// sampling position:
+//    W = [(1-px)*(1-py), px*(1-py), (1-px)*py, px*py]
+// 'ratio' tensor has dimensions [batch, dim_0, ...dim_n, 2].
+//
+// The returned tensor has dimensions [batch, dim_0, ... dim_n, 4].
+XlaOp BilinearWeights(XlaOpKernelContext* ctx, XlaOp ratio,
+                      const TensorShape warp_shape,
+                      xla::PrimitiveType xla_type) {
+  auto first_term = xla::ConstantR2<float>(
+      ctx->builder(), {{1.0, 1.0}, {0.0, 1.0}, {1.0, 0.0}, {0.0, 0.0}});
+  first_term = xla::ConvertElementType(first_term, xla_type);
+
+  auto warp_dims = warp_shape.dim_sizes();
+  std::vector<int64> broadcast_dims(warp_dims.begin(), warp_dims.end() - 1);
+  broadcast_dims.push_back(4);
+  broadcast_dims.push_back(2);
+
+  const int64 broadcast_dims_size = broadcast_dims.size();
+
+  std::vector<int64> last_two_dims_indices = {(broadcast_dims_size - 2),
+                                              (broadcast_dims_size - 1)};
+
+  xla::Shape broadcast_shape =
+      xla::ShapeUtil::MakeShape(xla_type, broadcast_dims);
+
+  auto broadcast_first_term =
+      xla::BroadcastInDim(first_term, broadcast_shape, last_two_dims_indices);
+
+  // Ratio is of the same dimension as warp, which is [batch, dim_0,... dim_n,
+  // 2], we broadcast ratio tensor to 'broadcast_dim' by keeping the
+  // [batch, dim_0,...dim_n] dimensions and the [2] dimension as the last
+  // dimension.
+  std::vector<int64> ratio_broadcast_indices(broadcast_dims.size());
+  std::iota(ratio_broadcast_indices.begin(), ratio_broadcast_indices.end(), 0);
+  ratio_broadcast_indices.erase(ratio_broadcast_indices.end() - 2);
+
+  auto broadcast_ratio =
+      xla::BroadcastInDim(ratio, broadcast_shape, ratio_broadcast_indices);
+
+  auto first_term_subtract_weights = broadcast_first_term - broadcast_ratio;
+
+  // Now we have [(1-px, 1-py), (-px, 1-py), (1-px, -py), (px, py)], need to
+  // flip the signs of the second and the third term.
+  auto sign_change = xla::ConstantR2<float>(
+      ctx->builder(), {{1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {1.0, 1.0}});
+  sign_change = xla::ConvertElementType(sign_change, xla_type);
+
+  auto broadcast_sign_change =
+      xla::BroadcastInDim(sign_change, broadcast_shape, last_two_dims_indices);
+
+  auto flipped = first_term_subtract_weights * broadcast_sign_change;
+
+  // Build up the final bilinear weight tensor by multiply reduction, which
+  // gives:
+  //    [(1-px)*(1-py), px*(1-py), (1-px)*py, px*py]
+  // for each 4 neighboring pixels where px and py are the weight of the target
+  // pixel we are sampling from.
+  return xla::Reduce(
+      flipped, xla::One(ctx->builder(), xla_type),
+      xla::CreateScalarMultiplyComputation(xla_type, ctx->builder()),
+      {broadcast_dims_size - 1});
+}
+
+// Concatenates the batch indices to the (x, y) coordinate indices.
+// This is done by first creating an Iota tensor that represents the current
+// batch it is in, then concatenate with the givin (coordinate) indices.
+//
+// The resulting tensor has dimension (batch, dim_0, ... dim_n, 3) where
+// the last dimension of size 3 in turn is [batch_number, x, y].
+// The [batch_number, x, y] dimension is needed because the indices
+// [x,y] alone cannot allow the xla::Gather operation to gather from the input
+// data, which is of dimension [batch, height(y), width(x), channel] with
+// 'batch' being the first dimension.
+XlaOp ConcatenateIota(xla::XlaBuilder* b, XlaOp indices,
+                      const TensorShape& warp_shape) {
+  // We need to create an iota tensor with the same batch dimension.
+  std::vector<int64> dimensions;
+  for (auto dim : warp_shape) {
+    dimensions.push_back(dim.size);
+  }
+  // Except the last dimension, which is of size 1.
+  dimensions.back() = 1;
+
+  auto batch_indices =
+      xla::Iota(b, xla::ShapeUtil::MakeShape(xla::U32, dimensions),
+                /*iota_dimension=*/0);
+
+  return xla::ConcatInDim(b, {batch_indices, indices}, dimensions.size() - 1);
+}
+
+// Gathers the 2x2 neighbors of the input starting_indices, and return a
+// tensor of dimension [batch, dim_0, ... dim_n, 4, data_channels].
+// 'gather_indices' is of dimension [batch, dim_0, ..., dim_n, 3] where the last
+// dimension of size 3 is (batch_no, x, y).
+XlaOp Gather2by2Neighbors(xla::XlaBuilder* b, XlaOp data, XlaOp gather_indices,
+                          int64 data_channels, int warp_dims) {
+  xla::GatherDimensionNumbers gather_dim_numbers;
+  const int64 neighbor_data_dimensions = warp_dims + 2;
+  // Since the Gather output dimensions are [batch, dim_0, ... dim_n, 2, 2,
+  // data_channels], the offset dimensions for Gather is the last 3 dimensions.
+  gather_dim_numbers.add_offset_dims(neighbor_data_dimensions - 3);
+  gather_dim_numbers.add_offset_dims(neighbor_data_dimensions - 2);
+  gather_dim_numbers.add_offset_dims(neighbor_data_dimensions - 1);
+  // The last dimension of 'gather_indices' is the starting indices for gather.
+  gather_dim_numbers.set_index_vector_dim(warp_dims - 1);
+  gather_dim_numbers.add_collapsed_slice_dims(0);
+  gather_dim_numbers.add_start_index_map(0);
+  // Since input is of dimension [batch, height(y), width(x), channel], and warp
+  // is of dimension [batch, x, y], the ordering of x, y here needs to be
+  // swapped when gathering.
+  gather_dim_numbers.add_start_index_map(2);
+  gather_dim_numbers.add_start_index_map(1);
+  // Data dimensions are [batch, x, y, channel].
+  // Output dimensions are [batch, dim_0, ... dim_n, 2, 2, data_channels].
+  auto neighbors_data = xla::Gather(data, gather_indices, gather_dim_numbers,
+                                    /*slice_sizes=*/{1, 2, 2, data_channels});
+  // Collapse the ...,2,2,... dimensions into ...,4,...
+  return xla::Collapse(neighbors_data, {warp_dims - 1, warp_dims});
+}
+
+// Scatter 'updates' tensor to 'grad_data' based on 'indices'. Returns the
+// resulting tensor of dimension: [batch, dim_0, ...dim_n, 2, 2, data_channels].
+// This function can also be seen as the inverse of 'Gather2by2Neighbors'.
+XlaOp ScatterToGradData(XlaOpKernelContext* ctx, XlaOp grad_data, XlaOp indices,
+                        XlaOp updates, int64 warp_dims,
+                        xla::PrimitiveType xla_type) {
+  xla::ScatterDimensionNumbers scatter_dim_numbers;
+  const int64 neighbor_data_dimensions = warp_dims + 2;
+  // Since the Scatter output dimensions are [batch, dim_0, ... dim_n, 2, 2,
+  // data_channels], the update window dimensions is the last 3 dimensions.
+  scatter_dim_numbers.add_update_window_dims(neighbor_data_dimensions - 3);
+  scatter_dim_numbers.add_update_window_dims(neighbor_data_dimensions - 2);
+  scatter_dim_numbers.add_update_window_dims(neighbor_data_dimensions - 1);
+  scatter_dim_numbers.set_index_vector_dim(warp_dims - 1);
+
+  scatter_dim_numbers.add_inserted_window_dims(0);
+  scatter_dim_numbers.add_scatter_dims_to_operand_dims(0);
+  // Since input is of dimension [batch, height(y), width(x), channel], and warp
+  // is of dimension [batch, x, y], the ordering of x, y here needs to be
+  // swapped when scattering.
+  scatter_dim_numbers.add_scatter_dims_to_operand_dims(2);
+  scatter_dim_numbers.add_scatter_dims_to_operand_dims(1);
+
+  return xla::Scatter(grad_data, indices, updates,
+                      xla::CreateScalarAddComputation(xla_type, ctx->builder()),
+                      scatter_dim_numbers);
+}
+
+// Build computation the backprop into input 'data'.
+// Where input:
+// grad_output is of dimension [batch, dim_0, ...dim_n, channel]
+// ratio is of dimension [batch, dim_0, ...dim_n, 2]
+// gather_indices is of dimension [batch, dim_0, ...dim_n, 3]
+//
+// Output:
+// scatter-add to each 2x2 grad_data neighbor:
+//  grad_data[fx, fy, chan] += output_grad * dx * dy
+//  grad_data[cx, fy, chan] += output_grad * (1 - dx) * dy
+//  grad_data[fx, cy, chan] += output_grad * dx * (1 - dy)
+//  grad_data[cx, cy, chan] += output_grad * (1 - dx) * (1 - dy)
+// where (dx, dy) is (1 - ratio).
+XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
+                        XlaOp gather_indices, xla::PrimitiveType warp_type,
+                        TensorShape warp_shape, int64 data_channels,
+                        xla::Shape data_shape) {
+  // Weights tensor has dimension [batch, dim_0, ... dim_n, 4].
+  auto weights = BilinearWeights(ctx, ratio, warp_shape, warp_type);
+
+  auto warp_dims = warp_shape.dim_sizes();
+  std::vector<int64> warp_dims_without_last_dims(warp_dims.begin(),
+                                                 warp_dims.end() - 1);
+
+  std::vector<int64> reshaped_weights_dims = warp_dims_without_last_dims;
+  // Reshape the last dimension of size 4 to two dimensions [2, 2].
+  reshaped_weights_dims.push_back(2);
+  reshaped_weights_dims.push_back(2);
+  std::vector<int64> reshape_dims(warp_shape.dims());
+  std::iota(reshape_dims.begin(), reshape_dims.end(), 0);
+  // The dimension is [batch, dim_0,..., dim_n, 2, 2].
+  auto reshaped_weights = xla::Reshape(weights, /*dimensions=*/reshape_dims,
+                                       /*new_sizes=*/reshaped_weights_dims);
+
+  std::vector<int64> weights_with_channels_dims = reshaped_weights_dims;
+  weights_with_channels_dims.push_back(data_channels);
+  auto weights_with_channels_shape =
+      xla::ShapeUtil::MakeShape(warp_type, weights_with_channels_dims);
+  std::vector<int64> reshaped_weights_indices(reshaped_weights_dims.size());
+  std::iota(reshaped_weights_indices.begin(), reshaped_weights_indices.end(),
+            0);
+
+  // The dimension is [batch, dim_0, ..., dim_n, 2, 2, data_channel].
+  auto broadcast_reshaped_weights = xla::BroadcastInDim(
+      reshaped_weights, weights_with_channels_shape, reshaped_weights_indices);
+
+  std::vector<int64> grad_output_indices(warp_dims_without_last_dims.size());
+  std::iota(grad_output_indices.begin(), grad_output_indices.end(), 0);
+  grad_output_indices.push_back(weights_with_channels_dims.size() - 1);
+  XlaOp broadcast_grad_output = xla::BroadcastInDim(
+      grad_output, weights_with_channels_shape, grad_output_indices);
+
+  auto grad_output_multiply_weights =
+      broadcast_grad_output * broadcast_reshaped_weights;
+
+  auto grad_data = xla::ConstantLiteral(
+      ctx->builder(), xla::Literal::CreateFromShape(data_shape));
+
+  return ScatterToGradData(ctx, grad_data, gather_indices,
+                           grad_output_multiply_weights, warp_shape.dims(),
+                           warp_type);
+}
+
+// Build computation for the backprop into input 'warp'.
+// Where input:
+// warp is of dimension [batch, dim_0, ...dim_n, 2]
+// grad_output is of dimension [batch, dim_0, ...dim_n, channel]
+// ratio is of dimension [batch, dim_0, ...dim_n, 2]
+// gather_indices is of dimension [batch, dim_0, ...dim_n, 3]
+// data is of dimension [batch, x, y, channel]
+//
+// Output (simplified by ignoring the batch dimensions):
+// Since the forward path has:
+//    output = dot(weights * neighbors)
+// The backprop into warp will therefore be:
+//    grad_warp = output_grad * d_output / d_warp
+//              = output_grad * (d_weights / d_warp * neighbors + d_neighbors /
+//              d_warp * weight)
+// Where:
+//    d_weights / d_warp_x = [-(1 - py), (1 - py), -py, py]
+//    d_weights / d_warp_y = [-(1 - px), -px, (1-px), px]
+// and
+//    d_neighbors / d_warp_x = 0
+//
+// Therefore:
+//    grad_warp_x = py * (img_cxcy - img_fxcy) + (1-py) * (img_cxfy-img_fxfy)
+//    grad_warp_y = px * (img_cxcy - img_cxfy) + (1-px) * (img_fxcy-img_fxfy)
+//
+// where (px, py) is warp, (fx, fy) is the left top corner and (cx, cy) is the
+// bottom right corner in a 2x2 neighborhood.
+XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
+                        XlaOp gather_indices, XlaOp data,
+                        TensorShape warp_shape, int64 data_channels,
+                        xla::PrimitiveType data_type) {
+  auto warp_dims = warp_shape.dim_sizes();
+  std::vector<int64> warp_dims_without_last_dims(warp_dims.begin(),
+                                                 warp_dims.end() - 1);
+
+  std::vector<int64> neighbor_broadcast_dims = warp_dims_without_last_dims;
+  neighbor_broadcast_dims.push_back(4);
+
+  // With dimension [batch, dim_0, ...dim_n, 4]
+  auto neighbor_broadcast_shape =
+      xla::ShapeUtil::MakeShape(data_type, neighbor_broadcast_dims);
+
+  // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
+  auto neighbors_data = Gather2by2Neighbors(
+      ctx->builder(), data, gather_indices, data_channels, warp_shape.dims());
+
+  const int64 last_warp_dim = warp_shape.dims() - 1;
+
+  // Since we will be creating the dot product of:
+  //  lhs: [batch, dim_0, ...dim_n, 4]
+  // and
+  //  rhs: [batch, dim_0, ...dim_n, 4, data_channels]
+  // we choose the last dimension of lhs and the second last dimension of rhs,
+  // with size 4, as the contracting dimension.
+  xla::DotDimensionNumbers dot_dims;
+  for (int i = 0; i < warp_shape.dims() - 1; ++i) {
+    dot_dims.add_lhs_batch_dimensions(i);
+    dot_dims.add_rhs_batch_dimensions(i);
+  }
+  dot_dims.add_lhs_contracting_dimensions(warp_shape.dims() - 1);
+  dot_dims.add_rhs_contracting_dimensions(warp_shape.dims() - 1);
+
+  // img_cxcy - img_fxcy
+  auto bottom_right_minus_bottom_left = xla::DotGeneral(
+      xla::BroadcastInDim(
+          xla::ConvertElementType(
+              xla::ConstantR1<float>(ctx->builder(), {0, 0, -1, 1}), data_type),
+          neighbor_broadcast_shape, {last_warp_dim}),
+      neighbors_data, dot_dims, /*precision_config=*/nullptr);
+
+  // img_cxfy - img_fxfy
+  auto top_right_minus_top_left = xla::DotGeneral(
+      xla::BroadcastInDim(
+          xla::ConvertElementType(
+              xla::ConstantR1<float>(ctx->builder(), {-1, 1, 0, 0}), data_type),
+          neighbor_broadcast_shape, {last_warp_dim}),
+      neighbors_data, dot_dims, /*precision_config=*/nullptr);
+
+  // img_cxcy - img_cxfy
+  auto bottom_right_minus_top_right = xla::DotGeneral(
+      xla::BroadcastInDim(
+          xla::ConvertElementType(
+              xla::ConstantR1<float>(ctx->builder(), {0, -1, 0, 1}), data_type),
+          neighbor_broadcast_shape, {last_warp_dim}),
+      neighbors_data, dot_dims, /*precision_config=*/nullptr);
+
+  // img_fxcy - img_fxfy
+  auto bottom_left_minus_top_left = xla::DotGeneral(
+      xla::BroadcastInDim(
+          xla::ConvertElementType(
+              xla::ConstantR1<float>(ctx->builder(), {-1, 0, 1, 0}), data_type),
+          neighbor_broadcast_shape, {last_warp_dim}),
+      neighbors_data, dot_dims, /*precision_config=*/nullptr);
+
+  // Slice out x and y.
+  auto weight_x = xla::SliceInDim(ratio, /*start_index=*/0, /*limit_index=*/1,
+                                  /*stride=*/1, /*dimno=*/last_warp_dim);
+  auto weight_y = xla::SliceInDim(ratio, /*start_index=*/1, /*limit_index=*/2,
+                                  /*stride=*/1, /*dimno=*/last_warp_dim);
+
+  // Build 1 - y and 1 - x.
+  auto one_minus_y = xla::One(ctx->builder(), data_type) - weight_y;
+  auto one_minus_x = xla::One(ctx->builder(), data_type) - weight_x;
+
+  auto x_before_reduce =
+      grad_output * weight_y * bottom_right_minus_bottom_left +
+      one_minus_y * top_right_minus_top_left;
+
+  std::vector<int64> reshaped_sizes = warp_dims_without_last_dims;
+  reshaped_sizes.push_back(1);
+
+  std::vector<int64> reshaped_dims(warp_dims_without_last_dims.size());
+  std::iota(reshaped_dims.begin(), reshaped_dims.end(), 0);
+
+  // Reduce-add along the channel dimension.
+  auto x_result =
+      xla::Reduce(x_before_reduce, xla::Zero(ctx->builder(), data_type),
+                  xla::CreateScalarAddComputation(data_type, ctx->builder()),
+                  {last_warp_dim});
+  // Reshape before concatenating with y values.
+  XlaOp reshaped_x = xla::Reshape(x_result, reshaped_dims, reshaped_sizes);
+
+  auto y_before_reduce = grad_output * weight_x * bottom_right_minus_top_right +
+                         one_minus_x * bottom_left_minus_top_left;
+  // Reduce-add along the channel dimension.
+  auto y_result =
+      xla::Reduce(y_before_reduce, xla::Zero(ctx->builder(), data_type),
+
+                  xla::CreateScalarAddComputation(data_type, ctx->builder()),
+                  {last_warp_dim});
+  XlaOp reshaped_y = xla::Reshape(y_result, reshaped_dims, reshaped_sizes);
+
+  return xla::ConcatInDim(ctx->builder(), {reshaped_x, reshaped_y},
+                          last_warp_dim);
+}
+
+class ResamplerOp : public XlaOpKernel {
+ public:
+  explicit ResamplerOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape data_shape = ctx->InputShape("data");
+    OP_REQUIRES(ctx, data_shape.dims() == 4,
+                errors::InvalidArgument("data must be 4-dimensional",
+                                        data_shape.DebugString()));
+    const int64 data_channels = data_shape.dim_size(3);
+    xla::PrimitiveType data_type = ctx->input_xla_type(0);
+
+    TensorShape warp_shape = ctx->InputShape("warp");
+    OP_REQUIRES(ctx, warp_shape.dims() >= 2,
+                errors::InvalidArgument("warp must be at least 2-dimensional",
+                                        warp_shape.DebugString()));
+    for (int size : warp_shape.dim_sizes()) {
+      OP_REQUIRES(ctx, size > 0,
+                  errors::InvalidArgument("warp sizes must be positive, got [",
+                                          size, "]"));
+    }
+    const int64 last_warp_dim = warp_shape.dims() - 1;
+    // Last dimension of warp shape must be of size 2.
+    OP_REQUIRES(ctx, warp_shape.dim_size(last_warp_dim) == 2,
+                errors::InvalidArgument(
+                    "the last dimension of warp must be exactly size 2."));
+    xla::PrimitiveType warp_type = ctx->input_xla_type(1);
+
+    XlaOp data = ctx->Input("data");
+    XlaOp warp = ctx->Input("warp");
+
+    // Find the coordinates of the top left corner for the 2x2 region to be
+    // sampled from. The dimensions are [batch, dim_0, ... dim_n, 2] where the
+    // last dimension of size 2 in turn is [x, y].
+    XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
+
+    auto gather_indices = ConcatenateIota(ctx->builder(), top_left, warp_shape);
+
+    // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
+    auto neighbors_data = Gather2by2Neighbors(
+        ctx->builder(), data, gather_indices, data_channels, warp_shape.dims());
+
+    // Dimensions are [batch, dim_0, ... dim_n, 2].
+    XlaOp ratio = warp - xla::ConvertElementType(top_left, data_type);
+
+    // Obtain the bilinear blending weights, the dimension is [batch, dim_0,
+    // ...dim_n, 4].
+    auto weights = BilinearWeights(ctx, ratio, warp_shape, data_type);
+
+    // Since we will be creating the dot product of:
+    //  lhs: [batch, dim_0, ...dim_n, 4]
+    // and
+    //  rhs: [batch, dim_0, ...dim_n, 4, data_channels]
+    // we choose the last dimension of lhs and the second last dimension of rhs,
+    // with size 4, as the contracting dimension.
+    xla::DotDimensionNumbers dot_dims;
+    for (int i = 0; i < warp_shape.dims() - 1; ++i) {
+      dot_dims.add_lhs_batch_dimensions(i);
+      dot_dims.add_rhs_batch_dimensions(i);
+    }
+    dot_dims.add_lhs_contracting_dimensions(warp_shape.dims() - 1);
+    dot_dims.add_rhs_contracting_dimensions(warp_shape.dims() - 1);
+
+    // The dimension is [batch, dim_0, ...dim_n, data_channels].
+    auto blended_pixels = xla::DotGeneral(weights, neighbors_data, dot_dims,
+                                          /*precision_config=*/nullptr);
+
+    // Handle out of boundary cases by constructing a predicate mask array based
+    // on the in-bound condition, and output 0 for the blended pixel value if
+    // out-bound. The dimension is the same as top_left: [batch, dim_0,
+    // ...dim_n, 2] where the last dimension of size 2 is the [x, y] coordinate.
+
+    auto is_ge_zero = xla::Ge(warp, xla::ZerosLike(warp));
+
+    auto is_lt_image_size = xla::Lt(
+        warp,
+        xla::ConvertElementType(
+            xla::ConstantR1<float>(
+                ctx->builder(),
+                {/*width=*/static_cast<float>(data_shape.dim_size(2) - 1),
+                 /*height=*/static_cast<float>(data_shape.dim_size(1) - 1)}),
+            warp_type),
+        /*broadcast_dimensions=*/{warp_shape.dims() - 1});
+
+    auto is_in_bound_x_y = xla::And(is_ge_zero, is_lt_image_size);
+    // Reduce along last dimension. The resulting dimension is:
+    // [batch, dim_0, ...dim_n].
+    auto is_in_bound = xla::Reduce(
+        is_in_bound_x_y, xla::ConstantR0<bool>(ctx->builder(), true),
+        xla::CreateScalarAndComputation(xla::PrimitiveType::PRED,
+                                        ctx->builder()),
+        {last_warp_dim});
+
+    // Broadcast 'is_in_bound' to the same dimension as 'blended_pixels', which
+    // is the dimension of the result:
+    //  [batch, dim_0, ...dim_n, data_channels].
+    auto warp_dims = warp_shape.dim_sizes();
+    std::vector<int64> result_dims(warp_dims.begin(), warp_dims.end() - 1);
+    result_dims.push_back(data_channels);
+    xla::Shape broadcasted_shape =
+        xla::ShapeUtil::MakeShape(xla::PrimitiveType::PRED, result_dims);
+
+    std::vector<int64> broadcasted_dims(warp_dims.size() - 1);
+    std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0);
+    auto broadcasted_is_in_bound =
+        xla::BroadcastInDim(is_in_bound, broadcasted_shape, broadcasted_dims);
+
+    // Set out of bound samples to zero.
+    auto zeros =
+        xla::Broadcast(xla::Zero(ctx->builder(), data_type), result_dims);
+    auto result = xla::Select(broadcasted_is_in_bound, blended_pixels, zeros);
+
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(Name("Resampler"), ResamplerOp);
+
+class ResamplerGradOp : public XlaOpKernel {
+ public:
+  explicit ResamplerGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    DataType output_dtype;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &output_dtype));
+  }
+
+  // TODO(b/112295522): note that sampling from image boundary is not currently
+  // being handled properly.
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape data_shape_tf = ctx->InputShape("data");
+    OP_REQUIRES(ctx, data_shape_tf.dims() == 4,
+                errors::InvalidArgument("data must be 4-dimensional",
+                                        data_shape_tf.DebugString()));
+    const int64 data_channels = data_shape_tf.dim_size(3);
+    xla::PrimitiveType data_type = ctx->input_xla_type(0);
+
+    TensorShape warp_shape = ctx->InputShape("warp");
+    OP_REQUIRES(ctx, warp_shape.dims() >= 2,
+                errors::InvalidArgument("warp must be at least 2-dimensional",
+                                        warp_shape.DebugString()));
+    for (int size : warp_shape.dim_sizes()) {
+      OP_REQUIRES(ctx, size > 0,
+                  errors::InvalidArgument("warp sizes must be positive, got [",
+                                          size, "]"));
+    }
+    // Last dimension of warp shape must be of size 2.
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
+                errors::InvalidArgument(
+                    "the last dimension of warp must be exactly size 2."));
+    xla::PrimitiveType warp_type = ctx->input_xla_type(1);
+
+    TensorShape output_grad_shape = ctx->InputShape("grad_output");
+    OP_REQUIRES(
+        ctx, output_grad_shape.dims() >= 2,
+        errors::InvalidArgument("output_grad must be at least 2-dimensional",
+                                output_grad_shape.DebugString()));
+
+    // Dimensions are [batch, x, y, channel].
+    XlaOp data = ctx->Input("data");
+    xla::Shape data_shape = TensorShapeToXLAShape(data_type, data_shape_tf);
+
+    // Dimensions are [batch, dim_0, ...dim_n, 2].
+    XlaOp warp = ctx->Input("warp");
+    // Dimensions are [batch, dim_0, ...dim_n, channel].
+    XlaOp grad_output = ctx->Input("grad_output");
+
+    // Find the top left corner coordinate for the region to be sampled from.
+    // The dimensions are [batch, dim_0, ... dim_n, 2] where the last dimension
+    // of size 2 in turn is [x, y].
+    XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
+
+    // Dimensions are [batch, dim_0, ... dim_n, 2]
+    XlaOp ratio = warp - xla::ConvertElementType(top_left, warp_type);
+
+    // Indices for gathering neighboring pixels.
+    auto gather_indices = ConcatenateIota(ctx->builder(), top_left, warp_shape);
+
+    auto grad_data =
+        CalculateGradData(ctx, grad_output, ratio, gather_indices, warp_type,
+                          warp_shape, data_channels, data_shape);
+
+    auto grad_warp =
+        CalculateGradWarp(ctx, grad_output, ratio, gather_indices, data,
+                          warp_shape, data_channels, data_type);
+
+    ctx->SetOutput(0, grad_data);
+    ctx->SetOutput(1, grad_warp);
+  }
+};
+
+REGISTER_XLA_OP(Name("ResamplerGrad"), ResamplerGradOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index 47a4eac20669c225a653bd3f1f00eeafd0845a42..fa1b6b91710f5507f41f3f69b0715398ae879aaf 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace {
@@ -36,7 +37,7 @@ class ReshapeOp : public XlaOpKernel {
     const TensorShape input_shape = ctx->InputShape(0);
     const TensorShape sizes_shape = ctx->InputShape(1);
     // Preliminary validation of sizes.
-    OP_REQUIRES(ctx, IsLegacyVector(sizes_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(sizes_shape),
                 errors::InvalidArgument("sizes input must be 1-D, not shape ",
                                         sizes_shape.DebugString()));
     const int64 num_dims = sizes_shape.num_elements();
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index e172c649325adb6f7761ce0be141f21e8d545bc1..6970dd0a00641c9f88571561501fb3454fb3eab3 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -46,61 +47,8 @@ class RetvalOp : public XlaOpKernel {
       // compilation.
       OP_REQUIRES_OK(ctx, frame->SetRetval(index_, input));
     } else {
-      xla::XlaOp input = ctx->Input(0);
-      const TensorShape input_shape = ctx->InputShape(0);
-      DataType input_type = ctx->input_type(0);
-      XlaContext& tc = XlaContext::Get(ctx);
-
-      if (input_type == DT_RESOURCE) {
-        XlaResource* resource;
-        OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
-        ctx->SetStatus(tc.AddResourceRetval(index_, resource));
-        return;
-      }
-
-      auto is_constant = ctx->builder()->IsConstant(input);
-      if (!is_constant.ok()) {
-        ctx->SetStatus(is_constant.status());
-        return;
-      }
-
-      if (tc.resolve_compile_time_constants() &&
-          (input_shape.num_elements() == 0 || is_constant.ValueOrDie())) {
-        xla::Literal literal;
-        OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal));
-        OP_REQUIRES_OK(ctx, tc.AddConstRetval(index_, dtype_, literal));
-      } else {
-        TensorShape shape = ctx->InputShape(0);
-        ctx->SetStatus(is_constant.status());
-        TensorShape representation_shape;
-        if (tc.is_entry_computation()) {
-          xla::StatusOr<TensorShape> shape_or_status =
-              tc.RepresentationShape(shape, ctx->input_type(0));
-          if (!shape_or_status.ok()) {
-            ctx->SetStatus(shape_or_status.status());
-            return;
-          } else {
-            representation_shape = shape_or_status.ValueOrDie();
-          }
-        } else {
-          representation_shape = shape;
-        }
-
-        xla::XlaOp output = input;
-        if (tc.is_entry_computation()) {
-          output = xla::Reshape(input, representation_shape.dim_sizes());
-        } else {
-          // The core from which a return value is returned depends on the
-          // device assignment of the input to the retval. Since we can't change
-          // the device assignment of "input" at this point, we must always
-          // introduce an operator here, even if the shape does not change.
-          // TODO(b/76097077): propagate device assignments onto arguments and
-          // return values of functions, and then reshape unconditionally.
-          output =
-              xla::GetTupleElement(xla::Tuple(ctx->builder(), {output}), 0);
-        }
-        tc.AddRetval(index_, dtype_, shape, output);
-      }
+      XlaContext& xla_context = XlaContext::Get(ctx);
+      xla_context.SetRetval(index_, ctx->InputExpression(0));
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index 56b80cb4a299c07157a166208b96ad369075aa83..2ceadaf79c5cef35ad50aa84a0d66a46527a6458 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -51,14 +51,11 @@ class ReverseOp : public XlaOpKernel {
     }
     // XlaBuilder::Rev() requires concrete values for dimensions arg.
     xla::Literal lax;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {x_shape.dims()}, &lax));
-    std::vector<bool> revdims(x_shape.dims());
-    std::copy(lax.data<bool>().begin(), lax.data<bool>().end(),
-              revdims.begin());
-    std::vector<int64> dimensions;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &lax));
 
+    std::vector<int64> dimensions;
     for (int d = 0; d < x_shape.dims(); ++d) {
-      if (revdims[d]) {
+      if (lax.Get<bool>({d})) {
         dimensions.push_back(d);
       }
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index 7ff3e9163811434e8d621795c22bf8304ba7a1ed..d7b38e86cc985d608116488f9e76756a8e904f9c 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 379f4aeb0fc7bbfff59696726f5af231b1294c49..b1fa2915d59e4e5e2f2523e20e9a37898d087117 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -30,31 +30,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-template <typename T>
-Status GetValue(int index, XlaOpKernelContext* ctx, T* value) {
-  xla::Literal literal;
-  TF_RETURN_IF_ERROR(ctx->ConstantInput(index, &literal));
-  *value = literal.Get<T>({});
-  return Status::OK();
-}
-
-Status GetIntValue(int index, XlaOpKernelContext* ctx, int64* value) {
-  xla::Literal literal;
-  TF_RETURN_IF_ERROR(ctx->ConstantInput(index, &literal));
-  switch (literal.shape().element_type()) {
-    case xla::S32:
-      *value = literal.Get<int32>({});
-      break;
-    case xla::S64:
-      *value = literal.Get<int64>({});
-      break;
-    default:
-      return errors::InvalidArgument("Invalid argument type for argument",
-                                     index);
-  }
-  return Status::OK();
-}
-
 // The type-specific part of the implementation of Range.
 template <typename T>
 xla::StatusOr<xla::XlaOp> CreateRangeTensor(
@@ -98,13 +73,13 @@ class RangeOp : public XlaOpKernel {
     const TensorShape start_in_shape = ctx->InputShape(0);
     const TensorShape limit_in_shape = ctx->InputShape(1);
     const TensorShape delta_in_shape = ctx->InputShape(2);
-    OP_REQUIRES(ctx, IsLegacyScalar(start_in_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(start_in_shape),
                 errors::InvalidArgument("start must be a scalar, not shape ",
                                         start_in_shape.DebugString()));
-    OP_REQUIRES(ctx, IsLegacyScalar(limit_in_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(limit_in_shape),
                 errors::InvalidArgument("limit must be a scalar, not shape ",
                                         limit_in_shape.DebugString()));
-    OP_REQUIRES(ctx, IsLegacyScalar(delta_in_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(delta_in_shape),
                 errors::InvalidArgument("delta must be a scalar, not shape ",
                                         delta_in_shape.DebugString()));
     xla::Literal start, limit, delta;
@@ -147,9 +122,9 @@ class LinSpaceOp : public XlaOpKernel {
   explicit LinSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape start_in_shape = ctx->InputShape(0);
-    const TensorShape stop_in_shape = ctx->InputShape(1);
-    const TensorShape num_in_shape = ctx->InputShape(2);
+    const TensorShape start_in_shape = ctx->InputShape("start");
+    const TensorShape stop_in_shape = ctx->InputShape("stop");
+    const TensorShape num_in_shape = ctx->InputShape("num");
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(start_in_shape),
                 errors::InvalidArgument("start must be a scalar, not shape ",
                                         start_in_shape.DebugString()));
@@ -163,16 +138,20 @@ class LinSpaceOp : public XlaOpKernel {
     DataType type = ctx->input_type(0);
 
     int64 num;
-    OP_REQUIRES_OK(ctx, GetIntValue(2, ctx, &num));
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("num", &num));
     OP_REQUIRES(ctx, num > 0,
                 errors::InvalidArgument("Requires num > 0: ", num));
     Tensor out_constant(type, TensorShape({num}));
 
+    xla::Literal start_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput("start", &start_literal));
+    xla::Literal stop_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput("stop", &stop_literal));
+
     switch (type) {
       case DT_FLOAT: {
-        float start, stop;
-        OP_REQUIRES_OK(ctx, GetValue(0, ctx, &start));
-        OP_REQUIRES_OK(ctx, GetValue(1, ctx, &stop));
+        float start = start_literal.GetFirstElement<float>();
+        float stop = stop_literal.GetFirstElement<float>();
         auto flat = out_constant.flat<float>();
         if (num == 1) {
           flat(0) = start;
@@ -185,9 +164,8 @@ class LinSpaceOp : public XlaOpKernel {
         break;
       }
       case DT_DOUBLE: {
-        double start, stop;
-        OP_REQUIRES_OK(ctx, GetValue(0, ctx, &start));
-        OP_REQUIRES_OK(ctx, GetValue(1, ctx, &stop));
+        double start = start_literal.GetFirstElement<double>();
+        double stop = stop_literal.GetFirstElement<double>();
         auto flat = out_constant.flat<double>();
         if (num == 1) {
           flat(0) = start;
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 37b026aeb058b464acd74264766f187b787914aa..12830816ec16c9797f0fe4d8f3f13f5a8176161d 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
@@ -108,21 +109,16 @@ class ExpandDimsOp : public XlaOpKernel {
   explicit ExpandDimsOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape dim_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape dim_shape = ctx->InputShape("dim");
 
-    // TODO(phawkins): the standard implementation of ExpandDimsOp seems to
-    // accept legacy scalars, even when they should be forbidden by the graphdef
-    // version.
-    OP_REQUIRES(ctx, dim_shape.num_elements() == 1,
+    std::vector<int64> dims;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshapedToIntVector("dim", &dims));
+    OP_REQUIRES(ctx, dims.size() == 1,
                 errors::InvalidArgument(absl::StrCat(
                     "dim input to ExpandDims must be a scalar; got ",
                     dim_shape.DebugString())));
-
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {1}, &literal));
-
-    int dim = literal.data<int32>()[0];
+    int dim = dims[0];
 
     OP_REQUIRES(ctx,
                 (dim >= -1 - input_shape.dims() && dim <= input_shape.dims()),
@@ -148,7 +144,7 @@ class ExpandDimsOp : public XlaOpKernel {
     dim = std::min<int32>(dim, existing_dims_size);
     new_shape.emplace(new_shape.begin() + dim, 1);
 
-    ctx->SetOutput(0, xla::Reshape(ctx->Input(0), new_shape));
+    ctx->SetOutput(0, xla::Reshape(ctx->Input("input"), new_shape));
   }
 };
 REGISTER_XLA_OP(Name("ExpandDims").CompileTimeConstantInput("dim"),
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 34980ead81815c2818a259e096148fcce9c9a3b1..88da64e5a217a0c026106f03cb26958f6738446c 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mem.h"
@@ -42,8 +43,8 @@ class SliceOp : public XlaOpKernel {
 
     OP_REQUIRES(
         ctx,
-        IsLegacyVector(begin_tensor_shape) &&
-            IsLegacyVector(size_tensor_shape) &&
+        TensorShapeUtils::IsVector(begin_tensor_shape) &&
+            TensorShapeUtils::IsVector(size_tensor_shape) &&
             begin_tensor_shape.num_elements() == input_shape.dims() &&
             size_tensor_shape.num_elements() == input_shape.dims(),
         errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 230a343f7966f19cda44991a747287ba675fca4c..7a0e240400b344ab25743997ce3baad81bd5f476 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -35,26 +35,16 @@ class SplitOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const int32 num_split = num_outputs();
-    const TensorShape index_shape = ctx->InputShape(0);
+    const TensorShape split_dim_shape = ctx->InputShape("split_dim");
     const TensorShape input_shape = ctx->InputShape(1);
 
-    xla::Literal literal_index;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal_index));
-
-    int32 split_dim_orig;
-    if (index_shape.dims() == 0) {
-      split_dim_orig = literal_index.Get<int>({});
-    } else {
-      OP_REQUIRES(
-          ctx, index_shape.dims() == 1,
-          errors::InvalidArgument("split_index input to Split Op must be a "
-                                  "scalar or a vector with 1 element"));
-      OP_REQUIRES(
-          ctx, index_shape.dim_size(0) == 1,
-          errors::InvalidArgument("split_index input to Split Op must be a "
-                                  "scalar or a vector with 1 element"));
-      split_dim_orig = literal_index.Get<int>({0});
-    }
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(split_dim_shape),
+        errors::InvalidArgument("split_dim must be a scalar but has rank ",
+                                split_dim_shape.dims()));
+    int64 split_dim_orig;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &split_dim_orig));
+
     int32 split_dim = split_dim_orig < 0 ? split_dim_orig + input_shape.dims()
                                          : split_dim_orig;
     OP_REQUIRES(ctx, 0 <= split_dim && split_dim < input_shape.dims(),
@@ -138,7 +128,6 @@ class SplitVOp : public XlaOpKernel {
     // Check that sizes are correct.
     int total_split_size = 0;
     int neg_one_dim = -1;
-    std::vector<int64> split_sizes_vec(num_split, -1);
     const TensorShape split_size_shape = ctx->InputShape(1);
     OP_REQUIRES(ctx,
                 split_size_shape.dims() == 1 &&
@@ -150,12 +139,11 @@ class SplitVOp : public XlaOpKernel {
                     split_size_shape.dims(), "-D and ",
                     split_size_shape.num_elements(), " elements"));
     // Get the dimension of this split.
-    xla::Literal split_size_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &split_size_literal));
+    std::vector<int64> split_sizes;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &split_sizes));
 
     for (int i = 0; i < num_split; ++i) {
-      int slice_size;
-      slice_size = split_size_literal.Get<int>({i});
+      int64 slice_size = split_sizes[i];
       if (slice_size == -1) {
         OP_REQUIRES(
             ctx, neg_one_dim == -1,
@@ -164,7 +152,6 @@ class SplitVOp : public XlaOpKernel {
                                     i));
         neg_one_dim = i;
       } else {
-        split_sizes_vec[i] = slice_size;
         total_split_size += slice_size;
       }
     }
@@ -183,7 +170,7 @@ class SplitVOp : public XlaOpKernel {
                                 total_split_size));
 
     if (neg_one_dim >= 0) {
-      split_sizes_vec[neg_one_dim] =
+      split_sizes[neg_one_dim] =
           input_shape.dim_size(split_dim) - total_split_size;
     }
 
@@ -195,7 +182,7 @@ class SplitVOp : public XlaOpKernel {
     std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < num_split; ++i) {
       TensorShape output_shape(input_shape);
-      int slice_size = split_sizes_vec[i];
+      int slice_size = split_sizes[i];
       output_shape.set_dim(split_dim, slice_size);
 
       // Slice out the ith split from the split dimension.
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index d79cdad9fa2dabe1e236741955499b845064148f..7b96b43ad834c28aa0283c5ef4ac516618ca5134 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -126,7 +126,9 @@ class StackOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
 };
 
-REGISTER_XLA_OP(Name("StackV2").CompileTimeConstantInput("max_size"), StackOp);
+REGISTER_XLA_OP(
+    Name("StackV2").CompileTimeConstantInput("max_size").CompilationOnly(),
+    StackOp);
 
 class StackPushOp : public XlaOpKernel {
  public:
@@ -173,7 +175,7 @@ class StackPushOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackPushOp);
 };
 
-REGISTER_XLA_OP(Name("StackPushV2"), StackPushOp);
+REGISTER_XLA_OP(Name("StackPushV2").CompilationOnly(), StackPushOp);
 
 class StackPopOp : public XlaOpKernel {
  public:
@@ -227,7 +229,7 @@ class StackPopOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackPopOp);
 };
 
-REGISTER_XLA_OP(Name("StackPopV2"), StackPopOp);
+REGISTER_XLA_OP(Name("StackPopV2").CompilationOnly(), StackPopOp);
 
 class StackCloseOp : public XlaOpKernel {
  public:
@@ -241,7 +243,7 @@ class StackCloseOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackCloseOp);
 };
 
-REGISTER_XLA_OP(Name("StackCloseV2"), StackCloseOp);
+REGISTER_XLA_OP(Name("StackCloseV2").CompilationOnly(), StackCloseOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 7b2cd5a5b08d80284d172c2ed5d6be4c355e76e0..e1c764f3d5c28cf0d812519e4a16786e1f2d3a3a 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/macros.h"
@@ -44,7 +45,7 @@ class TileOp : public XlaOpKernel {
     const TensorShape multiples_shape = ctx->InputShape("multiples");
 
     OP_REQUIRES(
-        ctx, IsLegacyVector(multiples_shape),
+        ctx, TensorShapeUtils::IsVector(multiples_shape),
         errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
                                 multiples_shape.DebugString()));
     OP_REQUIRES(ctx, input_shape.dims() == multiples_shape.num_elements(),
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 48a211942d7c4405bf68189e641ee184db36b0ba..c9b324a243e4cc3ec64daa3ca0d285336a0d0154 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -37,8 +37,8 @@ class TransposeOp : public XlaOpKernel {
       : XlaOpKernel(ctx), conjugate_(conjugate) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape perm_tensor_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("x");
+    const TensorShape perm_tensor_shape = ctx->InputShape("perm");
 
     // Preliminary validation of sizes.
     OP_REQUIRES(ctx, TensorShapeUtils::IsVector(perm_tensor_shape),
@@ -52,19 +52,15 @@ class TransposeOp : public XlaOpKernel {
                                         ". But input(1) is a vector of size ",
                                         perm_tensor_shape.num_elements()));
 
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {dims}, &literal));
-
-    std::vector<int32> perm(dims);
-    std::copy(literal.data<int32>().begin(), literal.data<int32>().end(),
-              perm.begin());
+    std::vector<int64> perm;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("perm", &perm));
 
     std::vector<int64> transposed_order;
     // Check whether permutation is a permutation of integers of [0 .. dims).
     absl::InlinedVector<bool, 8> bits(dims);
     bool is_identity = true;
     for (int i = 0; i < dims; ++i) {
-      const int32 d = perm[i];
+      const int64 d = perm[i];
       OP_REQUIRES(
           ctx, 0 <= d && d < dims,
           errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")"));
@@ -83,9 +79,9 @@ class TransposeOp : public XlaOpKernel {
     xla::XlaOp transposed;
     // 0-D, 1-D, and identity transposes do nothing.
     if (dims <= 1 || is_identity) {
-      transposed = ctx->Input(0);
+      transposed = ctx->Input("x");
     } else {
-      transposed = xla::Transpose(ctx->Input(0), transposed_order);
+      transposed = xla::Transpose(ctx->Input("x"), transposed_order);
     }
 
     // Conjugate the transposed result if this is ConjugateTransposeOp.
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 0bdfc05726105e2d18362a691cbe2aab00bf77f3..a0ea6422d732b00fc1b8cf855d9c9ad603b87c82 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -80,24 +80,8 @@ XLAJIT_MAKE_UNARY(Invert, xla::Not(x));
 XLAJIT_MAKE_UNARY(LogicalNot, xla::Not(x));
 XLAJIT_MAKE_UNARY(Neg, -x);
 
-// Implements Banker's rounding: numbers that are equidistant between two
-// integers are rounded towards even.
-xla::XlaOp RoundToEven(xla::XlaOp x) {
-  auto half = xla::ScalarLike(x, 0.5);
-  auto one = xla::ScalarLike(x, 1.0);
-  auto two = xla::ScalarLike(x, 2.0);
-
-  auto round_val = xla::Floor(x);
-  auto fraction = x - round_val;
-  auto nearest_even_int = round_val - two * xla::Floor(half * x);
-  auto is_odd = xla::Eq(nearest_even_int, one);
-  return xla::Select(xla::Or(xla::Gt(fraction, half),
-                             xla::And(xla::Eq(fraction, half), is_odd)),
-                     round_val + one, round_val);
-}
-
-XLAJIT_MAKE_UNARY(Rint, RoundToEven(x));
-XLAJIT_MAKE_UNARY(Round, RoundToEven(x));
+XLAJIT_MAKE_UNARY(Rint, xla::RoundToEven(x));
+XLAJIT_MAKE_UNARY(Round, xla::RoundToEven(x));
 
 XLAJIT_MAKE_UNARY(Rsqrt, xla::Rsqrt(x));
 
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 20103ec3ae00b57723e05326dbbb1b0f6e1a671a..67d08290033361f16dfff42b06af9b253e84963a 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -32,6 +32,12 @@ Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
   return Status::OK();
 }
 
+xla::StatusOr<xla::Literal> HostTensorToLiteral(const Tensor& host_tensor) {
+  xla::BorrowingLiteral literal;
+  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(host_tensor, &literal));
+  return literal.Clone();
+}
+
 Status HostTensorToMutableBorrowingLiteral(
     Tensor* host_tensor, xla::MutableBorrowingLiteral* literal) {
   xla::Shape xla_shape;
diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h
index 1db7470ee2a839099454b772d4833492e033bc92..a153dddee6127ff9c0858220f2d8a735ab3f0e19 100644
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@@ -30,6 +30,11 @@ namespace tensorflow {
 // 'host_tensor'.
 Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
                                     xla::BorrowingLiteral* literal);
+
+// Returns a Literal with the contents of 'host_tensor', backed by its own
+// storage (i.e., not reusing 'host_tensor's buffers.)
+xla::StatusOr<xla::Literal> HostTensorToLiteral(const Tensor& host_tensor);
+
 // Returns a MutableBorrowingLiteral that utilizes the same underlying buffer
 // owned by 'host_tensor', but is mutable via the xla::Literal methods.
 Status HostTensorToMutableBorrowingLiteral(
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index 8b559c87506a6b519e2ad1d1bf22ab30c0ff161d..c9f486edc8d30954619db0967c988fe8e26938de 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -3,6 +3,7 @@ licenses(["notice"])  # Apache 2.0
 package(
     default_visibility = [
         "//learning/deepmind/public/wavenet/python:__subpackages__",
+        "//learning/deepmind/research/alphastar:__subpackages__",
         "//learning/tfx:__subpackages__",
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index cb7843850c352eee2e55baf52a0c4445dc861d7b..ddb284966eeb97cc7c9d3ed77fb313e567975e59 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -124,13 +124,4 @@ Status XlaCompilationDevice::MakeTensorFromProto(
       "XLACompilationDevice::MakeTensorFromProto should not be called");
 }
 
-XlaExpression::XlaExpression() = default;
-
-void XlaExpression::set_handle(const xla::XlaOp& h) { handle_ = h; }
-
-void XlaExpression::set_constant_value(Tensor value) {
-  has_constant_value_ = true;
-  constant_value_ = std::move(value);
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h
index a6e78825334fec748be5fee80669649df699d2fb..de6a3356e05d8ab45c269d7c6c653853d2c63a79 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.h
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -18,9 +18,6 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/tf2xla/xla_resource.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -38,8 +35,8 @@ class XlaCompilationAllocator;
 // This is a 'dummy' TensorFlow device that is only used to execute a
 // subgraph of XLA compilation Ops to construct a compiled version
 // of the subgraph's computation. It has a 'dummy' allocator that
-// backs each Tensor with metadata indicating the computation the
-// Tensor represents.
+// backs each Tensor with an XlaExpression. The shape of the Tensor
+// matches the shape of XlaExpression.
 //
 // We deliberately don't register a device factory because we *never*
 // want placement to put Ops on a compilation device. The device is created
@@ -67,40 +64,6 @@ class XlaCompilationDevice : public LocalDevice {
   std::unique_ptr<XlaCompilationAllocator> allocator_;
 };
 
-// A XlaExpression wraps an XLA computation. Each Tensor on an
-// XlaCompilationDevice contains an XlaExpression, and the shape of the Tensor
-// matches the shape of the subcomputation in the XlaOp. Each
-// expression is either a constant, or a function of previously-compiled
-// expressions.
-class XlaExpression {
- public:
-  XlaExpression();
-
-  // handle() stores the XLA handle of the computation that the
-  // expression represents.
-  void set_handle(const xla::XlaOp& h);
-  const xla::XlaOp& handle() const { return handle_; }
-
-  void set_constant_value(Tensor value);
-  bool has_constant_value() const { return has_constant_value_; }
-  const Tensor& constant_value() const { return constant_value_; }
-
-  void set_resource(XlaResource* resource) { resource_ = resource; }
-  XlaResource* resource() const { return resource_; }
-
- private:
-  // The XLA handle of the expression's computation.
-  xla::XlaOp handle_;
-
-  // If this expression is a constant with a known value, 'constant_value' is a
-  // host-memory Tensor containing the value. Used to avoid invoking XLA for
-  // expressions that are trivially constant.
-  bool has_constant_value_ = false;
-  Tensor constant_value_;
-
-  XlaResource* resource_ = nullptr;  // Not owned.
-};
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILATION_DEVICE_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 425e769346ffcbc548495d93cb7adc779f860110..66206909a92fddbac4e77e5d2d8164fcbb46f317 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -287,11 +287,6 @@ class XlaCompiledCpuFunction {
 
   // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
   // for XLA generated code to be able to find it.
-  //
-  // For now we need to keep around the args_ array because there is code that
-  // depends on args() returning a void**.  However, in the future we may remove
-  // args_ in favor of using buffer_table_ as the sole storage for the
-  // arguments.
   const int32* const arg_index_table_;
 
   // The number of incoming arguments.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index e177a5f07f5607a0f9de75e6a999ee492cd9db4f..8036bc684401ff31c07ac381098e05fb8b7ee76a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -36,10 +36,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -48,7 +51,7 @@ namespace {
 
 // Checks that arguments `args` match types `types`.
 Status CheckSignature(const DataTypeVector& types,
-                      const std::vector<XlaCompiler::Argument>& args) {
+                      absl::Span<const XlaCompiler::Argument> args) {
   if (args.size() != types.size()) {
     return errors::Internal("Compilation arguments have ", args.size(),
                             " elements while function has ", types.size());
@@ -63,6 +66,262 @@ Status CheckSignature(const DataTypeVector& types,
   return Status::OK();
 }
 
+// Uses the _Arg and _Retval nodes in the graph to determine a core assignment
+// for each argument and return value.
+xla::StatusOr<std::pair<std::map<int, int>, std::map<int, int>>>
+ComputeArgAndRetvalCores(const Graph& graph) {
+  auto get_sharding_for_node = [](const Node* n) -> xla::StatusOr<int> {
+    TF_ASSIGN_OR_RETURN(
+        auto sharding,
+        ParseShardingFromDevice(*n, std::numeric_limits<int32>::max()));
+    if (sharding.has_value()) {
+      TF_RET_CHECK(sharding.value().type() ==
+                   xla::OpSharding::Type::OpSharding_Type_MAXIMAL);
+      return sharding.value().tile_assignment_devices(0);
+    } else {
+      return -1;
+    }
+  };
+  std::map<int, int> arg_cores;
+  std::map<int, int> retval_cores;
+  for (const Node* n : graph.nodes()) {
+    if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
+      TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
+      if (core < 0) continue;
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      TF_RET_CHECK(index >= 0) << "Negative _Arg index";
+      arg_cores[index] = core;
+    } else if (n->type_string() == FunctionLibraryDefinition::kRetOp) {
+      TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
+      if (core < 0) continue;
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      TF_RET_CHECK(index >= 0) << "Negative _Retval index";
+      TF_ASSIGN_OR_RETURN(retval_cores[index], get_sharding_for_node(n));
+      retval_cores[index] = core;
+    }
+  }
+  return std::make_pair(std::move(arg_cores), std::move(retval_cores));
+}
+
+Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
+                    XlaCompilationDevice* device, FunctionLibraryRuntime* flib,
+                    int64 step_id) {
+  // Resource cleanup is a bit messy. XlaContext is a ref-countd resource; the
+  // resource manager takes ownership via Create, and unrefs via Cleanup.  We
+  // explicitly add a reference to ensure the refcount at entry is maintained at
+  // all exit points; Create and Cleanup are always called in this function.
+  //
+  // The Executor requires us to use ScopedStepContainer. We wrap it in a
+  // unique_ptr so we can capture the cleanup status in the end.
+  xla_context->Ref();
+  Status status;
+  auto step_container = absl::make_unique<ScopedStepContainer>(
+      step_id, [&status, device](const string& name) {
+        status = device->resource_manager()->Cleanup(name);
+      });
+  TF_RETURN_IF_ERROR(device->resource_manager()->Create(
+      step_container->name(), XlaContext::kXlaContextResourceName,
+      xla_context));
+
+  GraphCompiler graph_compiler(device, graph.get(), flib, step_container.get());
+  TF_RETURN_IF_ERROR(graph_compiler.Compile());
+  // Explicitly clean up the step container, to capture the cleanup status.
+  step_container.reset();
+  return Status::OK();
+}
+
+// Builds the XLA computation.
+// - `args` is the list of input arguments
+// - `retvals` is the list of retvals produced by _Retval operators, in index
+//   order.
+// - `args_core` and `retval_cores` are mapping from arg/return indices to core
+//   assignments.
+// - If `return_updated_values_for_all_resources` is true, all resources will be
+//   included in `resource_updates`, regardless of whether their value changed.
+// - Sets `*num_nonconst_outputs` to the number of outputs of the `computation`.
+// - Sets `*resource_updates` to a description of resources whose values are
+//   written by the computation; the variable writes are the last
+// - `resource_updates.size()` return values from the computation. Each entry in
+//   `resource_updates` is a ResourceUpdate, whose `index` is the index of a
+//   resource variable argument to the computation to be updated, and `type` is
+//   the type of the final output.
+Status BuildComputation(
+    const std::vector<XlaCompiler::Argument>& args,
+    const std::vector<XlaExpression>& retvals,
+    const std::map<int, int>& arg_cores, const std::map<int, int>& retval_cores,
+    const std::vector<std::unique_ptr<XlaResource>>& resources,
+    std::unique_ptr<xla::XlaOp> token_output,
+    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
+    bool return_updated_values_for_all_resources, bool always_return_tuple,
+    xla::XlaBuilder* builder, xla::XlaComputation* computation,
+    int* num_computation_outputs, int* num_nonconst_outputs,
+    std::vector<XlaCompiler::OutputDescription>* outputs,
+    std::vector<XlaCompiler::ResourceUpdate>* resource_updates,
+    xla::Shape* output_shape) {
+  // Attach a common operator name as metadata. This has no semantic effect — it
+  // merely makes the HLO graph more readable when visualized via TensorBoard,
+  // since TensorBoard forms groups out of operators with similar names.
+  xla::OpMetadata retval_metadata;
+  retval_metadata.set_op_name("XLA_Retvals");
+  builder->SetOpMetadata(retval_metadata);
+  auto cleanup = gtl::MakeCleanup([builder]() { builder->ClearOpMetadata(); });
+
+  // Builds a no-op XLA computation. We need to set the sharding of outputs, but
+  // cannot change the sharding of the existing output op. To do this, we build
+  // a new identity op to which shardings can be applied.
+  auto identity_op = [builder](xla::XlaOp op) {
+    return xla::GetTupleElement(xla::Tuple(builder, {op}), 0);
+  };
+
+  std::vector<xla::XlaOp> elems;
+  elems.reserve(retvals.size());
+
+  // Keeps track of which retvals have layout to update. The first element is
+  // the output index, second element is the new layout.
+  std::vector<std::pair<int64, xla::Layout>> retval_to_update_layout;
+  for (int i = 0; i < retvals.size(); ++i) {
+    XlaCompiler::OutputDescription& output = (*outputs)[i];
+    const XlaExpression& retval = retvals[i];
+    output.type = retval.dtype();
+    switch (retval.kind()) {
+      case XlaExpression::Kind::kConstant:
+        output.is_constant = true;
+        output.constant_value = retval.constant_value();
+        output.shape = output.constant_value.shape();
+        break;
+
+      case XlaExpression::Kind::kXlaOp: {
+        output.is_constant = false;
+        TF_ASSIGN_OR_RETURN(output.shape, retval.GetShape());
+        xla::XlaOp value = retval.handle();
+        auto it = retval_cores.find(i);
+        xla::XlaScopedShardingAssignment assign_sharding(
+            builder, it == retval_cores.end()
+                         ? absl::optional<xla::OpSharding>()
+                         : xla::sharding_builder::AssignDevice(it->second));
+        if (shape_representation_fn) {
+          // If there is a shape representation function, reshape the output
+          // tensor to the shape given by the representation shape function.
+          TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
+                                                    output.shape, output.type));
+          value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
+          retval_to_update_layout.emplace_back(elems.size(), shape.layout());
+        } else if (it != retval_cores.end()) {
+          // Apply the sharding to the output, if there is a core assignment.
+          value = identity_op(value);
+        }
+
+        elems.push_back(value);
+        break;
+      }
+
+      case XlaExpression::Kind::kResource:
+        output.is_constant = false;
+        output.input_index = retval.resource()->arg_num();
+        output.shape = retval.resource()->shape();
+        break;
+
+      case XlaExpression::Kind::kInvalid:
+        return errors::InvalidArgument(
+            "Invalid expression returned by computation. "
+            "This probably means a return value was not set.");
+    }
+  }
+  *num_nonconst_outputs = elems.size();
+
+  // Add return values for resources whose values have changed.
+  std::vector<const XlaResource*> arg_resources;
+  arg_resources.reserve(resources.size());
+  for (const auto& resource : resources) {
+    if (resource->arg_num() >= 0) {
+      arg_resources.push_back(resource.get());
+    }
+  }
+  std::sort(arg_resources.begin(), arg_resources.end(),
+            [](const XlaResource* a, const XlaResource* b) {
+              return a->arg_num() < b->arg_num();
+            });
+
+  for (const XlaResource* resource : arg_resources) {
+    DCHECK_LT(resource->arg_num(), args.size());
+    const XlaCompiler::Argument& arg = args[resource->arg_num()];
+    auto it = arg_cores.find(resource->arg_num());
+    const int core = it == arg_cores.end() ? -1 : it->second;
+    bool modified = !resource->value().IsIdenticalTo(resource->initial_value());
+    // TensorArray gradients were modified if their values changed or there are
+    // any newly created gradients.
+    for (const auto& grad : resource->tensor_array_gradients()) {
+      modified =
+          modified ||
+          !grad.second->value().IsIdenticalTo(grad.second->initial_value()) ||
+          arg.tensor_array_gradients.count(grad.first) == 0;
+    }
+    if (return_updated_values_for_all_resources || modified) {
+      resource_updates->emplace_back();
+      XlaCompiler::ResourceUpdate& update = resource_updates->back();
+      update.input_index = resource->arg_num();
+      update.type = resource->type();
+      update.shape = resource->shape();
+      update.modified = modified;
+      for (const auto& grad : resource->tensor_array_gradients()) {
+        update.tensor_array_gradients_accessed.insert(grad.first);
+      }
+
+      // Request that the value be returned on a specific core.
+      xla::XlaScopedShardingAssignment assign_sharding(
+          builder, core == -1 ? absl::optional<xla::OpSharding>()
+                              : xla::sharding_builder::AssignDevice(core));
+
+      xla::XlaOp handle;
+      TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
+
+      // Ensures the correct sharding is applied to the output.
+      handle = identity_op(handle);
+
+      elems.push_back(handle);
+    }
+  }
+
+  // If we have token output, append it as the last one.
+  if (token_output) {
+    elems.push_back(*token_output);
+  }
+
+  *num_computation_outputs = elems.size();
+
+  // Builds the XLA computation. We *always* form a tuple here to ensure that
+  // the output value is the last thing added into the XLA computation, even
+  // if there is only one output value.
+  auto tuple = xla::Tuple(builder, elems);
+  if (!always_return_tuple && elems.size() == 1) {
+    xla::GetTupleElement(tuple, 0);
+  }
+
+  xla::StatusOr<xla::XlaComputation> computation_status = builder->Build();
+  if (!computation_status.ok()) {
+    return computation_status.status();
+  }
+  *computation = computation_status.ConsumeValueOrDie();
+
+  TF_ASSIGN_OR_RETURN(const auto& program_shape,
+                      computation->GetProgramShape());
+  *output_shape = program_shape.result();
+  // Update the output layout to the layout of retval.
+  for (auto& update : retval_to_update_layout) {
+    if (!always_return_tuple && elems.size() == 1) {
+      *output_shape->mutable_layout() = update.second;
+      continue;
+    }
+
+    xla::Shape* output_sub_shape =
+        xla::ShapeUtil::GetMutableSubshape(output_shape, {update.first});
+    *output_sub_shape->mutable_layout() = update.second;
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 bool XlaCompiler::Argument::operator==(
@@ -83,6 +342,39 @@ bool XlaCompiler::Argument::operator==(
   return constant_value.tensor_data() == other.constant_value.tensor_data();
 }
 
+string XlaCompiler::Argument::HumanString() const {
+  string common;
+  if (!name.empty()) {
+    common = absl::StrCat(" name=", name);
+  }
+  absl::StrAppend(&common, " type=", DataTypeString(type),
+                  " shape=", shape.DebugString());
+  switch (kind) {
+    case kInvalid:
+      return "invalid";
+    case kConstant:
+      return absl::StrCat("kind=constant", common,
+                          " value=", constant_value.DebugString());
+    case kResource: {
+      string output = absl::StrCat("kind=resource", common, " resource_kind=",
+                                   XlaResource::KindToString(resource_kind),
+                                   " initialized=", initialized);
+      if (tensor_array_size >= 0) {
+        absl::StrAppend(&output, " tensor_array_size=", tensor_array_size);
+      }
+      if (!tensor_array_gradients.empty()) {
+        absl::StrAppend(&output, " tensor_array_gradients=",
+                        absl::StrJoin(tensor_array_gradients, ","));
+      }
+      return output;
+    }
+    case kParameter:
+      return absl::StrCat("kind=parameter", common);
+    case kToken:
+      return absl::StrCat("token", common);
+  }
+}
+
 XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
       initialization_status_(Status::OK()),
@@ -110,8 +402,13 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
 
   // The default shape representation function is the identity.
   if (!options_.shape_representation_fn) {
-    options_.shape_representation_fn = [](const TensorShape& shape,
-                                          DataType type) { return shape; };
+    options_.shape_representation_fn =
+        [](const TensorShape& shape,
+           DataType dtype) -> xla::StatusOr<xla::Shape> {
+      xla::Shape xla_shape;
+      TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
+      return xla_shape;
+    };
   }
 }
 
@@ -171,15 +468,16 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   return graph;
 }
 
-Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
-                                    const NameAttrList& function,
-                                    std::vector<XlaCompiler::Argument> args,
-                                    XlaCompiler::CompilationResult* result) {
+Status XlaCompiler::CompileFunction(
+    const XlaCompiler::CompileOptions& options, const NameAttrList& function,
+    absl::Span<const XlaCompiler::Argument> args,
+    XlaCompiler::CompilationResult* result) {
   const string function_id =
       Canonicalize(function.name(), AttrSlice(&function.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
 
-  auto it = cache_.find({function_id, args});
+  const std::vector<XlaCompiler::Argument> arg_vector(args.begin(), args.end());
+  auto it = cache_.find({function_id, arg_vector});
   if (it != cache_.end()) {
     *result = it->second;
     return Status::OK();
@@ -212,14 +510,16 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
   // lowest-numbered core that consumes the argument. We choose the
   // lowest-numbered core so the assignment is deterministic.
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) == "_Arg") {
+    if (absl::string_view(n->type_string()) ==
+        FunctionLibraryDefinition::kArgOp) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/true));
     }
   }
   // Do _Retval as a second loop, in case the retval's input is an _Arg (which
   // may have gotten a device assignment from the first loop).
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) == "_Retval") {
+    if (absl::string_view(n->type_string()) ==
+        FunctionLibraryDefinition::kRetOp) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/false));
     }
   }
@@ -235,7 +535,7 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
       CompileGraph(options, function_id, std::move(graph), args, result));
   VLOG(1) << "====================================================";
 
-  cache_[{function_id, args}] = *result;
+  cache_[{function_id, arg_vector}] = *result;
   return Status::OK();
 }
 
@@ -247,25 +547,24 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
     case XlaCompiler::Argument::kConstant:
       LOG(FATAL) << "Unreachable case";
     case XlaCompiler::Argument::kParameter: {
-      TensorShape shape;
       if (is_entry_computation) {
         TF_ASSIGN_OR_RETURN(
-            shape, options_.shape_representation_fn(arg.shape, arg.type));
+            *xla_shape, options_.shape_representation_fn(arg.shape, arg.type));
       } else {
-        shape = arg.shape;
+        TF_RETURN_IF_ERROR(
+            TensorShapeToXLAShape(arg.type, arg.shape, xla_shape));
       }
-      return TensorShapeToXLAShape(arg.type, shape, xla_shape);
+      return Status::OK();
     }
     case XlaCompiler::Argument::kResource: {
       TF_RET_CHECK(arg.initialized);
 
       switch (arg.resource_kind) {
         case XlaResource::kVariable: {
-          TF_ASSIGN_OR_RETURN(
-              TensorShape representation_shape,
-              options_.shape_representation_fn(arg.shape, arg.type));
-          return TensorShapeToXLAShape(arg.type, representation_shape,
-                                       xla_shape);
+          TF_ASSIGN_OR_RETURN(*xla_shape, options_.shape_representation_fn(
+                                              arg.shape, arg.type));
+
+          return Status::OK();
         }
         case XlaResource::kTensorArray: {
           if (arg.tensor_array_size < 0) {
@@ -314,175 +613,16 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
   }
 }
 
-namespace {
-
-Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
-                    XlaCompilationDevice* device, FunctionLibraryRuntime* flib,
-                    int64 step_id) {
-  // Resource cleanup is a bit messy. XlaContext is a ref-countd resource; the
-  // resource manager takes ownership via Create, and unrefs via Cleanup.  We
-  // explicitly add a reference to ensure the refcount at entry is maintained at
-  // all exit points; Create and Cleanup are always called in this function.
-  //
-  // The Executor requires us to use ScopedStepContainer. We wrap it in a
-  // unique_ptr so we can capture the cleanup status in the end.
-  xla_context->Ref();
-  Status status;
-  auto step_container = absl::make_unique<ScopedStepContainer>(
-      step_id, [&status, device](const string& name) {
-        status = device->resource_manager()->Cleanup(name);
-      });
-  TF_RETURN_IF_ERROR(device->resource_manager()->Create(
-      step_container->name(), XlaContext::kXlaContextResourceName,
-      xla_context));
-
-  GraphCompiler graph_compiler(device, graph.get(), flib, step_container.get());
-  TF_RETURN_IF_ERROR(graph_compiler.Compile());
-  // Explicitly clean up the step container, to capture the cleanup status.
-  step_container.reset();
-  return Status::OK();
-}
-
-// Builds the XLA computation.
-// `args` is the list of input arguments, `retvals` is the list of retvals
-// produced by _Retval operators, in index order.
-// If `return_updated_values_for_all_resources` is true, all resources will be
-// included in `resource_updates`, regardless of whether their value changed.
-// Sets `*num_nonconst_outputs` to the number of outputs of the `computation`.
-// Sets `*resource_updates` to a description of resources whose values are
-// written by the computation; the variable writes are the last
-// `resource_updates.size()` return values from the computation. Each entry in
-// `resource_updates` is a (input_index, type) pair, where `input_index` is the
-// index of a resource variable argument to the computation, and `type` is the
-// type of the final output.
-Status BuildComputation(
-    const std::vector<XlaCompiler::Argument>& args,
-    const std::vector<int>& arg_cores,
-    const std::vector<XlaContext::Retval>& retvals,
-    const std::vector<std::unique_ptr<XlaResource>>& resources,
-    std::unique_ptr<xla::XlaOp> token_output,
-    bool return_updated_values_for_all_resources, bool always_return_tuple,
-    xla::XlaBuilder* builder, xla::XlaComputation* computation,
-    int* num_computation_outputs, int* num_nonconst_outputs,
-    std::vector<XlaCompiler::OutputDescription>* outputs,
-    std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
-  std::vector<xla::XlaOp> elems;
-  elems.reserve(retvals.size());
-  for (int i = 0; i < retvals.size(); ++i) {
-    XlaCompiler::OutputDescription& output = (*outputs)[i];
-    output.type = retvals[i].type;
-    output.shape = retvals[i].shape;
-    const XlaExpression& retval = retvals[i].expression;
-    if (retval.has_constant_value()) {
-      output.is_constant = true;
-      output.constant_value = retval.constant_value();
-    } else if (retval.resource() != nullptr) {
-      output.is_constant = false;
-      output.input_index = retval.resource()->arg_num();
-    } else {
-      output.is_constant = false;
-      elems.push_back(retval.handle());
-    }
-  }
-  *num_nonconst_outputs = elems.size();
-
-  // Add return values for resources whose values have changed.
-  std::vector<const XlaResource*> arg_resources;
-  arg_resources.reserve(resources.size());
-  for (const auto& resource : resources) {
-    if (resource->arg_num() >= 0) {
-      arg_resources.push_back(resource.get());
-    }
-  }
-  std::sort(arg_resources.begin(), arg_resources.end(),
-            [](const XlaResource* a, const XlaResource* b) {
-              return a->arg_num() < b->arg_num();
-            });
-
-  // Attach a common operator name as metadata. This has no semantic effect — it
-  // merely makes the HLO graph more readable when visualized via TensorBoard,
-  // since TensorBoard forms groups out of operators with similar names.
-  xla::OpMetadata retval_metadata;
-  retval_metadata.set_op_name("XLA_Retvals");
-  builder->SetOpMetadata(retval_metadata);
-
-  for (const XlaResource* resource : arg_resources) {
-    const XlaCompiler::Argument& arg = args[resource->arg_num()];
-    const int core = arg_cores[resource->arg_num()];
-    DCHECK_LT(resource->arg_num(), arg_cores.size());
-    bool modified = !resource->value().IsIdenticalTo(resource->initial_value());
-    // TensorArray gradients were modified if their values changed or there are
-    // any newly created gradients.
-    for (const auto& grad : resource->tensor_array_gradients()) {
-      modified =
-          modified ||
-          !grad.second->value().IsIdenticalTo(grad.second->initial_value()) ||
-          arg.tensor_array_gradients.count(grad.first) == 0;
-    }
-    if (return_updated_values_for_all_resources || modified) {
-      resource_updates->emplace_back();
-      XlaCompiler::ResourceUpdate& update = resource_updates->back();
-      update.input_index = resource->arg_num();
-      update.type = resource->type();
-      update.shape = resource->shape();
-      update.modified = modified;
-      for (const auto& grad : resource->tensor_array_gradients()) {
-        update.tensor_array_gradients_accessed.insert(grad.first);
-      }
-
-      // Request that the value be returned on a specific core.
-      xla::XlaScopedShardingAssignment assign_sharding(
-          builder, core == -1 ? absl::optional<xla::OpSharding>()
-                              : xla::sharding_builder::AssignDevice(core));
-
-      xla::XlaOp handle;
-      TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
-
-      // Since we can't change the sharding metadata of <value> as this point,
-      // create a tuple/get-tuple-element combination so that sharding
-      // assignment will be placed on this value, which will cause the resource
-      // update to be returned from the same device that provided the resource.
-      handle = xla::GetTupleElement(xla::Tuple(builder, {handle}), 0);
-      elems.push_back(handle);
-    }
-  }
-
-  // If we have token output, append it as the last one.
-  if (token_output) {
-    elems.push_back(*token_output);
-  }
-
-  *num_computation_outputs = elems.size();
-
-  // Builds the XLA computation. We *always* form a tuple here to ensure that
-  // the output value is the last thing added into the XLA computation, even
-  // if there is only one output value.
-  auto tuple = xla::Tuple(builder, elems);
-  if (!always_return_tuple && elems.size() == 1) {
-    xla::GetTupleElement(tuple, 0);
-  }
-  builder->ClearOpMetadata();
-
-  xla::StatusOr<xla::XlaComputation> computation_status = builder->Build();
-  if (!computation_status.ok()) {
-    return computation_status.status();
-  }
-  *computation = computation_status.ConsumeValueOrDie();
-  return Status::OK();
-}
-
-}  // namespace
-
 // Builds XLA computations for each of the arguments to the computation.
 // `args` are the arguments to the computation.
 Status XlaCompiler::BuildArguments(
     const Graph& graph, const std::vector<XlaCompiler::Argument>& args,
     bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context,
-    std::vector<int>* arg_cores, std::vector<XlaExpression>* arg_expressions,
+    const std::map<int, int>& arg_cores,
+    std::vector<XlaExpression>* arg_expressions,
     std::vector<int>* input_mapping, std::vector<xla::Shape>* input_shapes,
     bool is_entry_computation) {
   arg_expressions->resize(args.size());
-  *arg_cores = std::vector<int>(args.size(), -1);
 
   // Argument numbers of arguments and resources that are to be passed to the
   // XLA computation as runtime parameters.
@@ -504,7 +644,7 @@ Status XlaCompiler::BuildArguments(
             arg.resource_kind, i, arg.name, arg.type, arg.shape, xla::XlaOp(),
             /*tensor_array_size=*/arg.tensor_array_size,
             /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
-        arg_expression.set_resource(resource);
+        arg_expression = XlaExpression::Resource(resource);
         if (arg.initialized) {
           input_mapping->push_back(i);
         }
@@ -516,7 +656,7 @@ Status XlaCompiler::BuildArguments(
         break;
       }
       case XlaCompiler::Argument::kConstant:
-        arg_expression.set_constant_value(arg.constant_value);
+        arg_expression = XlaExpression::Constant(arg.constant_value);
         break;
       case XlaCompiler::Argument::kInvalid:
         return errors::Internal(
@@ -541,26 +681,6 @@ Status XlaCompiler::BuildArguments(
     *input_shapes = arg_shapes;
   }
 
-  // Use the _Arg nodes in the graph to resolve core assignments.
-  for (const Node* n : graph.nodes()) {
-    if (absl::string_view(n->type_string()) != "_Arg") continue;
-    int index;
-    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
-    TF_RET_CHECK(index >= 0 && index < args.size())
-        << "_Arg out of bounds: " << index << " vs " << args.size();
-    TF_ASSIGN_OR_RETURN(
-        auto sharding,
-        ParseShardingFromDevice(*n, std::numeric_limits<int32>::max()));
-    if (sharding.has_value()) {
-      TF_RET_CHECK(sharding.value().type() ==
-                   xla::OpSharding::Type::OpSharding_Type_MAXIMAL);
-      const int core = sharding.value().tile_assignment_devices(0);
-      if ((*arg_cores)[index] == -1 || core < (*arg_cores)[index]) {
-        (*arg_cores)[index] = core;
-      }
-    }
-  }
-
   // Attach a common operator name as metadata. This has no semantic effect — it
   // merely makes the HLO graph more readable when visualized via TensorBoard,
   // since TensorBoard forms groups out of operators with similar names.
@@ -576,11 +696,10 @@ Status XlaCompiler::BuildArguments(
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
       for (int64 parameter : *input_mapping) {
-        const int core = (*arg_cores)[parameter];
-        const int root_device = 0;
+        auto it = arg_cores.find(parameter);
+        const int core = it == arg_cores.end() ? 0 : it->second;
         *tuple_sharding.add_tuple_shardings() =
-            core == -1 ? xla::sharding_builder::AssignDevice(root_device)
-                       : xla::sharding_builder::AssignDevice(core);
+            xla::sharding_builder::AssignDevice(core);
       }
       xla::XlaScopedShardingAssignment assign_tuple_sharding(builder,
                                                              tuple_sharding);
@@ -589,7 +708,8 @@ Status XlaCompiler::BuildArguments(
       tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple");
     }
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
-      const int core = (*arg_cores)[input_mapping->at(i)];
+      auto it = arg_cores.find(i);
+      const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
           builder, core == -1 ? absl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
@@ -597,7 +717,8 @@ Status XlaCompiler::BuildArguments(
     }
   } else {
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
-      const int core = (*arg_cores)[input_mapping->at(i)];
+      auto it = arg_cores.find(i);
+      const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
           builder, core == -1 ? absl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
@@ -632,14 +753,14 @@ Status XlaCompiler::BuildArguments(
         // TODO(b/76097077): propagate device assignments onto arguments and
         // return values of functions, and then reshape unconditionally.
         if (is_entry_computation) {
-          arg_expression.set_handle(
-              xla::Reshape(arg_handles[i], arg.shape.dim_sizes()));
+          arg_expression = XlaExpression::XlaOp(
+              xla::Reshape(arg_handles[i], arg.shape.dim_sizes()), arg.type);
         } else {
-          arg_expression.set_handle(arg_handles[i]);
+          arg_expression = XlaExpression::XlaOp(arg_handles[i], arg.type);
         }
         break;
       case XlaCompiler::Argument::kToken: {
-        arg_expression.set_handle(arg_handles[i]);
+        arg_expression = XlaExpression::XlaOp(arg_handles[i], arg.type);
         break;
       }
       case XlaCompiler::Argument::kConstant:
@@ -653,46 +774,48 @@ Status XlaCompiler::BuildArguments(
 }
 
 Status XlaCompiler::CompileSingleOp(
-    const XlaCompiler::CompileOptions& options, string const& name,
-    OpKernelContext* ctx, const std::vector<XlaCompiler::Argument>& args,
-    CompilationResult* result) {
+    const XlaCompiler::CompileOptions& options, const NodeDef& node_def,
+    absl::Span<const XlaCompiler::Argument> args,
+    absl::Span<const DataType> result_types, CompilationResult* result) {
   // TODO(b/74182462): We implement this by creating a new dummy Graph including
   // _Arg nodes, and let CompileGraph walk it. This could be optimized.
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   Status status;
   // First create the actual node we care about computing.
-  Node* main_node = graph->AddNode(ctx->op_kernel().def(), &status);
+  Node* main_node = graph->AddNode(node_def, &status);
   TF_RETURN_IF_ERROR(status);
 
   // Create dummy _Arg nodes. Link these to `node` and also via a control
   // dependency edge to the _SOURCE node.
-  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+  for (int64 i = 0; i < args.size(); ++i) {
     Node* node;
-    string name = absl::StrCat(ctx->op_kernel().name(), "_", i, "_arg");
-    Status status = NodeBuilder(name, "_Arg")
-                        .ControlInput(graph->source_node())
-                        .Attr("T", ctx->input_dtype(i))
-                        .Attr("index", i)
-                        .Finalize(graph.get(), &node);
+    string arg_name = absl::StrCat("_arg", i);
+    Status status =
+        NodeBuilder(arg_name, FunctionLibraryDefinition::kArgOp)
+            .ControlInput(graph->source_node())
+            .Attr("T", args[i].kind == Argument::kResource ? DT_RESOURCE
+                                                           : args[i].type)
+            .Attr("index", i)
+            .Finalize(graph.get(), &node);
     TF_RETURN_IF_ERROR(status);
     graph->AddEdge(node, 0, main_node, i);
   }
 
   // Similarly with return values, create dummy _Retval nodes fed by `node`.
-  for (int64 i = 0; i < ctx->num_outputs(); ++i) {
+  for (int64 i = 0; i < result_types.size(); ++i) {
     Node* node;
-    string name = absl::StrCat(ctx->op_kernel().name(), "_", i, "_retval");
-    Status status = NodeBuilder(name, "_Retval")
+    string retval_name = absl::StrCat("_retval", i);
+    Status status = NodeBuilder(retval_name, FunctionLibraryDefinition::kRetOp)
                         .Input(main_node, i)
-                        .Attr("T", ctx->expected_output_dtype(i))
+                        .Attr("T", result_types[i])
                         .Attr("index", i)
                         .Finalize(graph.get(), &node);
     TF_RETURN_IF_ERROR(status);
   }
   FixupSourceAndSinkEdges(graph.get());
 
-  return CompileGraph(options, name, std::move(graph), args, result);
+  return CompileGraph(options, node_def.name(), std::move(graph), args, result);
 }
 
 namespace {
@@ -747,12 +870,38 @@ Status ValidateGraph(const Graph* graph,
   return Status::OK();
 }
 
+// Converts the value of any expressions whose values are known at compile-time
+// to constants.
+Status ResolveConstantExpressionsToConstants(
+    xla::Client* client, absl::Span<XlaExpression> expressions) {
+  for (XlaExpression& expression : expressions) {
+    if (expression.kind() == XlaExpression::Kind::kXlaOp) {
+      TF_ASSIGN_OR_RETURN(absl::optional<Tensor> constant,
+                          expression.ResolveConstant(client));
+      if (constant.has_value()) {
+        expression = XlaExpression::Constant(*constant);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void ConvertConstantsToExpressions(xla::XlaBuilder* builder,
+                                   absl::Span<XlaExpression> expressions) {
+  for (XlaExpression& expression : expressions) {
+    if (expression.kind() == XlaExpression::Kind::kConstant) {
+      expression =
+          XlaExpression::XlaOp(expression.AsXlaOp(builder), expression.dtype());
+    }
+  }
+}
+
 }  // namespace
 
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  string const& name,
                                  std::unique_ptr<Graph> graph,
-                                 const std::vector<XlaCompiler::Argument>& args,
+                                 absl::Span<const XlaCompiler::Argument> args,
                                  CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate XlaBuilder.";
 
@@ -774,13 +923,12 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                    options_.device_type, name));
 
   xla::XlaBuilder builder(name);
-  XlaContext* context = new XlaContext(
-      this, &builder, options_.allow_cpu_custom_calls,
-      options.resolve_compile_time_constants, options.is_entry_computation,
-      &options_.shape_representation_fn);
+  XlaContext* context =
+      new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
+                     &options_.shape_representation_fn);
   core::ScopedUnref context_unref(context);
 
-  std::vector<XlaCompiler::Argument> real_args(args);
+  std::vector<XlaCompiler::Argument> real_args(args.begin(), args.end());
   int token_input_index = -1;
   std::unique_ptr<xla::XlaOp> token_output;
   if (options.add_token_input_output) {
@@ -792,10 +940,14 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
     real_args.push_back(token_arg);
   }
 
+  std::map<int, int> arg_cores;
+  std::map<int, int> retval_cores;
+  TF_ASSIGN_OR_RETURN(std::tie(arg_cores, retval_cores),
+                      ComputeArgAndRetvalCores(*graph));
+
   std::vector<XlaExpression> arg_expressions;
-  std::vector<int> arg_cores;
   TF_RETURN_IF_ERROR(BuildArguments(
-      *graph, real_args, options.use_tuple_arg, &builder, context, &arg_cores,
+      *graph, real_args, options.use_tuple_arg, &builder, context, arg_cores,
       &arg_expressions, &result->input_mapping, &result->xla_input_shapes,
       options.is_entry_computation));
   context->set_args(std::move(arg_expressions));
@@ -843,28 +995,27 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   int num_computation_outputs;
   result->computation = std::make_shared<xla::XlaComputation>();
   result->outputs.resize(context->retvals().size());
+  std::vector<XlaExpression> retvals = context->retvals();
+  if (options.resolve_compile_time_constants) {
+    TF_RETURN_IF_ERROR(ResolveConstantExpressionsToConstants(
+        client(), absl::Span<XlaExpression>(retvals)));
+  } else {
+    ConvertConstantsToExpressions(&builder, absl::Span<XlaExpression>(retvals));
+  }
   TF_RETURN_IF_ERROR(BuildComputation(
-      real_args, arg_cores, context->retvals(), context->resources(),
-      std::move(token_output), options.return_updated_values_for_all_resources,
+      real_args, retvals, arg_cores, retval_cores, context->resources(),
+      std::move(token_output),
+      options.is_entry_computation ? options_.shape_representation_fn
+                                   : ShapeRepresentationFn{},
+      options.return_updated_values_for_all_resources,
       options.always_return_tuple, &builder, result->computation.get(),
       &num_computation_outputs, &num_nonconst_outputs, &result->outputs,
-      &result->resource_updates));
+      &result->resource_updates, &result->xla_output_shape));
 
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
-
-  // Compute the XLA output shape, if there is a computation with non-constant
-  // outputs.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ProgramShape> computation_shape,
-                      client()->GetComputationShape(*result->computation));
-
-  result->xla_output_shape.Swap(computation_shape->mutable_result());
   VLOG(2) << "XLA output shape: "
-          << xla::ShapeUtil::HumanString(result->xla_output_shape);
-
-  // Tensorflow expects a major-to-minor order of results.
-  xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
-
+          << xla::ShapeUtil::HumanStringWithLayout(result->xla_output_shape);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 2cc603a58016a509fafdf6f95423dd6c0864cce3..63426124686e1b92a3534b7e365b8282008b8455 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -18,10 +18,13 @@ limitations under the License.
 
 #include <stack>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -118,7 +121,7 @@ class XlaCompiler {
 
     // The type of the argument. If the argument is a resource, this
     // is the type of the variable's value, not DT_RESOURCE.
-    DataType type;
+    DataType type = DT_INVALID;
 
     // The shape of the argument. For:
     // * a parameter: the shape of the parameter.
@@ -155,6 +158,9 @@ class XlaCompiler {
     std::set<string> tensor_array_gradients;
 
     bool operator==(const Argument& other) const;
+
+    // Returns a human-readable summary of the argument.
+    string HumanString() const;
   };
 
   // Options pertaining to an individual call to CompileGraph() or
@@ -259,8 +265,7 @@ class XlaCompiler {
     std::shared_ptr<xla::XlaComputation> computation;
   };
 
-  typedef std::function<xla::StatusOr<TensorShape>(const TensorShape&,
-                                                   DataType)>
+  typedef std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType)>
       ShapeRepresentationFn;
   struct Options {
     // Name of the compilation device to use. It must be set by the caller.
@@ -316,22 +321,23 @@ class XlaCompiler {
 
   Status CompileFunction(const CompileOptions& options,
                          const NameAttrList& fn_name_attrs,
-                         std::vector<Argument> args, CompilationResult* result);
+                         absl::Span<const Argument> args,
+                         CompilationResult* result);
 
   // Compiles a tensorflow::Graph into an xla::XlaComputation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
   Status CompileGraph(const CompileOptions& options, string const& name,
                       std::unique_ptr<Graph> graph,
-                      const std::vector<Argument>& args,
+                      absl::Span<const Argument> args,
                       CompilationResult* result);
 
-  // Compiles a single Op, given by an OpKernelContext, into an
+  // Compiles a single Op, given by `node_def`, into an
   // xla::XlaComputation. Similar to CompileFunction but takes a single Op as
   // input.
-  Status CompileSingleOp(const CompileOptions& options, string const& name,
-                         OpKernelContext* ctx,
-                         const std::vector<Argument>& args,
+  Status CompileSingleOp(const CompileOptions& options, const NodeDef& node_def,
+                         absl::Span<const Argument> args,
+                         absl::Span<const DataType> result_types,
                          CompilationResult* result);
 
   // Returns the shape of the XLA parameter for an argument 'arg'.
@@ -411,7 +417,8 @@ class XlaCompiler {
   Status BuildArguments(const Graph& graph,
                         const std::vector<XlaCompiler::Argument>& args,
                         bool use_tuple_arg, xla::XlaBuilder* builder,
-                        XlaContext* context, std::vector<int>* arg_cores,
+                        XlaContext* context,
+                        const std::map<int, int>& arg_cores,
                         std::vector<XlaExpression>* arg_expressions,
                         std::vector<int>* input_mapping,
                         std::vector<xla::Shape>* input_shapes,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 4ef154f856b9284a6c97f2c3072b198ccfb5e517..eba5d77efabd752f8476c27e95610343c54ea460 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -909,6 +911,82 @@ TEST_F(XlaCompilerTest, Variables) {
   RunAndCheckVariablesComputation(client_, result);
 }
 
+TEST_F(XlaCompilerTest, ResultLayoutSingle) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Retval(scope.WithOpName("RET"), a, 0);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  // Sets the representation function to return a non-default layout.
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  auto compile_options = XlaCompiler::CompileOptions();
+  compile_options.always_return_tuple = false;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "id", std::move(graph),
+                                     args, &result));
+  EXPECT_TRUE(xla::ShapeUtil::Equal(
+      result.xla_output_shape,
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1})));
+}
+
+TEST_F(XlaCompilerTest, ResultLayoutMultiple) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Retval(scope.WithOpName("RET1"), a, 0);
+  auto c = ops::_Retval(scope.WithOpName("RET2"), a, 1);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  // Sets the representation function to return a non-default layout.
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "id",
+                                     std::move(graph), args, &result));
+  xla::Shape result_shape =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1});
+
+  EXPECT_TRUE(xla::ShapeUtil::Equal(
+      result.xla_output_shape,
+      xla::ShapeUtil::MakeTupleShape({result_shape, result_shape})));
+}
+
 // Tests a simple graph that reads and writes a variable.
 TEST_F(XlaCompilerTest, ReturnResourceHandleOnly) {
   Scope scope = Scope::NewRootScope().ExitOnError();
@@ -1018,9 +1096,11 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
 
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
-  options.shape_representation_fn = [](const TensorShape& shape,
-                                       DataType type) {
-    return TensorShape({shape.num_elements()});
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::PrimitiveType ptype;
+    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype));
+    return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()});
   };
   XlaCompiler compiler(options);
 
@@ -1086,9 +1166,11 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
 
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
-  options.shape_representation_fn = [](const TensorShape& shape,
-                                       DataType type) {
-    return TensorShape({shape.num_elements()});
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::PrimitiveType ptype;
+    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype));
+    return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()});
   };
   XlaCompiler compiler(options);
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 20e1ee2ddb390edd3a7d881022c68072a69193dc..43095fbb47351617a0de12a088c947106ccaa641 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -64,63 +64,23 @@ void XlaContext::set_args(std::vector<XlaExpression> args) {
 
 XlaContext::XlaContext(
     XlaCompiler* compiler, xla::XlaBuilder* builder,
-    bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
-    bool is_entry_computation,
-    const std::function<xla::StatusOr<TensorShape>(
+    bool allow_cpu_custom_calls,
+    const std::function<xla::StatusOr<xla::Shape>(
         const TensorShape&, DataType)>* shape_representation_fn)
     : compiler_(compiler),
       builder_(builder),
       allow_cpu_custom_calls_(allow_cpu_custom_calls),
-      resolve_compile_time_constants_(resolve_compile_time_constants),
-      is_entry_computation_(is_entry_computation),
       shape_representation_fn_(shape_representation_fn) {}
 
 string XlaContext::DebugString() { return "TLA JIT context"; }
 
-// This is called by the Retval Op to associate a computed value
-// with a specific return value of the subgraph.
-void XlaContext::AddRetval(int retval_index, DataType type,
-                           const TensorShape& shape, const xla::XlaOp& handle) {
-  VLOG(1) << "Added retval index " << retval_index << " to XLA computation";
-  // Add the return value to the list being built up.
-  if (retvals_.size() <= retval_index) {
-    retvals_.resize(retval_index + 1);
+void XlaContext::SetRetval(int index, const XlaExpression& expression) {
+  if (retvals_.size() <= index) {
+    retvals_.resize(index + 1);
   }
-  XlaExpression e;
-  e.set_handle(handle);
-  retvals_[retval_index] = Retval{type, shape, e};
+  retvals_[index] = expression;
 }
 
-Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
-                                  const xla::LiteralSlice& literal) {
-  VLOG(1) << "Adding retval index " << retval_index
-          << " with non-data-dependent tensor to XLA computation";
-  if (retvals_.size() <= retval_index) {
-    retvals_.resize(retval_index + 1);
-  }
-  Tensor value;
-  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype, &value));
-  XlaExpression e;
-  e.set_constant_value(value);
-  retvals_[retval_index] = Retval{dtype, value.shape(), e};
-  return Status::OK();
-}
-
-Status XlaContext::AddResourceRetval(int retval_index, XlaResource* resource) {
-  VLOG(1) << "Adding retval index " << retval_index << " with resource "
-          << resource->name() << ":" << resource->shape().DebugString()
-          << " to XLA computation";
-  if (retvals_.size() <= retval_index) {
-    retvals_.resize(retval_index + 1);
-  }
-  XlaExpression e;
-  e.set_resource(resource);
-  retvals_[retval_index] = Retval{DT_RESOURCE, resource->shape(), e};
-  return Status::OK();
-}
-
-xla::XlaBuilder* XlaContext::builder() { return builder_; }
-
 Status XlaContext::CreateResource(
     XlaResource::Kind kind, int arg_num, string name, DataType type,
     TensorShape shape, const xla::XlaOp& handle, int64 tensor_array_size,
@@ -133,7 +93,7 @@ Status XlaContext::CreateResource(
   return Status::OK();
 }
 
-xla::StatusOr<TensorShape> XlaContext::RepresentationShape(
+xla::StatusOr<xla::Shape> XlaContext::RepresentationShape(
     const TensorShape& shape, DataType type) const {
   return (*shape_representation_fn_)(shape, type);
 }
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 4da891634e97dd67af0ef09ef33dbc7a4d19743b..dbfd344c9bad8a5d05abb6a3b902ed3baebbe02a 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -46,9 +46,8 @@ class XlaContext : public ResourceBase {
   // Creates a new XlaContext. See the documentation on the class data fields
   // for descriptions of the arguments.
   XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
-             bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
-             bool is_entry_computation,
-             const std::function<xla::StatusOr<TensorShape>(
+             bool allow_cpu_custom_calls,
+             const std::function<xla::StatusOr<xla::Shape>(
                  const TensorShape&, DataType)>* shape_representation_fn);
 
   // Virtual method defined by ResourceBase.
@@ -57,37 +56,19 @@ class XlaContext : public ResourceBase {
   XlaCompiler* compiler() const { return compiler_; }
 
   // Returns the XlaBuilder that Ops use for compiling new expressions.
-  xla::XlaBuilder* builder();
+  xla::XlaBuilder* builder() { return builder_; }
 
   bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
 
-  bool resolve_compile_time_constants() const {
-    return resolve_compile_time_constants_;
-  }
-  bool is_entry_computation() const { return is_entry_computation_; }
-
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
-  struct Retval {
-    DataType type;
-    TensorShape shape;
-    // An XlaExpression representing the Retval's value.
-    XlaExpression expression;
-  };
-  const std::vector<Retval>& retvals() { return retvals_; }
-
-  // This is called by the Retval Op to associate a computed value
-  // with a specific return value of the subgraph.
-  void AddRetval(int retval_index, DataType type, const TensorShape& shape,
-                 const xla::XlaOp& handle);
+  const std::vector<XlaExpression>& retvals() { return retvals_; }
 
-  // As for Retval, but for return values that are compile-time constants.
-  Status AddConstRetval(int retval_index, DataType dtype,
-                        const xla::LiteralSlice& literal);
-
-  // As for Retval, but for return values that are resource handles.
-  Status AddResourceRetval(int retval_index, XlaResource* resource);
+  // Sets a return value.
+  // Since we do not always know in advance how many return values there are,
+  // grows the return values vector to size index+1 if it is smaller.
+  void SetRetval(int index, const XlaExpression& expression);
 
   // Creates a resource with resource `kind` and initial value `handle`. `name`
   // is a descriptive name for use in error messages. See the `XlaResource`
@@ -105,8 +86,8 @@ class XlaContext : public ResourceBase {
 
   // Returns the XLA shape to be used to represent a variable of TF `shape`
   // and `type`, or of an argument or return value of a top-level computation.
-  xla::StatusOr<TensorShape> RepresentationShape(const TensorShape& shape,
-                                                 DataType type) const;
+  xla::StatusOr<xla::Shape> RepresentationShape(const TensorShape& shape,
+                                                DataType type) const;
 
   // Get an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
@@ -140,31 +121,19 @@ class XlaContext : public ResourceBase {
   // Allow ops to emit CustomCall operations for CPU.
   const bool allow_cpu_custom_calls_;
 
-  // If true, constant return values are returned as Tensors instead of
-  // run-time computation outputs.
-  const bool resolve_compile_time_constants_;
-
   // Arguments to the Tensorflow graph, indexed by _Arg index.
   // Includes both compile-time constant arguments and runtime parameters.
   std::vector<XlaExpression> args_;
 
   // Return values of the Tensorflow graph, indexed by _Retval index.
-  std::vector<Retval> retvals_;
+  std::vector<XlaExpression> retvals_;
 
   // Holds ownership of resources. The resources are not ordered.
   std::vector<std::unique_ptr<XlaResource>> resources_;
 
-  // Is this a top-level computation, or an inner computation (e.g., a while
-  // body)?
-  const bool is_entry_computation_;
-
-  // A function that describes how the shapes of
-  // a) argument and return value, for entry computations
-  // b) variables, for all computations,
-  // should be represented in XLA. Parameters/return values will be shaped
-  // according to this function, and reshaped back to/from their declared shapes
-  // for computations. Must be non-null.
-  const std::function<xla::StatusOr<TensorShape>(const TensorShape&, DataType)>*
+  // Describes the on-host shapes of parameters and return values. Also see:
+  // XlaDevice::Options::shape_representation_fn.
+  const std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType)>*
       shape_representation_fn_;
 
   // Cache of prebuilt computations indexed by their type.
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca0309166b7c73d1a5a818091e2a30fa112a4de4
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -0,0 +1,145 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+XlaExpression::XlaExpression() = default;
+
+XlaExpression XlaExpression::Invalid() {
+  XlaExpression e;
+  e.kind_ = Kind::kInvalid;
+  return e;
+}
+
+XlaExpression XlaExpression::Constant(Tensor value) {
+  XlaExpression e;
+  e.kind_ = Kind::kConstant;
+  e.dtype_ = value.dtype();
+  e.constant_value_ = value;
+  return e;
+}
+
+XlaExpression XlaExpression::XlaOp(xla::XlaOp value, DataType dtype) {
+  XlaExpression e;
+  e.kind_ = Kind::kXlaOp;
+  e.dtype_ = dtype;
+  e.handle_ = value;
+  return e;
+}
+
+XlaExpression XlaExpression::Resource(XlaResource* resource) {
+  XlaExpression e;
+  e.kind_ = Kind::kResource;
+  e.dtype_ = DT_RESOURCE;
+  e.resource_ = resource;
+  return e;
+}
+
+string XlaExpression::HumanString() const {
+  switch (kind_) {
+    case Kind::kInvalid:
+      return "invalid";
+    case Kind::kConstant:
+      return "constant";
+    case Kind::kXlaOp:
+      return "xla_op";
+    case Kind::kResource:
+      return "resource";
+  }
+}
+
+xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
+  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    switch (kind_) {
+      case Kind::kConstant: {
+        xla::BorrowingLiteral literal;
+        TF_RETURN_IF_ERROR(
+            HostTensorToBorrowingLiteral(constant_value_, &literal));
+        return xla::ConstantLiteral(builder, literal);
+      }
+      case Kind::kXlaOp:
+        if (builder != handle_.builder()) {
+          return errors::InvalidArgument(
+              "Mismatched builders in XlaExpression::AsXlaOp");
+        }
+        return handle_;
+      default:
+        return errors::InvalidArgument("AsXlaOp called on XlaExpression: ",
+                                       HumanString());
+    }
+  });
+}
+
+xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
+    xla::Client* client) const {
+  switch (kind()) {
+    case Kind::kConstant:
+      return {constant_value()};
+    case Kind::kXlaOp:
+      break;
+    case Kind::kResource:
+    case Kind::kInvalid:
+      return errors::InvalidArgument(
+          "ResolveConstant called on XlaExpression: ", HumanString());
+  }
+
+  TF_ASSIGN_OR_RETURN(bool is_constant,
+                      handle().builder()->IsConstant(handle()));
+  if (!is_constant) return {absl::nullopt};
+
+  TF_ASSIGN_OR_RETURN(xla::XlaComputation constant_graph,
+                      handle().builder()->BuildConstantSubGraph(handle()));
+
+  TF_ASSIGN_OR_RETURN(TensorShape shape, GetShape());
+
+  // The XLA layout is specified minor to major, and TensorFlow uses a major to
+  // minor order.
+  std::vector<int64> layout_indices(shape.dims());
+  std::iota(layout_indices.rbegin(), layout_indices.rend(), 0);
+  xla::Layout layout = xla::LayoutUtil::MakeLayout(layout_indices);
+  TF_ASSIGN_OR_RETURN(xla::Literal literal,
+                      client->ComputeConstant(constant_graph, &layout));
+  Tensor tensor;
+  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype(), &tensor));
+  return {tensor};
+}
+
+xla::StatusOr<TensorShape> XlaExpression::GetShape() const {
+  switch (kind_) {
+    case Kind::kConstant:
+      return constant_value().shape();
+    case Kind::kXlaOp: {
+      TF_ASSIGN_OR_RETURN(xla::Shape xla_shape,
+                          handle().builder()->GetShape(handle()));
+      TensorShape shape;
+      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, &shape));
+      return shape;
+    }
+    case Kind::kResource:
+      return TensorShape({});
+    case Kind::kInvalid:
+      return errors::InvalidArgument(
+          "GetShape() called on invalid XlaExpression");
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
new file mode 100644
index 0000000000000000000000000000000000000000..bed6761d362a98d344003c1edea342e68c31ef07
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -0,0 +1,115 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// A XlaExpression represents a symbolic TensorFlow value in a TF->XLA
+// compilation.
+// An expression is one of:
+// * a constant tensor.
+// * an xla::XlaOp, representing a symbolic XLA value.
+// * a resource, e.g., a variable, represented as an XlaResource pointer.
+//
+// Constant tensors are mostly an optimization to avoid passing large constants
+// to XLA, but are also sometimes used to represent tensors that have no XLA
+// representation, for example, DT_STRING tensors. A canonical use case might be
+// an error message string.
+class XlaExpression {
+ public:
+  enum class Kind {
+    kInvalid,
+    kConstant,
+    kXlaOp,
+    kResource,
+  };
+
+  XlaExpression();
+  XlaExpression(const XlaExpression&) = default;
+  XlaExpression& operator=(const XlaExpression&) = default;
+
+  // Builds an invalid expression. (Same as the default constructor, but makes
+  // the intent clearer.)
+  static XlaExpression Invalid();
+
+  // Builds a constant XLA expression.
+  static XlaExpression Constant(Tensor value);
+
+  // Builds a XlaOp expression. Since the mapping from TF data types to XLA
+  // types is not 1-1, the TF type must also be provided; in general it cannot
+  // be derived from the XLA type.
+  static XlaExpression XlaOp(xla::XlaOp value, DataType dtype);
+
+  // Builds a resource expression.
+  static XlaExpression Resource(XlaResource* resource);
+
+  Kind kind() const { return kind_; }
+
+  DataType dtype() const { return dtype_; }
+
+  // handle() returns the XlaOp that backs a kXlaOp expression.
+  const xla::XlaOp& handle() const { return handle_; }
+
+  const Tensor& constant_value() const { return constant_value_; }
+
+  XlaResource* resource() const { return resource_; }
+
+  // Returns a human-readable summary of the expression.
+  string HumanString() const;
+
+  // Returns the value of a kConstant or kXlaOp as an xla::XlaOp. Returns
+  // an erroneous XlaOp if the expression is not a constant or an expression.
+  xla::XlaOp AsXlaOp(xla::XlaBuilder* builder) const;
+
+  // If a kXlaOp or kConstant expression can be resolved to a compile-time
+  // constant, returns the value as a host-memory Tensor. Returns an empty
+  // optional if it cannot be resolved. Returns an error if passed a resource
+  // expression.
+  xla::StatusOr<absl::optional<Tensor>> ResolveConstant(
+      xla::Client* client) const;
+
+  // Returns the shape of the tensor.
+  // The shape of a resource is the shape of a resource handle (i.e., a scalar),
+  // not the shape of the resource's value.
+  xla::StatusOr<TensorShape> GetShape() const;
+
+ private:
+  Kind kind_ = Kind::kInvalid;
+
+  DataType dtype_ = DT_INVALID;
+
+  // The XLA handle of the expression's computation, if kind_ == kXlaOp.
+  xla::XlaOp handle_;
+
+  // The value of the constant, if kind_ == kConstant.
+  Tensor constant_value_;
+
+  // The resource, if kind_ == kResource. Not owned.
+  XlaResource* resource_ = nullptr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
diff --git a/tensorflow/compiler/tf2xla/xla_expression_test.cc b/tensorflow/compiler/tf2xla/xla_expression_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84202c931390f2d68f6d381aef0752bfff00a53d
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_expression_test.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaExpressionTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    client_ = xla::ClientLibrary::LocalClientOrDie();
+    builder_ = absl::make_unique<xla::XlaBuilder>("acomputation");
+    constant_ = test::AsScalar<int32>(42);
+    op_ = xla::ConstantR0<int32>(builder_.get(), 7);
+    non_constant_op_ = xla::Parameter(
+        builder_.get(), 0, xla::ShapeUtil::MakeShape(xla::F32, {}), "x");
+    resource_ = absl::make_unique<XlaResource>(
+        XlaResource::kVariable, /*arg_num=*/0, /*name=*/string("avariable"),
+        DT_INT32, TensorShape({17, 3}), op_, /*tensor_array_size=*/-1,
+        /*tensor_array_gradients=*/std::set<string>(),
+        /*tensor_array_multiple_writes_aggregate=*/false);
+  }
+
+  xla::Client* client_;
+  std::unique_ptr<xla::XlaBuilder> builder_;
+  Tensor constant_;
+  xla::XlaOp op_;
+  xla::XlaOp non_constant_op_;
+  std::unique_ptr<XlaResource> resource_;
+};
+
+TEST_F(XlaExpressionTest, Kind) {
+  EXPECT_TRUE(XlaExpression::Kind::kInvalid == XlaExpression().kind());
+  EXPECT_TRUE(XlaExpression::Kind::kInvalid == XlaExpression::Invalid().kind());
+  EXPECT_TRUE(XlaExpression::Kind::kConstant ==
+              XlaExpression::Constant(constant_).kind());
+  EXPECT_TRUE(XlaExpression::Kind::kXlaOp ==
+              XlaExpression::XlaOp(op_, DT_INT32).kind());
+  EXPECT_TRUE(XlaExpression::Kind::kResource ==
+              XlaExpression::Resource(resource_.get()).kind());
+}
+
+TEST_F(XlaExpressionTest, HumanString) {
+  EXPECT_EQ("invalid", XlaExpression().HumanString());
+  EXPECT_EQ("invalid", XlaExpression::Invalid().HumanString());
+  EXPECT_EQ("constant", XlaExpression::Constant(constant_).HumanString());
+  EXPECT_EQ("xla_op", XlaExpression::XlaOp(op_, DT_INT32).HumanString());
+  EXPECT_EQ("resource", XlaExpression::Resource(resource_.get()).HumanString());
+}
+
+TEST_F(XlaExpressionTest, AsXlaOp) {
+  xla::XlaOp op_as_op =
+      XlaExpression::XlaOp(op_, DT_INT32).AsXlaOp(builder_.get());
+  EXPECT_TRUE(op_.IsIdenticalTo(op_as_op));
+
+  xla::XlaOp const_as_op =
+      XlaExpression::Constant(constant_).AsXlaOp(builder_.get());
+  TF_ASSERT_OK_AND_ASSIGN(xla::XlaComputation computation,
+                          builder_->BuildConstantSubGraph(const_as_op));
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal value,
+                          client_->ComputeConstant(computation));
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(xla::LiteralUtil::CreateR0<int32>(42),
+                                          value));
+}
+
+TEST_F(XlaExpressionTest, GetShape) {
+  EXPECT_FALSE(XlaExpression().GetShape().ok());
+  EXPECT_FALSE(XlaExpression::Invalid().GetShape().ok());
+
+  TF_ASSERT_OK_AND_ASSIGN(TensorShape resource_shape,
+                          XlaExpression::Resource(resource_.get()).GetShape());
+  EXPECT_EQ(TensorShape({}), resource_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(TensorShape op_shape,
+                          XlaExpression::XlaOp(op_, DT_INT32).GetShape());
+  EXPECT_EQ(TensorShape({}), op_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(TensorShape constant_shape,
+                          XlaExpression::Constant(constant_).GetShape());
+  EXPECT_EQ(TensorShape({}), constant_shape);
+}
+
+TEST_F(XlaExpressionTest, ResolveConstant) {
+  EXPECT_FALSE(XlaExpression().ResolveConstant(client_).ok());
+  EXPECT_FALSE(XlaExpression::Invalid().ResolveConstant(client_).ok());
+  EXPECT_FALSE(
+      XlaExpression::Resource(resource_.get()).ResolveConstant(client_).ok());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      absl::optional<Tensor> op_constant,
+      XlaExpression::XlaOp(op_, DT_INT32).ResolveConstant(client_));
+  ASSERT_TRUE(op_constant.has_value());
+  test::ExpectTensorEqual<int32>(test::AsScalar<int32>(7), *op_constant);
+
+  TF_ASSERT_OK_AND_ASSIGN(absl::optional<Tensor> op_nonconstant,
+                          XlaExpression::XlaOp(non_constant_op_, DT_FLOAT)
+                              .ResolveConstant(client_));
+  EXPECT_FALSE(op_nonconstant.has_value());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      absl::optional<Tensor> constant_constant,
+      XlaExpression::Constant(constant_).ResolveConstant(client_));
+  ASSERT_TRUE(constant_constant.has_value());
+  test::ExpectTensorEqual<int32>(constant_, *constant_constant);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 9a34cd8c6ae2dc6d52a3cc69168df96f5322c6da..af378bc95c096082ff5cd963b9d6156f4351cd8d 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index dd3498ef7aa242d3ad946cae5f60bc2c8853a342..8dd8def0549f2b39d4c9863bb535f19703c3ef22 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -43,32 +44,36 @@ xla::XlaBuilder* XlaOpKernelContext::builder() const {
 static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
-  CHECK(expression->handle().valid() || expression->resource() != nullptr);
-  VLOG(1) << "Fetched T" << expression->handle();
+  CHECK(expression->kind() != XlaExpression::Kind::kInvalid)
+      << expression->HumanString();
   return expression;
 }
 
-// Retrieves an uninitialized XlaExpression from a newly-allocated tensor.
-static XlaExpression* CastExpressionFromUninitializedTensor(Tensor* tensor) {
+// Assigns an XlaExpression to a tensor on an XLA compilation device.
+static void AssignExpressionToTensor(Tensor* tensor,
+                                     const XlaExpression& value) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor->tensor_data().data());
-  CHECK(!expression->handle().valid());
-  return const_cast<XlaExpression*>(expression);
+  CHECK(expression->kind() == XlaExpression::Kind::kInvalid)
+      << expression->HumanString();
+  *const_cast<XlaExpression*>(expression) = value;
 }
 
-// Retrieves the XlaOp from an input Tensor to an Op. This computation was
-// constructed by an Op that executed previously and created the output Tensor
-// using CreateOutputTensorFromComputation or CreateConstantOutputTensor.
-static const xla::XlaOp& GetComputationFromTensor(const Tensor& tensor) {
-  return CastExpressionFromTensor(tensor)->handle();
+const XlaExpression& XlaOpKernelContext::InputExpression(int index) {
+  return *CastExpressionFromTensor(context_->input(index));
 }
 
-const xla::XlaOp& XlaOpKernelContext::Input(int index) {
-  return GetComputationFromTensor(context_->input(index));
+const XlaExpression& XlaOpKernelContext::InputExpression(
+    absl::string_view name) {
+  return *CastExpressionFromTensor(GetInputTensorByName(name));
 }
 
-const xla::XlaOp& XlaOpKernelContext::Input(absl::string_view name) {
-  return GetComputationFromTensor(GetInputTensorByName(name));
+xla::XlaOp XlaOpKernelContext::Input(int index) {
+  return InputExpression(index).AsXlaOp(builder());
+}
+
+xla::XlaOp XlaOpKernelContext::Input(absl::string_view name) {
+  return InputExpression(name).AsXlaOp(builder());
 }
 
 TensorShape XlaOpKernelContext::InputShape(int index) {
@@ -125,77 +130,18 @@ Status XlaOpKernelContext::ConstantInput(absl::string_view name,
 Status XlaOpKernelContext::ConstantInputReshaped(
     int index, absl::Span<const int64> new_dims,
     xla::Literal* constant_literal) {
-  const Tensor& tensor = context_->input(index);
-  TensorShape new_shape(new_dims);
-  if (tensor.NumElements() != new_shape.num_elements()) {
-    return errors::InvalidArgument(
-        context_->op_kernel().name(), " input ", index, " has shape ",
-        tensor.shape().DebugString(),
-        " but was asked to be reshaped to incompatible shape ",
-        new_shape.DebugString());
-  }
-  const XlaExpression* expression = CastExpressionFromTensor(tensor);
-
-  auto copy_tensor_to_literal = [](const Tensor& tensor,
-                                   xla::Literal* literal) {
-    xla::Shape literal_shape;
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(tensor.dtype(), tensor.shape(), &literal_shape));
-
-    *literal = xla::Literal(literal_shape);
-
-    // memcpy over the payload ...
-    // TODO(phawkins): handle string types.
-    size_t total_bytes = tensor.TotalBytes();
-    if (total_bytes > 0) {
-      void* dst_ptr = literal->untyped_data();
-      const void* src_ptr = DMAHelper::base(&tensor);
-      memcpy(dst_ptr, src_ptr, total_bytes);
-    }
-    return Status::OK();
-  };
-
-  // If the tensor has a known constant value, there is no need to invoke XLA.
-  if (expression->has_constant_value()) {
-    Tensor temp(tensor.dtype());
-    if (!temp.CopyFrom(expression->constant_value(), new_shape)) {
-      // This should never happen. The constant should have a shape compatible
-      // with the enclosing Tensor.
-      return errors::Internal("Incompatible shapes in ConstantInputReshaped.");
-    }
-
-    return copy_tensor_to_literal(temp, constant_literal);
-  }
-
-  // Make sure we treat zero-element tensors as constant.
-  if (new_shape.num_elements() == 0) {
-    Tensor temp(tensor.dtype(), new_shape);
-
-    return copy_tensor_to_literal(temp, constant_literal);
-  }
-
-  xla::XlaOp handle = expression->handle();
-  if (new_shape != tensor.shape()) {
-    // Reshape the handle to the desired shape.
-    handle = xla::Reshape(handle, new_shape.dim_sizes());
-  }
-
-  // The XLA layout is specified minor to major, and TensorFlow's minor
-  // dimension is the last one.
-  std::vector<int64> layout_indices(new_shape.dims());
-  std::iota(layout_indices.rbegin(), layout_indices.rend(), 0);
-  xla::Layout layout = xla::LayoutUtil::MakeLayout(layout_indices);
-
-  xla::StatusOr<bool> is_constant = builder()->IsConstant(handle);
-  if (!is_constant.ok()) {
-    Status status = is_constant.status();
+  XlaExpression e = InputExpression(index);
+  xla::StatusOr<absl::optional<Tensor>> constant_or_status =
+      e.ResolveConstant(compiler()->client());
+  if (!constant_or_status.ok()) {
+    Status status = constant_or_status.status();
     errors::AppendToMessage(&status, "while evaluating input ", index, " of ",
                             context_->op_kernel().type_string(),
                             " operator as a compile-time constant.");
     return status;
   }
-
-  if (!is_constant.ValueOrDie()) {
+  absl::optional<Tensor> constant = constant_or_status.ValueOrDie();
+  if (!constant.has_value()) {
     return errors::InvalidArgument(
         "Input ", index, " to ", context_->op_kernel().type_string(),
         " operator must be a compile-time constant.\n"
@@ -208,25 +154,16 @@ Status XlaOpKernelContext::ConstantInputReshaped(
         "stateful operation such as a random number generator.");
   }
 
-  // Ask the XLA compiler to evaluate the data handle to a literal.
-  xla::StatusOr<xla::XlaComputation> constant_graph =
-      builder()->BuildConstantSubGraph(handle);
-  if (!constant_graph.ok()) {
-    return errors::Internal(
-        "Error getting a compile-time constant graph for ",
-        context_->op_kernel().name(), " input ", index,
-        ".\nError: ", constant_graph.status().error_message());
-  }
-  xla::StatusOr<xla::Literal> computed = compiler()->client()->ComputeConstant(
-      constant_graph.ValueOrDie(), &layout);
-  if (!computed.ok()) {
-    return errors::Internal("Error evaluating ", context_->op_kernel().name(),
-                            " input ", index,
-                            " as a compile-time constant.\nError: ",
-                            computed.status().error_message());
+  Tensor temp(constant->dtype());
+  if (!temp.CopyFrom(*constant, TensorShape(new_dims))) {
+    return errors::InvalidArgument(
+        context_->op_kernel().name(), " input ", index, " has shape ",
+        constant->shape().DebugString(),
+        " but was asked to be reshaped to incompatible shape ",
+        TensorShape(new_dims).DebugString());
   }
-  *constant_literal = std::move(computed).ValueOrDie();
 
+  TF_ASSIGN_OR_RETURN(*constant_literal, HostTensorToLiteral(temp));
   return Status::OK();
 }
 
@@ -322,6 +259,15 @@ Status XlaOpKernelContext::ConstantInputReshapedToIntVector(
   return LiteralToInt64Vector(literal, out);
 }
 
+Status XlaOpKernelContext::ConstantInputReshapedToIntVector(
+    absl::string_view name, std::vector<int64>* out) {
+  TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name));
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInputReshaped(
+      index, {InputShape(index).num_elements()}, &literal));
+  return LiteralToInt64Vector(literal, out);
+}
+
 Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index,
                                                        xla::Literal* out) {
   xla::Literal literal;
@@ -372,7 +318,7 @@ Status XlaOpKernelContext::InputList(absl::string_view name,
   handles->clear();
   shapes->clear();
   for (const Tensor& input : inputs) {
-    handles->push_back(GetComputationFromTensor(input));
+    handles->push_back(CastExpressionFromTensor(input)->AsXlaOp(builder()));
     shapes->push_back(input.shape());
   }
   return Status::OK();
@@ -413,9 +359,12 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
 
   XlaContext& xla_context = XlaContext::Get(ctx);
   TF_ASSIGN_OR_RETURN(
-      TensorShape representation_shape,
+      xla::Shape representation_shape,
       xla_context.RepresentationShape(variable->shape(), variable->type()));
-  if (representation_shape == variable->shape()) {
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(
+      TensorShapeToXLAShape(variable->type(), variable->shape(), &xla_shape));
+  if (xla::ShapeUtil::Compatible(xla_shape, representation_shape)) {
     *value = variable->value();
   } else {
     *value = xla::Reshape(variable->value(), variable->shape().dim_sizes());
@@ -455,90 +404,53 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
   return Status::OK();
 }
 
-Status XlaOpKernelContext::allocate_output(int index, const xla::Shape& shape,
-                                           Tensor** output) {
-  // The step's default allocator is the dummy XlaCompilationAllocator which
-  // simply allocates a metadata buffer to hold the expression to which it
-  // corresponds.
-  if (expected_output_dtype(index) == DT_VARIANT) {
-    // tensor_data() is not supported for variant Tensor (i.e.,
-    // DataTypeCanUseMemcpy is false for DT_VARIANT), and so storing the
-    // XlaExpression inside the Tensor's tensor_data() does not work for
-    // variant. Instead construct a uint8 tensor and store the expression in its
-    // value.
-    // TODO(jpienaar): This should be refactored to stop masquerading
-    // XlaExpressions as Tensors.
-    *output = new Tensor();
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(
-        context_->allocate_temp(DT_UINT8, tensor_shape, *output));
-    context_->set_output(index, **output);
-  } else {
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(XLAShapeToTensorShape(shape, &tensor_shape));
-    TF_RETURN_IF_ERROR(context_->allocate_output(index, tensor_shape, output));
+void XlaOpKernelContext::SetOutputExpression(int index,
+                                             const XlaExpression& expression) {
+  Status status = [&] {
+    // The step's default allocator is the dummy XlaCompilationAllocator which
+    // simply allocates a metadata buffer to hold the expression to which it
+    // corresponds.
+    Tensor* output = nullptr;
+    // Provides a special behavior for DT_VARIANT: a variant is treated as
+    // DT_UINT8 scalar as the type to allow mapping for variant to more generic
+    // types.
+    if (expression.dtype() == DT_VARIANT) {
+      // tensor_data() is not supported for variant Tensor (i.e.,
+      // DataTypeCanUseMemcpy is false for DT_VARIANT), and so storing the
+      // XlaExpression inside the Tensor's tensor_data() does not work for
+      // variant. Instead construct a uint8 tensor and store the expression in
+      // its value.
+      // TODO(jpienaar): This should be refactored to stop masquerading
+      // XlaExpressions as Tensors.
+      output = new Tensor();
+      TensorShape tensor_shape;
+      TF_RETURN_IF_ERROR(
+          context_->allocate_temp(DT_UINT8, tensor_shape, output));
+      context_->set_output(index, *output);
+    } else {
+      TF_ASSIGN_OR_RETURN(TensorShape shape, expression.GetShape());
+      TF_RETURN_IF_ERROR(context_->allocate_output(index, shape, &output));
+    }
+    AssignExpressionToTensor(output, expression);
+    return Status::OK();
+  }();
+  if (!status.ok()) {
+    SetStatus(status);
   }
-  return Status::OK();
 }
 
 void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) {
-  // Makes the host Tensor that will refer to the expression.
-  Tensor* output = nullptr;
-  auto shape_or = builder()->GetShape(handle);
-  if (!shape_or.ok()) {
-    SetStatus(shape_or.status());
-    return;
-  }
-
-  OP_REQUIRES_OK(context_,
-                 allocate_output(index, shape_or.ValueOrDie(), &output));
-
-  // The expression is stored in the tensor's data buffer. Fill in the
-  // fields now.
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_handle(handle);
+  SetOutputExpression(
+      index,
+      XlaExpression::XlaOp(handle, context_->expected_output_dtype(index)));
 }
 
 void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
-  const TensorShape& shape = constant.shape();
-
-  xla::BorrowingLiteral literal;
-  OP_REQUIRES_OK(context_, HostTensorToBorrowingLiteral(constant, &literal));
-
-  xla::XlaOp handle = xla::ConstantLiteral(builder(), literal);
-  CHECK(handle.valid());
-
-  // Make the Tensor that will refer to the expression.
-  Tensor* output = nullptr;
-  // The step's default allocator is the dummy XlaCompilationAllocator which
-  // simply allocates a metadata buffer to hold the expression to which it
-  // corresponds.
-  OP_REQUIRES_OK(context_, context_->allocate_output(index, shape, &output));
-
-  // The expression is stored in the tensor's data buffer. Fill in the
-  // fields now.
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_handle(handle);
-  expression->set_constant_value(constant);
-}
-
-void XlaOpKernelContext::SetInvalidOutput(int index) {
-  Tensor* output = nullptr;
-  OP_REQUIRES_OK(context_,
-                 context_->allocate_output(index, TensorShape({}), &output));
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  xla::XlaOp handle;
-  expression->set_handle(handle);
+  SetOutputExpression(index, XlaExpression::Constant(constant));
 }
 
 void XlaOpKernelContext::SetResourceOutput(int index, XlaResource* resource) {
-  Tensor* output = nullptr;
-  // The shape of the output tensor is the shape of the resource itself
-  // (i.e., a scalar), not the shape of the resource's value.
-  OP_REQUIRES_OK(context_,
-                 context_->allocate_output(index, TensorShape(), &output));
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_resource(resource);
+  SetOutputExpression(index, XlaExpression::Resource(resource));
 }
 
 Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
@@ -570,10 +482,13 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
   TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
 
   XlaContext& xla_context = XlaContext::Get(ctx);
-  TF_ASSIGN_OR_RETURN(TensorShape representation_shape,
+  TF_ASSIGN_OR_RETURN(xla::Shape representation_shape,
                       xla_context.RepresentationShape(shape, type));
-  if (shape != representation_shape) {
-    handle = xla::Reshape(handle, representation_shape.dim_sizes());
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+  if (!xla::ShapeUtil::Compatible(xla_shape, representation_shape)) {
+    handle = xla::Reshape(handle,
+                          xla::AsInt64Slice(representation_shape.dimensions()));
   }
   return variable->SetValue(handle);
 }
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index aa00a454968ad29495e34dc080e55b62bb0b5f7b..c06efa2c474c5ec3cb5d75d94ba15d4096faa085 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -88,9 +88,9 @@ class XlaOpKernelContext {
   // Returns input `index` as a XlaOp. Unlike
   // OpKernelContext::Input returns a symbolic value rather than a concrete
   // Tensor.
-  const xla::XlaOp& Input(int index);
+  xla::XlaOp Input(int index);
   // Returns input `name` as a XlaOp.
-  const xla::XlaOp& Input(absl::string_view name);
+  xla::XlaOp Input(absl::string_view name);
 
   // Returns true if all inputs are the same shape, otherwise sets the
   // status to a non-OK value and returns false.
@@ -111,14 +111,6 @@ class XlaOpKernelContext {
   Status ConstantInput(int index, xla::Literal* constant_literal);
   Status ConstantInput(absl::string_view name, xla::Literal* constant_literal);
 
-  // Evaluates input `index`, reshapes it to `new_shape` if new_shape !=
-  // InputShape(index), and stores it in `*constant_literal`. If the input
-  // cannot be evaluated, e.g., because it depends on unbound parameters,
-  // returns a non-Ok status. If InputShape(index).num_elements() !=
-  // new_shape.num_elements(), returns an error status.
-  Status ConstantInputReshaped(int index, absl::Span<const int64> new_dims,
-                               xla::Literal* constant_literal);
-
   // Converts a constant scalar int32 or int64 tensor into an int64.
   Status ConstantInputAsIntScalar(int index, int64* out);
   Status ConstantInputAsIntScalar(absl::string_view name, int64* out);
@@ -134,6 +126,8 @@ class XlaOpKernelContext {
   // Reshapes and converts a constant int32 or int64 tensor into a vector of
   // int64s.
   Status ConstantInputReshapedToIntVector(int index, std::vector<int64>* out);
+  Status ConstantInputReshapedToIntVector(absl::string_view name,
+                                          std::vector<int64>* out);
 
   // Converts a constant int32 or int64 Tensor into an xla int64 Literal.
   Status ConstantInputAsInt64Literal(int index, xla::Literal* out);
@@ -148,6 +142,10 @@ class XlaOpKernelContext {
   Status ConstantInputList(absl::string_view name,
                            std::vector<xla::Literal>* literals);
 
+  // Returns an XlaExpression describing the value of 'index'.
+  const XlaExpression& InputExpression(int index);
+  const XlaExpression& InputExpression(absl::string_view name);
+
   // Outputs
 
   int num_outputs() const { return context_->num_outputs(); }
@@ -165,9 +163,8 @@ class XlaOpKernelContext {
   // SetConstantOutput where possible.
   void SetConstantOutput(int index, const Tensor& host_tensor);
 
-  // Sets output `index` to an invalid value.
-  // Any subsequent attempt to consume this output will cause an error.
-  void SetInvalidOutput(int index);
+  // Returns an XlaExpression describing the value of 'index'.
+  void SetOutputExpression(int index, const XlaExpression& expression);
 
   // Status handling.
   void SetStatus(const Status& status) { context_->SetStatus(status); }
@@ -255,10 +252,13 @@ class XlaOpKernelContext {
   // Returns the tensor of input `name`.
   const Tensor& GetInputTensorByName(absl::string_view name);
 
-  // Wraps OpKernelContext's allocate_output method while providing special
-  // behavior for DT_VARIANT: a variant is treated as DT_UINT8 scalar as the
-  // type to allow mapping for variant to more generic types.
-  Status allocate_output(int index, const xla::Shape& shape, Tensor** output);
+  // Evaluates input `index`, reshapes it to `new_shape` if new_shape !=
+  // InputShape(index), and stores it in `*constant_literal`. If the input
+  // cannot be evaluated, e.g., because it depends on unbound parameters,
+  // returns a non-Ok status. If InputShape(index).num_elements() !=
+  // new_shape.num_elements(), returns an error status.
+  Status ConstantInputReshaped(int index, absl::Span<const int64> new_dims,
+                               xla::Literal* constant_literal);
 
   OpKernelContext* const context_;
 };
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 9f00de708cc5aceb2c1e397663bc3bba8705bda4..14237df69081016817fbd1a5332f22996e7f264d 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -129,21 +130,26 @@ XlaOpRegistry::~XlaOpRegistry() = default;
   // Lazily register the CPU and GPU JIT devices the first time
   // GetCompilationDevice is called.
   static void* registration_init = [&registry]() {
+    MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
+    bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+
     mutex_lock lock(registry.mutex_);
     if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_CPU)).ok()) {
       DeviceRegistration& registration =
           registry.compilation_devices_[DEVICE_CPU];
       registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
-      registration.requires_compilation = false;
-      registration.enable_jit_by_default = false;
+      registration.autoclustering_policy =
+          cpu_global_jit
+              ? XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally
+              : XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested;
       registration.compile_resource_ops = false;
     }
     if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_GPU)).ok()) {
       DeviceRegistration& registration =
           registry.compilation_devices_[DEVICE_GPU];
       registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
-      registration.requires_compilation = false;
-      registration.enable_jit_by_default = true;
+      registration.autoclustering_policy =
+          XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally;
       registration.compile_resource_ops = false;
     }
     return nullptr;
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 45a40c0acc07805b422591fd7ea3fcb131db8471..0bdd4a1085445420a5147756daac4a54f4725f11 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -66,19 +66,26 @@ class XlaOpRegistry {
  public:
   typedef OpKernel* (*Factory)(OpKernelConstruction*);
 
+  enum class AutoclusteringPolicy {
+    // Enable autoclustering if the user requests it, e.g., via
+    // experimental_jit_scope. Does not autocluster if the JIT is enabled
+    // globally (e.g., via the OptimizerOptions in the TF session
+    // configuration.)
+    kIfExplicitlyRequested,
+    // Enable autoclustering if explicitly requested, or if the JIT is enabled
+    // globally in the session options, or via TF_XLA_FLAGS=--tf_xla_auto_jit=N.
+    kIfEnabledGlobally,
+    // Always try to autocluster ops placed on this device.
+    kAlways,
+  };
+
   // Describes how to compile operators assigned to a device.
   struct DeviceRegistration {
     // The name of the an XLA compilation device to use to compile code.
     string compilation_device_name;
 
-    // Do operators assigned to this device require compilation?
-    bool requires_compilation;
-
-    // If !requires_compilation, should we try to JIT operators on this device
-    // when XLA JIT compilation is enabled globally via the SessionOptions?
-    // (It is still possible to explicitly mark operators to JIT compile, even
-    // if enable_jit_by_default is false.)
-    bool enable_jit_by_default;
+    // When should we autocluster operators assigned to this device?
+    AutoclusteringPolicy autoclustering_policy;
 
     // Enable compilation of operators that use DT_RESOURCE types?
     bool compile_resource_ops = false;
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index 63b09c8f02a60e91576544d13227d29f56d3e88c..a322eb9015e829fd468133f3de6c12aad7e4ff74 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -26,6 +26,19 @@ limitations under the License.
 
 namespace tensorflow {
 
+/*static*/ absl::string_view XlaResource::KindToString(XlaResource::Kind kind) {
+  switch (kind) {
+    case XlaResource::kInvalid:
+      return "invalid";
+    case XlaResource::kVariable:
+      return "variable";
+    case XlaResource::kStack:
+      return "stack";
+    case XlaResource::kTensorArray:
+      return "tensorarray";
+  }
+}
+
 XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
                          TensorShape shape, const xla::XlaOp& initial_value,
                          int64 tensor_array_size,
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index aa9ce1b171f11ea0de4db0123098729c1c97f93a..857b9a928bb824656f637b2b1ca2fc02a1bef139 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -35,6 +36,7 @@ class XlaResource {
     kTensorArray,
     kStack,
   };
+  static absl::string_view KindToString(Kind kind);
 
   XlaResource(Kind kind, int arg_num, string name, DataType type,
               TensorShape shape, const xla::XlaOp& initial_value,
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index d6b60c5f9916520ba7585824171aad1548610da6..d914e97b6bd4506251dc4be504d6ab427590e615 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -68,7 +68,7 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
     ],
 )
 
@@ -735,6 +735,72 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "parse_flags_from_env",
+    srcs = ["parse_flags_from_env.cc"],
+    hdrs = ["parse_flags_from_env.h"],
+    deps =
+        [
+            "//tensorflow/compiler/xla:types",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/strings:str_format",
+            "@com_google_absl//absl/types:span",
+        ],
+)
+
+tf_cc_test(
+    name = "parse_flags_from_env_test",
+    srcs = ["parse_flags_from_env_test.cc"],
+    deps =
+        [
+            ":parse_flags_from_env",
+            "//tensorflow/compiler/xla:types",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+            "@com_google_absl//absl/strings:str_format",
+        ],
+)
+
+cc_library(
+    name = "debug_options_flags",
+    srcs = [
+        "debug_options_flags.cc",
+        "debug_options_parsers.h",
+    ],
+    hdrs = ["debug_options_flags.h"],
+    deps =
+        [
+            ":parse_flags_from_env",
+            "//tensorflow/compiler/xla:xla_proto",
+            "//tensorflow/compiler/xla/service:hlo",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "@com_google_absl//absl/strings",
+        ],
+)
+
+tf_cc_test(
+    name = "debug_options_parsers_test",
+    size = "small",
+    srcs = [
+        "debug_options_parsers.h",
+        "debug_options_parsers_test.cc",
+    ],
+    deps =
+        [
+            "//tensorflow/compiler/xla:xla_proto",
+            "//tensorflow/compiler/xla/service:hlo",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/strings:str_format",
+        ],
+)
+
 # -----------------------------------------------------------------------------
 
 # This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index 782c966b4c57672d137569a318fb20ace14d493b..e4aca98f67d50287a83afc6f41a59458f3df2da2 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -104,7 +104,7 @@ std::unique_ptr<Array2D<NativeT>> MakeLinspaceArray2D(double from, double to,
   int64 count = n1 * n2;
   NativeT step =
       static_cast<NativeT>((count > 1) ? (to - from) / (count - 1) : 0);
-  auto set = [&array, n1, n2](int64 index, NativeT value) {
+  auto set = [&array, n2](int64 index, NativeT value) {
     (*array)(index / n2, index % n2) = value;
   };
   for (int64 i = 0; i < count - 1; ++i) {
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 0cbe68d7efd9fe2ea46b312763437e1b8c986d25..42da0ebf4992884187bbe21701a44d8ba2fccd64 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -68,6 +68,7 @@ cc_library(
     deps = [
         ":global_data",
         ":xla_computation",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:service_interface",
@@ -76,7 +77,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
@@ -236,13 +236,13 @@ tf_cc_test(
     deps = [
         ":xla_builder",
         ":xla_computation",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index f5f8d5c6b1fe265069992fe92acaa229647d4e8c..eef2844e0df6aaf509881535f41493673fbeeee5 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -210,11 +210,10 @@ StatusOr<XlaComputation> Client::LoadSnapshot(const HloSnapshot& module) {
   return XlaComputation(module.hlo().hlo_module());
 }
 
-StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
-    const XlaComputation& computation, absl::Span<GlobalData* const> arguments,
-    const ExecutionOptions* execution_options,
-    ExecutionProfile* execution_profile) {
-  ExecuteGraphRequest request;
+StatusOr<ExecutionHandle> Client::Compile(
+    const XlaComputation& computation, absl::Span<const Shape> argument_shapes,
+    const ExecutionOptions* execution_options) {
+  CompileRequest request;
   *request.mutable_computation() = computation.proto();
 
   if (execution_options == nullptr) {
@@ -222,6 +221,34 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
   } else {
     *request.mutable_execution_options() = *execution_options;
   }
+  if (request.execution_options().device_handles_size() > 1) {
+    return InvalidArgument(
+        "Compiling with multiple device handles is not supported. Use "
+        "'Execute' instead.");
+  }
+
+  // The argument shapes affect how the computation is compiled.
+  for (const auto& arg_shape : argument_shapes) {
+    *request.add_input_shape_with_layout() = arg_shape;
+  }
+
+  CompileResponse response;
+  VLOG(1) << "making compile request: " << request.ShortDebugString();
+  Status s = stub_->Compile(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+  TF_RET_CHECK(response.has_handle());
+  return response.handle();
+}
+
+StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
+    const ExecutionHandle& handle, absl::Span<GlobalData* const> arguments,
+    ExecutionProfile* execution_profile) {
+  ExecuteRequest request;
+  *request.mutable_handle() = handle;
   for (GlobalData* argument : arguments) {
     CHECK(argument != nullptr) << "Argument pointers must not be null.";
     *request.add_arguments() = argument->handle();
@@ -229,7 +256,7 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
 
   ExecuteResponse response;
   VLOG(1) << "making execute request: " << request.ShortDebugString();
-  Status s = stub_->ExecuteGraph(&request, &response);
+  Status s = stub_->Execute(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -238,15 +265,62 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
 
   if (execution_profile != nullptr) {
     *execution_profile = response.profile();
+  }
+
+  return absl::make_unique<GlobalData>(stub_, response.output());
+}
+
+StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
+    const XlaComputation& computation, absl::Span<GlobalData* const> arguments,
+    const ExecutionOptions* execution_options,
+    ExecutionProfile* execution_profile) {
+  if (execution_options != nullptr &&
+      execution_options->device_handles_size() > 1) {
+    std::vector<XlaComputationInstance> computation_instances = {
+        XlaComputationInstance{
+            computation,
+            std::vector<GlobalData*>(arguments.begin(), arguments.end()),
+            *execution_options, execution_profile}};
+    TF_ASSIGN_OR_RETURN(auto results, ExecuteParallel(computation_instances));
+    // The result selection is a bit hacky, but better than assuming it is
+    // device 0.
+    //
+    // TODO(b/118493728): Allow Execute to return one result per computation.
+    for (int64 i = 0; i < results.size(); i++) {
+      TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(*results[i]));
+      if (!ShapeUtil::IsEmptyTuple(shape)) {
+        VLOG(3) << "Fetching result from device " << i << ": "
+                << ShapeUtil::HumanString(shape);
+        return std::move(results[i]);
+      }
+    }
+    TF_RET_CHECK(!results.empty());
+    VLOG(1) << "Defaulting to device 0 result";
+    return std::move(results[0]);
+  }
+
+  // The argument shapes affect how the computation is compiled.
+  std::vector<Shape> arg_shapes(arguments.size());
+  for (int i = 0; i < arguments.size(); i++) {
+    TF_ASSIGN_OR_RETURN(arg_shapes[i], GetShape(*arguments[i]));
+  }
+
+  TF_ASSIGN_OR_RETURN(auto handle,
+                      Compile(computation, arg_shapes, execution_options));
+
+  TF_ASSIGN_OR_RETURN(auto result,
+                      Execute(handle, arguments, execution_profile));
+
+  if (execution_profile != nullptr) {
     if (VLOG_IS_ON(1)) {
       TF_ASSIGN_OR_RETURN(
           auto execution_stats,
-          ExecutionStatsAsString(computation, response.profile()));
+          ExecutionStatsAsString(computation, *execution_profile));
       VLOG(1) << execution_stats;
     }
   }
 
-  return absl::make_unique<GlobalData>(stub_, response.output());
+  return std::move(result);
 }
 
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
@@ -274,10 +348,11 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   }
 
   std::vector<std::unique_ptr<GlobalData>> outputs;
-  for (size_t i = 0; i < computations.size(); ++i) {
+  for (size_t i = 0; i < response.responses_size(); ++i) {
     outputs.push_back(
         absl::make_unique<GlobalData>(stub_, response.responses(i).output()));
-    if (computations[i].execution_profile != nullptr) {
+    if (i < computations.size() &&
+        computations[i].execution_profile != nullptr) {
       *computations[i].execution_profile = response.responses(i).profile();
     }
   }
@@ -390,8 +465,7 @@ StatusOr<string> Client::ExecutionStatsAsString(
     const XlaComputation& computation, const ExecutionProfile& profile) {
   TF_ASSIGN_OR_RETURN(
       auto computation_stats,
-      GetComputationStats(computation,
-                          legacy_flags::GetDebugOptionsFromFlags()));
+      GetComputationStats(computation, GetDebugOptionsFromFlags()));
   int64 total_flops =
       computation_stats.flop_count() + computation_stats.transcendental_count();
   if (profile.compute_time_ns() > 0) {
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index 6f4d33c469f1f885cfeef546e3981dc3417ef71f..d0ac4703c632e0e01d3c8911594b46fedf28930d 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -40,6 +40,31 @@ class Client {
   explicit Client(ServiceInterface* stub);
   virtual ~Client();
 
+  // Compile the computation with the given argument shapes and returns the
+  // handle to the compiled executable. The compiled executable is cached on the
+  // service, and the returned handle can be used for exection without
+  // re-compile.
+  // * The shape and layout of the arguments being executed with will affect how
+  //   the computation is compiled. If argument_shapes is empty, the parameters'
+  //   shape and layout will be used in the compilation.
+  // * If execution_options is not nullptr, these options are passed to the
+  //   service to affect how it compiles our computation.  (The pointer does not
+  //   need to live beyond this call.)
+  // * If execution_options.device_handles should be empty. If you need
+  //   non-empty device handles, call 'Execute' instead.
+  StatusOr<ExecutionHandle> Compile(
+      const XlaComputation& computation,
+      absl::Span<const Shape> argument_shapes,
+      const ExecutionOptions* execution_options = nullptr);
+
+  // Executes the compiled executable for the given handle with the given
+  // arguments and returns the global data that was produced from the execution.
+  // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
+  //   will be filled with profile data from the execution.
+  StatusOr<std::unique_ptr<GlobalData>> Execute(
+      const ExecutionHandle& handle, absl::Span<GlobalData* const> arguments,
+      ExecutionProfile* execution_profile = nullptr);
+
   // Executes the computation with the given arguments and returns the global
   // data that was produced from the execution.
   // * If execution_options is not nullptr, these options are passed to the
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index f833ddcd3235e08e2d0d3c0b9921e96ef871c89e..c5733bc66deb8d55a9186ad1893abaf17ed6909e 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -164,7 +164,6 @@ cc_library(
     deps = [
         ":constants",
         ":math",
-        ":numeric",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -178,8 +177,9 @@ cc_library(
     srcs = ["sorting.cc"],
     hdrs = ["sorting.h"],
     deps = [
-        ":numeric",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
     ],
@@ -188,10 +188,6 @@ cc_library(
 xla_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
-    blacklisted_backends = [
-        "cpu",
-        "gpu",
-    ],
     tags = ["enable_for_xla_interpreter"],
     deps = [
         ":sorting",
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index d3d7edb42a38595bbf9fdb36e0dd946ae5df51f9..08a887a6e4660cb2528f0ec7244b7ccc540808d2 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -265,6 +265,22 @@ XlaOp Digamma(XlaOp input) {
   return result;
 }
 
+// Implements Banker's rounding: numbers that are equidistant between two
+// integers are rounded towards even.
+XlaOp RoundToEven(XlaOp x) {
+  auto half = xla::ScalarLike(x, 0.5);
+  auto one = xla::ScalarLike(x, 1.0);
+  auto two = xla::ScalarLike(x, 2.0);
+
+  auto round_val = xla::Floor(x);
+  auto fraction = x - round_val;
+  auto nearest_even_int = round_val - two * xla::Floor(half * x);
+  auto is_odd = xla::Eq(nearest_even_int, one);
+  return xla::Select(xla::Or(xla::Gt(fraction, half),
+                             xla::And(xla::Eq(fraction, half), is_odd)),
+                     round_val + one, round_val);
+}
+
 // Trigonometric functions.
 
 // acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index a6cafd42077367bf23ffa1f45eab31c01dc31b16..3f06d04b9ae98b3aa75e68cd07810b2b4c24d280 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -51,6 +51,10 @@ XlaOp Lgamma(XlaOp input);
 // Computes an approximation of the digamma function.
 XlaOp Digamma(XlaOp input);
 
+// Rounds the given number to even when the number is equidistant between two
+// integers.
+XlaOp RoundToEven(XlaOp x);
+
 // Trigonometric functions
 
 // Computes the arc cosine of 'x'.
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 14c259a7fa2a47642663b65d2785e5bbdc040cfd..ae2ea225d1aadd7b3a794eabeca866c498f34760 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -136,5 +136,17 @@ XLA_TEST_F(MathTest, Digamma) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, RoundToEven) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(
+      &builder, {-1.4, -1.5, -2.5, -0.5, 0, 0.5, 1.5, 2.5, 3.5, 4.5});
+  RoundToEven(x);
+
+  std::vector<float> expected = {-1.0, -2.0, -2.0, -0.0, 0,
+                                 0.0,  2.0,  2.0,  4.0,  4.0};
+
+  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.h b/tensorflow/compiler/xla/client/lib/numeric.h
index efd8cdc25724198633e0bf1c48c4e7d9e4b4c9e1..f62fdab4b0e5e84347cfaa1424a8c2e5c58dd3ce 100644
--- a/tensorflow/compiler/xla/client/lib/numeric.h
+++ b/tensorflow/compiler/xla/client/lib/numeric.h
@@ -22,9 +22,6 @@ limitations under the License.
 
 namespace xla {
 
-// Returns a rank 1 tensor of `type` containing values [0, 1, 2, ...].
-XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size);
-
 // Returns an m x n matrix with 1s on the diagonal elements, zeros everywhere
 // else.
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index c6f68c8ee2f5198017c37abeb9551478f52a99f4..85b9e1827dcef5ed907d893277deb5a52f8f30e9 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index 0475fd9c94f6e390b5169cfe2cbba8eae28ddc18..e8553a08bb014e790822a14e128686b60b8d6b7c 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 
@@ -23,13 +25,12 @@ XlaOp TopK(XlaOp input, int64 k) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
     int last_dim = input_shape.dimensions_size() - 1;
-    int last_dim_size = input_shape.dimensions(last_dim);
 
-    XlaOp iota_s32 = Iota(builder, S32, last_dim_size);
+    Shape iota_shape =
+        ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
+    XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
     auto input_dims = input_shape.dimensions();
-    std::vector<int64> broadcast_dims(input_dims.begin(), input_dims.end() - 1);
-    XlaOp broadcast_s32 = Broadcast(iota_s32, broadcast_dims);
-    XlaOp sort_result = Sort(Neg(input), {broadcast_s32});
+    XlaOp sort_result = Sort(Neg(input), {iota_s32});
     std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
     std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
     limit_indices[last_dim] = k;
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index fef98c9923096e21a755c6d730de2c7c10852b2d..ebb30d3acc492a115f4980aaa4d2d08f73683864 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -56,5 +56,13 @@ XLA_TEST_F(SortingTest, TopKFullSort) {
   ComputeAndCompareR1<float>(&builder, inputs, {});
 }
 
+XLA_TEST_F(SortingTest, TopKFullSortWithDuplicates) {
+  XlaBuilder builder(TestName());
+  XlaOp a;
+  auto a_data = CreateR1Parameter<int>({1, 1, 2, 2, 1}, 0, "a", &builder, &a);
+  xla::GetTupleElement(xla::TopK(a, 5), 1);
+  ComputeAndCompareR1<int>(&builder, {2, 3, 0, 1, 4}, {a_data.get()});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index f96b6c9c261a9686fb647e3da0dcc933cd1f70df..aaa5d6989eefb94edb8921d13f96e3705aa3e3a4 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -310,4 +310,28 @@ StatusOr<int> LocalClient::ReplicaNumberToDeviceOrdinal(int replica_number) {
   return local_service_->ReplicaNumberToDeviceOrdinal(replica_number);
 }
 
+StatusOr<TransferToServerResponse> LocalClient::TransferToLocalServer(
+    const ::xla::BorrowingLiteral& literal, int device_oridinal) {
+  const ::xla::Shape& shape = literal.shape();
+
+  TF_ASSIGN_OR_RETURN(
+      ::xla::ScopedShapedBuffer shaped_buffer,
+      backend().transfer_manager()->AllocateScopedShapedBuffer(
+          shape, backend().memory_allocator(), device_oridinal));
+  TF_ASSIGN_OR_RETURN(auto stream,
+                      mutable_backend()->BorrowStream(device_oridinal));
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+      stream.get(), literal, shaped_buffer));
+  std::vector<::xla::ScopedShapedBuffer> replicated_buffer;
+  replicated_buffer.emplace_back(std::move(shaped_buffer));
+  ::xla::TransferToServerResponse result;
+  TF_ASSIGN_OR_RETURN(*result.mutable_data(),
+                      local_service_->RegisterReplicatedBuffers(
+                          std::move(replicated_buffer),
+                          absl::StrCat("TransferToServer literal of shape ",
+                                       ::xla::ShapeUtil::HumanString(shape))));
+
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index feb2f8ec9dab5bf13afdc866d10ccbe74f8edcb9..ddb36680e8b185b053368baffa6f1d5cac50dc07 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -60,8 +60,8 @@ class LocalExecutable {
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
   //
-  // The given ExecutableRunOptions override any values from legacy_flags
-  // (TF_XLA_FLAGS environment variable).
+  // The given ExecutableRunOptions override any values from TF_XLA_FLAGS
+  // environment variable.
   Status ValidateExecutionOptions(
       const absl::Span<const ShapedBuffer* const> arguments,
       const ExecutableRunOptions& run_options, const Backend& backend);
@@ -69,8 +69,8 @@ class LocalExecutable {
   // Records the computation in a SessionModule proto with the arguments used to
   // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
   //
-  // The given ServiceExecutableRunOptions override any values from legacy_flags
-  // (TF_XLA_FLAGS environment variable).
+  // The given ServiceExecutableRunOptions override any values from TF_XLA_FLAGS
+  // environment variable.
   StatusOr<ScopedShapedBuffer> ExecuteAndDump(
       const ServiceExecutableRunOptions* run_options,
       const absl::Span<const ShapedBuffer* const> arguments);
@@ -114,8 +114,8 @@ class LocalClient : public Client {
   // Build and return a LocalExecutable object. The executable is compiled using
   // the given XlaComputation, argument layouts and options.
   //
-  // The given ExecutableBuildOptions override any values from legacy_flags
-  // (TF_XLA_FLAGS environment variable).
+  // The given ExecutableBuildOptions override any values from TF_XLA_FLAGS
+  // environment variable.
   StatusOr<std::unique_ptr<LocalExecutable>> Compile(
       const XlaComputation& computation,
       const absl::Span<const Shape* const> argument_layouts,
@@ -129,6 +129,10 @@ class LocalClient : public Client {
       const Literal& literal, int device_ordinal,
       DeviceMemoryAllocator* allocator = nullptr);
 
+  // Transfer the BorrowingLiteral to the device with the given ordinal.
+  StatusOr<TransferToServerResponse> TransferToLocalServer(
+      const ::xla::BorrowingLiteral& literal, int device_oridinal);
+
   // Copy the data from the device contained in the given ShapedBuffer and
   // return as a Literal.
   StatusOr<Literal> ShapedBufferToLiteral(const ShapedBuffer& shaped_buffer);
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index f9c23b44810a52ae4dd40cc838e6cb575cb44445..f508ffb9c958ecfae7aea2c232e04001bd826a19 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -239,6 +239,19 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
   visited->insert(op_handle);
 }
 
+Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
+                                     ShapeIndex dynamic_size_param_index,
+                                     int64 target_param_num,
+                                     ShapeIndex target_param_index,
+                                     int64 target_dim_num) {
+  TF_RETURN_IF_ERROR(dynamic_parameter_binding_.Bind(
+      DynamicParameterBinding::DynamicParameter{dynamic_size_param_num,
+                                                dynamic_size_param_index},
+      DynamicParameterBinding::DynamicDimension{
+          target_param_num, target_param_index, target_dim_num}));
+  return Status::OK();
+}
+
 XlaComputation XlaBuilder::BuildAndNoteError() {
   DCHECK(parent_builder_ != nullptr);
   auto build_status = Build();
@@ -297,6 +310,9 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
   }
   module->add_computations()->Swap(&entry);
 
+  *(module->mutable_dynamic_parameter_binding()) =
+      dynamic_parameter_binding_.ToProto();
+
   // Clear data held by this builder.
   this->instructions_.clear();
   this->handle_to_index_.clear();
@@ -2305,6 +2321,19 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
   });
 }
 
+XlaOp XlaBuilder::GetDimensionSize(const XlaOp& operand, int64 dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferGetDimensionSizeShape(operand_shape, dimension));
+    instr.add_dimensions(dimension);
+    return AddInstruction(std::move(instr), HloOpcode::kGetDimensionSize,
+                          {operand});
+  });
+}
+
 StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
   TF_RETURN_IF_ERROR(first_error_);
 
@@ -3158,4 +3187,8 @@ XlaOp Iota(XlaBuilder* builder, const Shape& shape, int64 iota_dimension) {
   return builder->Iota(shape, iota_dimension);
 }
 
+XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension) {
+  return operand.builder()->GetDimensionSize(operand, dimension);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 908a616b4ead8820b5df991c3bc0b2f6724087ef..78c90dbccc486370377408d54406f4a896f60816 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -263,35 +264,30 @@ class XlaBuilder {
   // evaluating the computation.
   StatusOr<bool> IsConstant(const XlaOp& operand) const;
 
+  // Sets up binding which indicates that the `target_dim_num` in the subshape
+  // `target_param_index` of parameter `target_param_num` is a dynamic dimension
+  // and its real dynamic size is represented by `dynamic_param_index` in
+  // parameter `dynamic_param_num`.
+  //
+  // TODO(b/119520625): Remove this API once we have more dynamic shape infra
+  // ready.
+  Status SetDynamicBinding(int64 dynamic_size_param_num,
+                           ShapeIndex dynamic_size_param_index,
+                           int64 target_param_num,
+                           ShapeIndex target_param_index, int64 target_dim_num);
+
  private:
   // Build helper which takes the id of the root operation..
   StatusOr<XlaComputation> Build(int64 root_id);
 
-  // Enqueues a "retrieve parameter value" instruction for a parameter that was
-  // passed to the computation.
+  // Description for the methods below can be found in the corresponding public
+  // functions section in this file.
+
   XlaOp Parameter(int64 parameter_number, const Shape& shape,
                   const string& name);
 
-  // Enqueues a constant with the value of the given literal onto the
-  // computation.
   XlaOp ConstantLiteral(const LiteralSlice& literal);
 
-  // Enqueues a constant onto the computation. Methods are templated on the
-  // native host type (NativeT) which corresponds to a specific XLA
-  // PrimitiveType as given in the following table:
-  //
-  //  Native Type   PrimitiveType
-  // -----------------------------
-  //   bool           PRED
-  //   int32          S32
-  //   int64          S64
-  //   uint32         U32
-  //   uint64         U64
-  //   float          F32
-  //   double         F64
-  //
-  // Note: not all primitive types defined in xla_data.proto have a
-  // corresponding native type yet.
   template <typename NativeT>
   XlaOp ConstantR0(NativeT value);
   template <typename NativeT>
@@ -321,181 +317,78 @@ class XlaBuilder {
   template <typename NativeT>
   XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
 
-  // Enqueues a rank one constant (vector) onto the computation. The vector has
-  // size 'length' and every element has the value 'value'.
   template <typename NativeT>
   XlaOp ConstantR1(int64 length, NativeT value);
 
-  // Adds dimensions to an array by duplicating the data in the array.
-  //
-  // The new dimensions are inserted on the left, i.e. if
-  // broadcast_sizes has values {a0, ..., aN} and the operand shape
-  // has dimensions {b0, ..., bM} then the shape of the output has
-  // dimensions {a0, ..., aN, b0, ..., bM}.
-  //
-  // The new dimensions index into copies of the operand, i.e.
-  //
-  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
   XlaOp Broadcast(const XlaOp& operand,
                   absl::Span<const int64> broadcast_sizes);
 
   XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
                        const absl::Span<const int64> broadcast_dimensions);
 
-  // Enqueues a pad operation onto the computation that pads the given value on
-  // the edges as well as between the elements of the input. padding_config
-  // specifies the padding amount for each dimension.
   XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
             const PaddingConfig& padding_config);
 
-  // Enqueues an operation onto the computation that flattens the operand based
-  // on the dimension order (major/slowest-varying to minor/fastest-varying)
-  // given, followed by reshaping it into the shape with the given dimension
-  // sizes (also major to minor). Conceptually, this is a limited form of
-  // "shape casting".
   XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> dimensions,
                 absl::Span<const int64> new_sizes);
 
-  // Enqueues an operation onto the computation that collapses the operand, from
-  // first to last dimension (C order), then reshapes it to the given dimension
-  // sizes. Conceptually, this is a limited form of "shape casting".
   XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> new_sizes);
 
-  // Wrapper for Reshape.
-  // Enqueues an operation to collapse the provided dimensions; e.g. an
-  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-  // be a consecutive, in-order subsequence of the operand dimensions.
-  //
-  // Note that collapsing a single dimension does nothing:
-  //
-  //    {256} collapsing {0} => {256}
-  //    {1} collapsing {0} => {1}
-  //
-  // Collapsing multiple dimensions produces a single result dimension:
-  //
-  //    {256, 2} collapsing {0,1} => {512}
-  //    {256, 2, 3} collapsing {0,1} => {512, 3}
-  //
-  // This could potentially cause data to be moved -- it provides a more
-  // structured form of reshaping than an arbitrary Reshape operation.
   XlaOp Collapse(const XlaOp& operand, absl::Span<const int64> dimensions);
 
-  // Enqueues a slice operation onto the computation that slices the operand
-  // from the start indices to the limit indices; e.g.
-  //
-  //        x
-  //   [ 0 1 2 3 ]
-  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-  //   [ 8 9 a b ]
-  //
-  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-  // range notation.
-  // The strides parameter determines the stride over the slice
   XlaOp Slice(const XlaOp& operand, absl::Span<const int64> start_indices,
               absl::Span<const int64> limit_indices,
               absl::Span<const int64> strides);
 
-  // Enqueues a slice operation in a given dimension, taking all other
-  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-  // for:
-  //
-  //  array[:, 2:4:1, :]
   XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
                    int64 stride, int64 dimno);
 
-  // Enqueues a slice operation onto the computation that slices the 'operand'
-  // from dynamic start indices which are passed in 'start_indices'.
-  // The size of the slice in each dimension is passed in 'slice_sizes',
-  // which specify the end point of exclusive slice intervals in each
-  // dimension [start, start + size).
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo input dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
   XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                      absl::Span<const int64> slice_sizes);
 
-  // Enqueues a dynamic update slice operation onto the computation, which
-  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-  // The shape of 'update' determines the shape of the slice of 'operand'
-  // which is updated.
-  // The indices specified in 'start_indices' specify the offset of the slice
-  // of 'operand' which is updated.
-  //
-  //               update = {10, 11} // calculated at runtime.
-  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-  //   [7 8 9]                                                  [7 8  9 ]
-  //
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo update dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
   XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                            const XlaOp& start_indices);
 
-  // Enqueues a concatenate instruction onto the computation. 'operands' must
-  // have >= 1 entry.
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
 
-  // Enqueue a tracing operation onto the computation; the computation will emit
-  // a logging message with the operand.
   void Trace(const string& tag, const XlaOp& operand);
 
-  // Enqueues a conditional-move-like select operation onto the computation;
-  // predicated on pred, selects between on_true and on_false.
   XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
 
-  // Enqueues a tuple-creation instruction onto the computation.
   XlaOp Tuple(absl::Span<const XlaOp> elements);
 
-  // Enqueues a tuple-element-get instruction onto the computation.
   XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
 
-  // Enqueues an equal-to comparison instruction onto the computation.
   XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a not-equal comparison instruction onto the computation.
   XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a greater-or-equal comparison instruction onto the computation.
   XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a greater-than comparison instruction onto the computation.
   XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a less-than comparison instruction onto the computation.
   XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a less-or-equal comparison instruction onto the computation.
   XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a dot instruction onto the computation.
   XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
             const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a general dot instruction onto the computation.
   XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
                    const DotDimensionNumbers& dimension_numbers,
                    const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, which uses the
-  // default convolution dimension numbers.
   XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
              absl::Span<const int64> window_strides, Padding padding,
              int64 feature_group_count = 1,
              const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration in the format returned by MakePadding().
   XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides,
@@ -503,8 +396,6 @@ class XlaBuilder {
       int64 feature_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided dimension numbers configuration.
   XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides, Padding padding,
@@ -512,8 +403,6 @@ class XlaBuilder {
       int64 feature_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration as well as the dimension numbers.
   XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                     absl::Span<const int64> window_strides,
                     absl::Span<const std::pair<int64, int64>> padding,
@@ -521,8 +410,6 @@ class XlaBuilder {
                     int64 feature_group_count = 1,
                     const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration, dilation factors and dimension numbers.
   XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
                            absl::Span<const int64> window_strides,
                            absl::Span<const std::pair<int64, int64>> padding,
@@ -532,80 +419,53 @@ class XlaBuilder {
                            int64 feature_group_count = 1,
                            const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues an FFT instruction onto the computation, of the given type and
-  // with the given FFT length.
   XlaOp Fft(const XlaOp& operand, FftType fft_type,
             absl::Span<const int64> fft_length);
 
-  // Enqueues an infeed instruction onto the computation, which writes data of
-  // the given shape to the infeed buffer of the device.
   XlaOp Infeed(const Shape& shape, const string& config = "");
   XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
                         const string& config = "");
 
-  // Enqueues an outfeed instruction onto the computation. This instruction
-  // generates outgoing data transfers for the given data.
-  //
-  // shape_with_layout communicates the laid out shape that we want to outfeed
-  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-  // will occur.
   void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
                const string& outfeed_config);
   XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
                          const Shape& shape_with_layout,
                          const string& outfeed_config);
 
-  // Enqueues a call instruction onto the computation.
   XlaOp Call(const XlaComputation& computation,
              absl::Span<const XlaOp> operands);
 
-  // Enqueues a custom call instruction onto the computation.
   XlaOp CustomCall(
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
-  // The following methods enqueue element-wise binary arithmetic operations
-  // onto the computation. The shapes of the operands have to match unless one
-  // of the operands is a scalar, or an explicit broadcast dimension is given
-  // (see g3doc for more details).
-
-  // Enqueues a complex compose instruction onto the computation.
   XlaOp Complex(const XlaOp& real, const XlaOp& imag,
                 absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a complex conjugate instruction onto the computation.
   XlaOp Conj(const XlaOp& operand);
 
-  // Enqueues an add instruction onto the computation.
   XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a subtract instruction onto the computation.
   XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a multiply instruction onto the computation.
   XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a divide instruction onto the computation.
   XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a remainder instruction onto the computation.
   XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a max instruction onto the computation.
   XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a min instruction onto the computation.
   XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Element-wise logical operators
   XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
@@ -624,32 +484,23 @@ class XlaBuilder {
   XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
                           absl::Span<const int64> broadcast_dimensions = {});
 
-  // Reduces an array among the provided dimensions, given "computation" as a
-  // reduction operator.
   XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
 
-  // Reduces several arrays simultaneously among the provided dimensions, given
-  // "computation" as a reduction operator.
   XlaOp Reduce(absl::Span<const XlaOp> operands,
                absl::Span<const XlaOp> init_values,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
 
-  // Convenience wrapper around the above that reduces all the dimensions in the
-  // operand shape.
   XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
                   const XlaComputation& computation);
 
-  // Enqueues a windowed reduce instruction onto the computation.
   XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
                      const XlaComputation& computation,
                      absl::Span<const int64> window_dimensions,
                      absl::Span<const int64> window_strides, Padding padding);
 
-  // As ReduceWindow(), but the padding is given in the format
-  // returned by MakePadding().
   XlaOp ReduceWindowWithGeneralPadding(
       const XlaOp& operand, const XlaOp& init_value,
       const XlaComputation& computation,
@@ -659,48 +510,22 @@ class XlaBuilder {
       absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64>> padding);
 
-  // Returns the sum of the operand value within each subgroup of replicas. All
-  // replicas supply one input to the sum and all replicas receive the resulting
-  // sum for each subgroup.
   XlaOp CrossReplicaSum(const XlaOp& operand,
                         absl::Span<const ReplicaGroup> replica_groups = {});
 
-  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
-  // AllReduce means doing a reduction on the input operand cross cores and then
-  // broadcasting the reduction result to those cores. The reduction function is
-  // defined by `computation`, which should be a commutative computation on
-  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
-  // configured by:
-  //
-  // - `replica_groups`: each ReplicaGroup contains a list of replica id. If
-  // empty, all replicas belong to one group. Allreduce will be applied within
-  // subgroups. For example, we have 4 replicas, then
-  // replica_groups={{0,2},{1,3}} means, replica 0 and 2 are in subgroup 0,
-  // replica 1 and 3 are in subgroup 1.
-  //
-  // - `channel_id`: for Allreduce nodes from different modules, if they have
-  // the same channel_id, they will be 'Allreduce'd. If empty, Allreduce will
-  // not be applied cross modules.
-  //
-  // TODO(b/117564385): Rename this to AllReduce when it's ready to use.
   XlaOp CrossReplicaSum(
       const XlaOp& operand, const XlaComputation& computation,
       absl::Span<const ReplicaGroup> replica_groups = {},
       const absl::optional<ChannelHandle>& channel_id = absl::nullopt);
 
-  // Enqueues an operation that do an Alltoall of the operand cross cores.
   XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
                  int64 concat_dimension, int64 split_count,
                  const std::vector<ReplicaGroup>& replica_groups);
 
-  // Enqueues an operation that do an CollectivePermute of the operand cross
-  // cores.
   XlaOp CollectivePermute(
       const XlaOp& operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
-  // Enqueues an operation that scatters the `source` array to the selected
-  // indices of each window.
   XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
                          absl::Span<const int64> window_dimensions,
                          absl::Span<const int64> window_strides,
@@ -708,8 +533,6 @@ class XlaBuilder {
                          const XlaOp& init_value,
                          const XlaComputation& scatter);
 
-  // As SelectAndScatter(), but the padding is given in the format
-  // returned by MakePadding().
   XlaOp SelectAndScatterWithGeneralPadding(
       const XlaOp& operand, const XlaComputation& select,
       absl::Span<const int64> window_dimensions,
@@ -717,222 +540,126 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
       const XlaOp& init_value, const XlaComputation& scatter);
 
-  // Enqueues an abs instruction onto the computation.
   XlaOp Abs(const XlaOp& operand);
 
-  // Enqueues a atan2 instruction onto the computation.
   XlaOp Atan2(const XlaOp& y, const XlaOp& x,
               absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues an exp instruction onto the computation.
   XlaOp Exp(const XlaOp& operand);
 
-  // Enqueues an expm1 instruction onto the computation.
   XlaOp Expm1(const XlaOp& operand);
 
-  // Enqueues a floor instruction onto the computation.
   XlaOp Floor(const XlaOp& operand);
 
-  // Enqueues a ceil instruction onto the computation.
   XlaOp Ceil(const XlaOp& operand);
 
-  // Enqueues a round instruction onto the computation, rounding to nearest even
-  // with half-way cases rounding away from zero.
   XlaOp Round(const XlaOp& operand);
 
-  // Enqueues an log instruction (natural logarithm) onto the computation.
   XlaOp Log(const XlaOp& operand);
 
-  // Enqueues an log1p instruction (log(x+1)) onto the computation.
   XlaOp Log1p(const XlaOp& operand);
 
-  // Enqueues a sign instruction onto the computation.
   XlaOp Sign(const XlaOp& operand);
 
-  // Enqueues a count leading zeros instruction onto the computation.
   XlaOp Clz(const XlaOp& operand);
 
-  // Enqueues a cosine instruction onto the computation.
   XlaOp Cos(const XlaOp& operand);
 
-  // Enqueues a sine instruction onto the computation.
   XlaOp Sin(const XlaOp& operand);
 
-  // Enqueues a tanh instruction onto the computation.
   XlaOp Tanh(const XlaOp& operand);
 
-  // Enqueues a real-part instruction onto the computation.
   XlaOp Real(const XlaOp& operand);
 
-  // Enqueues an imaginary-part instruction onto the computation.
   XlaOp Imag(const XlaOp& operand);
 
-  // Enqueues a lhs^rhs computation onto the computation.
   XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues an operator that tests if the operand's values are finite, i.e.,
-  // not Inf or NaN. Defined only for floating-point types. Returns an array of
-  // booleans with the same shape where entries are true iff the corresponding
-  // entry was NaN.
   XlaOp IsFinite(const XlaOp& operand);
 
-  // Enqueues an iota operation onto the computation.
   XlaOp Iota(const Shape& shape, int64 iota_dimension);
 
-  // Enqueues a rank-1 iota operation onto the computation.
   XlaOp Iota(PrimitiveType type, int64 size);
 
-  // Enqueues a convert instruction onto the computation that changes the
-  // element type of the operand array to primitive_type.
   XlaOp ConvertElementType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  // Enqueues a no-op instruction onto the computation that changes
-  // the element type of the operand array to primitive_type. The
-  // bit-widths of the source and destination element types must be
-  // identical.
   XlaOp BitcastConvertType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  // Enqueues a negate instruction onto the computation.
   XlaOp Neg(const XlaOp& operand);
 
-  // Enqueues a transpose instruction onto the computation.
   XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation);
 
-  // Enqueues a reverse instruction onto the computation. The order of the
-  // elements in the given dimensions is reversed (i.e., the element at index i
-  // is moved to index dimension_size - 1 - i).
   XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 
-  // Enqueues a sort (as increasing order) instruction onto the computation.
-  // If only keys are provided:
-  // * If the keys are an rank-1 tensor (an array), the result is a sorted array
-  // of keys, in ascending order.
-  // * If the keys have higher rank, the keys are sorted along the provided
-  // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-  // value of 0 will indepenently sort every column, and a dimension value of 1
-  // will independently sort each row. If no dimension number is provided, then
-  // the last dimension is chosen by default.
-  //
-  // If both keys and values are provided:
-  // * The keys and all values must be tensors with the same dimensions. The
-  // element types of the tensors may be different.
-  // * The result is a tuple that consists of a sorted tensor of keys (along the
-  // provided dimension, as above) as the first element, and tensors with their
-  // corresponding values as the other elements.
   XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
              int64 dimension = -1);
 
-  // Enqueues a clamp instruction onto the computation.
   XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
-  // Enqueues a map instruction onto the computation.
   XlaOp Map(absl::Span<const XlaOp> operands, const XlaComputation& computation,
             absl::Span<const int64> dimensions,
             absl::Span<const XlaOp> static_operands = {});
 
-  // Enqueues a N(mu, sigma) random number generation instruction onto the
-  // computation.
   XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
 
-  // Enqueues a U(a, b) random number generation instruction onto the
-  // computation. Returns values in the semi-open interval [a, b).
   XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
 
-  // Enqueues a while node onto the computation.
   XlaOp While(const XlaComputation& condition, const XlaComputation& body,
               const XlaOp& init);
 
-  // Enqueues a conditional node onto the computation.
   XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                     const XlaComputation& true_computation,
                     const XlaOp& false_operand,
                     const XlaComputation& false_computation);
 
-  // Enqueues a ReducePrecision node onto the computation.
   XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                         const int mantissa_bits);
 
-  // Enqueues a Gather node onto the computation.
   XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
                const GatherDimensionNumbers& dimension_numbers,
                absl::Span<const int64> slice_sizes);
 
-  // Enqueues a Scatter node onto the computation.
   XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                 const XlaOp& updates, const XlaComputation& update_computation,
                 const ScatterDimensionNumbers& dimension_numbers);
 
-  // Enqueues a Send node onto the computation for device-to-device
-  // communication, to send the given operand to a Recv instruction that shares
-  // the same channel handle.
   void Send(const XlaOp& operand, const ChannelHandle& handle);
   XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
                       const ChannelHandle& handle);
 
-  // Enqueues a Send node which sends data to the host.
   XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
                    const Shape& shape_with_layout, const ChannelHandle& handle);
 
-  // Enqueues a Recv node which receives data from the host.
   XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
                      const ChannelHandle& handle);
 
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
   XlaOp CreateToken();
 
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
   XlaOp AfterAll(absl::Span<const XlaOp> tokens);
 
-  // Enqueues a Recv node onto the computation. The data comes from a Send
-  // instruction that shares the same channel handle and its shape must
-  // be the same as the given shape.
   XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
   XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
                       const ChannelHandle& handle);
 
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-  // is the normalized result and batch_mean and batch_var are the mean and
-  // variance, respectively, across batch for the operand.
   XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
                           const XlaOp& offset, float epsilon,
                           int64 feature_index);
 
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-  // computing `mean` and `variance` for each batch inside the operation. It
-  // uses the input `mean` and `variance` instead as estimated values. The
-  // purpose of this op is to reduce latency in inference, hence the name
-  // `BatchNormInference`.
-  //
-  // The output has the same shape as `operand`, and contains the normalized
-  // values for each batch.
   XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
                            const XlaOp& offset, const XlaOp& mean,
                            const XlaOp& variance, float epsilon,
                            int64 feature_index);
 
-  // Calculates the gradients of a batch norm op.
-  //
-  // The inputs `batch_mean` and `batch_var` represent the mean and variance
-  // across the batch.
-  //
-  // Returns a tuple of three elements:
-  //   - grad_operand: Gradient with respect to input `operand`
-  //   - grad_offset: Gradient with respect to input `offset`
-  //   - grad_scale: Gradient with respect to input `scale`
   XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                       const XlaOp& batch_mean, const XlaOp& batch_var,
                       const XlaOp& grad_output, float epsilon,
                       int64 feature_index);
 
+  XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
+
   StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
                                  absl::Span<const XlaOp> operands = {});
 
@@ -1017,6 +744,9 @@ class XlaBuilder {
   // The instructions of this computation.
   std::vector<HloInstructionProto> instructions_;
 
+  // Dynamic parameter configuration of this computation.
+  DynamicParameterBinding dynamic_parameter_binding_;
+
   // A map from XlaOp::Handle to the index in the instructions_ vector where the
   // instruction is held.
   absl::flat_hash_map<int64, int64> handle_to_index_;
@@ -1355,6 +1085,8 @@ class XlaBuilder {
                                 const string& outfeed_config);
   friend XlaOp CreateToken(XlaBuilder* builder);
   friend XlaOp AfterAll(XlaBuilder* builder, absl::Span<const XlaOp> tokens);
+
+  friend XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
 };
 
 // RAII-style object: sets the current sharding assignment in builder on
@@ -1389,6 +1121,7 @@ class XlaScopedShardingAssignment {
 // Free functions for building XlaOps. The intention is that these will
 // become the public API for building XlaOps rather than calling methods on
 // XlaBuilder directly.
+//
 
 // Enqueues a "retrieve parameter value" instruction for a parameter that was
 // passed to the computation.
@@ -2129,7 +1862,12 @@ XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                     const XlaOp& grad_output, float epsilon,
                     int64 feature_index);
 
+// Returns the size of the given dimension of the operand. The operand must be
+// array shaped.
+XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
+
 // Implementation details below this point.
+//
 
 template <typename NativeT>
 XlaOp XlaBuilder::ConstantR0(NativeT value) {
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index dfe5fd5eb23ca51d2a449106a21293405a3dab6f..8aa85c3cd63c9b0aeb55d2cebbb989b6432ac959 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -43,7 +43,7 @@ class XlaBuilderTest : public ::testing::Test {
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
-                            proto, legacy_flags::GetDebugOptionsFromFlags()));
+                            proto, GetDebugOptionsFromFlags()));
     return HloModule::CreateFromProto(proto, config);
   }
 
@@ -54,7 +54,7 @@ class XlaBuilderTest : public ::testing::Test {
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
-                            proto, legacy_flags::GetDebugOptionsFromFlags()));
+                            proto, GetDebugOptionsFromFlags()));
     return HloModule::CreateFromProto(proto, config);
   }
 
@@ -349,6 +349,15 @@ TEST_F(XlaBuilderTest, CollectivePermute) {
   EXPECT_EQ(root->opcode(), HloOpcode::kCollectivePermute);
 }
 
+TEST_F(XlaBuilderTest, GetDimensionSize) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
+  GetDimensionSize(x, 1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kGetDimensionSize);
+}
+
 TEST_F(XlaBuilderTest, ReportError) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
similarity index 96%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
rename to tensorflow/compiler/xla/debug_options_flags.cc
index 3ed3afcfcede20fbf5c7d4f004378817febeb4c7..a40330a9b1fe201b6ec83d1bfe1a21e294e18f55 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -13,17 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 
 #include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 #include <vector>
 #include "absl/strings/str_split.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/debug_options_parsers.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 
 namespace xla {
-namespace legacy_flags {
-
 namespace {
 
 DebugOptions* flag_values;
@@ -101,8 +99,8 @@ void AllocateFlags() {
       [](string comma_separated_values) {
         auto* extra_options_map =
             flag_values->mutable_xla_backend_extra_options();
-        impl::parse_xla_backend_extra_options(extra_options_map,
-                                              comma_separated_values);
+        parse_xla_backend_extra_options(extra_options_map,
+                                        comma_separated_values);
         return true;
       };
 
@@ -111,8 +109,8 @@ void AllocateFlags() {
       [](string reduce_precision_option_value) {
         HloReducePrecisionOptions* option_proto =
             flag_values->add_hlo_reduce_precision_options();
-        return impl::parse_xla_reduce_precision_option(
-            option_proto, reduce_precision_option_value);
+        return parse_xla_reduce_precision_option(option_proto,
+                                                 reduce_precision_option_value);
       };
 
   flag_objects = new std::vector<tensorflow::Flag>({
@@ -337,7 +335,7 @@ void AllocateFlags() {
           "behavior to help run tests on the host that run models in parallel "
           "across multiple devices."),
   });
-  ParseFlagsFromEnv(*flag_objects);
+  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
 }  // namespace
@@ -353,5 +351,4 @@ xla::DebugOptions GetDebugOptionsFromFlags() {
   return *flag_values;
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h b/tensorflow/compiler/xla/debug_options_flags.h
similarity index 81%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
rename to tensorflow/compiler/xla/debug_options_flags.h
index b53157f59c61cf4e0850e006ad3656f4be63a936..60e59abc2a2e0f1cce3de1afc928f9fe36f75b33 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
+++ b/tensorflow/compiler/xla/debug_options_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_FLAGS_H_
+#define TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_FLAGS_H_
 
 #include <vector>
 
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Appends flag definitions for debug options to flag_list.
 void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list);
@@ -32,7 +31,6 @@ void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list);
 // first.
 xla::DebugOptions GetDebugOptionsFromFlags();
 
-}  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h b/tensorflow/compiler/xla/debug_options_parsers.h
similarity index 94%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
rename to tensorflow/compiler/xla/debug_options_parsers.h
index ee7eb019c07cf898e48886955b18710146644cac..80aadfd5ece0e768afaf1842d2b6c5b11c288b55 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
+++ b/tensorflow/compiler/xla/debug_options_parsers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_PARSERS_H_
+#define TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_PARSERS_H_
 
 #include <vector>
 #include "absl/strings/numbers.h"
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
-namespace legacy_flags {
-namespace impl {
 
 template <typename T>
 void parse_xla_backend_extra_options(T* extra_options_map,
@@ -140,8 +138,6 @@ inline bool parse_xla_reduce_precision_option(
   return true;
 }
 
-}  // namespace impl
-}  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_PARSERS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc b/tensorflow/compiler/xla/debug_options_parsers_test.cc
similarity index 88%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc
rename to tensorflow/compiler/xla/debug_options_parsers_test.cc
index 6f197aec53c7596e84437a03affa9118f22f5a1d..8003c3496d5df9be2ff8a99bc171972c8e090c43 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc
+++ b/tensorflow/compiler/xla/debug_options_parsers_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Test for parse_flags_from_env.cc
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h"
+#include "tensorflow/compiler/xla/debug_options_parsers.h"
 
 #include <unordered_map>
 #include <vector>
@@ -23,13 +23,12 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Test that the xla_backend_extra_options flag is parsed correctly.
 TEST(DebugOptionsFlags, ParseXlaBackendExtraOptions) {
   std::unordered_map<string, string> test_map;
   string test_string = "aa=bb,cc,dd=,ee=ff=gg";
-  impl::parse_xla_backend_extra_options(&test_map, test_string);
+  parse_xla_backend_extra_options(&test_map, test_string);
   EXPECT_EQ(test_map.size(), 4);
   EXPECT_EQ(test_map.at("aa"), "bb");
   EXPECT_EQ(test_map.at("cc"), "");
@@ -41,7 +40,7 @@ TEST(DebugOptionsFlags, ParseXlaBackendExtraOptions) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStrings) {
   HloReducePrecisionOptions proto;
   string test_string = "OP_OUTPUTS=5,10:add,dot";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -56,7 +55,7 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStrings) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStringsSemicolon) {
   HloReducePrecisionOptions proto;
   string test_string = "OP_OUTPUTS=5,10:add,dot;";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -71,7 +70,7 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStringsSemicolon) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoOpcodes) {
   HloReducePrecisionOptions proto;
   string test_string = "UNFUSED_OP_OUTPUTS=5,10:;foo,bar/baz";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::UNFUSED_OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -84,7 +83,7 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoOpcodes) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionBoth) {
   HloReducePrecisionOptions proto;
   string test_string = "UNFUSED_OP_OUTPUTS=5,10:subtract;foo,bar/baz";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::UNFUSED_OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -96,7 +95,6 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionBoth) {
   EXPECT_EQ(proto.opname_substrings_to_suffix(1), "bar/baz");
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
 
 int main(int argc, char* argv[]) {
diff --git a/tensorflow/compiler/xla/execution_options_util.cc b/tensorflow/compiler/xla/execution_options_util.cc
index e83ff7cddd675197c7f6d7018257edb4c25b6228..cf569863bbe1c92bdcafb133d49dcf5ae8890ffe 100644
--- a/tensorflow/compiler/xla/execution_options_util.cc
+++ b/tensorflow/compiler/xla/execution_options_util.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 
 namespace xla {
 
 ExecutionOptions CreateDefaultExecutionOptions() {
   ExecutionOptions execution_options;
-  *(execution_options.mutable_debug_options()) =
-      legacy_flags::GetDebugOptionsFromFlags();
+  *(execution_options.mutable_debug_options()) = GetDebugOptionsFromFlags();
   return execution_options;
 }
 
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index fb135f5ceda67ce6c001de15b8f3f084ca164826..1fea816a803bfb75b9721393cef8c4dfc249268d 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as _np  # Avoids becoming a part of public Tensorflow API.
 
 from tensorflow.compiler.xla import xla_data_pb2
-from tensorflow.compiler.xla.python_api import xla_shape
 from tensorflow.core.framework import attr_value_pb2
 
 
@@ -64,22 +61,18 @@ class Sharding(object):
             tile_assignment_devices=[core]))
 
   @classmethod
-  def tile(cls, tile_shape, tile_assignment):
+  def tile(cls, tile_assignment):
     """Returns a Tiled sharding attribute.
 
     This causes an op to be partially computed on multiple cores in the
     XLA device.
 
     Args:
-      tile_shape: A xla_shape.Shape describing the tile shape that each core
-        will compute.
-        The tile shape does not need to be divisible by the tile assignment.
       tile_assignment: An np.ndarray describing the topology of the tiling and
         which device will compute which part of the topology.
 
     Raises:
-      TypeError: tile_assignment was not of np.array type or tile_shape was
-         not of xla_shape.Shape type.
+      TypeError: tile_assignment was not of np.array type.
 
     TODO(jmolloy): This concept is nefarious and is not
     something we really want to expose to users (especially as the
@@ -87,14 +80,11 @@ class Sharding(object):
     """
     if not isinstance(tile_assignment, _np.ndarray):
       raise TypeError('Tile assignment must be of type np.ndarray')
-    if not isinstance(tile_shape, xla_shape.Shape):
-      raise TypeError('Tile shape must be of type xla_shape.Shape')
     dims = list(tile_assignment.shape)
     flattened_devices = tile_assignment.reshape(-1, order='C')
     return Sharding(
         proto=xla_data_pb2.OpSharding(
             type=xla_data_pb2.OpSharding.OTHER,
-            tile_shape=tile_shape.message,
             tile_assignment_dimensions=dims,
             tile_assignment_devices=list(flattened_devices)))
 
@@ -118,14 +108,8 @@ class Sharding(object):
     shape = tensor.shape.as_list()
     if shape[split_dimension] < num_devices:
       raise ValueError('Split dimension was smaller than the required number '
-                       'of splits: shape=%r, dimension=%r, num_devices=%r',
-                       shape, split_dimension, num_devices)
-
-    tile_shape = shape
-    tile_shape[split_dimension] = int(
-        math.ceil(tile_shape[split_dimension] / num_devices))
-    tile_shape_proto = xla_data_pb2.Shape(
-        element_type=xla_data_pb2.F32, dimensions=tile_shape)
+                       'of splits: shape=%r, dimension=%r, num_devices=%r' %
+                       (shape, split_dimension, num_devices))
 
     tile_assignment_dims = [1] * len(shape)
     tile_assignment_dims[split_dimension] = num_devices
@@ -133,7 +117,6 @@ class Sharding(object):
     return Sharding(
         proto=xla_data_pb2.OpSharding(
             type=xla_data_pb2.OpSharding.OTHER,
-            tile_shape=tile_shape_proto,
             tile_assignment_dimensions=tile_assignment_dims,
             tile_assignment_devices=range(num_devices)))
 
@@ -149,7 +132,6 @@ class Sharding(object):
           type=xla_data_pb2.OpSharding.TUPLE, tuple_shardings=tuple_shardings)
     else:
       proto = self._proto
-
     attr_value = attr_value_pb2.AttrValue(s=proto.SerializeToString())
     # TODO(jmolloy): This need to be seriously revisited before declaring this
     # API available for public use.
@@ -194,8 +176,8 @@ def assign_device(tensor, device):
   return tensor
 
 
-def tile(tensor, tile_shape, tile_assignment):
-  Sharding.tile(tile_shape, tile_assignment).apply_to_tensor(tensor)
+def tile(tensor, tile_assignment):
+  Sharding.tile(tile_assignment).apply_to_tensor(tensor)
   return tensor
 
 
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index bcfbcc3a22f50c748c388d17fbcd7defd27846d0..12b7094705e75305dc43a013576f4549dd5f4185 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -3,15 +3,15 @@ upper_tabs:
 - include: /_upper_tabs_left.yaml
 - include: /api_docs/_upper_tabs_api.yaml
 # Dropdown menu
-- name: Ecosystem
-  path: /ecosystem
+- name: Resources
+  path: /resources
   is_default: true
   menu:
-  - include: /ecosystem/_menu_toc.yaml
+  - include: /resources/_menu_toc.yaml
   lower_tabs:
     # Subsite tabs
     other:
-    - name: Guide
+    - name: Guide & Tutorials
       contents:
       - title: XLA overview
         path: /xla/overview
@@ -27,3 +27,7 @@ upper_tabs:
         path: /xla/shapes
       - title: Using AOT compilation
         path: /xla/tfcompile
+      - heading: Tutorials
+      - title: XLA compile API
+        path: /xla/tutorials/xla_compile
+        status: experimental
diff --git a/tensorflow/compiler/xla/g3doc/_index.yaml b/tensorflow/compiler/xla/g3doc/_index.yaml
index 7934cd11ba22d3f47e172726f54ce51d15eb2cad..858de427119bfcfa82d0b1158776bf269129fd92 100644
--- a/tensorflow/compiler/xla/g3doc/_index.yaml
+++ b/tensorflow/compiler/xla/g3doc/_index.yaml
@@ -17,7 +17,7 @@ landing_page:
   - classname: devsite-landing-row-cards
     items:
     - heading: XLA - TensorFlow, compiled
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
+      image_path: /resources/images/tf-logo-card-16x9.png
       path: https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html
       buttons:
       - label: Read on Google Developers blog
@@ -28,7 +28,7 @@ landing_page:
       - label: Watch the video
         path: https://www.youtube.com/watch?v=kAOanJczHA0
     - heading: XLA on GitHub
-      image_path: /ecosystem/images/github-card-16x9.png
+      image_path: /resources/images/github-card-16x9.png
       path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla
       buttons:
       - label: View on GitHub
diff --git a/tensorflow/compiler/xla/g3doc/jit.md b/tensorflow/compiler/xla/g3doc/jit.md
index 5376a04669d7c17a2fed8cdab46e21277049bf72..85fa16ccc7f48a3dce840564e79097c9e136767f 100644
--- a/tensorflow/compiler/xla/g3doc/jit.md
+++ b/tensorflow/compiler/xla/g3doc/jit.md
@@ -58,7 +58,7 @@ sess = tf.Session(config=config)
 > compiled for the CPU. JIT compilation for CPU operations must be done via
 > the manual method documented below.
 
-#### Manual
+#### Manual with experimental_jit_scope()
 
 JIT compilation can also be turned on manually for one or more operators. This
 is done by tagging the operators to compile with the attribute
@@ -79,6 +79,16 @@ The `_XlaCompile` attribute is currently supported on a best-effort basis. If an
 operator cannot be compiled, TensorFlow will silently fall back to the normal
 implementation.
 
+#### Manual with xla.compile()
+
+Unlike experimental_jit_scope() which silently falls back to normal Tensorflow
+on uncompilable operator, xla.compile() returns an explicit error. This is
+useful if you want more predictable behaviors from XLA compilation.
+
+Please see
+[xla.compile() tutorial Colab](./tutorials/xla_compile.ipynb)
+for how to use it.
+
 ### Placing operators on XLA devices
 
 Another way to run computations via XLA is to place an operator on a specific
@@ -134,7 +144,7 @@ Execute the python script to train the model with XLA and turn on a debugging
 feature of XLA via an environmental variable that outputs the XLA graph.
 
 ```shell
-TF_XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
+XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
 ```
 
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index a3cdfe19b2e3470bb903cce6cbc79d8d13cc8349..73a9db75f6bf090bba5c3534f14d8ebfa421b5bb 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1339,6 +1339,22 @@ the semantics for `tf.gather_nd`.
 index `X` in the gather indices array picks an entire row and the result is the
 concatenation of all these rows.
 
+## GetDimensionSize
+
+See also
+[`XlaBuilder::GetDimensionSize`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Returns the size of the given dimension of the operand. The operand must be
+array shaped.
+
+<b> `GetDimensionSize(operand, dimension)` </b>
+
+| Arguments   | Type    | Semantics                                           |
+| ----------- | ------- | --------------------------------------------------- |
+| `operand`   | `XlaOp` | n dimensional input array                           |
+| `dimension` | `int64` | A value in the interval `[0, n)` that specifies the |
+:             :         : dimension                                           :
+
 ## GetTupleElement
 
 See also
diff --git a/tensorflow/compiler/xla/g3doc/overview.md b/tensorflow/compiler/xla/g3doc/overview.md
index 6a172c3ae159974fb4a34ec422a9a96079b0814a..d3428b7276131e8f406f60cfea9a9346c5478433 100644
--- a/tensorflow/compiler/xla/g3doc/overview.md
+++ b/tensorflow/compiler/xla/g3doc/overview.md
@@ -4,11 +4,8 @@
 <img style="width:50%" src="./images/xlalogo.png">
 </div>
 
-> Note: XLA is experimental and considered alpha.  Most use cases will not
-> see improvements in performance (speed or decreased memory usage). We have
-> released XLA early so the Open Source Community can contribute to its
-> development, as well as create a path for integration with hardware
-> accelerators.
+> Note: XLA is still under development.  Some use cases will not
+> see improvements in speed or decreased memory usage.
 
 XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
 algebra that optimizes TensorFlow computations. The results are improvements in
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..2a83092805be5efdd7b9ab54449b2bcc6a2ec481
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
@@ -0,0 +1,373 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "The XLA compile API",
+      "version": "0.3.2",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "f4TSNCvpENrW"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "##### Copyright 2018 The TensorFlow Authors."
+      ]
+    },
+    {
+      "metadata": {
+        "cellView": "form",
+        "colab_type": "code",
+        "id": "vamNSA0vEP-m",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "e1oSi4lHFt3z"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# The XLA compile API"
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "b7noD9NjFRL-"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/xla/tutorials/xla_compile\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>"
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "v9YbsuLZaBXy"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "\n",
+        "\n",
+        "Import TensorFlow and the XLA library. XLA contains `xla.compile()`, an experimental API that compiles part or all of a model with [XLA](https://www.tensorflow.org/extend/xla/)."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "45kUPj5ZFrRa",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "import tensorflow as tf\n",
+        "\n",
+        "from tensorflow.contrib.compiler import xla"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "GZVNiRmTDV-5"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Define some necessary constants and prepare the MNIST dataset."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "f37TSEGvGX4_",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "# Size of each input image, 28 x 28 pixels\n",
+        "IMAGE_SIZE = 28 * 28\n",
+        "# Number of distinct number labels, [0..9]\n",
+        "NUM_CLASSES = 10\n",
+        "# Number of examples in each training batch (step)\n",
+        "TRAIN_BATCH_SIZE = 100\n",
+        "# Number of training steps to run\n",
+        "TRAIN_STEPS = 1000"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "TiVXchblG5hK",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "# Loads MNIST dataset.\n",
+        "train, test = tf.keras.datasets.mnist.load_data()\n",
+        "train_ds = tf.data.Dataset.from_tensor_slices(train).batch(TRAIN_BATCH_SIZE).repeat()\n",
+        "test_ds = tf.data.Dataset.from_tensor_slices(test).batch(TRAIN_BATCH_SIZE)\n",
+        "\n",
+        "iterator = tf.data.Iterator.from_structure(train_ds.output_types, train_ds.output_shapes)\n",
+        "images, labels = iterator.get_next()\n",
+        "images = tf.reshape(images, [-1, IMAGE_SIZE])\n",
+        "images, labels = tf.cast(images, tf.float32), tf.cast(labels, tf.int64)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "x_ZehpZP-SfS"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Define the model constructing function\n",
+        "\n",
+        "Following code block contains a function that constructs a simple model with one dense layer, including both forward and backward propagation.\n",
+        "\n",
+        "When called, it returns two values. `y` is a `tf.Tensor` representing predicted probability of each target class, `train_step` is a `tf.Operation` that increments `global_step` and applies variable update."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "ZbhJl_WvGa3g",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "def build_mnist_model(x, y_):\n",
+        "  y = tf.keras.layers.Dense(NUM_CLASSES).apply(x)\n",
+        "\n",
+        "  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)\n",
+        "  train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)\n",
+        "\n",
+        "  return y, train_step"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "7Jh3lyQHDfM9"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Enable XLA\n",
+        "\n",
+        "Use `xla.compile` with the `build_mnist_model` function to enable XLA. Following code block wraps the model with `xla.compile()`, which allows the target function with provided inputs to be executed by XLA."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "kYpCXCdRHNuN",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "[y] = xla.compile(build_mnist_model, inputs=[images, labels])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "4giQh62IrZGF"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "When compiling the graph, XLA replaces all the graph nodes constructed in the target function with a few XLA ops.\n",
+        "\n",
+        "xla.compile does not return any\n",
+        "`tf.Operation` nodes that can be executed independently from the generated XLA ops. Instead, returned `tf.Operation` nodes from the target function are added as control dependencies of all returned `tf.Tensor` values. This triggers execution of the `tf.Operation` nodes when the returned tensors are evaluated.\n",
+        "\n",
+        "In pseudo-code, xla.compile's implementation looks as follows:\n",
+        "\n",
+        "---\n",
+        "```\n",
+        "# Ask Tensorflow to execute code in XLA-friendly manner\n",
+        "\n",
+        "y, train_step = build_mnist_model(images, labels)\n",
+        "with tf.control_dependencies([train_step]):\n",
+        "  y = tf.identity(y)\n",
+        "\n",
+        "# Ask Tensorflow to STOP executing code in XLA-friendly manner\n",
+        "```\n",
+        "---\n",
+        "\n",
+        "xla.compile() always returns a list of `tf.Tensor`'s (even if there is only one-element)."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "TPGas4jjFLZl"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "If you were to print the constructed graph now, you will see that it is not much different from a normal Tensorflow graph and you won't be able to find XLA ops mentioned before. This is because the actual compilation happens later when you try to execute the graph with `sess.run()`.  At that time, Tensorflow triggers a series of graph rewrite passes that actually generate XLA ops, which compiles and executes computation when all inputs are ready."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "EZD1m_n1DxAF"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Train and test the model"
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "qe28bAHNHUG2",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "# Creates session and initialize all variables.\n",
+        "# xla.compile() doesn't work with Keras model.fit() API or TF eager mode yet.\n",
+        "sess = tf.Session()\n",
+        "sess.run(tf.global_variables_initializer())"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "qgsKmz3n2UiW"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Following code block trains model. Evaluating `y` also triggers its control dependency node `train_step`, which updates model variables."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "_GxF6jTRHVuA",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "fbf299ca-02d5-4e95-f9fe-8f3c0432d132"
+      },
+      "cell_type": "code",
+      "source": [
+        "# Feeds training dataset\n",
+        "sess.run(iterator.make_initializer(train_ds))\n",
+        "\n",
+        "# Runs TRAIN_STEPS steps\n",
+        "for i in range(TRAIN_STEPS):\n",
+        "  sess.run(y)\n",
+        "\n",
+        "print(\"Model trained for %s steps.\" % TRAIN_STEPS)"
+      ],
+      "execution_count": 21,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Model trained for 1000 steps.\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "dHlQlRSRHXD1",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "9c3677a2-ec84-406f-9d2c-d722844f3093"
+      },
+      "cell_type": "code",
+      "source": [
+        "# Tests trained model\n",
+        "\n",
+        "# Feeds testing dataset\n",
+        "sess.run(iterator.make_initializer(test_ds))\n",
+        "\n",
+        "# Calculates accuracy\n",
+        "correct_prediction = tf.equal(tf.argmax(y, 1), labels)\n",
+        "accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))\n",
+        "print(\"Prediction accuracy after training: %s\" % sess.run(accuracy))"
+      ],
+      "execution_count": 22,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Prediction accuracy after training: 0.91\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "ynJQIuzjHYOb",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "# Cleans up session\n",
+        "sess.close()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/index_util_test.cc b/tensorflow/compiler/xla/index_util_test.cc
index 93522d2ca87a7eba8d3c7533785c54e63ce507b0..fa94d0afb4c9280b8f8fa9642c1b0ab7285ee6f3 100644
--- a/tensorflow/compiler/xla/index_util_test.cc
+++ b/tensorflow/compiler/xla/index_util_test.cc
@@ -24,8 +24,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-void SetMinorToMajorLayout(Shape* shape,
-                           std::initializer_list<int64> dimensions) {
+void SetMinorToMajorLayout(Shape* shape, std::vector<int64> dimensions) {
   shape->mutable_layout()->clear_minor_to_major();
   for (auto dimension : dimensions) {
     shape->mutable_layout()->add_minor_to_major(dimension);
@@ -122,7 +121,7 @@ TEST(IndexUtilTest, LinearToMultiToLinear) {
   std::vector<int64> linear_indexes = {0,        1439999999, 1145567336,
                                        43883404, 617295214,  1117613654};
 
-  std::vector<std::initializer_list<int64>> minor_to_major_orders;
+  std::vector<std::vector<int64>> minor_to_major_orders;
   minor_to_major_orders.push_back({6, 5, 4, 3, 2, 1, 0});
   minor_to_major_orders.push_back({0, 1, 2, 3, 4, 5, 6});
   minor_to_major_orders.push_back({4, 5, 1, 2, 6, 0, 3});
diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD
deleted file mode 100644
index 3e79129aafd234e5eab05d205f2017b54057795e..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/BUILD
+++ /dev/null
@@ -1,82 +0,0 @@
-# Legacy command-line flags for the XLA libraries.
-
-# Please do not add more flags to this package.
-
-# The XLA libraries were written in an environment that allowed command-line
-# flags to be scattered freely throughout the libraries.  This model, while
-# initially convenient, leads to a proliferation in unused command-line flags
-# in tests and binaries, and serious problems in servers, where one might wish
-# parameters to be different in independent RPC calls to the same routine.
-#
-# Please don't add more flags.  If you're a library author, pass options and
-# parameters explicitly through the library's interface.
-
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-cc_library(
-    name = "parse_flags_from_env",
-    srcs = ["parse_flags_from_env.cc"],
-    hdrs = ["parse_flags_from_env.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:types",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "@com_google_absl//absl/strings",
-        ],
-)
-
-tf_cc_test(
-    name = "parse_flags_from_env_test",
-    srcs = ["parse_flags_from_env_test.cc"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:types",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
-            "@com_google_absl//absl/strings:str_format",
-        ],
-)
-
-cc_library(
-    name = "debug_options_flags",
-    srcs = [
-        "debug_options_flags.cc",
-        "debug_options_parsers.h",
-    ],
-    hdrs = ["debug_options_flags.h"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:xla_proto",
-            "//tensorflow/compiler/xla/service:hlo",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "@com_google_absl//absl/strings",
-        ],
-)
-
-tf_cc_test(
-    name = "debug_options_parsers_test",
-    size = "small",
-    srcs = [
-        "debug_options_parsers.h",
-        "debug_options_parsers_test.cc",
-    ],
-    deps =
-        [
-            "//tensorflow/compiler/xla:xla_proto",
-            "//tensorflow/compiler/xla/service:hlo",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
-            "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/strings:str_format",
-        ],
-)
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h
deleted file mode 100644
index b54482ad2ba2224c781861341a80ceb878ffd343..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_PARSE_FLAGS_FROM_ENV_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_PARSE_FLAGS_FROM_ENV_H_
-
-// This module exports ParseFlagsFromEnv(), which allows other modules to parse
-// flags from the environtment variable TF_XLA_FLAGS, or (if the first
-// non-whitespace in the variable value is not '-'), a file named by that
-// environment variable.  The accepted syntax is that flags arguments are of
-// the form --flag=value or (for boolean flags) --flag, and are whitespace
-// separated.  The <value> may be one of:
-// - <non-whitespace, non-nul not starting with single-quote or double-quote>
-//   in which case the effective value is the string itself
-// - <single-quote><characters string not containing nul or
-//   single-quote><single_quote> in which case the effective value is the
-//   string with the single-quotes removed
-// - <double-quote><character string not containing nul or unesecaped
-//   double-quote><double_quote> in which case the effective value if the
-//   string with the double-quotes removed, and escaped sequences of
-//   <backslash><char> replaced by <char>.
-//
-// Flags values inconsistent with the type of the flag will be rejected by the
-// flag parser.
-//
-// Examples:
-//    TF_XLA_FLAGS="--foo=bar  --wombat='value with a space'"
-//
-//    TF_XLA_FLAGS=/tmp/flagfile
-// where /tmp/flagfile might contain
-//    --some_flag="This is a string containing a \" and a '."
-//    --another_flag=wombats
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Call tensorflow::Flags::Parse(argc, argv, flag_list) against any as yet
-// unrecognized flags passed in from the environment, and return its
-// return value.
-bool ParseFlagsFromEnv(const std::vector<tensorflow::Flag>& flag_list);
-
-// Used only for testing.  Not to be used by clients.
-void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv);
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_PARSE_FLAGS_FROM_ENV_H_
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 510aa39b4503111ec558c050f0c332c93de10517..36ad7c64866e77187d40f22b364d80230651696b 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -1012,167 +1013,143 @@ void LiteralBase::Piece::SortSparseElementsInternal() {
 
 namespace {
 
-void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
-                    bool print_layout, std::vector<string>* pieces) {
-  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
-  CHECK(LayoutUtil::HasLayout(literal.shape()));
-  CHECK(LayoutUtil::HasLayout(subshape));
+string ShapeToString(bool print_layout, const Shape& shape) {
+  return print_layout ? ShapeUtil::HumanStringWithLayout(shape)
+                      : ShapeUtil::HumanString(shape);
+}
 
-  auto shape_to_string = [print_layout](const Shape& shape) {
-    if (print_layout) {
-      return ShapeUtil::HumanStringWithLayout(shape);
-    } else {
-      return ShapeUtil::HumanString(shape);
-    }
-  };
+void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                    bool print_layout, std::vector<string>* pieces);
 
-  // TODO(b/32894291): refactor this code to reduce code duplication.
-  if (ShapeUtil::IsTuple(subshape)) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" (\n");
-    std::vector<string> tuple_pieces;
-    for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
-      ShapeIndex element_index = shape_index;
-      element_index.push_back(i);
-      std::vector<string> element_pieces;
-      ToStringHelper(literal, element_index, print_layout, &element_pieces);
-      tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
+void TupleToStringHelper(const LiteralBase& literal,
+                         const ShapeIndex& shape_index, bool print_layout,
+                         std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  pieces->push_back(ShapeToString(print_layout, subshape));
+  pieces->push_back(" (\n");
+  std::vector<string> tuple_pieces;
+  for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
+    ShapeIndex element_index = shape_index;
+    element_index.push_back(i);
+    std::vector<string> element_pieces;
+    ToStringHelper(literal, element_index, print_layout, &element_pieces);
+    tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
+  }
+  pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
+  pieces->push_back("\n)");
+}
+
+void SparseArrayToStringHelper(const LiteralBase& literal,
+                               const Shape& subshape, bool print_layout,
+                               std::vector<string>* pieces) {
+  pieces->push_back(ShapeToString(print_layout, subshape));
+  pieces->push_back("{");
+  int64 rank = ShapeUtil::Rank(subshape);
+  int64 num_elements = literal.sparse_element_count();
+  for (int64 i = 0; i < num_elements; ++i) {
+    if (i > 0) {
+      pieces->push_back(", ");
     }
-    pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
-    pieces->push_back("\n)");
-    return;
-  }
-
-  if (ShapeUtil::IsToken(subshape)) {
-    pieces->push_back("token");
-    return;
-  }
-
-  if (LayoutUtil::IsSparseArray(subshape)) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back("{");
-    int64 rank = ShapeUtil::Rank(subshape);
-    int64 num_elements = literal.sparse_element_count();
-    for (int64 i = 0; i < num_elements; ++i) {
-      if (i > 0) {
-        pieces->push_back(", ");
-      }
-      if (rank == 1) {
-        pieces->push_back(StrCat(literal.GetSparseIndex(i)[0]));
-        pieces->push_back(": ");
-      } else {
-        pieces->push_back("[");
-        pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", "));
-        pieces->push_back("]: ");
-      }
-      pieces->push_back(literal.GetSparseElementAsString(i));
+    if (rank == 1) {
+      pieces->push_back(StrCat(literal.GetSparseIndex(i)[0]));
+      pieces->push_back(": ");
+    } else {
+      pieces->push_back("[");
+      pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", "));
+      pieces->push_back("]: ");
     }
-    pieces->push_back("}");
-    return;
+    pieces->push_back(literal.GetSparseElementAsString(i));
   }
+  pieces->push_back("}");
+}
 
-  CHECK(LayoutUtil::IsDenseArray(subshape));
-
-  auto element_to_string = [&](absl::Span<const int64> indices) -> string {
-    PrimitiveType element_type = subshape.element_type();
-    if (element_type == PRED) {
-      // We display predicates in a densely packed form.
-      return literal.Get<bool>(indices, shape_index) ? "1" : "0";
-    }
-    return ((!indices.empty() && indices.back() > 0) ? ", " : "") +
-           literal.GetAsString(indices, shape_index);
-  };
+void DenseArrayToStringHelper(const LiteralBase& literal,
+                              const ShapeIndex& shape_index, bool print_layout,
+                              std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  int64 rank = ShapeUtil::Rank(subshape);
+
+  std::function<void(absl::Span<const int64> dimensions, std::vector<int64>*)>
+      to_string_recursive = [&](absl::Span<const int64> dimensions,
+                                std::vector<int64>* accum_indices) {
+        // dimensions.size() decreases by 1 at each recursive call,
+        // and accum_indices->size() increases by 1.
+        // Their sum is equal to the rank of the tensor.
+        CHECK_EQ(rank, dimensions.size() + accum_indices->size());
+
+        auto brace_to_string = [&](string brace) -> string {
+          // Handle 1D tensor
+          if (rank == 1) {
+            return brace;
+          }
+          // Handle the innermost tensor of a 2D+ tensor.
+          if (dimensions.size() == 1 && brace == "{") {
+            return StrCat("  ", brace, dimensions[0] <= 1 ? "" : " ");
+          }
+          if (dimensions.size() == 1 && brace == "}") {
+            return StrCat(dimensions[0] <= 1 ? "" : " ", brace);
+          }
+          // Handle the non-innermost tensors of a 2D+ tensor.
+          if (brace == "{") {
+            if (rank > 3 && !accum_indices->empty() &&
+                accum_indices->size() < rank) {
+              int index = accum_indices->size() - 1;
+              int value = accum_indices->back();
+              return StrCat(brace, " /*i", index, "=", value, "*/\n");
+            }
+            return StrCat(brace, "\n");
+          }
+          return StrCat("\n", brace);
+        };
 
-  if (ShapeUtil::Rank(subshape) == 0) {
-    pieces->push_back(literal.GetAsString({}, shape_index));
-  } else if (ShapeUtil::Rank(subshape) == 1) {
-    pieces->push_back("{");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(element_to_string({i0}));
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 2) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back("  { ");
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(element_to_string({i0, i1}));
-      }
-      pieces->push_back(" ");
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "}\n" : "},\n");
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 3) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(i0 > 0 ? ",\n{" : "{");
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(i1 > 0 ? ",\n  { " : " { ");
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back(element_to_string({i0, i1, i2}));
-        }
-        pieces->push_back(" }");
-      }
-      pieces->push_back(" }");
-    }
-    pieces->push_back("\n}");
-  } else if (ShapeUtil::Rank(subshape) == 4) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(StrFormat("  {  /*i0=%d*/\n", i0));
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(StrFormat("    {  /*i1=%d*/\n", i1));
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back("      {");
-          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
-            pieces->push_back(element_to_string({i0, i1, i2, i3}));
+        if (dimensions.empty()) {
+          // Display predicates as 0s and 1s so that the string is more dense.
+          string elem;
+          if (subshape.element_type() == PRED && rank > 0) {
+            elem = literal.Get<bool>(*accum_indices, shape_index) ? "1" : "0";
+          } else {
+            elem = literal.GetAsString(*accum_indices, shape_index);
           }
-          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "}\n" : "},\n");
-        }
-        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
-                                                           : "    },\n");
-      }
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 5) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(StrFormat("  {  /*i0=%d*/\n", i0));
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(StrFormat("    {  /*i1=%d*/\n", i1));
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back(StrFormat("      {  /*i2=%d*/\n", i2));
-          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
-            pieces->push_back("        {");
-            for (int64 i4 = 0; i4 < subshape.dimensions(4); ++i4) {
-              pieces->push_back(element_to_string({i0, i1, i2, i3, i4}));
+          pieces->push_back(elem);
+        } else {
+          pieces->push_back(brace_to_string("{"));
+          for (int i = 0; i < dimensions[0]; ++i) {
+            std::vector<int64> cloned_indices(*accum_indices);
+            cloned_indices.push_back(i);
+            to_string_recursive(dimensions.subspan(1), &cloned_indices);
+            if (i < dimensions[0] - 1) {
+              pieces->push_back(",");
+              pieces->push_back(dimensions.size() > 1 ? "\n" : " ");
             }
-            pieces->push_back(i3 == subshape.dimensions(3) - 1 ? "}\n"
-                                                               : "},\n");
           }
-          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "      }\n"
-                                                             : "      },\n");
+          pieces->push_back(brace_to_string("}"));
         }
-        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
-                                                           : "    },\n");
-      }
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
-    }
-    pieces->push_back("}");
+      };
+
+  if (rank > 1) {
+    pieces->push_back(ShapeToString(print_layout, subshape));
+    pieces->push_back(" ");
+  }
+  std::vector<int64> indices = {};
+  std::vector<int64> dimensions(subshape.dimensions().begin(),
+                                subshape.dimensions().end());
+  to_string_recursive(dimensions, &indices);
+}
+
+void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                    bool print_layout, std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  CHECK(LayoutUtil::HasLayout(literal.shape()));
+  CHECK(LayoutUtil::HasLayout(subshape));
+  if (ShapeUtil::IsTuple(subshape)) {
+    TupleToStringHelper(literal, shape_index, print_layout, pieces);
+  } else if (ShapeUtil::IsToken(subshape)) {
+    pieces->push_back("token");
+  } else if (LayoutUtil::IsSparseArray(subshape)) {
+    SparseArrayToStringHelper(literal, subshape, print_layout, pieces);
   } else {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {");
-    literal.EachCellAsString(
-        [&](absl::Span<const int64> indices, const string& value) {
-          pieces->push_back(" ");
-          pieces->push_back(value);
-        });
-    pieces->push_back("}");
+    CHECK(LayoutUtil::IsDenseArray(subshape));
+    DenseArrayToStringHelper(literal, shape_index, print_layout, pieces);
   }
 }
 
@@ -1435,10 +1412,14 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
       return EqualElementsInternal<bool>(other, &multi_index);
     case U8:
       return EqualElementsInternal<uint8>(other, &multi_index);
+    case S16:
+      return EqualElementsInternal<int16>(other, &multi_index);
     case S32:
       return EqualElementsInternal<int32>(other, &multi_index);
     case S64:
       return EqualElementsInternal<int64>(other, &multi_index);
+    case U16:
+      return EqualElementsInternal<uint16>(other, &multi_index);
     case U32:
       return EqualElementsInternal<uint32>(other, &multi_index);
     case U64:
@@ -1507,6 +1488,11 @@ bool LiteralBase::IsAll(int8 value) const {
             return AllElementsEqualValue<uint8>(piece.data<uint8>(), value);
           }
           return false;
+        case U16:
+          if (value >= 0) {
+            return AllElementsEqualValue<uint16>(piece.data<uint16>(), value);
+          }
+          return false;
         case U32:
           if (value >= 0) {
             return AllElementsEqualValue<uint32>(piece.data<uint32>(), value);
@@ -1519,6 +1505,8 @@ bool LiteralBase::IsAll(int8 value) const {
           return false;
         case S8:
           return AllElementsEqualValue<int8>(piece.data<int8>(), value);
+        case S16:
+          return AllElementsEqualValue<int16>(piece.data<int16>(), value);
         case S32:
           return AllElementsEqualValue<int32>(piece.data<int32>(), value);
         case S64:
@@ -1740,12 +1728,16 @@ bool LiteralBase::IsZero(absl::Span<const int64> indices) const {
   switch (shape().element_type()) {
     case U8:
       return Get<uint8>(indices) == 0;
+    case U16:
+      return Get<uint16>(indices) == 0;
     case U32:
       return Get<uint32>(indices) == 0;
     case U64:
       return Get<uint64>(indices) == 0;
     case S8:
       return Get<int8>(indices) == 0;
+    case S16:
+      return Get<int16>(indices) == 0;
     case S32:
       return Get<int32>(indices) == 0;
     case S64:
@@ -1803,6 +1795,20 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
     case S64:
       CopyToRepeatedField(proto->mutable_s64s(), data<int64>());
       break;
+    case U16:
+      *proto->mutable_u16s() = string(
+          reinterpret_cast<const char*>(data<uint16_t>().data()), size_bytes());
+      if (!kLittleEndian) {
+        ConvertEndianShort(proto->mutable_u16s());
+      }
+      break;
+    case S16:
+      *proto->mutable_s16s() = string(
+          reinterpret_cast<const char*>(data<int16_t>().data()), size_bytes());
+      if (!kLittleEndian) {
+        ConvertEndianShort(proto->mutable_s16s());
+      }
+      break;
     case F16:
       *proto->mutable_f16s() = string(
           reinterpret_cast<const char*>(data<half>().data()), size_bytes());
@@ -1917,6 +1923,22 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
     case U64:
       TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<uint64>(), proto.u64s()));
       break;
+    case S16: {
+      const string& s(proto.s16s());
+      TF_RET_CHECK(data<int16_t>().size() * sizeof(int16_t) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+      if (!kLittleEndian) {
+        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+      }
+    } break;
+    case U16: {
+      const string& s(proto.u16s());
+      TF_RET_CHECK(data<uint16_t>().size() * sizeof(uint16_t) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+      if (!kLittleEndian) {
+        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+      }
+    } break;
     case F16: {
       const string& s(proto.f16s());
       TF_RET_CHECK(data<half>().size() * sizeof(half) == s.size());
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index e791048b4d9f5dcf877e05e3b5cf16eb37c07dbc..fa9a71af4ceb998a7a289443cbef70eb52cb1a11 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -301,7 +301,7 @@ class LiteralBase {
   //
   // Note: It's an antipattern to use this method then immediately call
   // MutableLiteralBase::Populate on the result (since that results in zero
-  // initialization, then reinitialization. Conside if a call to
+  // initialization, then reinitialization. Consider if a call to
   // absl::make_unique<Literal>(shape), followed by the call to
   // MutableLiteralBase::Populate can be used instead.
   static Literal CreateFromShape(const Shape& shape);
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 9d34d9d504156c4b9e645ccfa7cdbd346e51390b..b044f0ad73f13a0599e77f1f43888bc974e31f73 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -141,8 +141,10 @@ int64 RecursiveElementCount(const Shape& shape) {
       total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
     }
     return total;
-  } else {
+  } else if (ShapeUtil::IsArray(shape)) {
     return ShapeUtil::ElementsIn(shape);
+  } else {
+    return 0;
   }
 }
 
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 4ae5ddbfdb8444ac778f82d01b1066aad8c0aa78..bd93517728b052aed854df0f9d9c5447bc3b156f 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -133,7 +133,7 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
   auto pred_vec = LiteralUtil::CreateR1<bool>({true, false, true});
-  EXPECT_EQ("{101}", pred_vec.ToString());
+  EXPECT_EQ("{1, 0, 1}", pred_vec.ToString());
 }
 
 TEST_F(LiteralUtilTest, R2ToString) {
@@ -150,12 +150,58 @@ TEST_F(LiteralUtilTest, R3ToString) {
   const auto literal =
       LiteralUtil::CreateR3({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}});
   const string expected = R"(s32[3,2,1] {
-{ { 1 },
-  { 2 } },
-{ { 3 },
-  { 4 } },
-{ { 5 },
-  { 6 } }
+{
+  {1},
+  {2}
+},
+{
+  {3},
+  {4}
+},
+{
+  {5},
+  {6}
+}
+})";
+  EXPECT_EQ(expected, literal.ToString());
+}
+
+TEST_F(LiteralUtilTest, R6ToString) {
+  const auto literal =
+      LiteralUtil::CreateFromDimensions(S32, {2, 2, 1, 1, 1, 2});
+  const string expected = R"(s32[2,2,1,1,1,2] {
+{ /*i0=0*/
+{ /*i1=0*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+},
+{ /*i1=1*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+}
+},
+{ /*i0=1*/
+{ /*i1=0*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+},
+{ /*i1=1*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+}
+}
 })";
   EXPECT_EQ(expected, literal.ToString());
 }
@@ -190,12 +236,16 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
   EXPECT_THAT(literal.shape().dimensions(), ElementsAre(2, 3, 2));
   string result = literal.ToString();
   const string expected = R"(f32[2,3,2] {
-{ { 1, 2 },
+{
+  { 1, 2 },
   { 3, 4 },
-  { 5, 6 } },
-{ { 7, 8 },
+  { 5, 6 }
+},
+{
+  { 7, 8 },
   { 9, 10 },
-  { 11, 12 } }
+  { 11, 12 }
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -247,18 +297,18 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
   EXPECT_THAT(literal.shape().dimensions(), ElementsAre(1, 2, 3, 2));
   string result = literal.ToString();
   const string expected = R"(f32[1,2,3,2] {
-  {  /*i0=0*/
-    {  /*i1=0*/
-      {1, 2},
-      {1001, 1002},
-      {2001, 2002}
-    },
-    {  /*i1=1*/
-      {1, 2},
-      {1001, 1002},
-      {2001, 2002}
-    }
-  }
+{ /*i0=0*/
+{ /*i1=0*/
+  { 1, 2 },
+  { 1001, 1002 },
+  { 2001, 2002 }
+},
+{ /*i1=1*/
+  { 1, 2 },
+  { 1001, 1002 },
+  { 2001, 2002 }
+}
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -268,30 +318,30 @@ TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) {
               ElementsAre(2, 2, 3, 3));
   string result = literal_r4_2x2x3x3_dim0major_.ToString();
   const string expected = R"(f32[2,2,3,3] {
-  {  /*i0=0*/
-    {  /*i1=0*/
-      {1, 2, 3},
-      {4, 5, 6},
-      {7, 8, 9}
-    },
-    {  /*i1=1*/
-      {11, 12, 13},
-      {14, 15, 16},
-      {17, 18, 19}
-    }
-  },
-  {  /*i0=1*/
-    {  /*i1=0*/
-      {101, 102, 103},
-      {104, 105, 106},
-      {107, 108, 109}
-    },
-    {  /*i1=1*/
-      {201, 202, 203},
-      {204, 205, 206},
-      {207, 208, 209}
-    }
-  }
+{ /*i0=0*/
+{ /*i1=0*/
+  { 1, 2, 3 },
+  { 4, 5, 6 },
+  { 7, 8, 9 }
+},
+{ /*i1=1*/
+  { 11, 12, 13 },
+  { 14, 15, 16 },
+  { 17, 18, 19 }
+}
+},
+{ /*i0=1*/
+{ /*i1=0*/
+  { 101, 102, 103 },
+  { 104, 105, 106 },
+  { 107, 108, 109 }
+},
+{ /*i1=1*/
+  { 201, 202, 203 },
+  { 204, 205, 206 },
+  { 207, 208, 209 }
+}
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -1394,6 +1444,28 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   EXPECT_EQ(h1, r[3]);
 }
 
+TEST_F(LiteralUtilTest, CopyFromProto_u16) {
+  uint16 u1(0xabcd);
+  uint16 u2(0x1234);
+
+  const unsigned char uint16_vals[8] = {0xcd, 0xab, 0x34, 0x12,
+                                        0x34, 0x12, 0xcd, 0xab};
+  LiteralProto p;
+  p.mutable_shape()->set_element_type(U16);
+  p.mutable_shape()->clear_dimensions();
+  p.mutable_shape()->add_dimensions(4);
+  LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+  p.clear_u16s();
+  p.set_u16s(uint16_vals, 8);
+  TF_ASSERT_OK_AND_ASSIGN(Literal literal, Literal::CreateFromProto(p));
+  auto r = literal.data<uint16>();
+  ASSERT_EQ(4, r.size());
+  EXPECT_EQ(u1, r[0]);
+  EXPECT_EQ(u2, r[1]);
+  EXPECT_EQ(u2, r[2]);
+  EXPECT_EQ(u1, r[3]);
+}
+
 TEST_F(LiteralUtilTest, LiteralSliceTest) {
   auto scalar = LiteralUtil::CreateR0<float>(1.0);
   auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
@@ -1515,9 +1587,9 @@ TEST_F(LiteralUtilTest, DecomposeTuple) {
   Literal nested_tuple = LiteralUtil::MakeTuple(
       {&tuple_elements[0], &tuple_elements[1], &nil_literal});
 
-  EXPECT_FALSE(ShapeUtil::IsNil(nested_tuple.shape()));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(nested_tuple.shape()));
   std::vector<Literal> elements = nested_tuple.DecomposeTuple();
-  EXPECT_TRUE(ShapeUtil::IsNil(nested_tuple.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(nested_tuple.shape()));
 
   ASSERT_EQ(elements.size(), 3);
 
@@ -1568,7 +1640,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) {
   EXPECT_EQ(literal.Get<double>({1}, /*shape_index=*/{2, 1}), 44.0);
 
   for (const Literal& element : elements) {
-    EXPECT_TRUE(ShapeUtil::IsNil(element.shape()));
+    EXPECT_TRUE(ShapeUtil::IsEmptyTuple(element.shape()));
   }
 }
 
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
similarity index 65%
rename from tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.cc
rename to tensorflow/compiler/xla/parse_flags_from_env.cc
index 2a4e49b05aa0d1eed2197095694cfc6aa8814983..5b568888d14f21c1330556d017eafba6c8dd2228 100644
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -13,16 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This module exports ParseFlagsFromEnv(), which allows other modules to parse
-// flags from an environtment variable, or a file named by the environment
-// variable.
+// This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
+// modules to parse flags from an environtment variable, or a file named by the
+// environment variable.
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <memory>
+#include <unordered_map>
 #include <vector>
 
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -31,9 +36,7 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
-namespace legacy_flags {
 
-static const char kEnvVar[] = "TF_XLA_FLAGS";  // environment variable queried
 static const char kWS[] = " \t\r\n";           // whitespace
 
 // The following struct represents an argv[]-style array, parsed
@@ -43,12 +46,20 @@ static const char kWS[] = " \t\r\n";           // whitespace
 // constructor/destructor collisions with other "private" types
 // in the same named namespace.
 namespace {
+
+// Functor which deletes objects by calling `free`.  Necessary to free strdup'ed
+// strings created by AppendToEnvArgv.
+struct FreeDeleter {
+  void operator()(char* ptr) { free(ptr); }
+};
+
 struct EnvArgv {
   EnvArgv() : initialized(false), argc(0) {}
   bool initialized;         // whether the other fields have been set.
   int argc;                 // elements used in argv[]
   std::vector<char*> argv;  // flag arguments parsed from environment string.
-  std::vector<char*> argv_save;  // saved values from argv[] to avoid leaks
+  // saved values from argv[] to avoid leaks
+  std::vector<std::unique_ptr<char, FreeDeleter>> argv_save;
 };
 }  // anonymous namespace
 
@@ -64,7 +75,7 @@ static void AppendToEnvArgv(const char* s0, size_t s0len, const char* s1,
     string s = string(s0, s0len) + string(s1, s1len);
     char* str = strdup(s.c_str());
     a->argv.push_back(str);
-    a->argv_save.push_back(str);
+    a->argv_save.emplace_back(str);
     a->argc++;
   }
 }
@@ -128,14 +139,14 @@ static void ParseArgvFromString(const string& flag_str, EnvArgv* a) {
   }
 }
 
-// Call ParseArgvFromString(..., a) on a string derived from the setting of an
-// environment variable kEnvVar, or a file it points to.
-static void SetArgvFromEnv(EnvArgv* a) {
+// Call ParseArgvFromString(..., a) on a string derived from the setting of the
+// environment variable `envvar`, or a file it points to.
+static void SetArgvFromEnv(absl::string_view envvar, EnvArgv* a) {
   if (!a->initialized) {
     static const char kDummyArgv[] = "<argv[0]>";
     AppendToEnvArgv(kDummyArgv, strlen(kDummyArgv), nullptr, 0,
                     a);  // dummy argv[0]
-    const char* env = getenv(kEnvVar);
+    const char* env = getenv(string(envvar).c_str());
     if (env == nullptr || env[0] == '\0') {
       // nothing
     } else if (env[strspn(env, kWS)] == '-') {  // flags in env var value
@@ -158,49 +169,66 @@ static void SetArgvFromEnv(EnvArgv* a) {
   }
 }
 
-// The simulated argv[] parsed from the environment.
-static EnvArgv* env_argv;
+// The simulated argv[] parsed from the environment, one for each different
+// environment variable we've seen.
+static std::unordered_map<string, EnvArgv>& EnvArgvs() {
+  static auto* env_argvs = new std::unordered_map<string, EnvArgv>();
+  return *env_argvs;
+}
 
-// Used to protect accesses to env_argv.
+// Used to protect accesses to env_argvs.
 static tensorflow::mutex env_argv_mu(tensorflow::LINKER_INITIALIZED);
 
-// Call Flags::Parse(argc, argv, flag_list) against any as yet unrecognized
-// flags passed in from the environment.
-bool ParseFlagsFromEnv(const std::vector<tensorflow::Flag>& flag_list) {
-  env_argv_mu.lock();
-  if (env_argv == nullptr) {
-    env_argv = new EnvArgv;
-  }
-  SetArgvFromEnv(env_argv);  // a no-op if already initialized
+bool ParseFlagsFromEnvAndDieIfUnknown(
+    absl::string_view envvar, const std::vector<tensorflow::Flag>& flag_list) {
+  tensorflow::mutex_lock lock(env_argv_mu);
+  auto* env_argv = &EnvArgvs()[string(envvar)];
+  SetArgvFromEnv(envvar, env_argv);  // a no-op if already initialized
   bool result =
       tensorflow::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
-  env_argv_mu.unlock();
+
+  // There's always at least one unparsed argc, namely the fake argv[0].
+  if (result && env_argv->argc != 1) {
+    // Skip the first argv, which is the fake argv[0].
+    auto unknown_flags = absl::MakeSpan(env_argv->argv);
+    unknown_flags.remove_prefix(1);
+
+    // Some flags are set on XLA_FLAGS, others on TF_XLA_FLAGS.  If we find an
+    // unrecognized flag, suggest the alternative.
+    string alternate_envvar;
+    if (envvar == "TF_XLA_FLAGS") {
+      alternate_envvar = "XLA_FLAGS";
+    } else if (envvar == "XLA_FLAGS") {
+      alternate_envvar = "TF_XLA_FLAGS";
+    }
+    string did_you_mean;
+    if (!alternate_envvar.empty()) {
+      did_you_mean = absl::StrFormat(
+          "\nPerhaps you meant to specify these on the %s envvar?",
+          alternate_envvar);
+    }
+
+    LOG(FATAL) << "Unknown flag" << (unknown_flags.size() > 1 ? "s" : "")
+               << " in " << envvar << ": " << absl::StrJoin(unknown_flags, " ")
+               << did_you_mean;
+    return false;
+  }
   return result;
 }
 
 // Testing only.
-// Reset the env_argv struct so that subsequent calls to ParseFlagsFromEnv()
-// will parse the environment variable (or the file it points to) anew, and set
-// *pargc, and *pargv to point to the internal locations of the argc and argv
-// constructed from the environment.
-void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv) {
-  env_argv_mu.lock();
-  if (env_argv == nullptr) {
-    env_argv = new EnvArgv;
-  }
-  if (!env_argv->argv_save.empty()) {
-    for (int i = 0; env_argv->argv_save[i] != nullptr; i++) {
-      free(env_argv->argv_save[i]);
-    }
-  }
-  env_argv->initialized = false;
-  env_argv->argc = 0;
-  env_argv->argv.clear();
-  env_argv->argv_save.clear();
-  env_argv_mu.unlock();
-  *pargc = &env_argv->argc;
-  *pargv = &env_argv->argv;
+//
+// Resets the env_argv struct so that subsequent calls to
+// ParseFlagsFromEnvAndDieIfUnknown() will parse the environment variable (or
+// the file it points to) anew, and set *pargc, and *pargv to point to the
+// internal locations of the argc and argv constructed from the environment.
+void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
+                                 std::vector<char*>** pargv) {
+  tensorflow::mutex_lock lock(env_argv_mu);
+  EnvArgvs().erase(string(envvar));
+  auto& env_argv = EnvArgvs()[string(envvar)];
+  *pargc = &env_argv.argc;
+  *pargv = &env_argv.argv;
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.h b/tensorflow/compiler/xla/parse_flags_from_env.h
new file mode 100644
index 0000000000000000000000000000000000000000..76940a4299ac50138222333ff250a264cc941288
--- /dev/null
+++ b/tensorflow/compiler/xla/parse_flags_from_env.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
+#define TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
+
+// This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
+// modules to parse flags from an environtment variable, or (if the first
+// non-whitespace in the variable value is not '-'), a file named by that
+// environment variable.
+//
+// The accepted syntax is that flags arguments are of the form --flag=value or
+// (for boolean flags) --flag, and are whitespace separated.  The <value> may be
+// one of:
+//
+//  - <non-whitespace, non-nul not starting with single-quote or double-quote>
+//    in which case the effective value is the string itself
+//  - <single-quote><characters string not containing nul or
+//    single-quote><single_quote> in which case the effective value is the
+//    string with the single-quotes removed
+//  - <double-quote><character string not containing nul or unesecaped
+//    double-quote><double_quote> in which case the effective value if the
+//    string with the double-quotes removed, and escaped sequences of
+//    <backslash><char> replaced by <char>.
+//
+// Flags values inconsistent with the type of the flag will be rejected by the
+// flag parser.
+//
+// Examples:
+//
+//  - TF_XLA_FLAGS="--foo=bar  --wombat='value with a space'"
+//  - TF_XLA_FLAGS=/tmp/flagfile
+//
+// where /tmp/flagfile might contain
+//
+//  --some_flag="This is a string containing a \" and a '."
+//  --another_flag=wombats
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+
+// Calls tensorflow::Flags::Parse(argc, argv, flag_list) against any as yet
+// unrecognized flags passed in the environment variable `envvar`, and returns
+// its return value.
+//
+// Raises a fatal error if any flags in `envvar` were not recognized.
+bool ParseFlagsFromEnvAndDieIfUnknown(
+    absl::string_view envvar, const std::vector<tensorflow::Flag>& flag_list);
+
+// Used only for testing.  Not to be used by clients.
+void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
+                                 std::vector<char*>** pargv);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
similarity index 89%
rename from tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
rename to tensorflow/compiler/xla/parse_flags_from_env_test.cc
index 138c0c852e2bb0527d171f25b4d96cedc5671516..3465552ebbf52140fb954b247d99d3c6afe7fcde 100644
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Test for parse_flags_from_env.cc
 
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Test that XLA flags can be set from the environment.
 // Failure messages are accompanied by the text in msg[].
@@ -38,20 +37,7 @@ static void TestParseFlagsFromEnv(const char* msg) {
   // Initialize module under test.
   int* pargc;
   std::vector<char*>* pargv;
-  ResetFlagsFromEnvForTesting(&pargc, &pargv);
-
-  // Ensure that environment variable can be parsed when
-  // no flags are expected.
-  std::vector<tensorflow::Flag> empty_flag_list;
-  bool parsed_ok = ParseFlagsFromEnv(empty_flag_list);
-  CHECK(parsed_ok) << msg;
-  const std::vector<char*>& argv_first = *pargv;
-  CHECK_NE(argv_first[0], nullptr) << msg;
-  int i = 0;
-  while (argv_first[i] != nullptr) {
-    i++;
-  }
-  CHECK_EQ(i, *pargc) << msg;
+  ResetFlagsFromEnvForTesting("TF_XLA_FLAGS", &pargc, &pargv);
 
   // Check that actual flags can be parsed.
   bool simple = false;
@@ -66,7 +52,7 @@ static void TestParseFlagsFromEnv(const char* msg) {
       tensorflow::Flag("single_quoted", &single_quoted, ""),
       tensorflow::Flag("double_quoted", &double_quoted, ""),
   };
-  parsed_ok = ParseFlagsFromEnv(flag_list);
+  bool parsed_ok = ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
   CHECK_EQ(*pargc, 1) << msg;
   const std::vector<char*>& argv_second = *pargv;
   CHECK_NE(argv_second[0], nullptr) << msg;
@@ -159,12 +145,11 @@ TEST(ParseFlagsFromEnv, EnvAndFlag) {
   }
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
 
 int main(int argc, char* argv[]) {
   // Save name of binary so that it may invoke itself.
-  xla::legacy_flags::binary_name = argv[0];
+  xla::binary_name = argv[0];
   bool recursing = false;
   xla::int32 int_flag = 1;
   const std::vector<tensorflow::Flag> flag_list = {
@@ -173,7 +158,8 @@ int main(int argc, char* argv[]) {
       tensorflow::Flag("int_flag", &int_flag, "An integer flag to test with"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  bool parse_ok = xla::legacy_flags::ParseFlagsFromEnv(flag_list);
+  bool parse_ok =
+      xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
   if (!parse_ok) {
     LOG(QFATAL) << "can't parse from environment\n" << usage;
   }
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 21685c4a5b90f76440e4cf10cce004b6cf925cc8..63ac1c6649210cbae9e238a74e0a45fb8ee4da63 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -3,6 +3,7 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 
 py_library(
     name = "xla_client",
@@ -66,6 +67,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt/cc:xrt_ops",
@@ -81,6 +83,7 @@ tf_py_wrap_cc(
     srcs = ["xla.i"],
     swig_includes = [
         "local_computation_builder.i",
+        "//tensorflow/python:platform/base.i",
     ],
     deps = [
         ":local_computation_builder",
@@ -89,5 +92,7 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-    ],
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+    ]),
 )
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index b1fae826ab1903fb73541a7ae32b5cc57b3b92a7..4d2a37cfac3e0e89d189f168031e5db44ca5d410 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -56,6 +57,12 @@ tensorflow::mutex g_local_client_mutex(tensorflow::LINKER_INITIALIZED);
 int g_replica_count GUARDED_BY(g_local_client_mutex) = 1;
 LocalClient* g_local_client GUARDED_BY(g_local_client_mutex) = nullptr;
 
+string* GetPlatformNameString() {
+  static string* platform_name_string PT_GUARDED_BY(g_local_client_mutex) =
+      new string("Host");
+  return platform_name_string;
+}
+
 Status InitializeReplicaCount(int replica_count) {
   if (replica_count < 1) {
     return InvalidArgument("Replica count must be >= 1; got %d.",
@@ -72,17 +79,33 @@ Status InitializeReplicaCount(int replica_count) {
   return Status::OK();
 }
 
+Status InitializePlatformName(const string& platform_name) {
+  string* g_platform_name = GetPlatformNameString();
+  tensorflow::mutex_lock lock(g_local_client_mutex);
+  if (g_local_client != nullptr) {
+    return FailedPrecondition(
+        "Attempted to set the platform name to %s, but a local XLA service was "
+        "previously created with a platform name of %s.",
+        platform_name, *g_platform_name);
+  }
+  TF_RETURN_IF_ERROR(PlatformUtil::GetPlatform(platform_name).status());
+  *g_platform_name = platform_name;
+  return Status::OK();
+}
+
 int GetReplicaCount() {
   tensorflow::mutex_lock lock(g_local_client_mutex);
   return g_replica_count;
 }
 
 LocalClient* GetOrCreateLocalClient() {
+  string* platform_name = GetPlatformNameString();
   tensorflow::mutex_lock lock(g_local_client_mutex);
   if (g_local_client != nullptr) {
     return g_local_client;
   }
   LocalClientOptions options;
+  options.set_platform(PlatformUtil::GetPlatform(*platform_name).ValueOrDie());
   options.set_number_of_replicas(g_replica_count);
   g_local_client = ClientLibrary::GetOrCreateLocalClient(options).ValueOrDie();
   CHECK(g_local_client != nullptr);
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 82f84ddb35bd4455fd3607509c6329457cca47f3..9e617c48bdc5ae4b37c1a1db9a1876bb4c0a6f0d 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -39,6 +39,12 @@ namespace swig {
 // returned.
 Status InitializeReplicaCount(int replica_count);
 
+// Initializes the platform name that XLA will be initialized with (when
+// first obtaining a handle to the local XLA service). If this is called after
+// the handle to the local XLA service has been established, then an error is
+// returned.
+Status InitializePlatformName(const string& platform_name);
+
 // Returns the replica count that is currently set, regardless of whether the
 // local XLA service has been instantiated yet or not.
 int GetReplicaCount();
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index c13d00d2530c7e9321d483a70e4a12361159362d..feabfdb889ca055550c5d1e1c05ca47c1b0bd166 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -977,6 +977,7 @@ tensorflow::ImportNumpy();
 %unignore xla;
 %unignore xla::swig;
 %unignore xla::swig::InitializeReplicaCount;
+%unignore xla::swig::InitializePlatformName;
 %unignore xla::swig::GetReplicaCount;
 %unignore xla::swig::TransferToInfeedLocal;
 %unignore xla::swig::TransferToInfeedLocalReplica;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 07e0e093255b2baf3412852821fe62fa060f6cad..92b0685dbba195405d78867776fe43b5f6c60f4c 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1371,6 +1371,18 @@ def initialize_replica_count(replica_count):
   c_api.InitializeReplicaCount(replica_count)
 
 
+def initialize_platform_name(platform_name):
+  """Initializes the desired platform name to use on XLA service init.
+
+  Args:
+    platform_name: string name of platform.
+
+  Raises:
+    A runtime exception if the XLA service has already been initialized.
+  """
+  c_api.InitializePlatformName(platform_name)
+
+
 def get_replica_count():
   """Returns the current replica count used for the XLA service.
 
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index 4e1435fa30a24c320ddbedb84d37b369a3158a54..d8123a6de28ca532819ece4a75cd0b725f8c1bbd 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -47,11 +47,18 @@ namespace xla {
   });
 }
 
-::grpc::Status GRPCService::ExecuteGraph(::grpc::ServerContext* /*context*/,
-                                         const ExecuteGraphRequest* arg,
-                                         ExecuteResponse* result) {
+::grpc::Status GRPCService::Compile(::grpc::ServerContext* /*context*/,
+                                    const CompileRequest* arg,
+                                    CompileResponse* result) {
   return DelegateRPC(
-      [this, arg, result]() { return service_->ExecuteGraph(arg, result); });
+      [this, arg, result]() { return service_->Compile(arg, result); });
+}
+
+::grpc::Status GRPCService::Execute(::grpc::ServerContext* /*context*/,
+                                    const ExecuteRequest* arg,
+                                    ExecuteResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Execute(arg, result); });
 }
 
 ::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index ca1b09b648013ad45d806040c5ddcf11d9e5604e..3e586b288a56a22573d0c3b9ae7b2f25fdbf851a 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -39,9 +39,13 @@ class GRPCService : public grpc::XlaService::Service {
                                   const DeconstructTupleRequest* arg,
                                   DeconstructTupleResponse* result) override;
 
-  ::grpc::Status ExecuteGraph(::grpc::ServerContext* context,
-                              const ExecuteGraphRequest* arg,
-                              ExecuteResponse* result) override;
+  ::grpc::Status Compile(::grpc::ServerContext* context,
+                         const CompileRequest* arg,
+                         CompileResponse* result) override;
+
+  ::grpc::Status Execute(::grpc::ServerContext* context,
+                         const ExecuteRequest* arg,
+                         ExecuteResponse* result) override;
 
   ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
                                   const WaitForExecutionRequest* arg,
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
index 7b8ab158e1396d7087a407be180ab44d2e16e121..66abf66cfd6c2f753c5507aa373452ac880e9a29 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -62,10 +62,17 @@ Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
   });
 }
 
-Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
-                              ExecuteResponse* response) {
+Status GRPCStub::Compile(const CompileRequest* request,
+                         CompileResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->ExecuteGraph(context, *request, response);
+    return grpc_stub_->Compile(context, *request, response);
+  });
+}
+
+Status GRPCStub::Execute(const ExecuteRequest* request,
+                         ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Execute(context, *request, response);
   });
 }
 
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
index 8dfcb761387d608abbb1f62974f49b976a7ff7ff..f02b401399f3e895153f0b08e325bc9c2c2336ec 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.h
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -43,8 +43,11 @@ class GRPCStub : public ServiceInterface {
   Status ResetDevice(const ResetDeviceRequest* arg,
                      ResetDeviceResponse* result) override;
 
-  Status ExecuteGraph(const ExecuteGraphRequest* request,
-                      ExecuteResponse* response) override;
+  Status Compile(const CompileRequest* request,
+                 CompileResponse* response) override;
+
+  Status Execute(const ExecuteRequest* request,
+                 ExecuteResponse* response) override;
 
   Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* request,
                               ExecuteParallelResponse* response) override;
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
index 551ae895e05586daec0ffcd425f4950f76bdd50d..e4f332cda22cc5b889bf73f06913b96d6091dc81 100644
--- a/tensorflow/compiler/xla/rpc/xla_service.proto
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -128,11 +128,14 @@ service XlaService {
       returns (CreateChannelHandleResponse) {
   }
 
-  // Invokes the provided computation with the provided global data passed as
-  // immutable arguments. The request contains the whole computation graph.
+  // Compiles the provided computation into executable. Returns the handle of
+  // the executable.
+  rpc Compile(CompileRequest) returns (CompileResponse) {}
+
+  // Invokes the provided executable with the provided global data passed as
+  // immutable arguments. The request contains the handle to the executable.
   // Returns global data output and execution timing.
-  rpc ExecuteGraph(ExecuteGraphRequest) returns (ExecuteResponse) {
-  }
+  rpc Execute(ExecuteRequest) returns (ExecuteResponse) {}
 
   // Invokes the provided list of computations in parallel with the provided
   // global data for each computation. Returns a list of global data output and
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 0097e917c869b19908ea11b3a647ecc9bad12dc7..1bd04d2785913c59929478974883b9669e1c1185 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -87,7 +87,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -124,7 +123,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -158,12 +156,12 @@ tf_cc_test(
         ":bfloat16_propagation",
         ":bfloat16_support",
         ":hlo",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
     ],
@@ -281,7 +279,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
@@ -294,6 +292,7 @@ cc_library(
     name = "hlo",
     srcs = [
         "dfs_hlo_visitor.cc",
+        "dynamic_parameter_binding.cc",
         "hlo_computation.cc",
         "hlo_input_output_alias_config.cc",
         "hlo_instruction.cc",
@@ -307,6 +306,7 @@ cc_library(
     hdrs = [
         "dfs_hlo_visitor.h",
         "dfs_hlo_visitor_with_default.h",
+        "dynamic_parameter_binding.h",
         "hlo_clone_context.h",
         "hlo_computation.h",
         "hlo_domain_metadata.h",
@@ -323,7 +323,6 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_module_config",
         ":hlo_proto",
-        ":hlo_reachability",
         ":name_uniquer",
         "//tensorflow/compiler/xla:array",
         "//tensorflow/compiler/xla:literal",
@@ -353,6 +352,25 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dynamic_parameter_binding_test",
+    srcs = ["dynamic_parameter_binding_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_memory_scheduler",
+        ":hlo_ordering",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 tf_cc_test(
     name = "dfs_hlo_visitor_with_default_test",
     srcs = ["dfs_hlo_visitor_with_default_test.cc"],
@@ -365,7 +383,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -402,10 +419,12 @@ cc_library(
     srcs = ["hlo_reachability.cc"],
     hdrs = ["hlo_reachability.h"],
     deps = [
+        ":hlo",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
     ],
@@ -420,7 +439,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -466,7 +484,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -519,7 +536,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -568,7 +584,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -591,7 +606,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -603,11 +617,11 @@ cc_library(
     hdrs = ["platform_util.h"],
     deps = [
         ":compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
@@ -647,6 +661,7 @@ cc_library(
         ":allocation_tracker",
         ":backend",
         ":channel_tracker",
+        ":compilation_cache",
         ":compiler",
         ":computation_layout",
         ":device_memory_allocator",
@@ -662,6 +677,7 @@ cc_library(
         ":source_map_util",
         ":stream_pool",
         ":transfer_manager",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:service_interface",
@@ -673,7 +689,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -730,12 +745,12 @@ cc_library(
         ":computation_layout",
         ":platform_util",
         ":service",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -811,6 +826,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:ptr_util",
+        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
     ],
@@ -833,6 +849,7 @@ cc_library(
         ":maybe_owning_device_memory",
         ":shaped_buffer",
         ":stream_pool",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:status",
@@ -840,7 +857,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -1086,7 +1102,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1103,6 +1118,7 @@ cc_library(
         ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_proto",
+        ":hlo_reachability",
         ":hlo_value",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1168,7 +1184,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1343,6 +1358,7 @@ cc_library(
         ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -1362,6 +1378,7 @@ cc_library(
         ":fusion_queue",
         ":hlo",
         ":hlo_pass",
+        ":hlo_reachability",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
@@ -1387,6 +1404,7 @@ cc_library(
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
     deps = [
+        ":hlo_reachability",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:hlo",
@@ -1427,7 +1445,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
@@ -1503,7 +1520,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
@@ -1555,7 +1571,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1592,7 +1607,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1642,7 +1656,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -1694,6 +1708,19 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "while_loop_analysis_test",
+    srcs = ["while_loop_analysis_test.cc"],
+    deps = [
+        ":while_loop_analysis",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "while_loop_simplifier",
     srcs = ["while_loop_simplifier.cc"],
@@ -1702,9 +1729,11 @@ cc_library(
         ":call_inliner",
         ":hlo",
         ":hlo_pass",
+        ":hlo_query",
+        ":pattern_matcher",
         ":while_loop_analysis",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -1716,10 +1745,17 @@ tf_cc_test(
     name = "while_loop_simplifier_test",
     srcs = ["while_loop_simplifier_test.cc"],
     deps = [
+        ":algebraic_simplifier",
+        ":hlo",
+        ":hlo_cse",
+        ":hlo_dce",
         ":hlo_matchers",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":tuple_simplifier",
         ":while_loop_simplifier",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings",
@@ -1750,7 +1786,7 @@ tf_cc_test(
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
 
@@ -1778,7 +1814,7 @@ tf_cc_test(
         ":implicit_broadcast_remover",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
 
@@ -1823,7 +1859,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -1857,7 +1892,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
@@ -2263,7 +2298,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -2326,13 +2360,27 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
 )
 
+cc_library(
+    name = "compilation_cache",
+    srcs = ["compilation_cache.cc"],
+    hdrs = ["compilation_cache.h"],
+    deps = [
+        ":executable",
+        ":hlo_module_config",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 cc_library(
     name = "layout_assignment",
     srcs = [
@@ -2402,14 +2450,13 @@ tf_cc_test(
         ":hlo_graph_dumper",
         ":hlo_matchers",
         ":hlo_runner",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -2527,7 +2574,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -2594,7 +2640,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -2656,7 +2701,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2697,7 +2742,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -2736,7 +2780,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2808,10 +2851,9 @@ tf_cc_test(
         ":hlo_domain_isolator",
         ":hlo_domain_remover",
         ":hlo_parser",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
@@ -2844,6 +2886,46 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_get_dimension_size_rewriter",
+    srcs = ["hlo_get_dimension_size_rewriter.cc"],
+    hdrs = ["hlo_get_dimension_size_rewriter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_get_dimension_size_rewriter_test",
+    srcs = ["hlo_get_dimension_size_rewriter_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_get_dimension_size_rewriter",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = [
@@ -2902,6 +2984,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
         "@llvm//:transform_utils",
@@ -2999,7 +3082,6 @@ tf_cc_test(
     deps = [
         ":hlo_tfgraph_builder",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -3278,6 +3360,8 @@ cc_library(
         ":tuple_util",
         "//tensorflow/compiler/xla:literal_util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -3304,10 +3388,11 @@ cc_library(
         ":hlo",
         ":hlo_pass",
         ":tuple_util",
+        ":while_loop_analysis",
         ":while_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3323,7 +3408,7 @@ tf_cc_test(
         ":while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -3353,7 +3438,7 @@ tf_cc_test(
         ":while_loop_constant_sinking",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -3366,6 +3451,7 @@ cc_library(
         ":bfloat16_normalization",
         ":defuser",
         ":hlo",
+        ":hlo_memory_scheduler",
         ":hlo_pass",
         ":hlo_pass_pipeline",
         ":implicit_broadcast_remover",
@@ -3413,7 +3499,7 @@ tf_cc_test(
         ":indexed_array_analysis",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:test",
     ],
@@ -3499,6 +3585,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ar_crs_combiner",
+    srcs = ["ar_crs_combiner.cc"],
+    hdrs = ["ar_crs_combiner.h"],
+    deps = [
+        ":call_graph",
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "ar_crs_combiner_test",
+    srcs = ["ar_crs_combiner_test.cc"],
+    deps = [
+        ":ar_crs_combiner",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "map_inliner_test",
     srcs = ["map_inliner_test.cc"],
@@ -3510,7 +3631,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 85fc42f74756458ee677e8b53448ceb02f08e834..56bf3a9f69d718db1b2845c6901a893a2fe1660b 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 
 #include <algorithm>
+#include <iterator>
 #include <memory>
 #include <numeric>
 #include <string>
@@ -83,7 +84,8 @@ bool TransposeIsBitcast(const HloInstruction* transpose) {
 // reshape may still be a bitcast. For example, a reshape from [28x28] to [784].
 bool ReshapeOrCopyIsBitcast(
     const HloInstruction* instr,
-    const AlgebraicSimplifier::ValidBitcastCallback& valid_bitcast_callback) {
+    const AlgebraicSimplifierOptions::ValidBitcastCallback&
+        valid_bitcast_callback) {
   CHECK(HloOpcode::kReshape == instr->opcode() ||
         HloOpcode::kCopy == instr->opcode());
 
@@ -107,6 +109,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleAdd(HloInstruction* add) override;
 
+  Status HandleAnd(HloInstruction* logical_and) override;
+
   Status HandleBitcast(HloInstruction* bitcast) override;
 
   Status HandleBitcastConvert(HloInstruction* bitcast) override;
@@ -141,6 +145,12 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleMultiply(HloInstruction* multiply) override;
 
+  Status HandleNegate(HloInstruction* negate) override;
+
+  Status HandleNot(HloInstruction* logical_not) override;
+
+  Status HandleOr(HloInstruction* logical_or) override;
+
   Status HandlePad(HloInstruction* pad) override;
 
   Status HandlePower(HloInstruction* power) override;
@@ -171,21 +181,13 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   const bool changed() const { return changed_; }
 
   // Runs the visitor on a computation.
-  static bool Run(
-      HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_strength_reduction, bool enable_conv_simplification);
+  static bool Run(HloComputation* computation,
+                  const AlgebraicSimplifierOptions& options);
 
  private:
-  explicit AlgebraicSimplifierVisitor(
-      HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_strength_reduction, bool enable_conv_simplification)
-      : computation_(computation),
-        is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_strength_reduction_(enable_dot_strength_reduction),
-        enable_conv_simplification_(enable_conv_simplification) {}
+  explicit AlgebraicSimplifierVisitor(HloComputation* computation,
+                                      const AlgebraicSimplifierOptions& options)
+      : computation_(computation), options_(options) {}
 
   // Transforms Dots where at least one input is a vector or has a degenerate
   // dimension and converts it into a multiply and reduce. This should enable
@@ -224,10 +226,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                                      HloInstruction* new_instruction);
 
   // Returns whether the shape of the output of the given instructions are the
-  // same for the purposes of simplification. If is_layout_sensitive_ is true,
-  // then this tests shape equality including layout (ShapeUtil::Equal). If
-  // is_layout_sensitive_ is false, then the tests shape compatibility
-  // (ShapeUtil::Compatible).
+  // same for the purposes of simplification. If options_.is_layout_sensitive()
+  // is true, then this tests shape equality including layout
+  // (ShapeUtil::Equal). If options_.is_layout_sensitive() is false, then the
+  // tests shape compatibility (ShapeUtil::Compatible).
   bool SameShape(const HloInstruction* lhs, const HloInstruction* rhs) const;
 
   // Returns whether it was possible to transform `root` to a clamp instruction.
@@ -306,30 +308,22 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Tries to use a kDot in place of the given convolution.
   StatusOr<bool> SimplifyConvToDot(HloInstruction* convolution);
 
-  // Tries to simplify a slice(pad(...)) where the result of the slice is a
-  // scalar.
-  StatusOr<bool> TrySimplifySliceOfPad(HloInstruction* slice);
+  // Tries to simplify a slice where the result of the slice is a scalar.
+  StatusOr<bool> TrySimplifyScalarSlice(HloInstruction* slice);
+
+  // Tries to convert slice(reshape(X)) into reshape(slice(X))
+  StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
 
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
 
+  // The backend-specific options selected for the algebraic simplifier.
+  const AlgebraicSimplifierOptions& options_;
+
   // Whether algebraic simplification has occurred.
   bool changed_ = false;
 
-  // Whether layout is considered during transformation.
-  bool is_layout_sensitive_;
-
-  // Callback used to determine if a bitcast is possible.
-  AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback_;
-
-  // Disable dot strength reduction on platforms where it causes a slowdown.
-  bool enable_dot_strength_reduction_;
-
-  // Disable convolution -> dot simplification on platforms where it causes a
-  // slowdown.
-  bool enable_conv_simplification_;
-
   // Cached computation for adding two scalar F32.
   HloComputation* scalar_add_computation_ = nullptr;
 };
@@ -337,19 +331,15 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 }  // namespace
 
 bool AlgebraicSimplifierVisitor::Run(
-    HloComputation* computation, bool is_layout_sensitive,
-    AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-    bool enable_dot_strength_reduction, bool enable_conv_simplification) {
-  AlgebraicSimplifierVisitor visitor(
-      computation, is_layout_sensitive, std::move(valid_bitcast_callback),
-      enable_dot_strength_reduction, enable_conv_simplification);
+    HloComputation* computation, const AlgebraicSimplifierOptions& options) {
+  AlgebraicSimplifierVisitor visitor(computation, options);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
 
 bool AlgebraicSimplifierVisitor::SameShape(const HloInstruction* lhs,
                                            const HloInstruction* rhs) const {
-  if (is_layout_sensitive_) {
+  if (options_.is_layout_sensitive()) {
     return ShapeUtil::Equal(lhs->shape(), rhs->shape());
   } else {
     return ShapeUtil::Compatible(lhs->shape(), rhs->shape());
@@ -423,6 +413,43 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs))));
+  // Simplify logical and
+  if (ShapeUtil::HasPrimitiveType(lhs->shape(), xla::PRED) &&
+      ShapeUtil::HasPrimitiveType(rhs->shape(), xla::PRED)) {
+    // A && True => A
+    VLOG(10) << "trying transform [A && True => A]: "
+             << logical_and->ToString();
+    if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_and, lhs)) {
+      return Status::OK();
+    }
+    // True && A => A
+    VLOG(10) << "trying transform [True && A => A]: "
+             << logical_and->ToString();
+    if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_and, rhs)) {
+      return Status::OK();
+    }
+
+    // A && False => False
+    VLOG(10) << "trying transform [A && False => False]: "
+             << logical_and->ToString();
+    if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_and, rhs)) {
+      return Status::OK();
+    }
+
+    // False && A => False
+    VLOG(10) << "trying transform [False && A => False]: "
+             << logical_and->ToString();
+    if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_and, lhs)) {
+      return Status::OK();
+    }
+  }
+
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
   // If a bitcast feeds a bitcast, make it a single bitcast.
   HloInstruction* op;
@@ -456,8 +483,8 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
     return Status::OK();
   }
 
-  if (is_layout_sensitive_ &&
-      ReshapeOrCopyIsBitcast(copy, valid_bitcast_callback_)) {
+  if (options_.is_layout_sensitive() &&
+      ReshapeOrCopyIsBitcast(copy, options_.valid_bitcast_callback())) {
     ReplaceWithBitcast(copy);
   }
 
@@ -1167,7 +1194,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
     return ReplaceInstruction(dot, dot_of_gather_optimized);
   }
 
-  if (enable_dot_strength_reduction_ && !is_layout_sensitive_) {
+  if (options_.enable_dot_strength_reduction() &&
+      !options_.is_layout_sensitive()) {
     TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
                         HandleDotStrengthReduction(dot));
     if (did_strength_reduction) {
@@ -1229,6 +1257,64 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleNegate(HloInstruction* negate) {
+  // negate(negate(x)) => x
+  HloInstruction* x;
+  if (Match(negate, m::Negate(m::Negate(m::Op(&x)))) &&
+      ReplaceInstructionIfSameShape(negate, x)) {
+    return Status::OK();
+  }
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleNot(HloInstruction* logical_not) {
+  // not(not(x)) => x
+  HloInstruction* x;
+  if (Match(logical_not, m::Not(m::Not(m::Op(&x)))) &&
+      ReplaceInstructionIfSameShape(logical_not, x)) {
+    return Status::OK();
+  }
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleOr(HloInstruction* logical_or) {
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(logical_or, m::Or(m::Op(&lhs), m::Op(&rhs))));
+
+  // Simplify logical or
+  if (ShapeUtil::HasPrimitiveType(lhs->shape(), xla::PRED) &&
+      ShapeUtil::HasPrimitiveType(rhs->shape(), xla::PRED)) {
+    // A || True => True
+    VLOG(10) << "trying transform [A || True => True]: "
+             << logical_or->ToString();
+    if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_or, rhs)) {
+      return Status::OK();
+    }
+    // True || A => True
+    VLOG(10) << "trying transform [True || A => True]: "
+             << logical_or->ToString();
+    if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_or, lhs)) {
+      return Status::OK();
+    }
+
+    // A || False => A
+    VLOG(10) << "trying transform [A || False => A]: "
+             << logical_or->ToString();
+    if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_or, lhs)) {
+      return Status::OK();
+    }
+
+    // False || A => A
+    VLOG(10) << "trying transform [False || A => A]: "
+             << logical_or->ToString();
+    if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_or, rhs)) {
+      return Status::OK();
+    }
+  }
+
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) {
   // ln(exp(A)) => A
   VLOG(10) << "trying transform [ln(exp(A)) => A]: " << log->ToString();
@@ -1804,8 +1890,8 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   }
 
   // Make this a bitcast if possible.
-  if (is_layout_sensitive_ &&
-      ReshapeOrCopyIsBitcast(reshape, valid_bitcast_callback_)) {
+  if (options_.is_layout_sensitive() &&
+      ReshapeOrCopyIsBitcast(reshape, options_.valid_bitcast_callback())) {
     ReplaceWithBitcast(reshape);
     return Status::OK();
   }
@@ -1826,60 +1912,160 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse) {
   return Status::OK();
 }
 
-StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifySliceOfPad(
+StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
     HloInstruction* slice) {
   // Only try to do this for effective scalars. We could do the same for slicing
   // out larger pieces of padding (replacing with a broadcast of the padding
   // value), but this is probably not worth it.
-  if (!ShapeUtil::IsEffectiveScalar(slice->shape()) ||
-      slice->operand(0)->opcode() != HloOpcode::kPad) {
+  if (!ShapeUtil::IsEffectiveScalar(slice->shape())) {
     return false;
   }
 
-  VLOG(10) << "Trying to simplify scalar slice of pad";
-  // Check there's no internal padding. Again, we could handle that too, since
-  // everything is statically known, but it's not worth it.
-  auto pad = Cast<HloPadInstruction>(slice->mutable_operand(0));
-  auto padding_config = pad->padding_config();
-  int64 rank = padding_config.dimensions_size();
-  if (HasInteriorPadding(padding_config)) {
-    VLOG(10) << "Not folding scalar slice of pad, pad has interior padding";
-    return false;
+  if (slice->operand(0)->opcode() == HloOpcode::kPad) {
+    VLOG(10) << "Trying to simplify scalar slice of pad";
+    // Check there's no internal padding. Again, we could handle that too, since
+    // everything is statically known, but it's not worth it.
+    auto pad = Cast<HloPadInstruction>(slice->mutable_operand(0));
+    auto padding_config = pad->padding_config();
+    int64 rank = padding_config.dimensions_size();
+    if (HasInteriorPadding(padding_config)) {
+      VLOG(10) << "Not folding scalar slice of pad, pad has interior padding";
+      return false;
+    }
+
+    // Check whether the scalar we're slicing out falls into the padding.
+    bool in_padding = [&]() {
+      for (int64 i = 0; i < rank; ++i) {
+        int64 start = slice->slice_starts(i);
+        int64 low = padding_config.dimensions(i).edge_padding_low();
+        int64 data = pad->operand(0)->shape().dimensions(i);
+        if (start >= low && start < low + data) {
+          return false;
+        }
+      }
+      return true;
+    }();
+
+    if (in_padding) {
+      VLOG(10) << "Folding scalar slice of pad into padding value";
+      TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+          slice, HloInstruction::CreateReshape(slice->shape(),
+                                               pad->mutable_padding_value())));
+      return true;
+    } else {
+      // We already know the output of the slice is scalar. If the padded
+      // value is scalar, and it's not in the padding, then it's exactly the
+      // output value.
+      bool replaced =
+          ReplaceInstructionIfSameShape(slice, pad->mutable_operand(0));
+      if (replaced) {
+        VLOG(10) << "Folding scalar slice of pad into padded value";
+      } else {
+        VLOG(10) << "Not folding scalar slice of pad into padded value as they "
+                    "have different shapes.";
+      }
+      return replaced;
+    }
   }
 
-  // Check whether the scalar we're slicing out falls into the padding.
-  bool in_padding = [&]() {
-    for (int64 i = 0; i < rank; ++i) {
-      int64 start = slice->slice_starts(i);
-      int64 low = padding_config.dimensions(i).edge_padding_low();
-      int64 data = pad->operand(0)->shape().dimensions(i);
-      if (start >= low && start < low + data) {
-        return false;
+  if (slice->operand(0)->opcode() == HloOpcode::kConcatenate) {
+    VLOG(10) << "Trying to simplify scalar slice of concat";
+    // Only do this for R1, there's no chance of this being useful otherwise.
+    if (ShapeUtil::Rank(slice->shape()) != 1) {
+      VLOG(10) << "Not folding, slice is not rank 1";
+      return false;
+    }
+    HloConcatenateInstruction* concat =
+        Cast<HloConcatenateInstruction>(slice->mutable_operand(0));
+    int64 operand_start = 0;
+    int64 operand_num = 0;
+    // Weird loop structure to avoid annoying off-by-one errors.
+    while (true) {
+      TF_RET_CHECK(operand_num < concat->operand_count());
+      const HloInstruction* operand = concat->operand(operand_num);
+      int64 next_operand_start = operand_start + operand->shape().dimensions(0);
+      if (next_operand_start > slice->slice_starts(0)) {
+        break;
       }
+      operand_start = next_operand_start;
+      operand_num++;
     }
-    return true;
-  }();
 
-  if (in_padding) {
-    VLOG(10) << "Folding scalar slice of pad into padding value";
-    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
-        slice, HloInstruction::CreateReshape(slice->shape(),
-                                             pad->mutable_padding_value())));
-    return true;
-  } else {
-    // We already know the output of the slice is scalar. If the padded
-    // value is scalar, and it's not in the padding, then it's exactly the
-    // output value.
-    bool replaced =
-        ReplaceInstructionIfSameShape(slice, pad->mutable_operand(0));
+    bool replaced = ReplaceInstructionIfSameShape(
+        slice, concat->mutable_operand(operand_num));
     if (replaced) {
-      VLOG(10) << "Folding scalar slice of pad into padded value";
+      VLOG(10) << "Folding scalar slice of concat into concat operand";
     } else {
-      VLOG(10) << "Not folding scalar slice of pad into padded value as they "
-                  "have different shapes.";
+      VLOG(10) << "Folding scalar slice of concat into slice of concat operand";
+      TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+          slice, HloInstruction::CreateSlice(
+                     slice->shape(), concat->mutable_operand(operand_num),
+                     {slice->slice_starts(0) - operand_start},
+                     {slice->slice_starts(0) - operand_start + 1},
+                     slice->slice_strides())));
+    }
+    return true;
+  }
+
+  return false;
+}
+
+bool IsUnstridedSlice(const HloInstruction* hlo) {
+  return absl::c_all_of(hlo->slice_strides(),
+                        [](int64 stride) { return stride == 1; });
+}
+
+StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
+    HloInstruction* slice) {
+  CHECK_EQ(slice->opcode(), HloOpcode::kSlice);
+  if (!IsUnstridedSlice(slice)) {
+    return false;
+  }
+  HloInstruction* reshape = slice->mutable_operand(0);
+  if (reshape->opcode() != HloOpcode::kReshape) {
+    return false;
+  }
+  HloInstruction* new_slice_operand = reshape->mutable_operand(0);
+  int64 slice_rank = ShapeUtil::Rank(slice->shape());
+  std::vector<int64> sliced_dims;
+  for (int64 i = 0; i < slice_rank; ++i) {
+    if (slice->slice_starts(i) != 0 ||
+        slice->slice_limits(i) != reshape->shape().dimensions(i)) {
+      sliced_dims.push_back(i);
+    }
+  }
+
+  if (sliced_dims.size() == 1 && sliced_dims[0] == 0 &&
+      slice->slice_starts(0) == 0) {
+    const Shape& new_slice_shape = new_slice_operand->shape();
+    const int64 rank = ShapeUtil::Rank(new_slice_shape);
+    std::vector<int64> new_slice_starts(rank, 0);
+    std::vector<int64> new_slice_stides(rank, 1);
+    std::vector<int64> new_slice_limits(new_slice_shape.dimensions().begin(),
+                                        new_slice_shape.dimensions().end());
+    int64 slice_elements = ShapeUtil::ElementsIn(slice->shape());
+    for (int64 i = rank - 1; i >= 0; --i) {
+      if (slice_elements >= new_slice_limits[i]) {
+        if (slice_elements % new_slice_limits[i] != 0) {
+          return false;
+        }
+        slice_elements /= new_slice_limits[i];
+      } else {
+        new_slice_limits[i] = slice_elements;
+        slice_elements = 1;
+      }
     }
-    return replaced;
+    HloInstruction* new_slice =
+        computation_->AddInstruction(HloInstruction::CreateSlice(
+            ShapeUtil::MakeShape(new_slice_shape.element_type(),
+                                 new_slice_limits),
+            new_slice_operand, new_slice_starts, new_slice_limits,
+            new_slice_stides));
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+        slice, HloInstruction::CreateReshape(slice->shape(), new_slice)));
+    return true;
   }
+  return false;
 }
 
 Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
@@ -1888,12 +2074,8 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     return Status::OK();
   }
 
-  auto is_unstrided_slice = [](const HloInstruction* hlo) {
-    return absl::c_all_of(hlo->slice_strides(),
-                          [](int64 stride) { return stride == 1; });
-  };
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
-      is_unstrided_slice(slice) && is_unstrided_slice(slice->operand(0))) {
+      IsUnstridedSlice(slice) && IsUnstridedSlice(slice->operand(0))) {
     HloInstruction* operand_slice = slice->mutable_operand(0);
     std::vector<int64> new_slice_starts = slice->slice_starts();
     std::vector<int64> new_slice_limits = slice->slice_limits();
@@ -1907,11 +2089,15 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
                    new_slice_starts, new_slice_limits, slice->slice_strides()));
   }
 
-  TF_ASSIGN_OR_RETURN(bool replaced, TrySimplifySliceOfPad(slice));
+  TF_ASSIGN_OR_RETURN(bool replaced, TrySimplifyScalarSlice(slice));
   if (replaced) {
     return Status::OK();
   }
 
+  TF_ASSIGN_OR_RETURN(replaced, TryToReorderSliceAndReshape(slice));
+  if (replaced) {
+    return Status::OK();
+  }
   return Status::OK();
 }
 
@@ -2295,6 +2481,108 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
     return ReplaceWithNewInstruction(
         sort, HloInstruction::CreateTuple(sort->operands()));
   }
+  if (!options_.enable_permutation_sort_replacement()) {
+    return Status::OK();
+  }
+  // Check if we are sorting a permutation. In that case, we know that the keys
+  // will be sorted to the identity permutation, and we can represent the
+  // changes to the 'values' parameter as a scatter.
+  if (sort->operand_count() == 2 &&
+      operand->opcode() == HloOpcode::kGetTupleElement) {
+    const HloInstruction* other_sort = operand->operand(0);
+    // Check whether the 'values' parameter is the result of another sort with
+    // the same sort dimension.
+    if (other_sort->opcode() == HloOpcode::kSort &&
+        other_sort->operand_count() >= 2 &&
+        other_sort->dimensions(0) == dimension_to_sort &&
+        other_sort->operand(operand->tuple_index())->opcode() ==
+            HloOpcode::kIota) {
+      auto* iota =
+          Cast<HloIotaInstruction>(other_sort->operand(operand->tuple_index()));
+      // The sort operand needs to be an integral iota, and the iota dimension
+      // needs to be the dimension that was sorted.
+      if (iota->iota_dimension() == dimension_to_sort &&
+          ShapeUtil::ElementIsIntegral(iota->shape())) {
+        // We use the following construction method for a Scatter that applies
+        // the permutation from 'keys' to the 'values' parameter.
+        // - Take the "keys" parameter of the second sort and reshape it to have
+        //   another "1" dimension at the end.
+        // - Concatenate it with iotas of the same extended shape with all
+        //   different iota_dimensions except the dimension_to_sort in the order
+        //   of iota_dimensions/dimension_to_sort, so e.g. with rank 3 and
+        //   dimension_to_sort = 1, we would have concatenate of (iota with
+        //   iota_dimension=0, keys, iota with iota_dimension = 2)
+        // - Use this as the indices parameter of scatter, and set updates
+        //   of the scatter to be a reshaped 'values' parameter of sort (adding
+        //   'rank' many 1 dimensions at the end).
+        int64 rank = ShapeUtil::Rank(operand->shape());
+        Shape extended_shape = operand->shape();
+        extended_shape.add_dimensions(1);
+        extended_shape.mutable_layout()->add_minor_to_major(rank);
+        auto reshaped_permutation = computation_->AddInstruction(
+            HloInstruction::CreateReshape(extended_shape, operand));
+        std::vector<HloInstruction*> concat_operands;
+        for (int64 i = 0; i < rank; ++i) {
+          if (i == dimension_to_sort) {
+            concat_operands.push_back(reshaped_permutation);
+          } else {
+            concat_operands.push_back(computation_->AddInstruction(
+                HloInstruction::CreateIota(extended_shape, i)));
+          }
+        }
+        Shape concat_shape = operand->shape();
+        concat_shape.add_dimensions(rank);
+        concat_shape.mutable_layout()->add_minor_to_major(rank);
+        auto scatter_indices =
+            rank > 1 ? computation_->AddInstruction(
+                           HloInstruction::CreateConcatenate(
+                               concat_shape, concat_operands, rank))
+                     : reshaped_permutation;
+
+        // We don't care about the operand, it will be completely overridden by
+        // the updates.
+        auto scatter_operand = computation_->AddInstruction(
+            HloInstruction::CreateIota(sort->operand(1)->shape(), 0));
+
+        // Construct the updates operand of scatter.
+        Shape update_shape = sort->operand(1)->shape();
+        for (int64 i = 0; i < rank; ++i) {
+          update_shape.add_dimensions(1);
+          update_shape.mutable_layout()->add_minor_to_major(rank + i);
+        }
+        auto scatter_updates =
+            computation_->AddInstruction(HloInstruction::CreateReshape(
+                update_shape, sort->mutable_operand(1)));
+
+        // Construct the updates computation, which simply replaces the operand
+        // values with the update values.
+        HloComputation::Builder b("update_replace_computation");
+        Shape scalar_shape = ShapeUtil::MakeShape(S32, {});
+        b.AddInstruction(
+            HloInstruction::CreateParameter(0, scalar_shape, "scalar_lhs"));
+        auto scalar_rhs = b.AddInstruction(
+            HloInstruction::CreateParameter(1, scalar_shape, "scalar_rhs"));
+        auto update_replace_computation =
+            computation_->parent()->AddEmbeddedComputation(b.Build(scalar_rhs));
+
+        ScatterDimensionNumbers dim_numbers;
+        dim_numbers.set_index_vector_dim(rank);
+        for (int64 i = 0; i < rank; ++i) {
+          dim_numbers.add_update_window_dims(rank + i);
+          dim_numbers.add_scatter_dims_to_operand_dims(i);
+        }
+        auto scatter =
+            computation_->AddInstruction(HloInstruction::CreateScatter(
+                sort->operand(1)->shape(), scatter_operand, scatter_indices,
+                scatter_updates, update_replace_computation, dim_numbers));
+        return ReplaceWithNewInstruction(
+            sort, HloInstruction::CreateTuple(
+                      {computation_->AddInstruction(HloInstruction::CreateIota(
+                           operand->shape(), dimension_to_sort)),
+                       scatter}));
+      }
+    }
+  }
   return Status::OK();
 }
 
@@ -2319,7 +2607,7 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
     return ReplaceInstruction(transpose, operand);
   }
 
-  if (is_layout_sensitive_ && TransposeIsBitcast(transpose)) {
+  if (options_.is_layout_sensitive() && TransposeIsBitcast(transpose)) {
     ReplaceWithBitcast(transpose);
     return Status::OK();
   }
@@ -2468,13 +2756,13 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   const ConvolutionDimensionNumbers& dnums =
       convolution->convolution_dimension_numbers();
 
-  if (!enable_conv_simplification_) {
+  if (!options_.enable_conv_simplification()) {
     return false;
   }
 
   // TODO(b/31337498): For now, we cowardly refuse to do this optimization in
   // layout-insensitive mode, for fear of adding nontrivial reshapes.
-  if (!is_layout_sensitive_) {
+  if (!options_.is_layout_sensitive()) {
     return false;
   }
 
@@ -2564,9 +2852,9 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   // We cannot insert bitcasts if the layouts will not be compatible.
   // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
   // invalid.
-  if (!valid_bitcast_callback_(input_shape, new_input_shape) ||
-      !valid_bitcast_callback_(filter_shape, new_filter_shape) ||
-      !valid_bitcast_callback_(dot_output_shape, convolution_shape)) {
+  if (!options_.valid_bitcast_callback()(input_shape, new_input_shape) ||
+      !options_.valid_bitcast_callback()(filter_shape, new_filter_shape) ||
+      !options_.valid_bitcast_callback()(dot_output_shape, convolution_shape)) {
     return false;
   }
 
@@ -2672,9 +2960,7 @@ StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
                  "AlgebraicSimplifier::Run(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (AlgebraicSimplifierVisitor::Run(
-            comp, is_layout_sensitive_, valid_bitcast_callback_,
-            enable_dot_strength_reduction_, enable_conv_simplification_)) {
+    if (AlgebraicSimplifierVisitor::Run(comp, options_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 9f8d0ee88bdebcf17310cd0407b1b99e4b0a7b5f..d2775b9fafa7e4c625f5d181114e80e7369f9c78 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -23,8 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-// A pass which performs algebraic simplifications.
-class AlgebraicSimplifier : public HloModulePass {
+class AlgebraicSimplifierOptions {
  public:
   // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
   // bitcast from 'from_shape' to 'to_shape' after considering platform
@@ -34,18 +33,63 @@ class AlgebraicSimplifier : public HloModulePass {
   using ValidBitcastCallback =
       std::function<bool(const Shape& from_shape, const Shape& to_shape)>;
 
+  explicit AlgebraicSimplifierOptions(
+      ValidBitcastCallback valid_bitcast_callback)
+      : valid_bitcast_callback_(std::move(valid_bitcast_callback)) {}
+  // If valid_bitcast_callback returns true, then the pass will replace reshapes
+  // and transposes with bitcasts.
+  const ValidBitcastCallback& valid_bitcast_callback() const {
+    return valid_bitcast_callback_;
+  }
+
+  // If is_layout_sensitive is true, then the simplifier preserves layout during
+  // transformation. Otherwise, layout is ignored.
+  void set_is_layout_sensitive(bool is_layout_sensitive) {
+    is_layout_sensitive_ = is_layout_sensitive;
+  }
+  bool is_layout_sensitive() const { return is_layout_sensitive_; }
+
+  // Enable dot simplification on platforms where it is profitable.
+  void set_enable_dot_strength_reduction(bool enable_dot_strength_reduction) {
+    enable_dot_strength_reduction_ = enable_dot_strength_reduction;
+  }
+  bool enable_dot_strength_reduction() const {
+    return enable_dot_strength_reduction_;
+  }
+
+  // Enable convolution simplification on platforms where it is profitable.
+  void set_enable_conv_simplification(bool enable_conv_simplification) {
+    enable_conv_simplification_ = enable_conv_simplification;
+  }
+  bool enable_conv_simplification() const {
+    return enable_conv_simplification_;
+  }
+
+  // If enable_permutation_sort_replacement is true, a sort op that is known to
+  // sort a permutation will be replaced with a scatter op.
+  void set_enable_permutation_sort_replacement(
+      bool enable_permutation_sort_replacement) {
+    enable_permutation_sort_replacement_ = enable_permutation_sort_replacement;
+  }
+  bool enable_permutation_sort_replacement() const {
+    return enable_permutation_sort_replacement_;
+  }
+
+ private:
+  ValidBitcastCallback valid_bitcast_callback_;
+  bool is_layout_sensitive_{false};
+  bool enable_dot_strength_reduction_{true};
+  bool enable_conv_simplification_{true};
+  bool enable_permutation_sort_replacement_{false};
+};
+
+// A pass which performs algebraic simplifications.
+class AlgebraicSimplifier : public HloModulePass {
+ public:
   // If is_layout_sensitive is true, then the simplifier preserves layout during
-  // transformation. Otherwise, layout is ignored. If valid_bitcast_callback
-  // returns true, then the pass will replace reshapes and transposes with
-  // bitcasts.
-  AlgebraicSimplifier(bool is_layout_sensitive,
-                      ValidBitcastCallback valid_bitcast_callback,
-                      bool enable_dot_strength_reduction = true,
-                      bool enable_conv_simplification = true)
-      : is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_strength_reduction_(enable_dot_strength_reduction),
-        enable_conv_simplification_(enable_conv_simplification) {}
+  // transformation. Otherwise, layout is ignored.
+  explicit AlgebraicSimplifier(const AlgebraicSimplifierOptions& options)
+      : options_(options) {}
   ~AlgebraicSimplifier() override = default;
   absl::string_view name() const override { return "algsimp"; }
 
@@ -54,14 +98,7 @@ class AlgebraicSimplifier : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  bool is_layout_sensitive_;
-  ValidBitcastCallback valid_bitcast_callback_;
-
-  // Enable dot simplification on platforms where it is profitable.
-  bool enable_dot_strength_reduction_;
-
-  // Enable convolution simplification on platforms where it is profitable.
-  bool enable_conv_simplification_;
+  AlgebraicSimplifierOptions options_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 7b3e957fbcf9f4628c4aeb0c323d50d3ed36a4f2..8b8ba2a77d9bec7a6baf6929a0566906727be319 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -46,18 +45,22 @@ using ::testing::ElementsAre;
 
 namespace op = xla::testing::opcode_matchers;
 
-AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() {
+AlgebraicSimplifierOptions::ValidBitcastCallback bitcasting_callback() {
   return [](const Shape&, const Shape&) { return true; };
 }
 
-AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
+AlgebraicSimplifierOptions::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
 
-class AlgebraicSimplifierTest : public HloVerifiedTestBase {};
+class AlgebraicSimplifierTest : public HloTestBase {
+ protected:
+  AlgebraicSimplifierOptions default_options_{non_bitcasting_callback()};
+};
 
 // Test that A + 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, AddZero) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -67,18 +70,18 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that A * 0 is simplified to 0
 TEST_F(AlgebraicSimplifierTest, MulZero) {
+  auto m = CreateNewVerifiedModule();
   Shape r0s32 = ShapeUtil::MakeShape(S32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -88,12 +91,11 @@ TEST_F(AlgebraicSimplifierTest, MulZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0s32, HloOpcode::kMultiply, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMultiply);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), zero);
 }
 
@@ -114,8 +116,7 @@ TEST_F(AlgebraicSimplifierTest, SelectTrue) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), param0);
 }
@@ -137,8 +138,7 @@ TEST_F(AlgebraicSimplifierTest, SelectFalse) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), param1);
 }
@@ -158,14 +158,14 @@ TEST_F(AlgebraicSimplifierTest, SelectIdentical) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), param1);
 }
 
 // Test that Reduce(Reduce(A)) -> Reduce(A)
 TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   // Create add computation.
   HloInstruction* zero = builder.AddInstruction(
@@ -180,7 +180,7 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
   HloInstruction* param = builder.AddInstruction(
@@ -193,17 +193,17 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
   Shape r1f32 = ShapeUtil::MakeShape(F32, {5});
   builder.AddInstruction(HloInstruction::CreateReduce(r1f32, reduce0, zero,
                                                       dims1, add_computation));
-  module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Reduce(param, zero));
   EXPECT_EQ(root->dimensions(), std::vector<int64>({0, 2, 3}));
 }
 
 // Test that Const + A is canonicalized to A + Const.
 TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -213,18 +213,18 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, constant, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Constant()));
 }
 
 // Test that [(A + C1) + C2] => [A + (C1 + C2)] for constants C1 and C2.
 TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -239,17 +239,17 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, add1, constant2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Add(constant1, constant2)));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -261,17 +261,17 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   // Create add computation.
   HloComputation* add_computation = nullptr;
@@ -284,7 +284,7 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   Shape r2f32 = ShapeUtil::MakeShape(F32, {32, 1});
   HloInstruction* param0 = builder.AddInstruction(
@@ -297,17 +297,17 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
                    HloInstruction::CreateBroadcast(r2f32, zero, {}))},
       add_computation));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMap);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Broadcast(zero)));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -319,64 +319,64 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({3.14f, 3.14f, 3.14f})));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
   EXPECT_EQ(3.14f, root->operand(0)->literal().GetFirstElement<float>());
 }
 
 TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({3.14, 3.14, 4})));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({0.0f, 1.0f, 2.0f})));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Iota());
 }
 
 // Test that A - 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, SubZero) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -386,18 +386,18 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kSubtract, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that A - Const is canonicalized to A + (-Const).
 TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -407,18 +407,18 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kSubtract, param0, constant));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Negate(constant)));
 }
 
 // Test that (A/B)/C is simplified to A/(B*C).
 TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -432,14 +432,13 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, div, param2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Divide(param0, param1), param2));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Multiply(param1, param2)));
@@ -447,6 +446,7 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
 
 // Test that A/(B/C) is simplified to (A*C)/B.
 TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -460,14 +460,13 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Divide(param1, param2)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Multiply(param0, param2), param1));
@@ -475,6 +474,7 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
 
 // Test that (A/B)/(C/D) is simplified to (A*D)/(B*C).
 TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {42, 123});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -492,15 +492,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, div0, div1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
       op::Divide(op::Divide(param0, param1), op::Divide(param2, param3)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -509,6 +508,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
 
 // Test that A/exp(B) is simplified to A*exp(-B).
 TEST_F(AlgebraicSimplifierTest, DivOfExp) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -520,14 +520,13 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, exp));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Exp(param1)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Exp(op::Negate(param1))));
@@ -535,6 +534,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
 
 // Test that A/pow(B,C) is simplified to A*pow(B,-C).
 TEST_F(AlgebraicSimplifierTest, DivOfPower) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -548,14 +548,13 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, power));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Power(param1, param2)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Power(param1, op::Negate(param2))));
@@ -564,6 +563,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
 // Test that broadcasting is done on the right step when simplifying A/pow(B,C)
 // to A*pow(B,-C).
 TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -577,14 +577,13 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide, param0, power));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Power(param1, param2)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   ASSERT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Power(param1, op::Negate(param2))));
@@ -592,6 +591,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
 
 // A / Const => A * InvertedConst
 TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {3});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -602,11 +602,10 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide,
                                                       param0, constant));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Constant()));
@@ -614,6 +613,7 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
 
 // pow(pow(A, X), Y) => pow(A, X*Y)
 TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* base = builder.AddInstruction(
@@ -627,10 +627,9 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Power(base, op::Multiply(exp1, exp2)));
 }
@@ -638,6 +637,7 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
 // Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex
 // numbers.
 TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
+  auto m = CreateNewVerifiedModule();
   Shape r1c64 = ShapeUtil::MakeShape(C64, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* base = builder.AddInstruction(
@@ -651,14 +651,14 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1c64, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Test that A/1 is simplified to A for a scalar.
 TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -668,18 +668,18 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that A/1 is simplified to A for an array.
 TEST_F(AlgebraicSimplifierTest, DivOneArray) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -689,18 +689,18 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that complex(real(c), imag(c)) is simplified to c.
 TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   Shape r2c64 = ShapeUtil::MakeShape(C64, {2, 2});
   HloComputation::Builder builder(TestName());
@@ -713,18 +713,18 @@ TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
   HloInstruction* cplx = builder.AddInstruction(
       HloInstruction::CreateBinary(r2c64, HloOpcode::kComplex, real, imag));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, cplx);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that real(complex(r,i)) is simplified to r.
 TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -737,18 +737,18 @@ TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
   HloInstruction* real = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kReal, cplx));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, real);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that imag(complex(r,i)) is simplified to i.
 TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -761,18 +761,18 @@ TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
   HloInstruction* imag = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kImag, cplx));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, imag);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param1);
 }
 
 // Test that get_element(make_tuple({A,B}),1) is simplified to B
 TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -788,18 +788,18 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, get, param2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, add);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param1, param2));
 }
 
 // Test that exp(A)/exp(B) is simplified to exp(A-B)
 TEST_F(AlgebraicSimplifierTest, ExpDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -813,14 +813,13 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, exp0, exp1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Exp(param0), op::Exp(param1)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Subtract(param0, param1)));
@@ -828,6 +827,7 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
 
 // Test that exp(A)*exp(B) is simplified to exp(A+B)
 TEST_F(AlgebraicSimplifierTest, ExpMul) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -841,14 +841,13 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, exp0, exp1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(op::Exp(param0), op::Exp(param1)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Add(param0, param1)));
@@ -856,6 +855,7 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
 
 // Test that pow(exp(A), B) is simplified to exp(A*B)
 TEST_F(AlgebraicSimplifierTest, PowExp) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -867,14 +867,13 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, exp0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Power(op::Exp(param0), param1));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Multiply(param0, param1)));
@@ -882,6 +881,7 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
 
 // Test that ln(pow(A, B)) is simplified to ln(A)*B
 TEST_F(AlgebraicSimplifierTest, LnPow) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -893,14 +893,13 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, pow));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Log(op::Power(param0, param1)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(op::Log(param0), param1));
@@ -908,6 +907,7 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
 
 // Test that ln(exp(A)) is simplified to A
 TEST_F(AlgebraicSimplifierTest, LnExp) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -917,19 +917,19 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, exp0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that ln(exp(A)/exp(B)) is simplified to A-B
 TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -945,14 +945,13 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
 }
@@ -960,6 +959,7 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
 // Test that pow(A, 0) where A is a scalar is simplified to the scalar
 // constant 1.
 TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -969,13 +969,12 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
@@ -984,6 +983,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
 
 // Test that pow(A, 0) where A is not a scalar is simplified to broadcast(1).
 TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {42});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -993,13 +993,12 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast());
@@ -1012,6 +1011,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
 
 // Test that pow(A, 1) is simplified to A.
 TEST_F(AlgebraicSimplifierTest, Pow1) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1021,19 +1021,19 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that pow(A, 2) is simplified to A*A.
 TEST_F(AlgebraicSimplifierTest, Pow2) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1043,19 +1043,19 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, two));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
 }
 
 // Test that pow(A, -1) is simplified to 1/A.
 TEST_F(AlgebraicSimplifierTest, PowNegative1) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1065,13 +1065,12 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32, HloOpcode::kPower,
                                                       param0, negative_one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
@@ -1081,6 +1080,7 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* lhs = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {3, 3, 0}), "lhs"));
@@ -1113,17 +1113,17 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(builder.Build());
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  m->AddEntryComputation(builder.Build());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Convolution(lhs, rhs));
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Broadcast(op::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1148,24 +1148,24 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   builder.AddInstruction(HloInstruction::CreateReduceWindow(
       ShapeUtil::MakeShape(F32, {5, 2}), param,
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
       window, add_computation));
-  module().AddEntryComputation(builder.Build());
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  m->AddEntryComputation(builder.Build());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::ReduceWindow(param, op::Constant()));
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Broadcast(op::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1182,17 +1182,17 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))),
       padding));
-  module().AddEntryComputation(builder.Build());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  m->AddEntryComputation(builder.Build());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Pad(param, op::Constant()));
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Broadcast(op::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
 
   auto builder = HloComputation::Builder(TestName());
@@ -1206,39 +1206,39 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
       ShapeUtil::MakeShape(F32, {3, 2}), broadcast));
 
   auto computation = builder.Build();
-  module().AddEntryComputation(std::move(computation));
+  m->AddEntryComputation(std::move(computation));
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Reshape(op::Broadcast(op::Reshape(op))));
 
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(), op);
+  EXPECT_THAT(m->entry_computation()->root_instruction(), op);
 }
 
 // Test that convert(A, $TYPE) is simplified to A if A is of type $TYPE.
 TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* input = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), input);
 }
 
 // Test that copies are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1246,18 +1246,18 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1268,24 +1268,27 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
       ShapeUtil::MakeShape(F32, {1, 14, 14, 64}), HloOpcode::kCopy, param));
   *copy->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({1, 2, 0, 3});
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->root_instruction(), op::Copy(param));
 
-  AlgebraicSimplifier simplifier1(/*is_layout_sensitive=*/true,
-                                  non_bitcasting_callback());
-  ASSERT_FALSE(simplifier1.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier1(options);
+  ASSERT_FALSE(simplifier1.Run(m.get()).ValueOrDie());
   // Verify that the copy is not replaced.
   EXPECT_THAT(computation->root_instruction(), op::Copy(param));
 
-  AlgebraicSimplifier simplifier2(/*is_layout_sensitive=*/true,
-                                  bitcasting_callback());
-  ASSERT_TRUE(simplifier2.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options2(bitcasting_callback());
+  options2.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier2(options2);
+  ASSERT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
   // Verify that the copy is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 // Test that unary concatenates are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1293,19 +1296,19 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
   builder.AddInstruction(
       HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
 // Test that empty operands of concatenates are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
+  auto m = CreateNewVerifiedModule();
   const int kParamLength = 100;
   Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
@@ -1322,15 +1325,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
       op::Concatenate(empty_literal, param0, param0, empty_slice, param1));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Concatenate(param0, param0, param1));
@@ -1338,6 +1340,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
 
 // Test that reduce of concat is simplified.
 TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
+  auto m = CreateNewVerifiedModule();
   const int kParamLength = 100;
   Shape r3f32 =
       ShapeUtil::MakeShape(F32, {kParamLength, kParamLength, kParamLength});
@@ -1363,7 +1366,7 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
   Shape reduce_shape = ShapeUtil::MakeShape(F32, {kParamLength});
@@ -1373,11 +1376,10 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
   builder.AddInstruction(HloInstruction::CreateReduce(
       reduce_shape, Concatenate, zero, {1, 2}, add_computation));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -1387,6 +1389,7 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
 
 // Test a concatenate with only empty operands is removed.
 TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
+  auto m = CreateNewVerifiedModule();
   const int kParamLength = 100;
   Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
@@ -1401,20 +1404,20 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, empty_slice}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Concatenate(empty_literal, empty_slice));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), empty_literal);
 }
 
 // Test that concat with a scalar broadcast becomes a pad.
 TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
@@ -1427,17 +1430,17 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       ShapeUtil::MakeShape(F32, {200}), {broadcast, param0}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1));
 }
 
 // Test that a simplification which changes layouts is not performed if layout
 // sensitive is true.
 TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1445,7 +1448,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   // Set to different layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
@@ -1453,9 +1456,10 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has not been removed.
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
@@ -1464,6 +1468,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
 // Test that a simplification which preserves layouts is performed if layout
 // sensitive is true.
 TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1471,7 +1476,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   // Set to same layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
@@ -1479,9 +1484,10 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has been removed.
   EXPECT_THAT(computation->root_instruction(), param0);
@@ -1490,6 +1496,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
 // Test that a reshape which could be replaced with a bitcast is not if
 // add_bitcasts is false.
 TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1502,13 +1509,14 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   *reshape->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3, 4, 5});
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
@@ -1516,6 +1524,7 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
 
 // Test transforming reshapes and transposes of rng.
 TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* zero = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
@@ -1532,11 +1541,11 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
                                 ShapeUtil::MakeShape(F32, {4}), transpose))
                             ->shape();
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
-  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that that reshape(transpose(rng)) is replace by a single rng of the
   // same shape as the reshape.
@@ -1547,6 +1556,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
 
 // Test transforming reshapes to bitcasts under various conditions.
 TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1578,15 +1588,16 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   builder.AddInstruction(HloInstruction::CreateTuple(
       {transformable_reshape, dimensions_wrong_reshape, layout_wrong_reshape}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(transformable_reshape, dimensions_wrong_reshape,
                         layout_wrong_reshape));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
-  simplifier.Run(&module()).ValueOrDie();
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  simplifier.Run(m.get()).ValueOrDie();
 
   // Verify that only the first reshape is replaced.
   EXPECT_THAT(
@@ -1597,6 +1608,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
 // 'changed' bit in AlgebraicSimplifier to false.
 TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // This add (param0 + 0) can be simplified.
@@ -1611,15 +1623,16 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {4}), add));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
-  module().AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  m->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
 // 'changed' bit in AlgebraicSimplifier to false.
 TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // This add (param0 + 0) can be simplified.
@@ -1635,13 +1648,14 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
       HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add,
                                       /*broadcast_dimensions=*/{0, 1}));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
-  module().AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  m->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1655,19 +1669,21 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3});
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1681,19 +1697,21 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({3, 1, 2, 0});
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1706,19 +1724,19 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {1, 2, 1, 1, 2, 1}), reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Reshape(param0)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1733,18 +1751,20 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 2, 1}),
       HloOpcode::kCopy, copy1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1757,13 +1777,12 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
   builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {4, 3, 2}), transpose1, {1, 0, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
@@ -1772,6 +1791,7 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
 
 // Test merging reshape and broadcast.
 TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {5}), "param0"));
@@ -1780,20 +1800,20 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {1, 2, 3, 5, 1}), reshape1, {0, 3, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Broadcast(op::Reshape(param0)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
 
 // Test merging broadcast and reshape.
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {2, 3}), "param0"));
@@ -1802,19 +1822,19 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2}), broadcast1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param0)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {1}), "param"));
@@ -1823,20 +1843,20 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), broadcast));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {4}), "param"));
@@ -1845,14 +1865,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), broadcast));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   EXPECT_THAT(computation->root_instruction()->dimensions(),
@@ -1860,6 +1879,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {1}), "param"));
@@ -1868,14 +1888,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 1}), broadcast));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   const std::vector<int64> broadcast_dims =
@@ -1885,6 +1904,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {4}), "param"));
@@ -1893,33 +1913,32 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 8}), broadcast));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(HloInstruction::CreateIota(
       ShapeUtil::MakeShape(F32, {1, 2, 3, 7, 12, 1}), 2));
   Shape result_shape = ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2});
   builder.AddInstruction(HloInstruction::CreateReshape(result_shape, iota));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Iota());
   EXPECT_TRUE(
@@ -1927,18 +1946,18 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {1, 1}), 0));
   auto result_shape = iota->shape();
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Iota());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   auto root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
@@ -1948,37 +1967,37 @@ TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) {
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2_6) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2}), 1));
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6}), iota));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 4}), 2));
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), iota));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Iota());
   EXPECT_EQ(Cast<HloIotaInstruction>(computation->root_instruction())
@@ -1987,19 +2006,19 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 2}), 2));
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 2}), iota));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Iota());
   const int64 iota_dim =
@@ -2009,19 +2028,19 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4x2_6x8) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 4, 2}), 2));
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6, 8}), iota));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
 }
@@ -2043,14 +2062,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {2, 2}), param, zero, no_padding));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -2076,11 +2094,10 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {11, 5}), param, zero, padding));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
 
   auto has_negative_padding = [](const HloInstruction* pad) {
     for (auto& padding_dimension : pad->padding_config().dimensions()) {
@@ -2095,7 +2112,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
   EXPECT_TRUE(has_negative_padding(pad));
 
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
   EXPECT_FALSE(
@@ -2110,14 +2127,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {2, 3}), param));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -2133,14 +2149,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
       ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
       /*limit_indices=*/{dim0, dim1}, /*strides=*/{1, 1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(param));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -2162,14 +2177,13 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
       ShapeUtil::MakeShape(F32, {dim0 - 5, dim1 - 9}), original_slice,
       /*start_indices=*/{2, 3},
       /*limit_indices=*/{dim0 - 3, dim1 - 6}, /*strides=*/{1, 1}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param)));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(param));
   EXPECT_EQ(computation->root_instruction()->slice_starts(0), 3);
@@ -2178,6 +2192,55 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
   EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4);
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfReshapeToReshapeOfSlice) {
+  HloComputation::Builder builder(TestName());
+  const int64 dim0 = 11;
+  const int64 dim1 = 12;
+  const int64 dim2 = 13;
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {dim0 * dim1, dim2}), "param"));
+  HloInstruction* original_reshape =
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(F32, {dim0, dim1, dim2}), param));
+
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {dim0 - 2, dim1, dim2}), original_reshape,
+      /*start_indices=*/{0, 0, 0},
+      /*limit_indices=*/{dim0 - 2, dim1, dim2}, /*strides=*/{1, 1, 1}));
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Slice(param)));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 144, 25, 1, 512}), "param"));
+  HloInstruction* original_reshape =
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(F32, {3600, 512}), param));
+
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {960, 512}), original_reshape,
+      /*start_indices=*/{0, 0},
+      /*limit_indices=*/{960, 512}, /*strides=*/{1, 1}));
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2185,14 +2248,86 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
   builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), keys);
 }
 
+TEST_F(AlgebraicSimplifierTest, ReplacePermutationSortWithScatter) {
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Tuple(op::Iota(),
+                        op::Scatter(op::Iota(),
+                                    op::Concatenate(op::Iota(), op::Reshape()),
+                                    op::Reshape())));
+}
+
+TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortIfNonIntegral) {
+  // Same as ReplacePermutationSortWithScatter except that the iota has F32
+  // type.
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = f32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = f32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(gte, values), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortWrongDimensions) {
+  // Same as ReplacePermutationSortWithScatter except that the sort dimensions
+  // don't match.
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2207,15 +2342,182 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   builder.AddInstruction(HloInstruction::CreateSort(
       ShapeUtil::MakeTupleShape({keys_shape, values_shape, values_shape}), 0,
       keys, {values0, values1}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(keys, values0, values1));
 }
 
+// Test that A && True is simplified to A
+TEST_F(AlgebraicSimplifierTest, AndTrue) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      param0, const_true));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+// Test that True && A is simplified to A
+TEST_F(AlgebraicSimplifierTest, AndTrue2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      const_true, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+// Test that A && False is simplified to False
+TEST_F(AlgebraicSimplifierTest, AndFalse) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      param0, const_false));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_false);
+}
+
+// Test that False && A is simplified to False
+TEST_F(AlgebraicSimplifierTest, AndFalse2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      const_false, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_false);
+}
+
+// Test that A || True is simplified to True
+TEST_F(AlgebraicSimplifierTest, OrTrue) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, param0, const_true));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_true);
+}
+
+// Test that True || A is simplified to True
+TEST_F(AlgebraicSimplifierTest, OrTrue2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, const_true, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_true);
+}
+
+// Test that A || False is simplified to A
+TEST_F(AlgebraicSimplifierTest, OrFalse) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr,
+                                                      param0, const_false));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+// Test that False || A is simplified to A
+TEST_F(AlgebraicSimplifierTest, OrFalse2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr,
+                                                      const_false, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
 // Used for TEST_Ps that test merging (or not) of a kPad instruction into a
 // convolution's Window.
 struct ConvPaddingTestcase {
@@ -2337,15 +2639,14 @@ TEST_P(ConvInputPaddingTest, DoTest) {
           .ValueOrDie(),
       lhs_pad, filter, /*feature_group_count=*/1, window, dnums,
       DefaultPrecisionConfig(2)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
-    ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
+    ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
-    ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+    ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
     ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
@@ -2455,15 +2756,14 @@ TEST_P(ConvFilterPaddingTest, DoIt) {
       input, rhs_pad, /*feature_group_count=*/1, window, dnums,
       precision_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
-    ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
+    ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
-    ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+    ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
     ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
@@ -2604,11 +2904,12 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
         /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
     // TODO(b/80488902): verify this module.
-    auto module = HloTestBase::CreateNewModule();
+    auto module = CreateNewUnverifiedModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
-    AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                   bitcasting_callback());
+    AlgebraicSimplifierOptions simplifier_options(bitcasting_callback());
+    simplifier_options.set_is_layout_sensitive(true);
+    AlgebraicSimplifier simplifier(simplifier_options);
     if (!simplifier.Run(module.get()).ValueOrDie()) {
       return "NO_CHANGE";
     }
@@ -2724,20 +3025,19 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
       slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, slice);
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
 
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(scalar_param));
@@ -2763,16 +3063,15 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   HloInstruction* reshape = builder.AddInstruction(
       HloInstruction::CreateReshape(reshape_shape, transpose));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reshape);
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(forty_two));
@@ -2782,7 +3081,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
 TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   // TODO(b/80488902): verify this module.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2837,8 +3136,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
@@ -2864,7 +3162,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 // ReduceWindow(Convert(op), x).
 TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
   // TODO(b/80488902): verify this module.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2923,8 +3221,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
@@ -2954,12 +3251,11 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   builder.AddInstruction(
       HloInstruction::CreateReverse(shape, a, /*dimensions=*/{2, 3}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(a, root);
@@ -2970,6 +3266,7 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   // Dots add computations to the parent module. Test that, when the HloModule's
   // computations are updated, then iterator invalidation doesn't occur
   // when running on subsequent computations.
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {1});
   HloComputation::Builder builder(TestName() + ".Dot");
   HloInstruction* x =
@@ -2991,15 +3288,15 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   call_builder.AddInstruction(
       HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get()));
 
-  module().AddEmbeddedComputation(std::move(dot_computation));
-  module().AddEntryComputation(call_builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  m->AddEmbeddedComputation(std::move(dot_computation));
+  m->AddEntryComputation(call_builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Test that a constant with tuple shape becomes a tuple of constants.
 TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   const float constant_scalar = 7.3f;
   std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
@@ -3008,11 +3305,10 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
   Literal value = LiteralUtil::MakeTuple({&elements[0], &elements[1]});
   builder.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(op::Constant(), op::Constant()));
 }
@@ -3021,6 +3317,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
 // of its input equals the size of its output.  In this case, the dynamic slice
 // is equal to its input.
 TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -3032,10 +3329,9 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
           1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
       /*slice_sizes=*/{10, 100, 1000}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), op::Parameter());
 }
 
@@ -3043,6 +3339,7 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
 // size of its "update" equals the size of its output.  In this case, the
 // dynamic-update-slice is equal to its update.
 TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -3065,16 +3362,16 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
       builder.AddInstruction(HloInstruction::CreateParameter(
           3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::DynamicSlice(op::Parameter(), op::Parameter()));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloInstruction* input_array = builder.AddInstruction(
@@ -3085,12 +3382,11 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r3f32, inner_bcast, {0, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
   EXPECT_THAT(root->dimensions(), ElementsAre(2));
@@ -3098,6 +3394,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
 
 // Test that two consecutive broadcasts can be merged to one.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 3});
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 5, 3});
@@ -3111,12 +3408,11 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r4f32, inner_bcast, {1, 2, 3}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Parameter(0)));
   EXPECT_THAT(root->dimensions(), ElementsAre(1, 3));
@@ -3124,6 +3420,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
 
 // Test that a broadcast of an iota can be merged to one iota.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloInstruction* iota =
@@ -3131,12 +3428,11 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 2, 2});
   builder.AddInstruction(HloInstruction::CreateBroadcast(r3f32, iota, {0, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Iota());
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
@@ -3144,6 +3440,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
 
 // Test that a broadcast of an iota can be merged to one iota.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 5, 3});
   HloInstruction* iota =
@@ -3152,12 +3449,11 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r4f32, iota, {1, 2, 3}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Iota());
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
@@ -3174,12 +3470,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
       ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[2:3],[0:1]}
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Reshape(op::Constant()));
@@ -3196,12 +3491,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
       ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[6:7],[9:10]}
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Reshape(op::Constant()));
@@ -3218,12 +3512,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
       ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[9:10]}
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
 
@@ -3238,17 +3531,102 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
       ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[3:4],[4:5]}
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Parameter());
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = f32[2] parameter(0)
+      param.1 = f32[1] parameter(1)
+      param.2 = f32[3] parameter(2)
+      concat = f32[6] concatenate(param.0, param.1, param.2), dimensions={0}
+      ROOT slice = f32[1] slice(concat), slice={[2:3]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Parameter(1));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = f32[2] parameter(0)
+      param.1 = f32[1] parameter(1)
+      param.2 = f32[3] parameter(2)
+      concat = f32[6] concatenate(param.0, param.1, param.2), dimensions={0}
+      ROOT slice = f32[1] slice(concat), slice={[4:5]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Slice(op::Parameter(2)));
+  EXPECT_EQ(root->slice_starts(0), 1);
+  EXPECT_EQ(root->slice_limits(0), 2);
+}
+
+TEST_F(AlgebraicSimplifierTest, NegateNegate) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = f32[2] parameter(0)
+      neg.0 = f32[2] negate(param.0)
+      ROOT neg.1 = f32[2] negate(neg.0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Parameter(0));
+}
+
+TEST_F(AlgebraicSimplifierTest, NotNot) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = pred[2] parameter(0)
+      not.0 = pred[2] not(param.0)
+      ROOT not.1 = pred[2] not(not.0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Parameter(0));
+}
+
 struct PadReduceWindowEffectiveBroadcastCase {
   std::vector<int64> input_spatials;
   std::vector<int64> symmetric_pad_spatials;
@@ -3278,6 +3656,7 @@ class PadReduceWindowEffectiveBroadcastTest
           PadReduceWindowEffectiveBroadcastCase> {};
 
 TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
+  auto m = CreateNewVerifiedModule();
   const auto& param = GetParam();
 
   // a and b are parallel bounds we can either turn into a B F S0 S1 or
@@ -3326,7 +3705,7 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
 
   Window window = window_util::MakeWindow(
@@ -3340,10 +3719,9 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
   builder.AddInstruction(HloInstruction::CreateReduceWindow(
       output_shape, pad, zero, window, add_computation));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
@@ -3392,6 +3770,7 @@ class DotStrengthReductionTest
       public ::testing::WithParamInterface<
           ::testing::tuple<int, int, int, bool, bool, PrimitiveType>> {};
 TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
+  auto module = CreateNewVerifiedModule();
   int m, k, n;
   bool transpose_lhs, transpose_rhs;
   PrimitiveType element_type;
@@ -3421,10 +3800,9 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(&module()));
+  auto computation = module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
   const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
   const bool computation_should_be_modified =
       dot_should_be_transformed || (transpose_lhs && transpose_rhs);
@@ -3452,7 +3830,7 @@ struct DotOfConcatTestSpec {
 };
 
 class DotOfConcatSimplificationTest
-    : public HloVerifiedTestBase,
+    : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<DotOfConcatTestSpec> {};
 
 // Test that we transform
@@ -3460,6 +3838,7 @@ class DotOfConcatSimplificationTest
 // to
 //  add(dot(const_0, A), dot(const_1, B),  dot(const_2, C))
 TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfConcatTestSpec spec = GetParam();
@@ -3498,10 +3877,9 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
@@ -3519,6 +3897,7 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
 // to
 //  add(dot(A, const_0), dot(B, const_1),  dot(C, const_2))
 TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfConcatTestSpec spec = GetParam();
@@ -3562,10 +3941,9 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
@@ -3590,6 +3968,7 @@ DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
 // Test that DynamicUpdateSlice update param with any dimension equal to zero
 // gets removed.
 TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   const Shape dslice_shape = ShapeUtil::MakeShape(F32, {10});
   HloInstruction* const operand = builder.AddInstruction(
@@ -3602,11 +3981,10 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       dslice_shape, operand, update, start_indices));
   const HloComputation* const computation =
-      module().AddEntryComputation(builder.Build());
+      m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), operand);
 }
 
@@ -3625,7 +4003,7 @@ struct DotOfGatherTestSpec {
 };
 
 class DotOfGatherSimplificationTest
-    : public HloVerifiedTestBase,
+    : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<DotOfGatherTestSpec> {};
 
 // input: dot(DS(ctA), ctB))
@@ -3634,6 +4012,7 @@ class DotOfGatherSimplificationTest
 // output: DS(dot(ctA, ctB))
 // => output dimensions: DS ({M x N}, {s, 0}, {1, N}) => {1 x N}.
 TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfGatherTestSpec spec = GetParam();
@@ -3680,10 +4059,9 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, ds, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
@@ -3704,6 +4082,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
 // output: DS(dot(ctA, ctB))
 // => output dimensions: DS ({M x N}, {0, s}, {M, 1}) => {M x 1}.
 TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfGatherTestSpec spec = GetParam();
@@ -3750,10 +4129,9 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, ds, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c11452a6fbd49a1fc382d11d24a7d7b7eeab0bcc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -0,0 +1,286 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/ar_crs_combiner.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+namespace {
+
+namespace m = match;
+
+// If the argument instruction is a CRS in the sequence
+// AR -> Convert -> Add -> CRS
+// then return the AR in the sequence.
+// TODO(b/117554291): Rewrite this to recognize more general patterns,
+// not just the specific one of AR -> Add -> Convert -> CRS.
+absl::optional<HloInstruction*> MatchesArCrsPattern(
+    HloInstruction* instruction) {
+  HloInstruction *ar, *convert, *add, *crs;
+  if (Match(instruction,
+            m::CrossReplicaSum(
+                &crs, m::Add(&add, m::Op(),
+                             m::Convert(&convert,
+                                        m::CrossReplicaSum(&ar, m::Op()))))) &&
+      ar->users().size() == 1 && ar->shape().element_type() == BF16 &&
+      convert->shape().element_type() == F32 && !crs->all_reduce_id()) {
+    return ar;
+  }
+  return absl::optional<HloInstruction*>();
+}
+
+}  // namespace
+
+absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
+    HloInstruction* instruction) {
+  CHECK(HloOpcode::kParameter == instruction->opcode());
+  HloComputation* computation = instruction->parent();
+  auto caller_instructions = call_graph_->GetComputationCallers(computation);
+  if (caller_instructions.size() == 1) {
+    auto caller_instruction = caller_instructions[0];
+    if (caller_instruction->opcode() == HloOpcode::kWhile) {
+      return caller_instruction;
+    }
+  }
+  return absl::optional<HloInstruction*>();
+}
+
+std::vector<HloInstruction*> ArCrsCombiner::GetAllTuples(
+    HloInstruction* instruction) {
+  if (instruction->opcode() == HloOpcode::kTuple) {
+    return {instruction};
+  }
+  if (instruction->opcode() == HloOpcode::kDomain) {
+    return GetAllTuples(instruction->operands()[0]);
+  }
+  if (instruction->opcode() == HloOpcode::kParameter) {
+    auto maybe_while = WhileFromBodyParameter(instruction);
+    if (!maybe_while) {
+      return {};
+    }
+    auto while_instr = *maybe_while;
+    auto init_tuples = GetAllTuples(while_instr->while_init());
+    auto body_tuples =
+        GetAllTuples(while_instr->while_body()->root_instruction());
+    if (init_tuples.empty() || body_tuples.empty()) {
+      return {};
+    }
+    init_tuples.insert(init_tuples.end(), body_tuples.begin(),
+                       body_tuples.end());
+    return init_tuples;
+  }
+  if (instruction->opcode() == HloOpcode::kGetTupleElement) {
+    std::vector<HloInstruction*> result_tuples;
+    for (auto tuple : GetAllTuples(instruction->operands()[0])) {
+      auto tmp_tuples =
+          GetAllTuples(tuple->mutable_operand(instruction->tuple_index()));
+      if (tmp_tuples.empty()) {
+        return {};
+      }
+      result_tuples.insert(result_tuples.end(), tmp_tuples.begin(),
+                           tmp_tuples.end());
+    }
+    return result_tuples;
+  }
+  return {};
+}
+
+bool ArCrsCombiner::TupleElementsComputeSameValue(
+    HloInstruction* tuple_shaped_instruction, int64 i1, int64 i2,
+    absl::flat_hash_map<int64, int64>* visited_pairs) {
+  auto tuples = GetAllTuples(tuple_shaped_instruction);
+  if (tuples.empty()) {
+    return false;
+  }
+  for (auto tuple : tuples) {
+    CHECK(tuple->opcode() == HloOpcode::kTuple);
+    if (!InstructionsComputeSameValue(tuple->mutable_operand(i1),
+                                      tuple->mutable_operand(i2),
+                                      visited_pairs)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/* static */
+bool ArCrsCombiner::TestInstructionsComputeSameValue(HloInstruction* i1,
+                                                     HloInstruction* i2) {
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2);
+  auto module = i1->parent()->parent();
+  CHECK_EQ(module, i2->parent()->parent());
+  combiner.call_graph_ = CallGraph::Build(module);
+  absl::flat_hash_map<int64, int64> visited_pairs;
+  return combiner.InstructionsComputeSameValue(i1, i2, &visited_pairs);
+}
+
+bool ArCrsCombiner::InstructionsComputeSameValue(
+    HloInstruction* i1, HloInstruction* i2,
+    absl::flat_hash_map<int64, int64>* visited_pairs) {
+  if (i1 == i2) {
+    return true;
+  }
+  auto uid1 = i1->unique_id();
+  auto uid2 = i2->unique_id();
+  auto min_uid = std::min(uid1, uid2);
+  auto max_uid = std::max(uid1, uid2);
+  auto it = visited_pairs->find(min_uid);
+  if (it != visited_pairs->end() && max_uid == it->second) {
+    return true;
+  }
+  auto opcode1 = i1->opcode();
+  auto operands1 = i1->operands();
+  if (opcode1 != i2->opcode() || operands1.size() != i2->operands().size()) {
+    return false;
+  }
+  if (opcode1 == HloOpcode::kConstant || i1->IsCrossModuleAllReduce()) {
+    return i1->Identical(
+        *i2,
+        /*eq_operands=*/std::equal_to<const HloInstruction*>(),
+        /*eq_computations=*/std::equal_to<const HloComputation*>(),
+        /*layout_sensitive=*/false);
+  }
+  visited_pairs->emplace(min_uid, max_uid);
+  for (int i = 0; i < operands1.size(); ++i) {
+    auto operand1 = operands1[i];
+    auto operand2 = i2->operands()[i];
+    if (!InstructionsComputeSameValue(operand1, operand2, visited_pairs)) {
+      return false;
+    }
+  }
+  if (opcode1 == HloOpcode::kGetTupleElement) {
+    if (i1->tuple_index() == i2->tuple_index()) {
+      return true;
+    }
+    return TupleElementsComputeSameValue(operands1[0], i1->tuple_index(),
+                                         i2->tuple_index(), visited_pairs);
+  }
+  return true;
+}
+
+void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      auto ar = MatchesArCrsPattern(instruction);
+      if (ar) {
+        all_reduce_map_[*((*ar)->all_reduce_id())].push_back(*ar);
+      }
+    }
+  }
+}
+
+void ArCrsCombiner::KeepProvablyEqualInstructionGroups() {
+  for (auto it : all_reduce_map_) {
+    auto instruction_vec = it.second;
+    CHECK_EQ(instruction_vec.size(), num_spatial_partitions_);
+
+    auto instr_0 = instruction_vec[0];
+    auto add_0 = instr_0->users()[0]->users()[0];
+    CHECK(HloOpcode::kAdd == add_0->opcode());
+
+    for (int i = 1; i < instruction_vec.size(); ++i) {
+      auto instr_i = instruction_vec[i];
+      auto add_i = instr_i->users()[0]->users()[0];
+      CHECK(HloOpcode::kAdd == add_i->opcode());
+      absl::flat_hash_map<int64, int64> visited_pairs;
+      if (!InstructionsComputeSameValue(add_0, add_i, &visited_pairs)) {
+        all_reduce_map_.erase(it.first);
+      }
+    }
+  }
+}
+
+StatusOr<bool> ArCrsCombiner::RewriteGraph() {
+  if (all_reduce_map_.empty()) {
+    return false;
+  }
+
+  auto computation_is_addition = [](HloComputation* c) {
+    return c->instruction_count() == 3 &&
+           Match(c->root_instruction(), m::Add(m::Parameter(), m::Parameter()));
+  };
+
+  for (auto it : all_reduce_map_) {
+    auto instruction_vec = it.second;
+    for (auto all_reduce : instruction_vec) {
+      auto parent_computation = all_reduce->parent();
+      auto convert = all_reduce->users()[0];
+      auto add = convert->users()[0];
+      auto crs = add->users()[0];
+
+      if (!computation_is_addition(all_reduce->called_computations()[0]) ||
+          !computation_is_addition(crs->called_computations()[0])) {
+        continue;
+      }
+      HloInstruction* other_summand = (add->operands()[0] == convert)
+                                          ? add->operands()[1]
+                                          : add->operands()[0];
+      // Remove the AllReduce and replace the CRS with:
+      // AllReduce - (other_summand * (num_spatial_partitions_ - 1))
+      TF_CHECK_OK(
+          all_reduce->ReplaceAllUsesWith(all_reduce->mutable_operand(0)));
+      crs->set_all_reduce_id(all_reduce->all_reduce_id());
+      auto new_shape = crs->shape();
+      HloInstruction* to_subtract;
+      if (num_spatial_partitions_ == 2) {
+        to_subtract = other_summand;
+      } else {
+        Literal partitions_minus_1_lit = Literal(new_shape);
+        partitions_minus_1_lit.PopulateWithValue<float>(
+            num_spatial_partitions_ - 1);
+        auto partitions_minus_1_const = parent_computation->AddInstruction(
+            HloInstruction::CreateConstant(partitions_minus_1_lit.Clone()));
+        to_subtract =
+            parent_computation->AddInstruction(HloInstruction::CreateBinary(
+                new_shape, HloOpcode::kMultiply, other_summand,
+                partitions_minus_1_const));
+      }
+      auto sub =
+          parent_computation->AddInstruction(HloInstruction::CreateBinary(
+              new_shape, HloOpcode::kSubtract, crs, to_subtract));
+      TF_CHECK_OK(crs->ReplaceAllUsesWith(sub));
+      TF_CHECK_OK(parent_computation->RemoveInstruction(all_reduce));
+    }
+  }
+
+  return true;
+}
+
+StatusOr<bool> ArCrsCombiner::Run(HloModule* module) {
+  call_graph_ = CallGraph::Build(module);
+
+  GroupAllReducesById(module);
+
+  KeepProvablyEqualInstructionGroups();
+
+  return RewriteGraph();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6a7ef76ec3b76972d1b2c7fb548cecfb9423160
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Combine an AllReduce and a CrossReplicaSum when they are close to each other
+// in the graph, to use an efficient CrossReplicaSum implementation that
+// fully utilizes the interconnect bandwidth.
+class ArCrsCombiner : public HloModulePass {
+ public:
+  ArCrsCombiner(int num_spatial_partitions)
+      : num_spatial_partitions_(num_spatial_partitions) {}
+  absl::string_view name() const override { return "ar-crs-combiner"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Helper method to allow testing of InstructionsComputeSameValue.
+  static bool TestInstructionsComputeSameValue(HloInstruction* i1,
+                                               HloInstruction* i2);
+
+ private:
+  // If the passed instruction is a while parameter, and the while body is only
+  // called by a single while instruction, return the while instruction.
+  absl::optional<HloInstruction*> WhileFromBodyParameter(
+      HloInstruction* instruction);
+
+  // Returns a vector of tuple instructions.
+  // If all instructions that flow to "instruction" are tuples, return them.
+  // Otherwise, return an empty vector.
+  std::vector<HloInstruction*> GetAllTuples(HloInstruction* instruction);
+
+  // Checks whether two different elements in the same tuple compute the same
+  // value.
+  bool TupleElementsComputeSameValue(
+      HloInstruction* tuple_shaped_instruction, int64 i1, int64 i2,
+      absl::flat_hash_map<int64, int64>* visited_pairs);
+
+  // Returns whether the instructions i1 and i2 can be shown to evaluate to the
+  // same value. Handling WHILE requires recursion, which may cause us to visit
+  // the same instruction again. To avoid infinite loops, we pass a cache of
+  // visited instruction pairs.
+  bool InstructionsComputeSameValue(
+      HloInstruction* i1, HloInstruction* i2,
+      absl::flat_hash_map<int64, int64>* visited_pairs);
+
+  // Populates all_reduce_map_.
+  void GroupAllReducesById(HloModule* module);
+
+  // Looks at each AllReduce group in all_reduce_map_, and keeps only the
+  // groups for which it's safe to move the AllReduce later in the HLO graph.
+  void KeepProvablyEqualInstructionGroups();
+
+  // Performs the graph rewrite that eliminates the early AllReduce and turns
+  // the later CRS into an AllReduce.
+  StatusOr<bool> RewriteGraph();
+
+  int num_spatial_partitions_;
+
+  // Map from all-reduce ids to the all reduce instructions.
+  absl::flat_hash_map<int64, std::vector<HloInstruction*>> all_reduce_map_;
+
+  std::unique_ptr<CallGraph> call_graph_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9d5eaf63ccf32cd78b8c11f12f9bccdfd1fec3e0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -0,0 +1,415 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/ar_crs_combiner.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ArCrsCombinerTest : public HloTestBase {};
+
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(
+      i1, module->entry_computation()->parameter_instruction(0)));
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestNumOperands) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> ((f32[2,2]), (f32[2,2], f32[2,2])) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple1 = (f32[2,2]) tuple(%constant.f32)
+  %tuple2 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %tuple = ((f32[2,2]), (f32[2,2], f32[2,2])) tuple(%tuple1, %tuple2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementSameIndex) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementDifferentIndex1) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementDifferentIndex2) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{2, 3}, {4, 5}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile1) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0];
+  auto i2 = body_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile2) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {7, 8}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0];
+  auto i2 = body_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile3) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {1, 2}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32.1)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32.2)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0]->operands()[0];  // %get-tuple-element.1
+  auto i2 = body_tuple->operands()[1]->operands()[0];  // %get-tuple-element.2
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, RewritePatternArConvertAddCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+
+  %cross-replica-sum.ar.1 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=0}
+  %convert.1 = f32[2,2]
+      convert(%cross-replica-sum.ar.1),
+      sharding={maximal device=0}
+  %add.1 = f32[2,2]
+      add(%constant.f32, %convert.1),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[2,2]
+      cross-replica-sum(%add.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=1}
+  %convert.2 = f32[2,2]
+      convert(%cross-replica-sum.ar.2),
+      sharding={maximal device=1}
+  %add.2 = f32[2,2]
+      add(%constant.f32, %convert.2),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[2,2]
+      cross-replica-sum(%add.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[2,2], f32[2,2])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Subtract(op::CrossReplicaSum(), op::Constant()),
+                        op::Subtract(op::CrossReplicaSum(), op::Constant())));
+  auto sub = module->entry_computation()->root_instruction()->operands()[0];
+  auto crs_after = sub->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  ASSERT_EQ(replica_groups_before.size(), replica_groups_after.size());
+  for (int i = 0; i < replica_groups_before.size(); ++i) {
+    // Somewhat verbose way to compare the replica_ids, because EqualsProto
+    // is not available in the open-source build.
+    auto group_before = replica_groups_before[i];
+    std::vector<int64> ids_before(group_before.replica_ids().begin(),
+                                  group_before.replica_ids().end());
+    auto group_after = replica_groups_after[i];
+    std::vector<int64> ids_after(group_after.replica_ids().begin(),
+                                 group_after.replica_ids().end());
+    EXPECT_EQ(ids_before, ids_after);
+  }
+}
+
+TEST_F(ArCrsCombinerTest, OtherSummandNotTheSameDontRewrite) {
+  const char* module_str = R"(
+HloModule foobar
+
+%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+
+  %cross-replica-sum.ar.1 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=0}
+  %convert.1 = f32[2,2]
+      convert(%cross-replica-sum.ar.1),
+      sharding={maximal device=0}
+  %add.1 = f32[2,2]
+      add(%constant.f32.1, %convert.1),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[2,2]
+      cross-replica-sum(%add.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=1}
+  %convert.2 = f32[2,2]
+      convert(%cross-replica-sum.ar.2),
+      sharding={maximal device=1}
+  %add.2 = f32[2,2]
+      add(%constant.f32.2, %convert.2),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[2,2]
+      cross-replica-sum(%add.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[2,2], f32[2,2])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
index 38f1a5d3a645f98220ec445bb9bbdf2b9b842109..52ec1a794c5e9f4452a4bf2b648f453d8acfe976 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
@@ -17,14 +17,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 
 namespace xla {
 namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class BatchDotSimplificationTest : public HloVerifiedTestBase {};
+class BatchDotSimplificationTest : public HloTestBase {};
 
 TEST_F(BatchDotSimplificationTest,
        ElideSingleDegenerateBatchDotDim_VectorVector) {
@@ -38,11 +37,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -61,11 +61,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -84,11 +85,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -107,11 +109,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -130,11 +133,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -153,11 +157,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index f7ac8f5482908af104554a1cf812370b9098cda7..08cf8026177d77ff98cca5e5d168ac3194936b35 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -29,14 +29,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-using BatchNormExpanderTest = HloVerifiedTestBase;
+using BatchNormExpanderTest = HloTestBase;
 
 // Test that we expand BatchNormTraining.
 TEST_F(BatchNormExpanderTest, BatchNormTraining) {
@@ -59,14 +59,14 @@ TEST_F(BatchNormExpanderTest, BatchNormTraining) {
       param0, param1, param2,
       /*epsilon=*/0.001, /*feature_index=*/3));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormTraining);
   BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
-  ASSERT_TRUE(rewriter.Run(module).ValueOrDie());
+  ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
@@ -101,14 +101,14 @@ TEST_F(BatchNormExpanderTest, BatchNormGrad) {
       param1, param2, param3, param4,
       /*epsilon=*/0.001, /*feature_index=*/3));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormGrad);
   BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
-  ASSERT_TRUE(rewriter.Run(module).ValueOrDie());
+  ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
@@ -126,13 +126,13 @@ ENTRY entry {
     epsilon=0.001, feature_index=1, sharding={maximal device=1}
 })";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
   BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
-  ASSERT_TRUE(rewriter.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(rewriter.Run(m.get()).ValueOrDie());
 
-  for (auto* instruction : module().entry_computation()->instructions()) {
+  for (auto* instruction : m->entry_computation()->instructions()) {
     if (instruction->opcode() == HloOpcode::kParameter) {
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index 5f93740887aa7e61458990992fe0573883ff056d..4ce351acc2c359773e618da70360c96faf5ca379 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -65,11 +65,11 @@ class TestBFloat16Support : public BFloat16Support {
   }
 };
 
-class BFloat16ConversionFoldingTest : public HloVerifiedTestBase {
+class BFloat16ConversionFoldingTest : public HloTestBase {
  protected:
   BFloat16ConversionFoldingTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/false,
-                            /*allow_mixed_precision=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   bool FoldConversions(HloModule* module) {
     TestBFloat16Support bfloat16_support_;
@@ -103,10 +103,10 @@ TEST_F(BFloat16ConversionFoldingTest, FoldIfSupported) {
       HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, convert1, c));
   builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, add1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(FoldConversions(module));
+  EXPECT_TRUE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), add1);
   EXPECT_EQ(add0->shape().element_type(), BF16);
@@ -138,10 +138,10 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldIfUnsupported) {
   HloInstruction* convert2 =
       builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, mul1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(FoldConversions(module));
+  EXPECT_FALSE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), convert2);
   EXPECT_EQ(mul0->shape().element_type(), F32);
@@ -173,10 +173,10 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldUnsupportedMixedPrecision) {
   HloInstruction* convert2 =
       builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, sub1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(FoldConversions(module));
+  EXPECT_FALSE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), convert2);
   EXPECT_EQ(sub0->shape().element_type(), F32);
@@ -203,10 +203,10 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
   HloInstruction* convert1 =
       builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(FoldConversions(module));
+  EXPECT_FALSE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), convert1);
   EXPECT_EQ(gte->shape().element_type(), F32);
@@ -216,7 +216,7 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
 TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
   auto builder = HloComputation::Builder(TestName());
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder sum_builder("add");
   auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
@@ -252,7 +252,7 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(FoldConversions(module));
+  EXPECT_TRUE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), tuple);
   EXPECT_EQ(tuple->operand(0), gte_a);
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index cb075a5e38a5ea9db2ceb432b2b59f8db5e2e640..9f97d18c565c7915b9f9346f0c6330cdc3c707e9 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -68,11 +68,11 @@ class TestBFloat16Support : public BFloat16Support {
   }
 };
 
-class BFloat16NormalizationTest : public HloVerifiedTestBase {
+class BFloat16NormalizationTest : public HloTestBase {
  protected:
   BFloat16NormalizationTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/false,
-                            /*allow_mixed_precision=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   bool Normalize(HloModule* module) {
     TestBFloat16Support bfloat16_support_;
@@ -106,10 +106,10 @@ TEST_F(BFloat16NormalizationTest, NoopIfSupported) {
   HloInstruction* add1 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, add0, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(Normalize(module));
+  EXPECT_FALSE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), add1);
   EXPECT_EQ(add0->shape().element_type(), BF16);
@@ -134,10 +134,10 @@ TEST_F(BFloat16NormalizationTest, ResolveIfUnsupportedBF16) {
   HloInstruction* mul1 = builder.AddInstruction(
       HloInstruction::CreateBinary(bf16_shape, HloOpcode::kMultiply, mul0, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(computation->root_instruction()->operand(0), mul1);
@@ -164,10 +164,10 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionSubtraction) {
   HloInstruction* sub1 = builder.AddInstruction(
       HloInstruction::CreateBinary(bf16_shape, HloOpcode::kSubtract, sub0, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(computation->root_instruction()->operand(0), sub1);
@@ -191,7 +191,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
       HloInstruction::CreateBinary(bf16_scalar_shape, HloOpcode::kAdd,
                                    reduce_comp_param0, reduce_comp_param1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto reduce_computation =
       module->AddEmbeddedComputation(reduce_comp_builder.Build());
 
@@ -205,7 +205,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), reduce);
   EXPECT_EQ(reduce->called_computations().size(), 1);
@@ -233,7 +233,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder sum_builder("sum");
   auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
@@ -263,7 +263,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), gte);
   EXPECT_EQ(gte->shape().element_type(), BF16);
@@ -272,7 +272,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {1024});
   Shape bf16_shape = ShapeUtil::MakeShape(BF16, {1024});
@@ -290,7 +290,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), gte);
   EXPECT_EQ(gte->shape().element_type(), BF16);
@@ -299,7 +299,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSortRoot) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {1024});
   Shape bf16_shape = ShapeUtil::MakeShape(BF16, {1024});
@@ -314,7 +314,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSortRoot) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(sort->operand(0)->shape().element_type(), F32);
   EXPECT_EQ(ShapeUtil::GetSubshape(sort->shape(), {0}).element_type(), F32);
@@ -342,10 +342,10 @@ TEST_F(BFloat16NormalizationTest, DoNotAddUnsupportedMixedPrecision) {
   HloInstruction* dot = builder.AddInstruction(
       HloInstruction::CreateDot(bf16_shape, a, b, dot_dnums, precision_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(dot->shape().element_type(), F32);
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 0af71eaac96fca366e45430788e769c618f86bb5..5be7141aae423adb4fe2f39262e463ff25ae8234 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -55,11 +55,11 @@ class TestBFloat16Support : public BFloat16Support {
   }
 };
 
-class BFloat16PropagationTest : public HloVerifiedTestBase {
+class BFloat16PropagationTest : public HloTestBase {
  protected:
   BFloat16PropagationTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/false,
-                            /*allow_mixed_precision=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   // Runs the propagation pass on the given module, and returns whether the
   // module is changed after this pass.
@@ -121,10 +121,10 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSelectButNotAdd) {
   HloInstruction* root = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kAdd, dot, dot));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), root);
   EXPECT_TRUE(OutputsBF16(xpose));
@@ -136,6 +136,62 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSelectButNotAdd) {
   EXPECT_FALSE(OutputsBF16(c));
 }
 
+TEST_F(BFloat16PropagationTest, PropagateThroughMaxPoolReduceWindow) {
+  auto module = CreateNewVerifiedModule();
+
+  auto sub_builder = HloComputation::Builder("max");
+  HloInstruction* p0 = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "a"));
+  HloInstruction* p1 = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "b"));
+  sub_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {}), HloOpcode::kMaximum, p0, p1));
+  auto max_computation = module->AddEmbeddedComputation(sub_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* c =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "c"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_high(1);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(1);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+  HloInstruction* rw =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          shape, add,
+          builder.AddInstruction(
+              HloInstruction::CreateConstant(LiteralUtil::Zero(F32))),
+          window, max_computation));
+  HloInstruction* xpose =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {4, 2}), c, {1, 0}));
+  HloInstruction* dot = builder.AddInstruction(
+      CreateDot(ShapeUtil::MakeShape(F32, {4, 4}), xpose, rw));
+  HloInstruction* root = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kAdd, dot, dot));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), root);
+  EXPECT_TRUE(OutputsBF16(add));
+  EXPECT_TRUE(OutputsBF16(xpose));
+  EXPECT_TRUE(OutputsBF16(rw));
+}
+
 // Tests that side-effecting all-reduce should not be changed.
 TEST_F(BFloat16PropagationTest, DoNotChangeAllReduce) {
   auto module = CreateNewVerifiedModule();
@@ -186,10 +242,10 @@ TEST_F(BFloat16PropagationTest, ConvertConstantLiteral) {
       HloInstruction::CreateConstant(LiteralUtil::CreateFromArray(array_b)));
   HloInstruction* dot = builder.AddInstruction(CreateDot(shape, a, b));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(dot->operand(0)));
@@ -242,10 +298,10 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTuples) {
   HloInstruction* output_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({dot, add2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), output_tuple);
   EXPECT_TRUE(OutputsBF16(xpose));
@@ -281,10 +337,10 @@ TEST_F(BFloat16PropagationTest, SameValueReferencedTwice) {
   HloInstruction* dot = builder.AddInstruction(
       CreateDot(ShapeUtil::MakeShape(F32, {4, 4}), lhs, rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(add1));
@@ -310,10 +366,10 @@ TEST_F(BFloat16PropagationTest, DoNotChangeComputationRoot) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({add, dot}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(PropagatePrecision(module));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), tuple);
   EXPECT_FALSE(OutputsBF16(add));
@@ -321,7 +377,7 @@ TEST_F(BFloat16PropagationTest, DoNotChangeComputationRoot) {
 
 // Tests that BF16 is propagated properly through fused computations.
 TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -356,7 +412,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), fusion1);
   EXPECT_TRUE(OutputsBF16(add));
@@ -369,7 +425,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
 // Tests that changes to BF16 that cannot be propagated outside a fusion are
 // discarded.
 TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -393,7 +449,7 @@ TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(PropagatePrecision(module));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
   EXPECT_EQ(computation->root_instruction(), fusion);
 }
 
@@ -408,7 +464,7 @@ TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
 //   (BF16, BF16) fusion_computation(F32 a, F32 b)
 //     = tuple(BF16 convert(a), BF16 add(F32 a, F32 b))
 TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -439,7 +495,7 @@ TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(gte0));
@@ -458,7 +514,7 @@ TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
 // on_true and on_false must match, so that as long as one of them is F32, the
 // other must be F32 as well.
 TEST_F(BFloat16PropagationTest, SelectOverTuples) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
 
@@ -489,7 +545,7 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_FALSE(OutputsBF16(add0));
@@ -502,7 +558,7 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) {
 // Tests that BF16 is propagated properly through a while computation with
 // non-tuple input/output.
 TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -545,7 +601,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
   auto dot = builder.AddInstruction(CreateDot(shape, while_hlo, while_hlo));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(
@@ -561,7 +617,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
 // made to the while body and thus the fusion node inside it.
 TEST_F(BFloat16PropagationTest,
        ConditionPreventsPropagationForFusionInsideWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -610,7 +666,7 @@ TEST_F(BFloat16PropagationTest,
   auto dot = builder.AddInstruction(CreateDot(shape, while_hlo, while_hlo));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(PropagatePrecision(module));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_FALSE(OutputsBF16(add));
   EXPECT_FALSE(OutputsBF16(body_fusion));
@@ -622,7 +678,7 @@ TEST_F(BFloat16PropagationTest,
 // Tests that BF16 is propagated properly through while computations with
 // tuple-shaped input/output.
 TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -690,7 +746,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
   auto dot = builder.AddInstruction(CreateDot(shape, lhs, rhs));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(lhs));
@@ -709,7 +765,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
 // Tests that BF16 is not propagated through multiple whiles that invoke the
 // same computation as long as one while prevents the propagation.
 TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -820,7 +876,7 @@ TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
   auto dot = builder.AddInstruction(CreateDot(shape, lhs, rhs));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
   EXPECT_FALSE(OutputsBF16(body_dot));
   EXPECT_FALSE(OutputsBF16(body_rhs));
   EXPECT_FALSE(OutputsBF16(body_lhs));
@@ -859,10 +915,10 @@ TEST_F(BFloat16PropagationTest, NoopConversionRemoved) {
   HloInstruction* add2 = builder.AddInstruction(HloInstruction::CreateBinary(
       bf16_shape, HloOpcode::kAdd, convert0, convert1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), add2);
   EXPECT_EQ(add2->operand(0), add0);
@@ -895,10 +951,10 @@ TEST_F(BFloat16PropagationTest, TupleDomain) {
   HloInstruction* root = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
   EXPECT_EQ(computation->root_instruction(), root);
 
   // test BF16 propagated through domain
@@ -941,10 +997,10 @@ TEST_F(BFloat16PropagationTest, TupleDomainNoPropagation) {
   HloInstruction* root = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), root);
   EXPECT_TRUE(OutputsBF16(a_trans));
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
index 5b48f10505e78c035608d4c575501e4623218987..2b9502f63a821f3675ddfb506f41bb2390cf4136 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 
@@ -107,6 +108,21 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision(
     case HloOpcode::kSelect:
     case HloOpcode::kTupleSelect:
       return operand_index == 1 || operand_index == 2;
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow: {
+      HloComputation* reduce_comp = hlo.called_computations()[0];
+      for (HloInstruction* inst : reduce_comp->instructions()) {
+        if (inst->opcode() == HloOpcode::kParameter) {
+          continue;
+        }
+        for (int64 i = 0; i < inst->operand_count(); ++i) {
+          if (!EffectiveOperandPrecisionIsOutputPrecision(*inst, i)) {
+            return false;
+          }
+        }
+      }
+      return true;
+    }
     default:
       break;
   }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index ee4e5942731110e16c8396a824e6dbd19c9df607..8d7c62447852fd946440c41389300a92377c471f 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -641,7 +641,7 @@ Status BufferAssignment::ComputeSummaryStats() {
   bool schedule_complete = true;
   for (const auto& computation : module_->computations()) {
     if (!computation->IsFusionComputation()) {
-      const std::vector<const HloInstruction*>* sequence =
+      const HloInstructionSequence* sequence =
           liveness_->hlo_ordering().SequentialOrder(*computation);
       if (sequence == nullptr) {
         schedule_complete = false;
@@ -746,8 +746,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     LogicalBuffer::AlignmentFunction color_alignment,
     bool allow_input_output_aliasing, bool allocate_buffers_for_constants,
     BufferLiveness::Colorer colorer, ReuseAllocationFunction reuse_checker) {
-  BufferAssigner assigner(allow_input_output_aliasing,
-                          allocate_buffers_for_constants, std::move(colorer),
+  BufferAssigner assigner(allocate_buffers_for_constants, std::move(colorer),
                           std::move(reuse_checker));
   return assigner.CreateAssignment(module, std::move(hlo_ordering),
                                    std::move(buffer_size),
@@ -1180,7 +1179,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       const HloComputation* computation = pair.first;
       const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
           pair.second;
-      const std::vector<const HloInstruction*>* instruction_sequence =
+      const HloInstructionSequence* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
       schedule.set_sequence(computation, *instruction_sequence);
@@ -1215,7 +1214,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       const HloComputation* computation = pair.first;
       const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
           pair.second;
-      const std::vector<const HloInstruction*>* instruction_sequence =
+      const HloInstructionSequence* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
       auto color_map = SplitBuffersByColor(buffers_to_assign);
@@ -1230,7 +1229,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(get_heap_algorithm(alignment), *computation,
-                               HloInstructionSequence(*instruction_sequence),
+                               *instruction_sequence,
                                assignment->points_to_analysis(),
                                assignment->buffer_size_, options));
         AssignBuffersFromHeapSimulator(result, assignment,
@@ -1434,33 +1433,40 @@ BufferAssigner::MergeColocatedBufferSets(
            computation == module->entry_computation();
   };
 
+  std::vector<bool> set_can_be_merged(colocated_buffer_sets.size(), true);
+
+  // Do not merge if one of the sets includes live outs, entry parameters or
+  // constants.
+  //
+  // Buffer liveness does not report the correct live range for entry
+  // parameter and live out buffers so we have to special case them here.  On
+  // backends that support constant buffer allocations, constant buffers are
+  // assigned globals in readonly storage so we can't merge colocated buffer
+  // sets containing constants with colocated buffer sets containing writing
+  // instructions or other constants.
+  //
+  // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
+  // the caller of the executable so we can't write to entry parameters
+  // either, and the argument for not merging constants also applies to entry
+  // parameters.
+  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
+    for (auto& buffer : colocated_buffer_sets[i]) {
+      if (buffer_liveness.MaybeLiveOut(*buffer) ||
+          is_entry_parameter(*buffer) ||
+          buffer->instruction()->opcode() == HloOpcode::kConstant) {
+        set_can_be_merged[i] = false;
+        break;
+      }
+    }
+  }
+
   // Returns true if the two colocated buffer sets (specified by their indices
   // into the colocated_buffer_sets) can be merged into a single set.
   auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
                                    &buffer_size,
-                                   &is_entry_parameter](int64 i, int64 j) {
-    // Do not merge if one of the sets includes live outs, entry parameters or
-    // constants.
-    //
-    // Buffer liveness does not report the correct live range for entry
-    // parameter and live out buffers so we have to special case them here.  On
-    // backends that support constant buffer allocations, constant buffers are
-    // assigned globals in readonly storage so we can't merge colocated buffer
-    // sets containing constants with colocated buffer sets containing writing
-    // instructions or other constants.
-    //
-    // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
-    // the caller of the executable so we can't write to entry parameters
-    // either, and the argument for not merging constants also applies to entry
-    // parameters.
-    for (int64 key : {i, j}) {
-      for (auto& buffer : colocated_buffer_sets[key]) {
-        if (buffer_liveness.MaybeLiveOut(*buffer) ||
-            is_entry_parameter(*buffer) ||
-            buffer->instruction()->opcode() == HloOpcode::kConstant) {
-          return true;
-        }
-      }
+                                   &set_can_be_merged](int64 i, int64 j) {
+    if (!set_can_be_merged[i] || !set_can_be_merged[j]) {
+      return true;
     }
 
     // Colocated sets satisfy the invariant that all buffers within a set have
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index d8e1612b899f10a5793f9c65c59a41024dfdddd1..0a9fdede803e84ca42472259084615c031b206eb 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -545,12 +545,10 @@ class BufferAssigner {
       ReuseAllocationFunction reuse_checker = nullptr);
 
  private:
-  BufferAssigner(bool allow_input_output_aliasing,
-                 bool allocate_buffers_for_constants,
+  BufferAssigner(bool allocate_buffers_for_constants,
                  BufferLiveness::Colorer colorer,
                  ReuseAllocationFunction reuse_checker)
-      : allow_input_output_aliasing_(allow_input_output_aliasing),
-        allocate_buffers_for_constants_(allocate_buffers_for_constants),
+      : allocate_buffers_for_constants_(allocate_buffers_for_constants),
         colorer_(colorer),
         reuse_checker_(reuse_checker) {}
   virtual ~BufferAssigner() = default;
@@ -640,10 +638,6 @@ class BufferAssigner {
                       LogicalBuffer::Color::Hasher>
   SplitBuffersByColor(const absl::flat_hash_set<const LogicalBuffer*>& buffers);
 
-  // If true, buffer assignments assumes that input parameter buffers and output
-  // buffers can be shared if their sizes match.
-  bool allow_input_output_aliasing_;
-
   // If true, allocate buffers for constant instructions.
   bool allocate_buffers_for_constants_;
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 327211d3efd24177a28cc2d08dc3c4fbf2fbaff9..8f482e6ba8c3e71c9980be5e6947ea61f3b4ef29 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -81,7 +81,7 @@ const std::vector<const HloInstruction*> GetInstructions(HloInstruction* root) {
   return main_list.GetInstructions();
 }
 
-class BufferAssignmentTest : public HloVerifiedTestBase {
+class BufferAssignmentTest : public HloTestBase {
  protected:
   ~BufferAssignmentTest() override {}
 
@@ -137,8 +137,7 @@ class BufferAssignmentTest : public HloVerifiedTestBase {
   }
 
   std::unique_ptr<BufferAssignment> RunBufferAssignmentWithInstructionSequence(
-      HloModule* module,
-      absl::Span<const HloInstruction* const> instruction_sequence,
+      HloModule* module, absl::Span<HloInstruction* const> instruction_sequence,
       int64 alignment = 1) {
     HloSchedule schedule(module);
     schedule.set_sequence(module->entry_computation(), instruction_sequence);
@@ -334,16 +333,16 @@ TEST_F(BufferAssignmentTest, ScalarConstant) {
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   {
-    auto buffers = RunBufferAssignment(module);
+    auto buffers = RunBufferAssignment(module.get());
     EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
   }
 
   {
-    auto buffers = RunBufferAssignmentNoBuffersForConstants(module);
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
     EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
   }
 }
@@ -358,17 +357,17 @@ TEST_F(BufferAssignmentTest, BufferForConst) {
       LiteralUtil::CreateR1<float>({4.1f, 4.2f, 4.3f, 4.4f})));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, const0, const1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   {
-    auto buffers = RunBufferAssignment(module);
+    auto buffers = RunBufferAssignment(module.get());
     EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
     EXPECT_TRUE(buffers->HasTopLevelAllocation(const1));
     GetAssignedOutputAllocation(*buffers, add);
   }
   {
-    auto buffers = RunBufferAssignmentNoBuffersForConstants(module);
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
     EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
     EXPECT_FALSE(buffers->HasTopLevelAllocation(const1));
     GetAssignedOutputAllocation(*buffers, add);
@@ -387,10 +386,10 @@ TEST_F(BufferAssignmentTest, HasAllocationAt) {
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({negate, param0, constant}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   // Make sure that HasAllocationAt() agrees with what HasTopLevelAllocation()
   // reports for the instruction directly.
   EXPECT_EQ(buffers->HasTopLevelAllocation(tuple),
@@ -410,10 +409,10 @@ TEST_F(BufferAssignmentTest, BufferForOutputConst) {
       LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
   auto copy = builder.AddInstruction(
       HloInstruction::CreateUnary(const0->shape(), HloOpcode::kCopy, const0));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   // The copy node now has an output buffer.
   GetAssignedOutputAllocation(*buffers, copy);
 }
@@ -439,10 +438,10 @@ TEST_F(BufferAssignmentTest, Basic) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -538,7 +537,7 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto colorer = [](const BufferLiveness& buffer_liveness) {
@@ -553,7 +552,7 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
     return Status::OK();
   };
 
-  auto buffers = RunColoredBufferAssignment(module, colorer);
+  auto buffers = RunColoredBufferAssignment(module.get(), colorer);
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -599,7 +598,7 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto colorer = [](const BufferLiveness& buffer_liveness) {
@@ -622,7 +621,7 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
     return Status::OK();
   };
 
-  auto buffers = RunColoredBufferAssignment(module, colorer);
+  auto buffers = RunColoredBufferAssignment(module.get(), colorer);
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -671,10 +670,10 @@ TEST_F(BufferAssignmentTest, MultipleUsersForNode) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kSubtract, add, mul));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
 
   // Input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -706,7 +705,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   // param0[100x10] ---> (map x+1)
   //
   // Builds the map function.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto map_computation =
       module->AddEmbeddedComputation(BuildMapComputationPlus1("f32+1"));
   auto inner_last = map_computation->root_instruction();
@@ -725,7 +724,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   EXPECT_EQ(3, level1.size()) << "Invalid nested add+1 size";
 
   // Assigns buffers and fetches sizes.
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   int64 size0 = ValidateBuffers(level0, *buffers);
   int64 size1 = ValidateBuffers(level1, *buffers);
 
@@ -761,7 +760,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) {
   // out-of-order reductions could overwrite an element before a use.)
   //
   // param0[100] --- (exp1) --- (exp2) --- (reduce x+y) --- (exp3)
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto reduce_computation =
       module->AddEmbeddedComputation(BuildReduceComputation("f32+f32"));
 
@@ -784,7 +783,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) {
 
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   const std::vector<const HloInstruction*> instrs = GetInstructions(exp3);
   ValidateBuffers(instrs, *buffers);
 
@@ -812,7 +811,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
   // const4[f32[4]] --- tuple --- while[condition, body]
   //
   // Builds the nested condition and body.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto condition_computation =
       module->AddEmbeddedComputation(BuildWhileConditionComputation("if<4"));
   auto body_computation =
@@ -840,7 +839,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
   EXPECT_EQ(8, levelb.size()) << "Invalid nested body size";
 
   // Assigns buffers and fetches sizes.
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   int64 size0 = ValidateBuffers(level0, *buffers);
   int64 sizec = ValidateBuffers(levelc, *buffers);
   int64 sizeb = ValidateBuffers(levelb, *buffers);
@@ -878,7 +877,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
 }
 
 TEST_F(BufferAssignmentTest, ExampleConditional) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto true_computation = module->AddEmbeddedComputation(
       BuildR0F32UnaryOpComputation(HloOpcode::kCeil, "Ceil"));
   auto false_computation = module->AddEmbeddedComputation(
@@ -905,7 +904,7 @@ TEST_F(BufferAssignmentTest, ExampleConditional) {
   EXPECT_EQ(2, true_instrs.size());
   EXPECT_EQ(2, false_instrs.size());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   ValidateBuffers(conditional_instrs, *buffers);
   ValidateBuffers(true_instrs, *buffers);
   ValidateBuffers(false_instrs, *buffers);
@@ -941,9 +940,9 @@ TEST_F(BufferAssignmentTest, UnaryOpReuseChain) {
   auto neg = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, exp2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // tanh and exp2 can reuse exp1's buffer
   EXPECT_TRUE(assignment->HasTopLevelAllocation(exp1));
@@ -970,9 +969,9 @@ TEST_F(BufferAssignmentTest, ReuseNonOperandBuffer) {
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // negate and broadcast should share a buffer.
   EXPECT_TRUE(assignment->HasTopLevelAllocation(broadcast));
@@ -1003,9 +1002,9 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) {
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({negate, broadcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The instructions should not share buffers.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1040,9 +1039,9 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) {
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({tuple, broadcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The instructions should not share buffers.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1075,9 +1074,9 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBuffer) {
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The broadcast output buffer cannot be shared.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1107,9 +1106,9 @@ TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) {
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 10}), slice, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // negate and broadcast should share a buffer.
   EXPECT_TRUE(assignment->HasTopLevelAllocation(broadcast));
@@ -1145,9 +1144,9 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBufferInTuple) {
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
   builder.AddInstruction(HloInstruction::CreateTuple({broadcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The broadcast output buffer cannot be shared.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1160,7 +1159,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
   // Verify that buffers for embedded computations are properly marked as
   // thread-local and that embedded parameters are not marked as
   // is_entry_computation_parameter.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto vec_shape = ShapeUtil::MakeShape(F32, {42});
   auto scalar_shape = ShapeUtil::MakeShape(F32, {});
 
@@ -1191,7 +1190,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
       HloInstruction::CreateMap(vec_shape, {call}, map_computation));
   module->AddEntryComputation(builder.Build());
 
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Allocations for the map computation should be thread-local and not
   // live-out.
@@ -1238,9 +1237,9 @@ TEST_F(BufferAssignmentTest, TupleParameterAsOutput) {
                                  ShapeUtil::MakeShape(S32, {42})}),
       "param0"));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // There should be four allocations: one for vector of pointers, and one for
   // each tuple element.
@@ -1274,9 +1273,9 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(tuple_param->shape(), {1}), tuple_param, 1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Only some of the elements of the input param are liveout.
   EXPECT_FALSE(
@@ -1318,9 +1317,9 @@ TEST_F(BufferAssignmentTest, TupleConstantAsOutput) {
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::MakeTuple({&elements[0], &elements[1]})));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_EQ(3, assignment->Allocations().size());
 }
@@ -1332,9 +1331,9 @@ TEST_F(BufferAssignmentTest, TupleCustomCallAsOutput) {
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(PRED, {1, 2, 3, 4}),
                                  ShapeUtil::MakeShape(S32, {101})}),
       /*operands=*/{}, /*custom_call_target=*/"foo_function"));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_EQ(3, assignment->Allocations().size());
   EXPECT_TRUE(
@@ -1347,7 +1346,7 @@ TEST_F(BufferAssignmentTest, TupleCustomCallAsOutput) {
 
 TEST_F(BufferAssignmentTest, TupleCallAsOutput) {
   // Test a computation which returns a tuple call value.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto elem_shape = f32vec4_;
   auto tuple_shape = ShapeUtil::MakeTupleShape({elem_shape});
 
@@ -1365,7 +1364,7 @@ TEST_F(BufferAssignmentTest, TupleCallAsOutput) {
       HloInstruction::CreateCall(tuple_shape, {param}, sub_computation));
   module->AddEntryComputation(builder.Build());
 
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_EQ(2, assignment->Allocations().size());
   // Buffers for call are colocated with the sub-computation.
@@ -1388,7 +1387,7 @@ TEST_F(BufferAssignmentTest, TupleChainedCallAsOutput) {
   // B: call(C, param)
   // C: call(D, param)
   // D: param
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto elem_shape = f32vec4_;
   auto tuple_shape = ShapeUtil::MakeTupleShape({elem_shape});
 
@@ -1427,7 +1426,7 @@ TEST_F(BufferAssignmentTest, TupleChainedCallAsOutput) {
   module->AddEntryComputation(std::move(a_computation));
   module->AddEmbeddedComputation(std::move(b_computation));
 
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Buffers for call are colocated with the sub-computations.
   EXPECT_EQ(GetAllocation(*assignment, a_call, /*index=*/{}),
@@ -1461,9 +1460,9 @@ TEST_F(BufferAssignmentTest, BitcastAsOutput) {
   auto bitcast = builder.AddInstruction(
       HloInstruction::CreateUnary(param->shape(), HloOpcode::kBitcast, param));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Bitcast should get the same allocation as the param.
   EXPECT_EQ(1, assignment->Allocations().size());
@@ -1488,9 +1487,9 @@ TEST_F(BufferAssignmentTest, AmbiguousBufferAsOutput) {
       HloInstruction::CreateTernary(tuple_shape, HloOpcode::kTupleSelect,
                                     pred_param, tuple_param0, tuple_param1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Select shallow copies one of its operands so it defines its own top-level
   // buffer and receives its own allocation.
@@ -1526,9 +1525,9 @@ TEST_F(BufferAssignmentTest, TupleBufferNotReused) {
   auto copy = builder.AddInstruction(HloInstruction::CreateUnary(
       scalar_shape, HloOpcode::kCopy, tuple_element));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // There should be no buffer reuse. The copy should not reuse the tuple
   // buffer.
@@ -1568,9 +1567,9 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
       HloInstruction::CreateConcatenate(shape_5x4, {dot_ab, dot_bc}, 0));
 
   // Run buffer assignment with alignment=1.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module, /*alignment=*/1);
+  auto assignment = RunBufferAssignment(module.get(), /*alignment=*/1);
 
   // There are 5 allocations: 3 parameters, 1 output, and 1 temp.
   EXPECT_EQ(5, assignment->Allocations().size());
@@ -1589,7 +1588,7 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
   EXPECT_EQ(80, slice_bc.allocation()->size());
 
   // Re-run buffer assignment with alignment=64.
-  assignment = RunBufferAssignment(module, /*alignment=*/64);
+  assignment = RunBufferAssignment(module.get(), /*alignment=*/64);
   EXPECT_EQ(5, assignment->Allocations().size());
   slice_ab = assignment->GetUniqueTopLevelSlice(dot_ab).ConsumeValueOrDie();
   slice_bc = assignment->GetUniqueTopLevelSlice(dot_bc).ConsumeValueOrDie();
@@ -1632,10 +1631,10 @@ TEST_F(BufferAssignmentTest, TrivialPeakBuffers) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
 
   const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
   const std::vector<const LogicalBuffer*>& peak_buffers =
@@ -1673,11 +1672,11 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
 
       ShapeUtil::MakeShape(F32, {1}), concat, {0}, {1}, {1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto buffers = RunBufferAssignmentWithInstructionSequence(
-      module, {param, log, rev, neg, concat, root});
+      module.get(), {param, log, rev, neg, concat, root});
 
   // The temporary buffer should hold the 4 interior instructions.
   const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, concat);
@@ -1698,7 +1697,7 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
 }
 
 TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape shape = ShapeUtil::MakeShape(F32, {123, 123});
   HloComputation* condition;
   {
@@ -1733,7 +1732,7 @@ TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
       ShapeUtil::MakeShape(F32, {123, 123, 123}), bcast, {0}));
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, bcast);
   const std::vector<const LogicalBuffer*>& peak_buffers =
       buffer.PeakMemoryLogicalBuffers();
@@ -1783,13 +1782,13 @@ ENTRY main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
   HloInstruction* constant_1 =
-      module().entry_computation()->GetInstructionWithName("constant.1.1");
+      m->entry_computation()->GetInstructionWithName("constant.1.1");
   HloInstruction* constant_2 =
-      module().entry_computation()->GetInstructionWithName("constant.1.2");
+      m->entry_computation()->GetInstructionWithName("constant.1.2");
 
-  auto buffers = RunBufferAssignment(&module());
+  auto buffers = RunBufferAssignment(m.get());
 
   {
     const BufferAllocation& allocation_for_const_1 =
@@ -1818,7 +1817,7 @@ ENTRY main {
   }
 }
 
-class WhileBufferAssignmentTest : public HloVerifiedTestBase {
+class WhileBufferAssignmentTest : public HloTestBase {
  protected:
   std::unique_ptr<HloComputation> BuildWhileConditionComputation(
       const string& name) {
@@ -1853,7 +1852,7 @@ class WhileBufferAssignmentTest : public HloVerifiedTestBase {
   std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
                                                         int64 alignment = 1) {
     HloSchedule schedule =
-        ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie();
+        ScheduleModule(module, ByteSizeOf).ConsumeValueOrDie();
     return BufferAssigner::Run(
                module, absl::make_unique<SequentialHloOrdering>(schedule),
                ByteSizeOf,
@@ -1878,7 +1877,7 @@ static void RunCopyInsertion(HloModule* module) {
 }
 
 TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1917,8 +1916,8 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
       HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1));
 
   module->AddEntryComputation(builder.Build());
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
 
   // Verify 'input0' and read-only use while0{0} alias.
   EXPECT_EQ(assignment->GetUniqueSlice(input0, {}).ConsumeValueOrDie(),
@@ -1974,20 +1973,19 @@ ENTRY %test_module {
   ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={}
 })";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
   // Run CopyInsertion and check if the graph constructed above doesn't need
   // any copies inserted for BufferAssignment to run.
-  int64 instruction_count = module().instruction_count();
+  int64 instruction_count = m->instruction_count();
   CopyInsertion copy_insertion;
-  ASSERT_IS_OK(copy_insertion.Run(&module()).status());
-  ASSERT_EQ(instruction_count, module().instruction_count());
+  ASSERT_IS_OK(copy_insertion.Run(m.get()).status());
+  ASSERT_EQ(instruction_count, m->instruction_count());
 
   // Get the instructions in the module.
-  const HloInstruction* bcast =
-      module().entry_computation()->root_instruction();
+  const HloInstruction* bcast = m->entry_computation()->root_instruction();
   const HloInstruction* param =
-      module().entry_computation()->parameter_instruction(0);
+      m->entry_computation()->parameter_instruction(0);
   ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
   const HloInstruction* while1 = bcast->operand(0);
   ASSERT_EQ(while1->opcode(), HloOpcode::kWhile);
@@ -1995,7 +1993,7 @@ ENTRY %test_module {
   ASSERT_EQ(while0->opcode(), HloOpcode::kWhile);
 
   // Run buffer assignment.
-  auto assignment = RunBufferAssignment(&module());
+  auto assignment = RunBufferAssignment(m.get());
   TF_ASSERT_OK_AND_ASSIGN(auto slice_param,
                           assignment->GetUniqueSlice(param, {}));
   TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
@@ -2042,20 +2040,19 @@ ENTRY %test_module {
   ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={}
 })";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
   // Run CopyInsertion and check if the graph constructed above doesn't need
   // any copies inserted for BufferAssignment to run.
-  int64 instruction_count = module().instruction_count();
+  int64 instruction_count = m->instruction_count();
   CopyInsertion copy_insertion;
-  ASSERT_IS_OK(copy_insertion.Run(&module()).status());
-  ASSERT_EQ(instruction_count, module().instruction_count());
+  ASSERT_IS_OK(copy_insertion.Run(m.get()).status());
+  ASSERT_EQ(instruction_count, m->instruction_count());
 
   // Get the instructions in the module.
-  const HloInstruction* bcast =
-      module().entry_computation()->root_instruction();
+  const HloInstruction* bcast = m->entry_computation()->root_instruction();
   const HloInstruction* constant =
-      module().entry_computation()->GetInstructionWithName("constant.42");
+      m->entry_computation()->GetInstructionWithName("constant.42");
   ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
   const HloInstruction* while1 = bcast->operand(0);
   ASSERT_EQ(while1->opcode(), HloOpcode::kWhile);
@@ -2063,7 +2060,7 @@ ENTRY %test_module {
   ASSERT_EQ(while0->opcode(), HloOpcode::kWhile);
 
   // Run buffer assignment.
-  auto assignment = RunBufferAssignment(&module());
+  auto assignment = RunBufferAssignment(m.get());
   TF_ASSERT_OK_AND_ASSIGN(auto slice_constant,
                           assignment->GetUniqueSlice(constant, {}));
   TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
@@ -2121,7 +2118,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   };
 
   // Build the entry computation as described in the comment above.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto token = builder.AddInstruction(HloInstruction::CreateToken());
@@ -2156,7 +2153,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   // any copies inserted for BufferAssignment to run.
   int64 instruction_count = module->instruction_count();
   CopyInsertion copy_insertion;
-  ASSERT_IS_OK(copy_insertion.Run(module).status());
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
   ASSERT_EQ(instruction_count, module->instruction_count());
 
   // Create a sequential order among all the instructions in the entry
@@ -2164,7 +2161,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   // nodes are traversed during BufferAssignment.
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
@@ -2175,12 +2172,12 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto assignment,
-      BufferAssigner::Run(module,
-                          absl::make_unique<SequentialHloOrdering>(schedule),
-                          backend().compiler()->BufferSizeBytesFunction(),
-                          [](LogicalBuffer::Color) { return 1; },
-                          /*allow_input_output_aliasing=*/false,
-                          /*allocate_buffers_for_constants=*/true));
+      BufferAssigner::Run(
+          module.get(), absl::make_unique<SequentialHloOrdering>(schedule),
+          backend().compiler()->BufferSizeBytesFunction(),
+          [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true));
 
   // The result tuple elements must be assigned with different buffers.
   TF_ASSERT_OK_AND_ASSIGN(auto slice0, assignment->GetUniqueSlice(tuple, {0}));
@@ -2202,7 +2199,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
 }
 
 TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -2234,8 +2231,8 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
       HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, while0));
 
   module->AddEntryComputation(builder.Build());
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
 
   // while0 and while1 buffers should be completely aligned.
   EXPECT_EQ(assignment->GetUniqueSlice(while0, {0}).ConsumeValueOrDie(),
@@ -2247,7 +2244,7 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
 }
 
 TEST_F(BufferAssignmentTest, TwoCalls) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(xla::F32, {});
   HloComputation* sub_computation;
   {
@@ -2277,13 +2274,13 @@ TEST_F(BufferAssignmentTest, TwoCalls) {
 
   {
     FlattenCallGraph flatten;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
     EXPECT_TRUE(result);
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   }
 
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_TRUE(BuffersDistinct({call1}, {call2}, *assignment));
 }
@@ -2308,13 +2305,14 @@ ENTRY Main {
 )";
 
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-  ParseAndVerifyModule(hlo_text, config);
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
 
-  auto buffers = RunBufferAssignment(&module());
+  auto buffers = RunBufferAssignment(m.get());
 
-  HloComputation* main = module().entry_computation();
-  HloComputation* callee = module().GetComputationWithName("Callee");
+  HloComputation* main = m->entry_computation();
+  HloComputation* callee = m->GetComputationWithName("Callee");
   EXPECT_NE(callee, nullptr);
 
   HloInstruction* param0 = callee->parameter_instruction(0);
@@ -2338,7 +2336,7 @@ ENTRY Main {
 }
 
 TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto zero = builder.AddInstruction(
@@ -2385,40 +2383,41 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
 
   {
     FlattenCallGraph flatten;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
     EXPECT_TRUE(result);
   }
 
-  RunCopyInsertion(module);
+  RunCopyInsertion(module.get());
 
   HloSchedule schedule =
-      ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie();
+      ScheduleModule(module.get(), ByteSizeOf).ConsumeValueOrDie();
 
   // To trigger b/38494731, we want a specific Hlo schedule for the
   // root computation, so we overwrite that entry with a manually
   // crafted sequence.
-  schedule.set_sequence(module->entry_computation(),
-                        {input1, weights1, one, output1, while1->operand(0),
-                         while1, input0, weights0, zero, output0,
-                         while0->operand(0), while0, gte0, gte1, root_add});
+  schedule.set_sequence(
+      module->entry_computation(),
+      {input1, weights1, one, output1, while1->mutable_operand(0), while1,
+       input0, weights0, zero, output0, while0->mutable_operand(0), while0,
+       gte0, gte1, root_add});
 
   // If this ASSERT fails, we constructed a bogus sequence above and this test
   // itself is buggy.
   TF_ASSERT_OK(schedule.Verify());
 
   auto assignment =
-      BufferAssigner::Run(module,
-                          absl::make_unique<SequentialHloOrdering>(schedule),
-                          ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
-                          /*allow_input_output_aliasing=*/false,
-                          /*allocate_buffers_for_constants=*/true)
+      BufferAssigner::Run(
+          module.get(), absl::make_unique<SequentialHloOrdering>(schedule),
+          ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true)
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
 }
 
 TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -2462,8 +2461,8 @@ TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
       HloInstruction::CreateGetTupleElement(data_shape_, while1, 2));
 
   module->AddEntryComputation(builder.Build());
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
   // Get BufferAllocation for root instruction.
   auto* root_alloc = assignment->GetUniqueTopLevelSlice(while1_out)
                          .ConsumeValueOrDie()
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index 17e50905059ad2c92784d14132c1cb1f46f35ade..40825a78716b1c0b9fb0121787977d275891c0f8 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -117,7 +117,7 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(vec_, HloOpcode::kLog, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -164,7 +164,7 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -213,7 +213,7 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
   auto reverse =
       builder.AddInstruction(HloInstruction::CreateReverse(vec_, negate, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -247,7 +247,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -289,7 +289,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -336,7 +336,7 @@ TEST_F(BufferLivenessTest, RootInstructionIsNotLastInSequentialOrder) {
       HloInstruction::CreateSend(recv_done, token, /*channel_id=*/1));
   auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build(add));
 
   HloSchedule schedule(module.get());
@@ -373,7 +373,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
   auto outer_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({inner_tuple, exp}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -393,7 +393,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
 
 TEST_F(BufferLivenessTest, EmbeddedComputation) {
   // Test MaybeLiveOut and MayInterfere for embedded computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto embedded_builder = HloComputation::Builder(TestName() + "_embedded");
   auto embedded_param = embedded_builder.AddInstruction(
@@ -450,7 +450,7 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
   builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       inner_tuple0.shape(), tuple_constant, 0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -514,7 +514,7 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(BuildDummyComputation());
   module->AddEmbeddedComputation(builder.Build());
 
@@ -576,7 +576,7 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(BuildDummyComputation());
   module->AddEmbeddedComputation(builder.Build());
 
@@ -611,8 +611,8 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
 class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
  protected:
   // Builds and runs a computation (see test case computation graphs below).
-  std::unique_ptr<HloModule> BuildModule(const bool update_uses_tuple_element1,
-                                         const bool fuse_gte0) {
+  std::unique_ptr<VerifiedHloModule> BuildModule(
+      const bool update_uses_tuple_element1, const bool fuse_gte0) {
     auto builder = HloComputation::Builder(TestName());
     // Create param0 Tuple.
     Shape data_shape = ShapeUtil::MakeShape(F32, {8});
@@ -646,7 +646,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewModule();
+    auto module = CreateNewVerifiedModule();
     module->AddEntryComputation(builder.Build());
     auto* computation = module->entry_computation();
     // Create fusion instruction based on number of tuple element 1 users.
@@ -802,7 +802,7 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     auto tuple_root = builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewModule();
+    auto module = CreateNewVerifiedModule();
     module->AddEntryComputation(BuildDummyComputation());
     module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index bdd5069632e84fe6c67ca129f726432479ac1b35..7987343bfaf1069fd550909d127e4b11f2124701 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -325,6 +325,15 @@ bool CallGraph::IsFlattened() const {
   return true;
 }
 
+std::vector<HloInstruction*> CallGraph::GetComputationCallers(
+    HloComputation* c) {
+  std::vector<HloInstruction*> callers;
+  for (auto callsite : GetNode(c).caller_callsites()) {
+    callers.push_back(callsite.instruction());
+  }
+  return callers;
+}
+
 std::pair<HloInstruction*, HloInstruction*>
 CallGraph::NearestAncestorsInSameComputation(HloInstruction* a,
                                              HloInstruction* b) const {
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index cb56f4789d06ac33acdaadc8b619b9e37f683d58..05c7c998738f861ee804d1ec87bfa5fb17ddfb74 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -236,6 +236,10 @@ class CallGraph {
   // FlattenCallGraph.
   bool IsFlattened() const;
 
+  // Returns a vector of instructions calling the passed computation.
+  // (Often a vector of size 1.)
+  std::vector<HloInstruction*> GetComputationCallers(HloComputation* c);
+
   string ToString() const;
 
  private:
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 34f3f914d593bc603c4964663f9cafb70a136fd3..a3ac2568b0f3eec8556a42dbe3c2c64bd8564468 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -31,7 +31,7 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-class CallGraphTest : public HloVerifiedTestBase {
+class CallGraphTest : public HloTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
   std::unique_ptr<HloComputation> MakeScalarComputation(
@@ -93,10 +93,10 @@ class CallGraphTest : public HloVerifiedTestBase {
 
 TEST_F(CallGraphTest, SingletonComputation) {
   // Test the call graph of a module with a single computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(1, call_graph->nodes().size());
   EXPECT_TRUE(call_graph->IsFlattened());
 
@@ -112,13 +112,13 @@ TEST_F(CallGraphTest, SingletonComputation) {
 TEST_F(CallGraphTest, UnreachableComputation) {
   // Test the call graph of a module with an entry computation and an
   // unreachable computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(MakeScalarComputation());
   HloComputation* unreachable_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
@@ -134,13 +134,13 @@ TEST_F(CallGraphTest, UnreachableComputation) {
 TEST_F(CallGraphTest, ParallelComputation) {
   // Test a call graph of a module with an entry computation which calls another
   // computation in a parallel context via kMap.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* map_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
   HloComputation* entry_computation = module->AddEntryComputation(
       MakeMappingComputation(map_computation, /*callsites=*/5));
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
@@ -163,13 +163,13 @@ TEST_F(CallGraphTest, ParallelComputation) {
 TEST_F(CallGraphTest, SequentialComputations) {
   // Test a call graph of a module with an entry computation which calls another
   // computation in a sequential context via kCall.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* called_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
   HloComputation* entry_computation = module->AddEntryComputation(
       MakeCallingComputation(called_computation, /*callsites=*/3));
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   // The called computation is only called from one other computation, but there
@@ -196,7 +196,7 @@ TEST_F(CallGraphTest, SequentialComputations) {
 TEST_F(CallGraphTest, ContextBothComputations) {
   // Test a call graph of a module with an entry computation which calls another
   // computation in both a parallel and sequential context.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* subcomputation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
@@ -210,7 +210,7 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   EXPECT_FALSE(call_graph->IsFlattened());
@@ -239,7 +239,7 @@ TEST_F(CallGraphTest, ContextBothComputations) {
 
 TEST_F(CallGraphTest, ComputationWithConditional) {
   // Test a call graph of a module with a conditional.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* true_computation =
       module->AddEmbeddedComputation(MakeScalarComputation(HloOpcode::kCeil));
   HloComputation* false_computation =
@@ -259,7 +259,7 @@ TEST_F(CallGraphTest, ComputationWithConditional) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   EXPECT_EQ(3, call_graph->nodes().size());
 
@@ -298,7 +298,7 @@ TEST_F(CallGraphTest, ComplexGraph) {
   //    c
   //
   // Calls are made via kCall, kWhile, and kMap instructions.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(MakeConditionComputation());
   HloComputation* c_computation =
@@ -328,7 +328,7 @@ TEST_F(CallGraphTest, ComplexGraph) {
     entry_computation = module->AddEntryComputation(builder.Build());
   }
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(5, call_graph->nodes().size());
   EXPECT_FALSE(call_graph->IsFlattened());
 
@@ -418,7 +418,7 @@ TEST_F(CallGraphTest, ComplexGraphNearestAncestors) {
   //    c
   //
   // Calls are made via kCall, kWhile, and kMap instructions.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(MakeConditionComputation());
   HloComputation* c_computation =
@@ -452,7 +452,7 @@ TEST_F(CallGraphTest, ComplexGraphNearestAncestors) {
     entry_computation = module->AddEntryComputation(builder.Build());
   }
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(5, call_graph->nodes().size());
 
   // Verify NearestAncestorsInSameComputation for various instructions in the
@@ -479,10 +479,10 @@ TEST_F(CallGraphTest, ComplexGraphNearestAncestors) {
 
 TEST_F(CallGraphTest, VisitSingletonComputation) {
   // Test the call graph visitor with a call graph with a single node.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   std::vector<HloComputation*> visited;
   TF_ASSERT_OK(call_graph->VisitNodes([&visited](const CallGraphNode& node) {
@@ -494,12 +494,12 @@ TEST_F(CallGraphTest, VisitSingletonComputation) {
 
 TEST_F(CallGraphTest, VisitUnreachableComputation) {
   // Test the call graph visitor with a call graph with an unreachable node.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(MakeScalarComputation());
   HloComputation* unreachable_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   // Test visitation of only reachable nodes.
   {
@@ -531,9 +531,9 @@ TEST_F(CallGraphTest, VisitUnreachableComputation) {
 
 TEST_F(CallGraphTest, VisitWithError) {
   // Test that the call graph visitor properly propagates errors.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   Status status = call_graph->VisitNodes(
       [](const CallGraphNode&) { return InternalError("Visitation failed"); });
diff --git a/tensorflow/compiler/xla/service/call_inliner_test.cc b/tensorflow/compiler/xla/service/call_inliner_test.cc
index e6b566543594a86eb5369ee9b7440f62618f6c5a..0b6e323f75c7a5dae127e20d2a4b92a83a72df3b 100644
--- a/tensorflow/compiler/xla/service/call_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/call_inliner_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -40,7 +40,7 @@ namespace {
 
 // Tests for call inlining that are most tractable at the HLO level (vs
 // ComputationBuilder API in call_test.cc).
-using CallInlinerTest = HloVerifiedTestBase;
+using CallInlinerTest = HloTestBase;
 
 TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   // "inner" computation just has a control dependency from the "zero" value to
@@ -51,7 +51,7 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   HloInstruction* one = inner.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   TF_ASSERT_OK(zero->AddControlDependencyTo(one));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* inner_computation =
       module->AddEmbeddedComputation(inner.Build());
 
@@ -64,7 +64,7 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   auto computation = module->AddEntryComputation(outer.Build());
 
   CallInliner call_inliner;
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
   ASSERT_TRUE(mutated);
   EXPECT_THAT(computation->root_instruction(), op::Constant());
   EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<float>(),
@@ -79,7 +79,7 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
 // returns false should be identical to just returning false).
 TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) {
   const Shape pred = ShapeUtil::MakeShape(PRED, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // Create a lambda that calls a function that returns the false predicate.
   // Note we also use this lambda twice by reference, just to make the test a
@@ -107,7 +107,7 @@ TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) {
   auto computation = module->AddEntryComputation(outer.Build());
 
   CallInliner call_inliner;
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
   ASSERT_TRUE(mutated);
   EXPECT_THAT(
       computation->root_instruction()->while_condition()->root_instruction(),
@@ -120,7 +120,7 @@ TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) {
 // whole pass.
 TEST_F(CallInlinerTest, InlineWithoutRunningPass) {
   const Shape pred = ShapeUtil::MakeShape(PRED, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder just_false(TestName() + ".false");
   auto* true_constant = just_false.AddInstruction(
@@ -144,7 +144,7 @@ TEST_F(CallInlinerTest, InlineWithoutRunningPass) {
 
 TEST_F(CallInlinerTest, CallToOutfeedComputationIsInlined) {
   const Shape f32 = ShapeUtil::MakeShape(F32, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder outfeeder(TestName() + ".outfeeder");
   auto value = outfeeder.AddInstruction(
@@ -163,7 +163,7 @@ TEST_F(CallInlinerTest, CallToOutfeedComputationIsInlined) {
   module->AddEntryComputation(outer.Build());
 
   CallInliner call_inliner;
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
   ASSERT_TRUE(mutated);
 }
 
diff --git a/tensorflow/compiler/xla/service/compilation_cache.cc b/tensorflow/compiler/xla/service/compilation_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2662fe46705f4936ce0d654df0943e7d30890ebe
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compilation_cache.cc
@@ -0,0 +1,70 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/compilation_cache.h"
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+int64 GetUniqueId() {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static int64 counter = 0;
+  tensorflow::mutex_lock loc(mu);
+  const int64 id = counter++;
+  return id;
+}
+
+}  // namespace
+
+ExecutionHandle CompilationCache::Insert(
+    std::unique_ptr<Executable> executable) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  CacheKey key = GetUniqueId();
+  VLOG(2) << "inserting cache key: " << key;
+  CHECK_EQ(cache_.count(key), 0);
+  cache_.emplace(key, std::move(executable));
+
+  ExecutionHandle handle;
+  handle.set_handle(key);
+  return handle;
+}
+
+StatusOr<std::shared_ptr<Executable>> CompilationCache::LookUp(
+    const ExecutionHandle& handle) const {
+  tensorflow::mutex_lock lock(mutex_);
+
+  CacheKey key = handle.handle();
+  VLOG(2) << "looking up cache key: " << key;
+  if (cache_.count(key) == 0) {
+    VLOG(2) << "cache key not found: " << key;
+    return InvalidArgumentStrCat("can not find executable with handle ", key);
+  } else {
+    auto& result = cache_.at(key);
+    VLOG(2) << "hit executable: " << result->module().name();
+    return result;
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compilation_cache.h b/tensorflow/compiler/xla/service/compilation_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f94def509d4d4a8950272cb498af5056a698ce0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compilation_cache.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace xla {
+
+// A cache which stores Executables indexed by computation handle and version.
+//
+// TODO(b/119042872): Provide mechanism for removing computations from the
+// compilation cache.
+class CompilationCache {
+ public:
+  CompilationCache() {}
+
+  ExecutionHandle Insert(std::unique_ptr<Executable> executable);
+
+  // Lookup the Executable for the specified handle in the cache. Return a
+  // shared_ptr to the Executable if it exists in the cache.
+  StatusOr<std::shared_ptr<Executable>> LookUp(
+      const ExecutionHandle& handle) const;
+
+ protected:
+  mutable tensorflow::mutex mutex_;
+
+  using CacheKey = int64;
+
+  absl::flat_hash_map<CacheKey, std::shared_ptr<Executable>> cache_
+      GUARDED_BY(mutex_);
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CompilationCache);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 6d67f970020d278cc7bf61b56350200d3e5cb926..67132274c0dcbfda831c79836d052bb51b753ec7 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 80c630c6201503d88a690f04a88f6fca6f3a438a..8f08c244908efb823b3870c19bdc3491fa87d44f 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -110,6 +110,6 @@ Compiler::GetPlatformCompilers() {
 }
 
 AotCompilationOptions::AotCompilationOptions()
-    : debug_options_(legacy_flags::GetDebugOptionsFromFlags()) {}
+    : debug_options_(GetDebugOptionsFromFlags()) {}
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index c899ffb9dc562426ef14c0d414469c04debeec70..844b42a38d7539cccd5c4e30071c0ea6693e3bba 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -105,8 +105,6 @@ class ComputationPlacer {
   // Map from platform kind to computation placer singleton.
   static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
 
-  se::Platform::Id platform_id_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
 };
 
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
index c43a31b167d47af3c92ed35fa52594fa5da1e4af..289eb6d90239a72ecc0f3312a7e0e8453f946858 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -37,7 +37,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class ConditionalSimplifierTest : public HloVerifiedTestBase {
+class ConditionalSimplifierTest : public HloTestBase {
  public:
   // Makes a computation that contains a conditional with constant predicate.
   HloComputation* MakeConditional(HloModule* module);
@@ -96,25 +96,28 @@ HloComputation* ConditionalSimplifierTest::MakeConditional(HloModule* module) {
 }
 
 TEST_F(ConditionalSimplifierTest, ConditionalGetsInlined) {
-  HloComputation* computation = MakeConditional(&module());
-  ASSERT_TRUE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
+  ASSERT_TRUE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Parameter(), op::Constant()));
 }
 
 TEST_F(ConditionalSimplifierTest, ConditionalWithControlDependency) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
 
   auto* true_op = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
   TF_ASSERT_OK(
       true_op->AddControlDependencyTo(computation->root_instruction()));
 
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsSend) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
   auto* conditional = computation->root_instruction();
   ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
 
@@ -125,11 +128,12 @@ TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsSend) {
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true))),
       token, /*channel_id=*/0));
   true_computation->AddInstruction(HloInstruction::CreateSendDone(send));
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsRecv) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
   auto* conditional = computation->root_instruction();
   ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
 
@@ -138,18 +142,19 @@ TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsRecv) {
   auto* recv = true_computation->AddInstruction(HloInstruction::CreateRecv(
       ShapeUtil::MakeShape(F32, {1}), token, /*channel_id=*/0));
   true_computation->AddInstruction(HloInstruction::CreateRecvDone(recv));
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsNonRemovableInstruction) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
   auto* conditional = computation->root_instruction();
   ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
   auto* false_computation = conditional->false_computation();
   auto token = false_computation->AddInstruction(HloInstruction::CreateToken());
   false_computation->AddInstruction(HloInstruction::CreateInfeed(
       ShapeUtil::MakeShape(F32, {1}), token, "config"));
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
index 0ac4a65ec6ae55fabd2b48ea2982b94f9551c8d2..7f7f1503a099b3a67ed22cb5978c01da6cf8ba88 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
@@ -51,7 +51,8 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
   Status HandleConvolution(HloInstruction* convolution) override;
 
   // Runs the visitor on a computation.
-  static bool Run(HloComputation* computation);
+  static bool Run(HloComputation* computation,
+                  bool canonicalize_depthwise_filter);
 
   // Returns whether any convolution ops were rewritten.
   const bool changed() const { return changed_; }
@@ -59,18 +60,24 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
   ~ConvolutionVisitor() override = default;
 
  private:
-  explicit ConvolutionVisitor(HloComputation* computation)
-      : computation_(computation) {}
+  explicit ConvolutionVisitor(HloComputation* computation,
+                              bool canonicalize_depthwise_filter = false)
+      : computation_(computation),
+        filter_expansion_(!canonicalize_depthwise_filter) {}
 
   // Current HloComputation instance the ConvolutionVisitor is traversing.
   HloComputation* computation_;
 
   // Whether rewrite has occurred.
   bool changed_ = false;
+
+  // Whether filter expansion is required.
+  bool filter_expansion_;
 };
 
-bool ConvolutionVisitor::Run(HloComputation* computation) {
-  ConvolutionVisitor visitor(computation);
+bool ConvolutionVisitor::Run(HloComputation* computation,
+                             bool canonicalize_depthwise_filter) {
+  ConvolutionVisitor visitor(computation, canonicalize_depthwise_filter);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -190,9 +197,49 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
   HloInstruction* filter_mask = GetExpandedFilterMask(
       filter->shape(), input_feature_dim, output_feature_dim, group_count, add);
   HloInstruction* expanded_filter;
-  // We want to repeat 'filter' in the 'input_feature_dim' dimension
-  // 'group_count' times.
+
   if (group_size == 1) {
+    bool depthwise_separable =
+        (group_count == filter->shape().dimensions(output_feature_dim));
+    // If the code generator handles depthwise separable convolutions
+    // inherently, then no filter expansion is needed.
+    if (!filter_expansion_ && depthwise_separable) {
+      const int64 old_kernel_input_feature_dimension =
+          dim_numbers.kernel_input_feature_dimension();
+      const int64 old_kernel_output_feature_dimension =
+          dim_numbers.kernel_output_feature_dimension();
+
+      // For depthwise convolutions, we want the kernel input feature dimension
+      // to be smaller than the output feature dimension. If that's not the
+      // case, we swap the dimensions.
+      if (old_kernel_input_feature_dimension >
+          old_kernel_output_feature_dimension) {
+        Shape reshaped_filter_shape = filter->shape();
+        auto& dimensions = *reshaped_filter_shape.mutable_dimensions();
+        std::swap(dimensions[old_kernel_input_feature_dimension],
+                  dimensions[old_kernel_output_feature_dimension]);
+
+        auto reshaped_filter =
+            add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
+
+        dim_numbers.set_kernel_input_feature_dimension(
+            old_kernel_output_feature_dimension);
+
+        dim_numbers.set_kernel_output_feature_dimension(
+            old_kernel_input_feature_dimension);
+
+        auto new_convolution = HloInstruction::CreateConvolve(
+            convolution->shape(), convolution->mutable_operand(0),
+            reshaped_filter, group_count, convolution->window(), dim_numbers,
+            convolution->precision_config());
+
+        TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+            convolution, std::move(new_convolution)));
+      }
+      return Status::OK();
+    }
+    // We want to repeat 'filter' in the 'input_feature_dim' dimension
+    // 'group_count' times.
     Shape reshaped_filter_shape =
         ShapeUtil::DeleteDimension(input_feature_dim, filter->shape());
     auto reshaped_filter =
@@ -237,7 +284,7 @@ StatusOr<bool> ConvolutionFeatureGroupConverter::Run(HloModule* module) {
                         module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (ConvolutionVisitor::Run(comp)) {
+    if (ConvolutionVisitor::Run(comp, filter_expansion_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
index ce0138e56fbd51daaf5d3ac329ccbe31a9fdbde7..cb6bc04c00a2ff10f970da2a07fb540a561dad5a 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
@@ -27,7 +27,8 @@ namespace xla {
 // convolutions with feature_group_count = 1.
 class ConvolutionFeatureGroupConverter : public HloModulePass {
  public:
-  ConvolutionFeatureGroupConverter() {}
+  ConvolutionFeatureGroupConverter(bool canonicalize_depthwise_filter = false)
+      : filter_expansion_(canonicalize_depthwise_filter) {}
 
   absl::string_view name() const override {
     return "convolution-feature-group-converter";
@@ -36,6 +37,9 @@ class ConvolutionFeatureGroupConverter : public HloModulePass {
   // Run convolution rewriting on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
+
+  // Tells whether filter expansion is required.
+  bool filter_expansion_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 4e547d925f62dce1d2dd23a39a28ca8c23ba9f2f..df6059663876dfde71f4c75d3931b3d2de72c1df 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -442,7 +442,6 @@ class CopyRemover {
               const HloOrdering& ordering, HloModule* module)
       : module_(module),
         alias_analysis_(alias_analysis),
-        ordering_(ordering),
         buffer_value_tracker_(*module, alias_analysis, ordering) {}
 
   // Try to elide the given copy. The copy is elided if the instruction is not
@@ -1003,7 +1002,6 @@ class CopyRemover {
 
   HloModule* module_;
   const HloAliasAnalysis& alias_analysis_;
-  const HloOrdering& ordering_;
 
   // Object tracking the HLO values contained in each HLO buffer.
   BufferValueTracker buffer_value_tracker_;
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 4533ebb99bbba854a029fb8a9a1e31b023be720d..e4e9d7ba05c115be9dd0eb53ebd7de208d514efb 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <set>
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -94,7 +94,7 @@ TEST_F(CopyInsertionTest, SingleParameter) {
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -114,7 +114,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -127,7 +127,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 TEST_F(CopyInsertionTest, ExistingCopiesNotRemoved) {
   // Verify that kCopy instructions which change layout and exist before
   // copy-insertion remain in the graph after copy-insertion.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant =
@@ -181,7 +181,7 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
 
   builder.AddInstruction(HloInstruction::CreateTuple({constant2, x, add}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -217,7 +217,7 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
   EXPECT_THAT(constant2->users(), UnorderedElementsAre(tuple1, tuple2));
   EXPECT_THAT(constant3->users(), UnorderedElementsAre(tuple2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
@@ -238,7 +238,7 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -261,7 +261,7 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(bitcast));
@@ -283,7 +283,7 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
   builder.AddInstruction(HloInstruction::CreateTuple({bitcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -310,7 +310,7 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
            ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(HloOpcode::kParameter,
@@ -351,7 +351,7 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -388,7 +388,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(select->shape(), {0}), select, 0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -403,7 +403,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
 
 class WhileCopyInsertionTest : public CopyInsertionTest {
  protected:
-  WhileCopyInsertionTest() : module_(CreateNewModule()) {}
+  WhileCopyInsertionTest() : module_(CreateNewUnverifiedModule()) {}
 
   // Builds a While condition computation which reads the induction variable
   // from the tuple parameter, and returns a predicate indicating whether this
@@ -1295,7 +1295,7 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
 TEST_F(CopyInsertionTest, SwizzlingWhile) {
   // Test a while instruction with a body which permutes its tuple parameter
   // elements.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1362,7 +1362,7 @@ TEST_F(CopyInsertionTest, CrossingParameters) {
   //   |  / \ |
   //   | /   \|
   //  (p1  ,  p0)
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1395,7 +1395,7 @@ TEST_F(CopyInsertionTest, ParametersAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1428,7 +1428,7 @@ TEST_F(CopyInsertionTest, ParameterWithNoAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1461,7 +1461,7 @@ TEST_F(CopyInsertionTest, ParameterWithPartialAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1496,7 +1496,7 @@ TEST_F(CopyInsertionTest, ParameterAndParallelOpsWithPartialAliasing) {
   //   |    |      |
   //   |    |      |
   //   +-- (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1534,7 +1534,7 @@ TEST_F(CopyInsertionTest, ParameterAndOpsWithPartialAliasing) {
   //   |    Add----+
   //   |    |      |
   //   +-- (p0 ,  p1)
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1569,7 +1569,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) {
   // the operation (instruction) on the element makes the live range of the
   // respective input and output elements different than if the instruction were
   // not there (as in the SwizzlingWhile test above).
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1632,7 +1632,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileSharedInput) {
   // the while body is a single constant (both loop state elements are the same
   // constant). This means no copies are necessary because both loop state
   // elements are the same so interchanging them is a no-op.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1693,7 +1693,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
   const Shape loop_state_shape = ShapeUtil::MakeTupleShape(
       {element_shape, element_shape, element_shape, element_shape});
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, element_shape, "param_0"));
@@ -1783,7 +1783,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
 TEST_F(CopyInsertionTest, WhileBodyWithConstantRoot) {
   // Test a while body and condition which are each simply a constant (root of
   // computation is a constant). The body constant should be copied.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
@@ -1896,7 +1896,7 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) {
   tensorflow::testing::StopTiming();
   for (int i = 0; i < num_iters; ++i) {
     HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
 
     auto builder = HloComputation::Builder("BM_SequentialWhiles");
@@ -1936,7 +1936,7 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) {
   tensorflow::testing::StopTiming();
   for (int i = 0; i < num_iters; ++i) {
     HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
 
     auto builder = HloComputation::Builder("BM_ParallelWhiles");
@@ -2003,7 +2003,7 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody(
 void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
   tensorflow::testing::StopTiming();
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsFromFlags());
   CopyInsertion copy_insertion;
   const Shape element_shape = ShapeUtil::MakeShape(F32, {});
   std::vector<HloInstruction*> tuple_params(num_tuple_inputs);
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 36e25cbe678e03f511934eb00af8c3834de2c63e..ce4c2a9cc69240b9565b35a3f2504d7fc9373917 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -96,6 +96,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
         "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -824,7 +825,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -846,7 +846,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -887,7 +886,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -961,17 +959,16 @@ tf_cc_test(
     srcs = ["cpu_copy_insertion_test.cc"],
     deps = [
         ":cpu_copy_insertion",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -997,7 +994,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 73b03440cbb936017257b8a92f16dcc25d41e21c..796a7cf94d02b0ad42366387a9d3f8d589b8840a 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -61,19 +61,6 @@ Disabling these as a starting point.
 // TODO(b/64227304) Creating a custom pass pipeline will replace this.
 
 namespace {
-class FilteredFunctionPassManager : public llvm::legacy::FunctionPassManager {
- public:
-  FilteredFunctionPassManager(llvm::Module* m, bool disable_expensive_passes)
-      : llvm::legacy::FunctionPassManager(m),
-        disable_expensive_passes_(disable_expensive_passes) {}
-  void add(llvm::Pass* p) override {
-    llvm::legacy::FunctionPassManager::add(p);
-  }
-
- private:
-  bool disable_expensive_passes_;
-};
-
 class FilteredPassManager : public llvm::legacy::PassManager {
  public:
   explicit FilteredPassManager(bool disable_expensive_passes)
@@ -96,8 +83,7 @@ class FilteredPassManager : public llvm::legacy::PassManager {
 std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
     llvm::Module& module) const {
   FilteredPassManager module_passes(disable_expensive_passes_);
-  FilteredFunctionPassManager function_passes(&module,
-                                              disable_expensive_passes_);
+  llvm::legacy::FunctionPassManager function_passes(&module);
 
   VLOG(2) << "IR before optimizations";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(module));
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index 2083f440fdd971db1b675d005664d25e6de53dbe..c58175428fea6a2d38253c35de598b99a4281bf1 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -32,7 +32,7 @@ namespace cpu {
 
 using ::testing::ElementsAre;
 
-class ConvCanonicalizationTest : public HloVerifiedTestBase {
+class ConvCanonicalizationTest : public HloTestBase {
  public:
   ConvCanonicalizationTest() {
     for (int i = 0; i < 2; ++i) {
@@ -87,7 +87,7 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
       input, kernel, /*feature_group_count=*/1, conv_window_, dnums,
       DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
@@ -96,7 +96,7 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
         return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
       });
   ConvCanonicalization conv_canonicalization(&target_machine_features);
-  EXPECT_TRUE(conv_canonicalization.Run(module).ValueOrDie());
+  EXPECT_TRUE(conv_canonicalization.Run(module.get()).ValueOrDie());
 
   const HloInstruction* output_reshape = entry_computation->root_instruction();
   EXPECT_EQ(HloOpcode::kTranspose, output_reshape->opcode());
@@ -150,7 +150,7 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
       input, kernel, /*feature_group_count=*/1, conv_window_, dnums,
       DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
@@ -158,7 +158,7 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
         return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
       });
   ConvCanonicalization conv_canonicalization(&target_machine_features);
-  EXPECT_FALSE(conv_canonicalization.Run(module).ValueOrDie());
+  EXPECT_FALSE(conv_canonicalization.Run(module.get()).ValueOrDie());
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 4ce5a8a29255a763c83941efb6de9b7c652cedb4..2bf24c15c1f050b200b1d9af2d95286f9a9dbe4c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -76,6 +76,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -249,6 +250,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
 
+  pipeline.AddPass<HloGetDimensionSizeRewriter>();
   pipeline.AddPass<MapInliner>();
 
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
@@ -268,10 +270,10 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true);
-    pass.AddPass<AlgebraicSimplifier>(
-        /*is_layout_sensitive=*/false,
-        [](const Shape&, const Shape&) { return false; },
-        /*enable_dot_strength_reduction=*/false);
+    AlgebraicSimplifierOptions options(
+        [](const Shape&, const Shape&) { return false; });
+    options.set_enable_dot_strength_reduction(false);
+    pass.AddPass<AlgebraicSimplifier>(options);
     pass.AddPass<HloDCE>();
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
@@ -334,10 +336,11 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     pass.AddInvariantChecker<HloVerifier>(
         /*layout_sensitive=*/true,
         /*allow_mixed_precision=*/false);
-    pass.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        /*is_layout_sensitive=*/true,
-        [](const Shape&, const Shape&) { return true; },
-        /*enable_dot_strength_reduction=*/false);
+    AlgebraicSimplifierOptions options(
+        [](const Shape&, const Shape&) { return true; });
+    options.set_is_layout_sensitive(true);
+    options.set_enable_dot_strength_reduction(false);
+    pass.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
     pass.AddPass<HloDCE>();
     pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
   }
@@ -587,9 +590,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // Select an order for emitting the HLO instructions for each
   // computation. Using this sequence enables tighter buffer liveness analysis
   // and reduced memory usage (as compared to using DependencyHloOrdering).
-  TF_ASSIGN_OR_RETURN(
-      HloSchedule schedule,
-      ScheduleModule(*module, BufferSizeBytesFunction(), DFSMemoryScheduler));
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                      ScheduleModule(module.get(), BufferSizeBytesFunction(),
+                                     DFSMemoryScheduler));
 
   // Run buffer allocation on the HLO graph.
   TF_ASSIGN_OR_RETURN(
@@ -779,7 +782,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     XLA_VLOG_LINES(2, module->ToString());
 
     TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                        ScheduleModule(*module, BufferSizeBytesFunction()));
+                        ScheduleModule(module, BufferSizeBytesFunction()));
 
     // Run buffer analysis on the HLO graph. This analysis figures out which
     // temporary buffers are required to run the computation.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
index c9fb34be1cd582c71618c770c892058c233c571a..c085f85fb73e98e4c7ba15af8db8bb19c2499f5f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -52,7 +52,7 @@ int64 CountCopies(const HloModule& module) {
   return count;
 }
 
-class CpuCopyInsertionTest : public HloVerifiedTestBase {
+class CpuCopyInsertionTest : public HloTestBase {
  protected:
   void InsertCopies(HloModule* module) {
     CpuCopyInsertion copy_insertion;
@@ -65,7 +65,7 @@ class CpuCopyInsertionTest : public HloVerifiedTestBase {
 TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) {
   // Test a while body and condition which are each simply a constant (root of
   // computation is a constant). Each constant should be copied.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
@@ -90,7 +90,7 @@ TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) {
 
   module->AddEntryComputation(builder.Build());
 
-  InsertCopies(module);
+  InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 3);
 
@@ -103,7 +103,7 @@ TEST_F(CpuCopyInsertionTest, TupleCall) {
   // Test a kCall instruction which calls a computation which produces a three
   // element tuple: one is a constant, one is a parameter, and one is produced
   // in the computation. The constant and parameter should be copied.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
@@ -127,7 +127,7 @@ TEST_F(CpuCopyInsertionTest, TupleCall) {
 
   module->AddEntryComputation(builder.Build());
 
-  InsertCopies(module);
+  InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*subcomputation), 2);
   EXPECT_THAT(subcomputation->root_instruction(),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 29abf38e439d919ff93629ed992cb3ff93a929bd..818b2b0d0db2893e11fa46c7867e6c74bbbb6905 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -51,8 +51,7 @@ namespace cpu {
 CpuExecutable::CpuExecutable(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<const HloModule> hlo_module,
-    const string& entry_function_name,
+    std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 3c3c047bfe8ee0d1ad90ede2432a86264f47870b..3b91b15ba9b5603b50f78f489e9a3fdad354c083 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -49,7 +49,7 @@ class CpuExecutable : public Executable {
  public:
   CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,
                 std::unique_ptr<const BufferAssignment> assignment,
-                std::unique_ptr<const HloModule> hlo_module,
+                std::unique_ptr<HloModule> hlo_module,
                 const string& entry_function_name,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
index e6b6fcdf684eadb3702e490bbe24dbb7b3b52ec7..9cbfb88834bf51f4df54e97efe6cd7bf88b12334 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -25,7 +25,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class CpuHloSupportCheckerTest : public HloVerifiedTestBase {
+class CpuHloSupportCheckerTest : public HloTestBase {
  protected:
   CpuHloSupportChecker& checker() { return checker_; }
 
@@ -42,10 +42,10 @@ TEST_F(CpuHloSupportCheckerTest, Add) {
       HloInstruction::CreateParameter(1, scalar_shape, "param1"));
   builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  TF_ASSERT_OK(checker().Run(module).status());
+  TF_ASSERT_OK(checker().Run(module.get()).status());
 }
 
 TEST_F(CpuHloSupportCheckerTest, SparseUnimplemented) {
@@ -60,7 +60,7 @@ TEST_F(CpuHloSupportCheckerTest, SparseUnimplemented) {
   // Since verifier is reporting sparse layouts as errors, we should
   // use a regular HloModule instead of VerifiedHloModule to avoid
   // verifier errors being triggered in the destructor.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   Status status = checker().Run(module.get()).status();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 7d99b914d4f5e5d27722bcd098d2ae0c54a36a23..c77d5988ba3d204a6e9da2ff1337d68c44c19e62 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -58,7 +58,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_0) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), exp0, arg1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -77,7 +77,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_1) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -98,7 +98,7 @@ TEST_F(InstructionFusionTest, DotOperationNoFusion_Bitcast) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), bitcast0, arg1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -119,7 +119,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Reshape) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), reshape0, arg1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -138,7 +138,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_TooLarge) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1, 32 * 1024}), arg0, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -157,7 +157,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_ElementReuse) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {2, 1024}), arg0, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -321,7 +321,7 @@ TEST_F(OpcodeFusionTest, Exponential_Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -350,7 +350,7 @@ TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       dynamic_slice_shape, HloOpcode::kTanh, dynamic_slice4));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -370,7 +370,7 @@ TEST_F(OpcodeFusionTest, Broadcast_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, broadcast1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -392,7 +392,7 @@ TEST_F(OpcodeFusionTest, DynamicSlice_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, dynamic_slice2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -410,7 +410,7 @@ TEST_F(OpcodeFusionTest, Exponential_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -429,7 +429,7 @@ TEST_F(OpcodeFusionTest, Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -447,7 +447,7 @@ TEST_F(OpcodeFusionTest, Reverse_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, reverse1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -466,7 +466,7 @@ TEST_F(OpcodeFusionTest, Slice_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2}), HloOpcode::kNegate, slice1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -489,7 +489,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, transpose2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -498,7 +498,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
 }
 
 TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -517,7 +517,7 @@ TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
 }
 
 TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -542,7 +542,7 @@ TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
 }
 
 TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -573,7 +573,7 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
 }
 
 TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {4, 100, 10, 100, 50});
@@ -641,7 +641,7 @@ TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastUnary) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(large_shape, HloOpcode::kExp, small_exp));
 
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto did_fusion = CpuInstructionFusion().Run(module.get());
@@ -670,7 +670,7 @@ TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastBinary) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       large_shape, HloOpcode::kAdd, small_exp, large_param));
 
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto did_fusion = CpuInstructionFusion().Run(module.get());
@@ -712,7 +712,7 @@ void CreateComputationForDotAddOutputFusionTest(const string& test_name,
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/1,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -725,7 +725,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/false);
@@ -738,7 +738,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -751,7 +751,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/true);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 97659b88a7974d7caf91ab0d4741f3635e4dae4a..6c61b64758ede160e2d50e4429590a789ec253c3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -73,7 +73,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensor) {
   auto result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -114,7 +114,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor0) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       result_shape, HloOpcode::kAdd, dot_a_result, dot_b_result));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -158,7 +158,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor1) {
   auto tuple_result = builder.AddInstruction(
       HloInstruction::CreateTuple({dot_a_result, dot_b_result}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -192,7 +192,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantLhsTensor) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -232,7 +232,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensorThroughGTE) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -353,7 +353,7 @@ static void AssertCorrectLayoutForDotOutputFusion(
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -365,7 +365,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -377,7 +377,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -389,7 +389,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -401,7 +401,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
@@ -413,7 +413,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index b8ace5702688096822573c7afae234cbcbe77b28..92debb83e33b1400a59e5eef0f90971392ab7b22 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -22,7 +22,6 @@ limitations under the License.
 namespace {
 
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
-const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaEnableExperimentalLlvmIrGemm =
     "xla_enable_experimental_llvm_ir_gemm";
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index d6968323f337d83e41b5e031cc49fab5b6a17b21..cf97a8bde0757b67bdea62c30ea0e8e63161c573 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -111,7 +111,7 @@ IrEmitter::IrEmitter(
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
     HloComputation* computation, const string& function_name_prefix,
     bool is_top_level_computation,
-    const std::vector<const HloInstruction*>* instruction_order) {
+    const std::vector<HloInstruction*>* instruction_order) {
   string function_name = name_uniquer_.GetUniqueName(function_name_prefix);
   VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix
           << "]; ordered? " << (instruction_order != nullptr);
@@ -140,7 +140,7 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
   // readcyclecounter if it is unavailable.
   bool use_rdtscp = arch_type_ == llvm::Triple::ArchType::x86 ||
                     arch_type_ == llvm::Triple::ArchType::x86_64;
-  profiling_state_ = ProfilingState(use_rdtscp, GetProfileCountersArgument());
+  profiling_state_ = ProfilingState(use_rdtscp);
   if (instruction_order == nullptr) {
     TF_RETURN_IF_ERROR(computation->Accept(this));
   } else {
@@ -1379,33 +1379,6 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   return Status::OK();
 }
 
-// Fills up the free variables in 'index_with_free_var' with values from
-// 'filler_index'. The size of free variables must be the same as the
-// size of 'filler_index'.
-//
-// This is often used after dimension reduction, where
-// 'index_with_free_var' has one or more dimensions reduced, which serves as
-// free variables (represented as nullptr). For example, if we have a 4
-// dimensional input and index for the dimension being reduced is
-// 2 (third dimension), we will have an index like [i, j, NULL, k]
-// after reduced dimension.
-//
-// Here we fill up that free variable by 'filler_index', which contains
-// the value in the reduced dimension.
-static llvm_ir::IrArray::Index FillReducedDimensionIndex(
-    llvm_ir::IrArray::Index index_with_free_var,
-    llvm_ir::IrArray::Index filler_index) {
-  llvm_ir::IrArray::Index::const_iterator it = filler_index.begin();
-
-  for (size_t i = 0; i < index_with_free_var.size(); ++i) {
-    if (index_with_free_var[i] == nullptr) {
-      index_with_free_var[i] = *it++;
-    }
-  }
-  CHECK(filler_index.end() == it);
-  return index_with_free_var;
-}
-
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   VLOG(2) << "HandleParameter: " << parameter->ToString();
   return EmitTargetAddressForOp(parameter);
@@ -1536,7 +1509,8 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
 
     case HloOpcode::kMaximum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs,
+                 llvm::Value* rhs) -> llvm::Value* {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::maxnum,
                                               {lhs, rhs}, {lhs->getType()}, b);
@@ -1551,7 +1525,8 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
 
     case HloOpcode::kMinimum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs,
+                 llvm::Value* rhs) -> llvm::Value* {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::minnum,
                                               {lhs, rhs}, {lhs->getType()}, b);
@@ -2192,14 +2167,6 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   return Status::OK();
 }
 
-// If `hlo` is a Transpose, returns its operand; otherwise returns `hlo` itself.
-static const HloInstruction* StripTranspose(const HloInstruction& hlo) {
-  if (hlo.IsRank2Transpose()) {
-    return hlo.operand(0);
-  }
-  return &hlo;
-}
-
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   auto* root = fusion->fused_expression_root();
   if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion, assignment_)) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 136b88ff75ea8a5f48b42d3476219f18f5ecb39a..f529c613a3de62996feeca854213155df5943e7b 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -101,7 +101,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   StatusOr<llvm::Function*> EmitComputation(
       HloComputation* computation, const string& function_name_prefix,
       bool is_top_level_computation,
-      const std::vector<const HloInstruction*>* instruction_order);
+      const std::vector<HloInstruction*>* instruction_order);
 
   llvm::IRBuilder<>* b() { return &b_; }
 
@@ -467,9 +467,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // profiling a computation.
   class ProfilingState {
    public:
-    ProfilingState() : use_rdtscp_(false), prof_counters_(nullptr) {}
-    ProfilingState(bool use_rdtscp, llvm::Value* prof_counters)
-        : use_rdtscp_(use_rdtscp), prof_counters_(prof_counters) {}
+    ProfilingState() : use_rdtscp_(false) {}
+    explicit ProfilingState(bool use_rdtscp) : use_rdtscp_(use_rdtscp) {}
 
     // Record the cycle counter before an HLO executes.
     void RecordCycleStart(llvm::IRBuilder<>* b, HloInstruction* hlo);
@@ -494,9 +493,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
     // intrinsic?
     bool use_rdtscp_;
 
-    // The argument which corresponds to the profile counter buffer.
-    llvm::Value* prof_counters_;
-
     // The first read cycle counter in the program.
     llvm::Value* first_read_cycle_start_ = nullptr;
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index fad76338a57cd9eb21d9469ca8552efa8ea0129b..f0b65046c14ccec5336abf7c4d05d1d755f783bd 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -17,13 +17,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
+class ParallelTaskAssignmentTest : public HloTestBase {
  protected:
   const HloCostAnalysis::ShapeSizeFunction shape_size_func_ =
       cpu::CpuExecutable::ShapeSizeBytes;
@@ -35,7 +35,7 @@ class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
   cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features_;
 
   ParallelTaskAssignmentTest()
-      : HloVerifiedTestBase(), target_machine_features_([](int64 shape_size) {
+      : HloTestBase(), target_machine_features_([](int64 shape_size) {
           return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
         }) {}
 
@@ -57,8 +57,9 @@ TEST_F(ParallelTaskAssignmentTest, DotOperationNotParallelized) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -84,8 +85,9 @@ TEST_F(ParallelTaskAssignmentTest,
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -100,8 +102,9 @@ TEST_F(ParallelTaskAssignmentTest, RngOperationNotParallelized) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -116,8 +119,9 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index 669eeb95f3299623a7556bfbb8045fd77f5d0745..722aa3120ef4d8c957873ac58c361f19632dde1f 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -41,61 +42,60 @@ void KeyValueSort(std::pair<KeyType, int64>* row_to_sort, int64 num_elements) {
   std::sort(row_to_sort, row_to_sort + num_elements);
 }
 
-// For floating point numbers, we want a total order comparator. -NaN and NaN
-// should appear at the beginning and end of the ordering, and -0.0 should
-// appear before 0.0. Also we want to have a stable sort, so if the keys are the
-// same, we compare the index values.
-template <typename KeyType>
-bool LessThan(KeyType lhs, int64 lhs_index, KeyType rhs, int64 rhs_index) {
-  bool lhs_is_negative = std::signbit(lhs);
-  bool rhs_is_negative = std::signbit(rhs);
-  // If the signs are different, we can just compare the signs.
-  if (lhs_is_negative != rhs_is_negative) {
-    return lhs_is_negative && !rhs_is_negative;
-  }
-  bool lhs_nan = std::isnan(lhs);
-  bool rhs_nan = std::isnan(rhs);
-  // Exactly one number is nan?
-  if (lhs_nan != rhs_nan) {
-    if (lhs_nan) {
-      return lhs_is_negative;
-    }
-    return !rhs_is_negative;
+// We would like a total order of floating point numbers so that the
+// sort has a predictable behavior in the presence of NaNs. Rather
+// than using floating point comparison, we use the following trick:
+// If f is a float, and
+// x = bit_cast<int32>(f);
+// y = x < 0 ? 0x7FFFFFFF - x : x;
+// then y is ordered as an int32 such that finite values have the
+// obvious order, -0 is ordered before 0, and -NaN and NaN appear at
+// the beginning and end of the ordering.
+template <typename CastType, typename UnsignedCastType, typename KeyType>
+CastType Convert(KeyType value) {
+  CastType casted_value;
+  memcpy(&casted_value, &value, sizeof(CastType));
+  if (casted_value < 0) {
+    return static_cast<UnsignedCastType>(std::numeric_limits<CastType>::max()) -
+           casted_value;
   }
-  if (lhs != rhs) {
-    return lhs < rhs;
-  }
-  return lhs_index < rhs_index;
+  return casted_value;
+}
+
+template <typename CastType, typename UnsignedCastType, typename KeyType>
+bool LessThan(KeyType lhs, KeyType rhs) {
+  return Convert<CastType, UnsignedCastType>(lhs) <
+         Convert<CastType, UnsignedCastType>(rhs);
 }
 
 template <>
 void KeyValueSort(std::pair<double, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<double, int64>& lhs,
-               const std::pair<double, int64>& rhs) -> bool {
-              return LessThan(lhs.first, lhs.second, rhs.first, rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<double, int64>& lhs,
+                      const std::pair<double, int64>& rhs) -> bool {
+                     return LessThan<int64, uint64>(lhs.first, rhs.first);
+                   });
 }
 
 template <>
 void KeyValueSort(std::pair<float, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<float, int64>& lhs,
-               const std::pair<float, int64>& rhs) -> bool {
-              return LessThan(lhs.first, lhs.second, rhs.first, rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<float, int64>& lhs,
+                      const std::pair<float, int64>& rhs) -> bool {
+                     return LessThan<int32, uint32>(lhs.first, rhs.first);
+                   });
 }
 
 template <>
 void KeyValueSort(std::pair<Eigen::half, int64>* row_to_sort,
                   int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<Eigen::half, int64>& lhs,
-               const std::pair<Eigen::half, int64>& rhs) -> bool {
-              return LessThan(
-                  Eigen::half_impl::half_to_float(lhs.first), lhs.second,
-                  Eigen::half_impl::half_to_float(rhs.first), rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<Eigen::half, int64>& lhs,
+                      const std::pair<Eigen::half, int64>& rhs) -> bool {
+                     return LessThan<int32, uint32>(
+                         Eigen::half_impl::half_to_float(lhs.first),
+                         Eigen::half_impl::half_to_float(rhs.first));
+                   });
 }
 
 template <typename KeyType>
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
index 1a3d82de954318368d61e3feeb0345dc592dcd8b..7d8e51f909e3db699b745f94a6c625407bc4a6e3 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include <random>
 
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 namespace cpu {
 namespace {
 
-class ShapePartitionAssignerTest : public HloVerifiedTestBase {
+class ShapePartitionAssignerTest : public HloTestBase {
  protected:
   typedef std::vector<int64> Vec;
 
@@ -91,7 +91,7 @@ TEST_F(ShapePartitionAssignerTest, Shape532WithLayout201) {
             expected_partitions);
 }
 
-class ShapePartitionIteratorTest : public HloVerifiedTestBase {
+class ShapePartitionIteratorTest : public HloTestBase {
  protected:
   typedef std::vector<std::pair<int64, int64>> Partition;
 };
@@ -145,7 +145,7 @@ TEST_F(ShapePartitionIteratorTest, Shape532WithLayout210) {
   }
 }
 
-class RandomShapePartitionIteratorTest : public HloVerifiedTestBase {
+class RandomShapePartitionIteratorTest : public HloTestBase {
  protected:
   typedef std::vector<std::pair<int64, int64>> Partition;
   RandomShapePartitionIteratorTest()
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 4b129c95d46d8b5a119e5d23eef387daf7863cce..382dfd0d99df87bbadfe541ddaa32cd6da8e8068 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -48,7 +48,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 18ee25ba9158c28baaf01492c290638b9673f1ec..f8f5f392da8ab3348e63185aecf7b639daacaa42 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -50,7 +50,7 @@ class CpuEigenDotOperationTest
         /*entry_point_name=*/"entry",
         /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewVerifiedModule();
     hlo_module->AddEntryComputation(std::move(entry_computation));
 
     CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index 00a7aa2ad2f6bac4877302296ccb76222557535c..e30f95311fce229f9c559d3bb40142151e8bf3e3 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -46,7 +46,7 @@ class CpuExternalConstantsTest : public CpuCodegenTest {
     builder.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, constant));
 
-    std::unique_ptr<HloModule> module = CreateNewModule();
+    std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
     module->AddEntryComputation(builder.Build());
 
     CompileAndVerifyIr(std::move(module), filecheck_pattern,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
index 1deb412064b02988a8d4a6d726969c948d354d47..04a81dfd35f459ff1fdb3181dc8fc65c62a37d4f 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
@@ -34,7 +34,7 @@ namespace xla {
 namespace cpu {
 namespace {
 
-class CpuFusionTest : public HloVerifiedTestBase {
+class CpuFusionTest : public HloTestBase {
  protected:
   CpuFusionTest() {}
 
@@ -57,11 +57,11 @@ TEST_F(CpuFusionTest, FuseTwoElementwiseOps) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, add1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The computation root instruction was fused. Verify the fusion instruction
   // is now the root.
@@ -104,11 +104,11 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, two, floor));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The computation root instruction was fused. Verify the fusion instruction
   // is now the root.
@@ -131,7 +131,7 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) {
 TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) {
   // Test a chain of fusible ops with a non-fusible op (a reduce) thrown in the
   // middle.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({-1.5, -2.5, -3.0});
   Shape vshape = input_literal.shape();
@@ -183,7 +183,7 @@ TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) {
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The computation root instruction was fused. Verify the fusion instruction
   // is now the root.
@@ -250,12 +250,12 @@ TEST_F(CpuFusionTest, TestOperandOrderToAvoidDuplication) {
       builder.AddInstruction(HloInstruction::CreateTuple({add1, add2}));
 
   // Create computation and module.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Run fusion.
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   auto fusion1 = result->operand(0);
   auto fusion2 = result->operand(1);
@@ -310,11 +310,11 @@ TEST_F(CpuFusionTest, DoNotDuplicateExpensiveOps) {
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({negate1, negate2, exp2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The only fusion instruction should be operand 0 of the tuple (formerly
   // negate1).
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index a434c04a980b9b3cd849792b97a0d9e965ba09f2..9b10c49f4f547edfb2164f98c49cceb031148bdc 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -91,7 +91,7 @@ TEST_P(CpuUnaryIntrinsicTest, DoIt) {
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   string check_lines{spec.check_lines.data(), spec.check_lines.size()};
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index 3b87683ffffefd2aa24dd234cc072425bef00a24..fa0e09ff6b5694c0e97963b83c6e541b858a1376 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -63,7 +63,7 @@ CHECK-NOT: private constant [48 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_text));
+                          ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
@@ -104,14 +104,14 @@ ENTRY main {
 )";
 
   string filecheck_pattern = R"(
-CHECK: private constant [4 x i8]
-CHECK: private constant [8 x i8]
+CHECK-DAG: private constant [4 x i8]
+CHECK-DAG: private constant [8 x i8]
 CHECK-NOT: private constant [4 x i8]
 CHECK-NOT: private constant [8 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_text));
+                          ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index b35fd9dad877c319c3d0110c96a00aeefa78769e..a7702c2aeeaff8a46a2c4f2785ccb873ea2c08e5 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -56,7 +56,7 @@ TEST_F(CpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // Now that we have an HLO module, build an llvm_ir::AliasAnalysis for it.
diff --git a/tensorflow/compiler/xla/service/defuser_test.cc b/tensorflow/compiler/xla/service/defuser_test.cc
index e727ba49cb6321e499b5d50d5f45e7f7f6bb6fef..64fb50318394918b277fd717994f5366d762ac36 100644
--- a/tensorflow/compiler/xla/service/defuser_test.cc
+++ b/tensorflow/compiler/xla/service/defuser_test.cc
@@ -18,19 +18,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
 
-class DefuserTest : public HloVerifiedTestBase {
+class DefuserTest : public HloTestBase {
  protected:
   // Returns the number of fusion instructions in the module.
-  int FusionCount() {
+  int FusionCount(const HloModule* m) {
     int count = 0;
-    for (HloComputation* computation : module().computations()) {
+    for (HloComputation* computation : m->computations()) {
       if (computation->IsFusionComputation()) {
         count++;
       }
@@ -43,6 +43,7 @@ class DefuserTest : public HloVerifiedTestBase {
 };
 
 TEST_F(DefuserTest, NoFusionInstruction) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -51,13 +52,14 @@ TEST_F(DefuserTest, NoFusionInstruction) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, param0, param1));
 
-  module().AddEntryComputation(builder.Build());
-  EXPECT_EQ(0, FusionCount());
+  m->AddEntryComputation(builder.Build());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
-  EXPECT_FALSE(defuser_.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(defuser_.Run(m.get()).ValueOrDie());
 }
 
 TEST_F(DefuserTest, TrivialFusionInstructionAsRoot) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -66,21 +68,22 @@ TEST_F(DefuserTest, TrivialFusionInstructionAsRoot) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, param0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add},
                                        HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(1, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(1, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Parameter(), op::Parameter()));
 }
 
 TEST_F(DefuserTest, TrivialFusionInstructionNotAsRoot) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -91,21 +94,22 @@ TEST_F(DefuserTest, TrivialFusionInstructionNotAsRoot) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(shape_, HloOpcode::kNegate, add));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add},
                                        HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(), op::Negate(op::Fusion()));
 
-  EXPECT_EQ(1, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(1, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Negate(op::Add(op::Parameter(), op::Parameter())));
 }
 
 TEST_F(DefuserTest, NonTrivialFusionInstruction) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -128,22 +132,23 @@ TEST_F(DefuserTest, NonTrivialFusionInstruction) {
   auto add2 = builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, constant, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction(
       {add2, constant, div, mul, sub, negate, add},
       HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(1, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(1, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Constant(), op::Divide()));
 }
 
 TEST_F(DefuserTest, MultipleFusionInstructions) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -166,7 +171,7 @@ TEST_F(DefuserTest, MultipleFusionInstructions) {
   auto add2 = builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, constant, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add2, constant, div, mul},
                                        HloInstruction::FusionKind::kLoop);
   computation->CreateFusionInstruction({sub, negate, add},
@@ -174,15 +179,16 @@ TEST_F(DefuserTest, MultipleFusionInstructions) {
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(2, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(2, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Constant(), op::Divide()));
 }
 
 TEST_F(DefuserTest, NestedFusionInstructions) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -193,7 +199,7 @@ TEST_F(DefuserTest, NestedFusionInstructions) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(shape_, HloOpcode::kNegate, add));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   auto outer_fusion = computation->CreateFusionInstruction(
       {negate, add}, HloInstruction::FusionKind::kLoop);
   HloInstruction* fused_negate = outer_fusion->fused_expression_root();
@@ -203,9 +209,9 @@ TEST_F(DefuserTest, NestedFusionInstructions) {
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(2, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(2, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(), op::Negate(op::Add()));
 }
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
index b3549acfc291a54b2345b006310613c3a45a4b47..ed37099a5428075928ec98b134632867d58bbfe7 100644
--- a/tensorflow/compiler/xla/service/despecializer.cc
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
 #include "tensorflow/compiler/xla/service/defuser.h"
+#include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
 
 namespace xla {
@@ -45,6 +46,7 @@ class ControlDepRemover : public HloModulePass {
 
 Despecializer::Despecializer() : pipeline_("despecializer") {
   // TODO(b/70588125): Also deal with window reversal in a fast way.
+  pipeline_.AddPass<HloDescheduler>();
   pipeline_.AddPass<ControlDepRemover>();
   pipeline_.AddPass<Defuser>();
   pipeline_.AddPass<ImplicitBroadcastRemover>();
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 4159aa281fa2b66d310d7c135f123a5a3bb83270..d6371283221b63b30f968929fe2807eae3f22df0 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -108,6 +108,7 @@ class DfsHloVisitorBase {
   virtual Status HandleCrossReplicaSum(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
+  virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 4cd10ab06cd3b804406607212d3f3c316d6cff95..e57184f639f4f2c618b980a5082381f4b9c28b19 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -203,6 +203,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleAfterAll(HloInstructionPtr token) override {
     return DefaultAction(token);
   }
+  Status HandleGetDimensionSize(HloInstructionPtr get_size) override {
+    return DefaultAction(get_size);
+  }
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8bfc8905064bcd7b68fe259fbcc1546ff083dbd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+
+Status DynamicParameterBinding::Bind(
+    const DynamicParameter& dynamic_parameter,
+    const DynamicDimension& dynamic_dimension) {
+  auto result = bindings_.emplace(dynamic_dimension, dynamic_parameter);
+  TF_RET_CHECK(result.second);
+  return Status::OK();
+}
+
+absl::optional<DynamicParameterBinding::DynamicParameter>
+DynamicParameterBinding::GetBinding(const DynamicDimension& dynamic_dimension) {
+  auto param_iter = bindings_.find(dynamic_dimension);
+  if (param_iter == bindings_.end()) {
+    return absl::nullopt;
+  }
+  return param_iter->second;
+}
+
+DynamicParameterBindingProto DynamicParameterBinding::ToProto() const {
+  DynamicParameterBindingProto result;
+  for (const auto& binding : bindings_) {
+    const DynamicDimension& dynamic_dimension = binding.first;
+    const DynamicParameter& dynamic_param = binding.second;
+    DynamicParameterBindingProto::Binding binding_proto;
+    binding_proto.set_dynamic_param_num(dynamic_param.parameter_num);
+    for (int64 i : dynamic_param.parameter_index) {
+      binding_proto.add_dynamic_param_index(i);
+    }
+
+    binding_proto.set_target_param_num(dynamic_dimension.parameter_num);
+
+    for (int64 i : dynamic_dimension.parameter_index) {
+      binding_proto.add_target_param_index(i);
+    }
+
+    binding_proto.set_target_param_dim_num(dynamic_dimension.dimension);
+    result.add_entries()->Swap(&binding_proto);
+  }
+  return result;
+}
+
+StatusOr<DynamicParameterBinding> DynamicParameterBinding::CreateFromProto(
+    const DynamicParameterBindingProto& proto) {
+  DynamicParameterBinding result;
+  for (const DynamicParameterBindingProto::Binding& binding : proto.entries()) {
+    int64 dynamic_param_num = binding.dynamic_param_num();
+    ShapeIndex dynamic_param_index(binding.dynamic_param_index().begin(),
+                                   binding.dynamic_param_index().end());
+    int64 target_param_num = binding.target_param_num();
+    ShapeIndex target_param_index(binding.target_param_index().begin(),
+                                  binding.target_param_index().end());
+    int64 target_dim_num = binding.target_param_num();
+
+    TF_RETURN_IF_ERROR(
+        result.Bind(DynamicParameter{dynamic_param_num, dynamic_param_index},
+                    DynamicDimension{target_param_num, target_param_index,
+                                     target_dim_num}));
+  }
+
+  return result;
+}
+
+string DynamicParameterBinding::ToString() const {
+  std::vector<string> pieces;
+  pieces.push_back("DynamicParameterBinding: ");
+  for (const auto& binding : bindings_) {
+    const DynamicDimension& dynamic_dimension = binding.first;
+    const DynamicParameter& dynamic_param = binding.second;
+    pieces.push_back(absl::StrFormat(
+        " -- Input param number %lld at %s has dim %lld as dynamic"
+        " dimension, which is represented by param number %lld at "
+        "%s",
+        dynamic_dimension.parameter_num,
+        dynamic_dimension.parameter_index.ToString(),
+        dynamic_dimension.dimension, dynamic_param.parameter_num,
+        dynamic_param.parameter_index.ToString()));
+  }
+  return absl::StrJoin(pieces, "\n");
+}
+
+Status DynamicParameterBinding::ForEachBinding(BindingFn fn) const {
+  for (const auto& binding : bindings_) {
+    TF_RETURN_IF_ERROR(fn(binding.second, binding.first));
+  }
+  return Status::OK();
+}
+
+Status DynamicParameterBinding::Verify(const HloModule& module) const {
+  const HloComputation* entry = module.entry_computation();
+  return ForEachBinding([&](const DynamicParameter& dynamic_parameter,
+                            const DynamicDimension& dynamic_dimension)
+                            -> Status {
+    TF_RET_CHECK(dynamic_parameter.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(dynamic_dimension.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        entry->parameter_instruction(dynamic_parameter.parameter_num)->shape(),
+        dynamic_parameter.parameter_index));
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        entry->parameter_instruction(dynamic_dimension.parameter_num)->shape(),
+        dynamic_dimension.parameter_index));
+    TF_RET_CHECK(
+        dynamic_dimension.dimension <
+        ShapeUtil::Rank(ShapeUtil::GetSubshape(
+            entry->parameter_instruction(dynamic_dimension.parameter_num)
+                ->shape(),
+            dynamic_dimension.parameter_index)));
+    return Status::OK();
+  });
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const DynamicParameterBinding& binding) {
+  out << binding.ToString();
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd474d8eed1b2c30ddb8f624a864198c74eacaba
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+class HloModule;
+// We currently use an explicit API that takes an extra parameter to indicate
+// the runtime size of a dynamic dimension. DynamicParameterBinding indicates
+// the relationship between parameter: We can have a dynamic parameter that
+// points to another target parameter to indicate that the target parameter is
+// dynamic.
+//
+//
+// TODO(b/119520625): Remove this API once we have more dynamic shape infra
+// ready.
+class DynamicParameterBinding {
+ public:
+  // DynamicParameter represents a special parameter that is used to represent
+  // the runtime size of a dimension of another parameter. A dynamic parameter
+  // has to be a scalar value.
+  struct DynamicParameter {
+    // The parameter number of dynamic parameter.
+    int64 parameter_num;
+    // The index of the parameter.
+    ShapeIndex parameter_index;
+  };
+
+  // DynamicDimension represents a dimension whose size is determined at
+  // runtime. A DynamicDimension's runtime size is determined by the binded
+  // DynamicParameter using `DynamicParameterBinding::Bind` method.
+  struct DynamicDimension {
+    // The parameter number of dynamic dimension.
+    int64 parameter_num;
+    // The subshape index of the parameter.
+    ShapeIndex parameter_index;
+    // The dimension number in the subshape.
+    int64 dimension;
+
+    // "friend" keyword are added so these functions can be found by ADL.
+    template <typename H>
+    friend H AbslHashValue(H h, const DynamicDimension& m) {
+      return H::combine(std::move(h), m.parameter_num, m.parameter_index,
+                        m.dimension);
+    }
+
+    friend bool operator==(const DynamicDimension& lhs,
+                           const DynamicDimension& rhs) {
+      return lhs.parameter_num == rhs.parameter_num &&
+             lhs.parameter_index == rhs.parameter_index &&
+             lhs.dimension == rhs.dimension;
+    }
+  };
+
+  DynamicParameterBinding() = default;
+
+  virtual ~DynamicParameterBinding() = default;
+
+  // Adds binding which indicates that the dimension indicated by
+  // `dynamic_dimension` is dynamic, and its runtime size is represented by
+  // `dynamic_parameter`.
+  Status Bind(const DynamicParameter& dynamic_parameter,
+              const DynamicDimension& dynamic_dimension);
+
+  // Returns the parameter and the index representing the runtime size of
+  // dimension `dim_num` of parameter `param_num` at `param_index`.
+  //
+  // Returns nullopt if the binding is not set.
+  absl::optional<DynamicParameter> GetBinding(
+      const DynamicDimension& dynamic_dimension);
+
+  using BindingFn =
+      std::function<Status(const DynamicParameter& dynamic_parameter,
+                           const DynamicDimension& dynamic_dimension)>;
+
+  // Iterate through each binding.
+  Status ForEachBinding(BindingFn fn) const;
+
+  DynamicParameterBindingProto ToProto() const;
+
+  static StatusOr<DynamicParameterBinding> CreateFromProto(
+      const DynamicParameterBindingProto& proto);
+
+  string ToString() const;
+
+  // Verifies that the given binding is valid for the given module.
+  // Specifically, the binding's parameter and parameter size should be valid.
+  Status Verify(const HloModule& module) const;
+
+ private:
+  // Keeps track of mappings from DynamicDimension to DynamicParameter. The
+  // direction of is chosen so that we can easily query if a dimension is
+  // dynamic and which dynamic parameter represents the real size of that
+  // dimension.
+  absl::flat_hash_map<DynamicDimension, DynamicParameter> bindings_;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const DynamicParameterBinding& binding);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83a6d83dffde7995bd8e43917d13c5fd2705ba6f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+class DynamicParameterBindingTest : public HloTestBase {};
+
+TEST_F(DynamicParameterBindingTest, SimpleBinding) {
+  // 'b' is a dynamic shape; 'a' represents the real size of b's first
+  // dimension.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[10] parameter(1)
+  ROOT root = (f32[], f32[10]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {}},
+                   DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
+                                                    /*parameter_index=*/{},
+                                                    /*dimension=*/0});
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({}));
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+TEST_F(DynamicParameterBindingTest, TupleBinding) {
+  // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's first
+  // dimension.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[10]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[10] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[10]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+TEST_F(DynamicParameterBindingTest, TupleBindingWithMultiDimension) {
+  // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's both
+  // dimensions.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[10, 10]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[10, 10] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[10, 10]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 1}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+  EXPECT_TRUE(param2);
+  EXPECT_EQ(param2->parameter_num, 0);
+  EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
+
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 515267edd7caf42e04ebe638b99006db8967ea30..00bb430206afdb81f9d101c0a5b2b4cf907b447a 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instructions.h"
@@ -1671,26 +1672,66 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
 
   b_->SetInsertPoint(init_block);
 
+  // Assign a unique id for each *different* operand, and count how often each
+  // operand is used. If all operands are different, the usage count will be 1
+  // for each operand.
+  absl::flat_hash_map<const HloInstruction*, int64> to_unique_operand_id;
+  std::vector<int64> operand_usage_count;
+  for (const auto* operand : hlo->operands()) {
+    if (to_unique_operand_id.contains(operand)) {
+      ++operand_usage_count[to_unique_operand_id[operand]];
+    } else {
+      int64 unique_operand_id = to_unique_operand_id.size();
+      to_unique_operand_id[operand] = unique_operand_id;
+      operand_usage_count.push_back(1);
+    }
+  }
+
+  // To avoid that we emit the same operand more than once, we create one basic
+  // block for each *different* operand with a PHI node for the different source
+  // index inputs.
+  std::vector<llvm::BasicBlock*> emit_operand_blocks(
+      to_unique_operand_id.size(), nullptr);
+  std::vector<llvm::PHINode*> source_index_phis(to_unique_operand_id.size(),
+                                                nullptr);
+  for (const auto* operand : hlo->operands()) {
+    int64 operand_id = to_unique_operand_id[operand];
+    if (emit_operand_blocks[operand_id] != nullptr) {
+      continue;
+    }
+
+    emit_operand_blocks[operand_id] = llvm_ir::CreateBasicBlock(
+        exit_block, StrCat("concat_index_from_operand_id", operand_id), b_);
+    auto saved_insert_point = b_->GetInsertPoint();
+    llvm_ir::SetToFirstInsertPoint(emit_operand_blocks[operand_id], b_);
+    source_index_phis[operand_id] =
+        PHI(source_index.GetType(), operand_usage_count[operand_id]);
+    auto operand_index = source_index;
+    operand_index[concat_dim] = source_index_phis[operand_id];
+
+    // Create the terminator of the block before calling operand generators,
+    // because they require non-degenerate basic blocks.
+    b_->SetInsertPoint(llvm::BranchInst::Create(
+        exit_block, /*InsertAtEnd=*/emit_operand_blocks[operand_id]));
+    TF_ASSIGN_OR_RETURN(llvm::Value * value,
+                        operand_to_generator.at(operand)(operand_index));
+    output->addIncoming(value, b_->GetInsertBlock());
+    b_->SetInsertPoint(init_block, saved_insert_point);
+  }
+
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
        ++operand_idx) {
     const HloInstruction* operand = hlo->operand(operand_idx);
-    auto true_block = llvm_ir::CreateBasicBlock(
-        exit_block, StrCat("concat_index_from_operand", operand_idx), b_);
     auto false_block = llvm_ir::CreateBasicBlock(
         exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_);
     auto concat_dim_size =
         llvm::ConstantInt::get(source_index[concat_dim]->getType(),
                                operand->shape().dimensions(concat_dim));
-    CondBr(ICmpULT(source_index[concat_dim], concat_dim_size), true_block,
-           false_block);
-
-    // Create the terminator of the true block before calling operand
-    // generators, because they require non-degenerate basic blocks.
-    b_->SetInsertPoint(
-        llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
-    TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                        operand_to_generator.at(operand)(source_index));
-    output->addIncoming(value, b_->GetInsertBlock());
+    int64 operand_id = to_unique_operand_id[operand];
+    source_index_phis[operand_id]->addIncoming(source_index[concat_dim],
+                                               b_->GetInsertBlock());
+    CondBr(ICmpULT(source_index[concat_dim], concat_dim_size),
+           emit_operand_blocks[operand_id], false_block);
 
     // Subtract the size of the concat dimension of the current operand
     // from the source index.
@@ -1815,8 +1856,6 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
     // Clamp the gather index so that the gather region fits in the operand.
     // gather_dim_component_extended_inbound =
     //     clamp(gather_dim_component_extended, 0, largest_valid_start_index);
-
-    // TODO(b/111078873): This is implementation defined behavior.
     bool is_signed = ShapeUtil::ElementIsSigned(indices_shape);
     auto gather_dim_component_extended_inbound = EmitIntegralMin(
         index.GetConstantWithIndexType(largest_valid_start_index),
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 47c56e2f7fbd9f53be6a2b189c5c36cf4fdcdccb..10b8c01ff1383658fcfb2271c177ba54347f985a 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 3a6780f2a67f230cae626ea00cfbf93b4e60d968..b34bca55a48b113c325dbf28c03f7a0f5b71f658 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -61,7 +61,7 @@ struct ExecutionOutput {
 class Executable {
  public:
   explicit Executable(
-      std::unique_ptr<const HloModule> hlo_module,
+      std::unique_ptr<HloModule> hlo_module,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
       : hlo_module_(std::move(hlo_module)),
@@ -162,7 +162,7 @@ class Executable {
     return hlo_profile_printer_data_ != nullptr;
   }
 
-  const HloModule& module() const { return *hlo_module_; }
+  HloModule& module() const { return *hlo_module_; }
 
   const bool has_module() const { return hlo_module_ != nullptr; }
 
@@ -199,7 +199,7 @@ class Executable {
   // HloModule this was compiled from. BufferAssignment keeps pointers to
   // HloInstructions owned by the HloModule so we need to keep the HloModule
   // around.
-  const std::unique_ptr<const HloModule> hlo_module_;
+  const std::unique_ptr<HloModule> hlo_module_;
 
   // HloSnapshot this was compiled from. Null if not dumping executions.
   std::unique_ptr<HloSnapshot> hlo_snapshot_;
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index 5fbd73a5363b4cdbcaafedbe6f4e7bd6bb2a92d8..8eeb930b48165a2e3c622581e05cb5f7063fa1fa 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -30,7 +30,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class FlattenCallGraphTest : public HloVerifiedTestBase {
+class FlattenCallGraphTest : public HloTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
   std::unique_ptr<HloComputation> MakeScalarComputation() {
@@ -108,7 +108,7 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
   //    c
   //
   // Calls are made via kCall, kWhile, and kMap instructions.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(MakeConditionComputation());
   HloComputation* c_computation =
@@ -139,9 +139,9 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
   }
 
   {
-    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
     EXPECT_TRUE(result);
-    std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(module.get());
     const CallGraphNode& c_node = flat_call_graph->GetNode(c_computation);
     EXPECT_EQ(1, c_node.caller_callsites().size());
   }
@@ -149,7 +149,7 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
 
 // Test corner case of a computation used as a body and a loop condition.
 TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation;
   {
     HloComputation::Builder builder(TestName() + ".cond");
@@ -176,15 +176,15 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
   }
 
   {
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
     const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
     EXPECT_EQ(2, cond_node.caller_callsites().size());
   }
 
   {
-    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
     EXPECT_TRUE(result);
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
     const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
     EXPECT_EQ(1, cond_node.caller_callsites().size());
   }
@@ -201,7 +201,7 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
 //     C
 //
 TEST_F(FlattenCallGraphTest, FlattenCalls) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* c_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
@@ -211,9 +211,9 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
   module->AddEntryComputation(
       MakeCallingComputation(b_computation, /*callsites=*/2, ".Entry"));
 
-  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
   EXPECT_TRUE(result);
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(7, module->computation_count());
 
   const CallGraphNode& c_node = call_graph->GetNode(c_computation);
@@ -224,7 +224,7 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
 }
 
 TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* sub_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
@@ -243,9 +243,9 @@ TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
   module->AddEntryComputation(builder.Build());
   EXPECT_EQ(2, module->computation_count());
 
-  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
   EXPECT_TRUE(result);
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   // The true and false computations must now be different.
   EXPECT_EQ(3, module->computation_count());
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 1e8435fe542f2b65a11e256453cf911c5e6e833b..bfd1b6cb1492f5cb709e2ecefe73782094e26f5e 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -111,7 +111,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -463,7 +462,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:shape_inference",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:test",
     ],
@@ -627,7 +626,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
     ],
 )
@@ -702,6 +701,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
+        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -849,7 +849,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/memory",
@@ -909,7 +908,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -1036,6 +1034,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:pattern_matcher",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
index fa3afa6a5d318c399dc38e8934199b5a1393669e..af9303a5b761b99705945f1c02303156e3f874de 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -29,7 +29,7 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
 
-class CudnnConvPadForTensorCoresTest : public HloVerifiedTestBase {};
+class CudnnConvPadForTensorCoresTest : public HloTestBase {};
 
 TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvInputChannels) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index c46672c598b27670c56b3efa4775be8fea1fc6ac..e81850db69edced29ea31bb2a526b0503bf8a453 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -77,7 +77,11 @@ bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
     return false;
   }
 
-  if (window_util::HasWindowReversal(conv->window())) {
+  // CuDNN can perform either cross correlation (no reversal),
+  // or convolution (all dimensions reversed).
+  if (dnums.input_spatial_dimensions_size() == 2
+          ? !window_util::AllOrNoneReversed(conv->window())
+          : window_util::HasWindowReversal(conv->window())) {
     return false;
   }
   return true;
@@ -254,7 +258,7 @@ MatchBackwardInput(HloInstruction* conv) {
   const auto no_match_result =
       std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr);
 
-  // TODO(b/31709653): Theoretically cuDNN supports grouped convolutions also
+  // TODO(b/119479517): Theoretically cuDNN supports grouped convolutions also
   // for the backward input convolution, but at least for now with version 7.1.4
   // it is slower. This needs to be re-evaluated for future cuDNN versions.
   // Note that we already have the necessary code down below, the only thing to
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
index 87a835f2504068548159ef32b276201c936fa385..443883a89f66a747def1049bc5afb53fec3c2409 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -34,11 +34,11 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
 
-class CudnnConvRewriterTest : public HloVerifiedTestBase {
+class CudnnConvRewriterTest : public HloTestBase {
  public:
   CudnnConvRewriterTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/true,
-                            /*allow_mixed_precision=*/false) {
+      : HloTestBase(/*layout_sensitive=*/true,
+                    /*allow_mixed_precision=*/false) {
     for (int i = 0; i < 2; ++i) {
       WindowDimension* window_dim = default_conv_window_.add_dimensions();
       window_dim->set_size(1);
@@ -118,10 +118,10 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolve) {
   metadata.set_op_name("foo");
   conv->set_metadata(metadata);
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   ASSERT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -152,10 +152,10 @@ TEST_F(CudnnConvRewriterTest,
       activations, gradients, /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -182,10 +182,10 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithPaddedActivations) {
       /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -212,10 +212,10 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithPaddedGradients) {
       /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -241,10 +241,10 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
       /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
@@ -292,10 +292,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveEvenPadding) {
                          /*feature_group_count=*/1, conv_window, conv_dnums)
                          .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
 
   ASSERT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
@@ -338,10 +338,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolve1x1Filter) {
       /*lhs=*/output, /*rhs=*/kernel, /*feature_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
@@ -371,10 +371,10 @@ TEST_F(CudnnConvRewriterTest,
       default_conv_window_, tf_default_dnums_for_backward_input_,
       DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(
       entry_computation->root_instruction(),
       op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
@@ -425,10 +425,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveUnevenPaddingOnGradients) {
           conv_window, tf_default_dnums_for_backward_input_)
           .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   ASSERT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
@@ -475,10 +475,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveLowPaddingTooLarge) {
           conv_window, tf_default_dnums_for_backward_input_)
           .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(
       entry_computation->root_instruction(),
       op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
@@ -529,10 +529,10 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveUnevenPaddingOnActivations) {
           conv_window, tf_default_dnums_for_backward_input_)
           .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   ASSERT_THAT(entry_computation->root_instruction(),
               op::GetTupleElement(
                   op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
@@ -584,10 +584,10 @@ TEST_F(CudnnConvRewriterTest,
           conv_window, tf_default_dnums_for_backward_input_)
           .ValueOrDie()));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
+  EXPECT_TRUE(RunPass(module.get()));
   EXPECT_THAT(
       entry_computation->root_instruction(),
       op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
@@ -600,7 +600,8 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveConstantFilter) {
   constant_arr.FillIota(0);
   string constant_str =
       LiteralUtil::CreateR4FromArray4D(constant_arr).ToString();
-  ParseAndVerifyModule(absl::StrFormat(R"(
+
+  const string module_str = absl::StrFormat(R"(
     HloModule test
 
     ENTRY entry_computation {
@@ -610,10 +611,12 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveConstantFilter) {
           window={size=4x4 pad=2_2x2_2 lhs_dilate=2x2},
           dim_labels=bf01_01oi->bf01, feature_group_count=1
     })",
-                                       constant_str));
-  EXPECT_TRUE(RunPass(&module()));
+                                            constant_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  EXPECT_TRUE(RunPass(m.get()));
   EXPECT_THAT(
-      module().entry_computation()->root_instruction(),
+      m->entry_computation()->root_instruction(),
       op::GetTupleElement(op::CustomCall(kCudnnConvBackwardInputCallTarget, _,
                                          op::Reverse(op::Constant())),
                           0));
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
index 492d290bf4a27a91fa14dea95ac62d90bc1fa28a..3425e1b4942aaf1011ba1bf1c50dd7e79c1f9807 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -138,6 +138,7 @@ Status RunCudnnConvImpl(CudnnConvParams params,
 
   const int num_dimensions = window.dimensions_size();
   CHECK_LE(num_dimensions, 3);
+  CHECK_GE(num_dimensions, 1);
   // cuDNN does not support 1D convolutions. We therefore express 1D
   // convolutions as 2D convolutions where the first spatial dimension is 1.
   // This matches the behavior of TF (see definition of conv1d in
@@ -148,10 +149,15 @@ Status RunCudnnConvImpl(CudnnConvParams params,
            output_shape.element_type())
       << ShapeUtil::HumanString(output_shape);
 
+  // If one dimension is reversed, we need to have all dimensions reversed (so
+  // we're doing convolution not cross correlation).
+  const bool dims_reversed = window.dimensions()[0].window_reversal();
+
   CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size());
   for (const WindowDimension& dim : window.dimensions()) {
+    CHECK_EQ(dims_reversed, dim.window_reversal());
     CHECK_EQ(dim.padding_low(), dim.padding_high());
     CHECK_EQ(dim.base_dilation(), 1)
         << "cudnn does not support base dilation; it "
@@ -198,6 +204,7 @@ Status RunCudnnConvImpl(CudnnConvParams params,
 
   ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
   convolution_descriptor.set_group_count(feature_group_count);
+  convolution_descriptor.set_convolution_not_crosscorr(dims_reversed);
   for (int dim = 0; dim < num_dimensions; ++dim) {
     convolution_descriptor
         .set_zero_padding(
@@ -363,14 +370,12 @@ StatusOr<CudnnConvParams> GetCudnnConvParams(
       params.output_shape = &conv_result_shape;
       params.fusion.emplace();
       auto& fusion = *params.fusion;
-      if (backend_config.activation_mode() <
-          static_cast<int64>(se::dnn::ActivationMode::kNumActivationModes)) {
-        fusion.mode = static_cast<se::dnn::ActivationMode>(
-            backend_config.activation_mode());
-      } else {
+      if (!se::dnn::ActivationMode_IsValid(backend_config.activation_mode())) {
         return InternalError("Bad activation mode: %s",
                              backend_config.ShortDebugString());
       }
+      fusion.mode = static_cast<se::dnn::ActivationMode>(
+          backend_config.activation_mode());
       fusion.side_input_scale = backend_config.side_input_scale();
       params.input_buf = operand_buffers[0];
       params.filter_buf = operand_buffers[1];
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 30c1f9088968305ad0207164ecb07ba13cc89ee6..470457935acacb8940af241dadb393d770786939 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -229,7 +229,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) {
         return user->opcode() == HloOpcode::kFusion &&
                (user->fusion_kind() == HloInstruction::FusionKind::kLoop ||
-                (user->fusion_kind() == HloInstruction::FusionKind::kInput &&
+                (IsReduceInputFusion(*user) &&
                  LayoutsAreReduceInputFusionFriendly(*fusion, *user)));
       })) {
     VLOG(3) << "Not merging " << fusion->name()
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 57426327822d95a42f407ed7488f35acfd3623d2..ae2e718db29803a085401969a7d9b09abf690a6c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -51,7 +51,7 @@ GpuExecutable::GpuExecutable(
     const string& ptx, const std::vector<uint8>& cubin,
     std::pair<int, int> compute_capability,
     std::unique_ptr<const ThunkSchedule> thunk_schedule,
-    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 0e276282e40fba0ae4881a51dad0c7c9e8d1c081..2b3c77f5b82aa94f44d8de56caf0f4d31c05e0cb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,7 @@ class GpuExecutable : public Executable {
   GpuExecutable(const string& ptx, const std::vector<uint8>& cubin,
                 std::pair<int, int> compute_capability,
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
-                std::unique_ptr<const HloModule> hlo_module,
+                std::unique_ptr<HloModule> hlo_module,
                 std::unique_ptr<const BufferAssignment> assignment,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 2d31fd5570c468b0c42fa308535fd335f3588a79..392b149abdfb5bf2ce76e8f9f7c4f2cba898ac8c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -55,7 +55,7 @@ bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
   });
 }
 
-bool IsInputFusibleReduction(const HloInstruction& instr) {
+bool IsReduceInputFusion(const HloInstruction& instr) {
   if (instr.IsMultiOutputFusion()) {
     for (const HloInstruction* operand :
          instr.fused_expression_root()->operands()) {
@@ -67,17 +67,18 @@ bool IsInputFusibleReduction(const HloInstruction& instr) {
         return true;
       }
     }
-    return false;
-  } else if (instr.opcode() == HloOpcode::kFusion) {
-    if (IsReductionToVector(*instr.fused_expression_root())) {
-      CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput)
-          << " Fusion rooted at reduction-to-vector op must be of kind kInput: "
-          << instr.ToString();
-      return true;
-    }
-    return false;
+  } else if (instr.opcode() == HloOpcode::kFusion &&
+             IsReductionToVector(*instr.fused_expression_root())) {
+    CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput)
+        << " Fusion rooted at reduction-to-vector op must be of kind kInput: "
+        << instr.ToString();
+    return true;
   }
-  return IsReductionToVector(instr);
+  return false;
+}
+
+bool IsInputFusibleReduction(const HloInstruction& instr) {
+  return IsReduceInputFusion(instr) || IsReductionToVector(instr);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index f7c24a0d5bbfcc61389ea19ae7f769671e4e974d..c0be354730d22fb76754a60a1c9c58781d0d452a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -33,14 +33,17 @@ namespace gpu {
 bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
                                          const HloInstruction& reduce);
 
-// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
-// is either an unfused reduction-to-vector op, an input fusion rooted at a
-// reduction-to-vector op, or a multi-output input fusion with at least one
-// reduction-to-vector op root.
 // Note that reduction ops are lowered in different ways. Reduce input fusions
 // are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at
 // reduction-to-vector ops. Other reduction ops are lowered by
 // GpuElementalIrEmitter and fused like elementwise ops.
+
+// Whether `instr` is an input fusion rooted at a reduction-to-vector op or a
+// multi-output input fusion with at least one reduction-to-vector op root.
+bool IsReduceInputFusion(const HloInstruction& instr);
+
+// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
+// is either an unfused reduction-to-vector op or a reduce input fusion.
 bool IsInputFusibleReduction(const HloInstruction& instr);
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index d91b7bc61fda5a07c163a07ec0e1644d2ad9db49..12222500ea732a4ca8ea6b3a37033f7e8d4ee927 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -178,7 +178,7 @@ TEST_F(GpuFusibleTest,
   EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_ReductionToVector) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
       c0 = f32[] parameter(0)
@@ -191,10 +191,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_ElementalReduction) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
       c0 = f32[] parameter(0)
@@ -207,10 +208,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputInputReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -225,10 +227,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputLoopReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -243,10 +246,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputInputReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -263,11 +267,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
 TEST_F(GpuFusibleTest,
-       IsInputFusibleReduction_MultiOutputInputReduceFusionWithExtraOutputs) {
+       IsReduceInputFusion_MultiOutputInputReduceFusionWithExtraOutputs) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -284,10 +289,11 @@ TEST_F(GpuFusibleTest,
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputLoopReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -304,11 +310,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
 TEST_F(GpuFusibleTest,
-       IsInputFusibleReduction_MultiOutputLoopFusionReduceAndElementwiseOp) {
+       IsReduceInputFusion_MultiOutputLoopFusionReduceAndElementwiseOp) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -325,6 +332,7 @@ TEST_F(GpuFusibleTest,
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
index 02a0d028c118aba23996f9b97d05443bb4a00c88..1126943624a3771433ecac591545d335c1890115 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
@@ -37,12 +37,12 @@ class GpuHloOrdering : public PredecessorHloOrdering {
  public:
   GpuHloOrdering(const HloModule* module,
                  const StreamAssignment& stream_assignment,
-                 const std::vector<const HloInstruction*>& thunk_launch_order);
+                 const std::vector<HloInstruction*>& thunk_launch_order);
   ~GpuHloOrdering() override = default;
 
   // Only the entry computation can possibly be sequentially ordered, and only
   // if we've assigned all instructions to a single stream.
-  const std::vector<const HloInstruction*>* SequentialOrder(
+  const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const override {
     return &computation == module_->entry_computation() ? entry_sequence_.get()
                                                         : nullptr;
@@ -51,17 +51,17 @@ class GpuHloOrdering : public PredecessorHloOrdering {
   string ToString() const override { return ToStringHelper("GpuHloOrdering"); }
 
  private:
-  std::unique_ptr<std::vector<const HloInstruction*>> entry_sequence_;
+  std::unique_ptr<HloInstructionSequence> entry_sequence_;
 };
 
 GpuHloOrdering::GpuHloOrdering(
     const HloModule* module, const StreamAssignment& stream_assignment,
-    const std::vector<const HloInstruction*>& thunk_launch_order)
+    const std::vector<HloInstruction*>& thunk_launch_order)
     : PredecessorHloOrdering(module) {
   // The entry computation has a total order when there's only one stream.
   if (stream_assignment.StreamCount() == 1) {
-    entry_sequence_ = absl::make_unique<std::vector<const HloInstruction*>>(
-        thunk_launch_order);
+    entry_sequence_ =
+        absl::make_unique<HloInstructionSequence>(thunk_launch_order);
   }
 
   // The ordering of instructions for the entry computation is determined by the
@@ -124,7 +124,8 @@ GpuHloOrdering::GpuHloOrdering(
   for (auto* computation : module->computations()) {
     if (computation != module->entry_computation() &&
         !computation->IsFusionComputation()) {
-      predecessors_.emplace(computation, computation->ComputeReachability());
+      predecessors_.emplace(computation,
+                            HloReachabilityMap::Build(computation));
     }
   }
 }
@@ -149,7 +150,7 @@ GpuHloOrdering::GpuHloOrdering(
 // However, if the total order is A,B,D,C,E, then C and E can run
 // concurrently.
 void BFSLaunchOrder(const HloComputation* computation,
-                    std::vector<const HloInstruction*>* launch_order) {
+                    std::vector<HloInstruction*>* launch_order) {
   // This topological sort uses two data structures:
   // 1. `incoming_edge_count` which keeps track of the number of incoming
   // edges to each HLO;
@@ -157,9 +158,9 @@ void BFSLaunchOrder(const HloComputation* computation,
   //
   // The sorting algorithm repeatedly pops the top from the queue and deletes
   // that HLO from the graph, making more HLOs incoming-edge free.
-  std::deque<const HloInstruction*> queue;
+  std::deque<HloInstruction*> queue;
   std::unordered_map<const HloInstruction*, int64> incoming_edge_count;
-  for (const auto& hlo : computation->instructions()) {
+  for (auto* hlo : computation->instructions()) {
     if (hlo->operand_count() == 0) {
       queue.push_back(hlo);
     } else {
@@ -171,10 +172,10 @@ void BFSLaunchOrder(const HloComputation* computation,
   }
 
   while (!queue.empty()) {
-    const HloInstruction* x = queue.front();
+    HloInstruction* x = queue.front();
     queue.pop_front();
     launch_order->push_back(x);
-    for (const HloInstruction* y : x->users()) {
+    for (HloInstruction* y : x->users()) {
       --incoming_edge_count[y];
       if (incoming_edge_count[y] == 0) {
         queue.push_back(y);
@@ -194,14 +195,14 @@ StatusOr<std::unique_ptr<GpuHloSchedule>> GpuHloSchedule::Build(
   std::unique_ptr<GpuHloSchedule> schedule(new GpuHloSchedule);
 
   // Initialize thunk_launch_order_, the total order of thunk launches.
-  const HloComputation* entry_computation = module.entry_computation();
+  HloComputation* entry_computation = module.entry_computation();
   if (stream_assignment.StreamCount() == 1) {
     // All kernels are launched on a single stream, so there's no loss of
     // concurrency by optimizing for minimal memory usage.
     TF_ASSIGN_OR_RETURN(
         HloInstructionSequence sequence,
         ScheduleComputation(
-            *entry_computation, [pointer_size](const BufferValue& buffer) {
+            entry_computation, [pointer_size](const BufferValue& buffer) {
               return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
             }));
     schedule->thunk_launch_order_ = sequence.instructions();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
index 07a7fc67aa555845c3de57e574ab582403ec0490..7f224ffe4f03f8f05b0f1907628d99d9df387770 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
@@ -46,7 +46,7 @@ class GpuHloSchedule {
 
   // Returns the total order of thunk launches, represented in terms of HLO
   // instructions.
-  const std::vector<const HloInstruction*>& ThunkLaunchOrder() const {
+  const std::vector<HloInstruction*>& ThunkLaunchOrder() const {
     return thunk_launch_order_;
   }
 
@@ -60,7 +60,7 @@ class GpuHloSchedule {
  private:
   GpuHloSchedule();
 
-  std::vector<const HloInstruction*> thunk_launch_order_;
+  std::vector<HloInstruction*> thunk_launch_order_;
   std::unique_ptr<HloOrdering> hlo_ordering_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
index b857fa775a76ec999b505a2a64332cc0c54cf00b..91db7151f22fd75b20244878bee86d65acd1d304 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -24,16 +24,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
 
-class GpuHloScheduleTest : public HloVerifiedTestBase {
+class GpuHloScheduleTest : public HloTestBase {
  protected:
-  using HloVec = std::vector<const HloInstruction*>;
+  using HloVec = std::vector<HloInstruction*>;
 
   // Pre-canned shapes.
   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
@@ -44,7 +44,7 @@ class GpuHloScheduleTest : public HloVerifiedTestBase {
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<HloModule> CreateNewModule() {
+  std::unique_ptr<HloModule> CreateNewVerifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -79,7 +79,7 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -139,7 +139,7 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
   HloInstruction* add3 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add3));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -209,7 +209,7 @@ TEST_F(GpuHloScheduleTest, ConcurrentMatMul) {
   HloInstruction* add =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, dot2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -288,7 +288,7 @@ TEST_F(GpuHloScheduleTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
index 7d01eeb02567d710e9de089c7f29ffcc5f959f9a..b511155f85fb24adc1828cbef7f3fb60778ef7ab 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -25,7 +25,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class GpuHloSupportCheckerTest : public HloVerifiedTestBase {
+class GpuHloSupportCheckerTest : public HloTestBase {
  protected:
   GpuHloSupportChecker& checker() { return checker_; }
 
@@ -42,10 +42,10 @@ TEST_F(GpuHloSupportCheckerTest, Add) {
       HloInstruction::CreateParameter(1, scalar_shape, "param1"));
   builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  TF_ASSERT_OK(checker().Run(module).status());
+  TF_ASSERT_OK(checker().Run(module.get()).status());
 }
 
 TEST_F(GpuHloSupportCheckerTest, SparseUnimplemented) {
@@ -60,7 +60,7 @@ TEST_F(GpuHloSupportCheckerTest, SparseUnimplemented) {
   // Since verifier is reporting sparse layouts as errors, we should
   // use a regular HloModule instead of VerifiedHloModule to avoid
   // verifier errors being triggered in the destructor.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   Status status = checker().Run(module.get()).status();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 4822b820f4e229336e2b26cfbd0097c8c31a50c8..2ffc8bfb49b205dced0d540ba72426e72d95e596 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -61,7 +61,7 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
             HloInstruction::CreateParameter(1, ashape, "y"));
         auto add = builder.AddInstruction(
             HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, x, y));
-        auto module = CreateNewModule();
+        auto module = CreateNewVerifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(add));
 
@@ -148,7 +148,7 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) {
           {operand, scale, offset, mean, variance, epsilon, feature_index},
           kCudnnBatchNormForwardInferenceCallTarget));
 
-      auto module = CreateNewModule();
+      auto module = CreateNewVerifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -217,7 +217,7 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) {
           batchnorm_shape, {operand, scale, offset, epsilon, feature_index},
           kCudnnBatchNormForwardTrainingCallTarget));
 
-      auto module = CreateNewModule();
+      auto module = CreateNewVerifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -298,7 +298,7 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) {
                  feature_index},
                 kCudnnBatchNormBackwardCallTarget));
 
-        auto module = CreateNewModule();
+        auto module = CreateNewVerifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(batchnorm));
 
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 7f2b59810f0334b24a50fc83b85ab838002afd23..43f43b50e4a6478f343088194871cc9d380bd2d2 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -47,6 +47,7 @@ bool IsFusible(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kReduce ||
          hlo.opcode() == HloOpcode::kReduceWindow ||
          hlo.opcode() == HloOpcode::kReshape ||
+         hlo.opcode() == HloOpcode::kReverse ||
          hlo.opcode() == HloOpcode::kScatter ||
          hlo.opcode() == HloOpcode::kSlice ||
          hlo.opcode() == HloOpcode::kTranspose;
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 57e66f5a12cf54824c3139ce2fb32e7cf762b040..2b060b03ceae9bf6947f896dae2987a50972013b 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -41,7 +41,7 @@ TEST_F(InstructionFusionTest,
       builder.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::MakeShape(S32, {1}), exp1, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -61,7 +61,7 @@ TEST_F(InstructionFusionTest,
       builder.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::MakeShape(S32, {1}), negate1, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -80,7 +80,7 @@ TEST_F(InstructionFusionTest,
   HloInstruction* reshape2 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -99,7 +99,7 @@ TEST_F(InstructionFusionTest,
   HloInstruction* transpose2 = builder.AddInstruction(
       HloInstruction::CreateTranspose(ShapeUtil::MakeShape(S32, {}), exp1, {}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfDotUnfused) {
   auto reshape2 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), dot1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -134,7 +134,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
   auto transpose2 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {1, 1}), dot1, {0, 1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -723,7 +723,7 @@ TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
     sum = b.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sum, param));
   }
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(b.Build());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
                   .Run(module.get())
@@ -805,5 +805,26 @@ TEST_F(InstructionFusionTest, NonscalarConstantsNotFused) {
               op::Reduce(op::Broadcast(op::Parameter()), op::Constant()));
 }
 
+TEST_F(InstructionFusionTest, FuseReverse) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    ENTRY Reverse {
+      p0 = f32[50,96,1024]{2,1,0} parameter(0)
+      add = f32[50,96,1024]{2,1,0} add(p0, p0)
+      ROOT reverse = f32[50,96,1024] reverse(add), dimensions={0}
+    })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Reverse(op::Add(op::Parameter(), op::Parameter())));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 21e44e1e7d3fb7818e114b70025bfb85eacf786a..ebd73f3a9124fbbfeabf3d5041d44a3da0ddd2fb 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -65,11 +65,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
@@ -88,6 +88,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using llvm_ir::KernelMappingScheme;
+
 namespace {
 
 using absl::InlinedVector;
@@ -1188,7 +1190,7 @@ Status IrEmitterUnnested::EmitColumnReduction(
       .EmitLoop(IrName(reduce), index_ty);
 }
 
-static std::pair<int64, int64> ComputeTilingSchemeForReduction(
+static std::pair<int64, int64> ComputeKernelMappingSchemeForReduction(
     int64 depth, int64 width, int64 kWarpSize) {
   constexpr int64 kTargetNumElementsPerThread = 64;
   int64 x_tile_size = kTargetNumElementsPerThread;
@@ -1322,7 +1324,7 @@ Status IrEmitterUnnested::EmitRowReduction(
   int64 x_tile_size;
   int64 z_tile_size;
   std::tie(x_tile_size, z_tile_size) =
-      ComputeTilingSchemeForReduction(depth, width, kWarpSize);
+      ComputeKernelMappingSchemeForReduction(depth, width, kWarpSize);
 
   // Round the width in tiles up to the nearest multiple of kWarpSize, so that
   // the use of shfl_down is valid.
@@ -2171,7 +2173,18 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
 Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   std::vector<std::unique_ptr<Thunk>> thunks;
   Shape keys_shape = sort->operand(0)->shape();
+  int64 dimension_to_sort = sort->dimensions(0);
+  // In case there is a 'values' parameter that is a iota, we take note and use
+  // it later to ensure a stable sort. Otherwise, we don't guarantee a stable
+  // sort.
+  int64 iota_values_parameter_index = -1;
   for (int64 i = 0; i < sort->operand_count(); ++i) {
+    if (i > 0 && sort->operand(i)->opcode() == HloOpcode::kIota &&
+        ShapeUtil::ElementIsIntegral(sort->operand(i)->shape()) &&
+        Cast<HloIotaInstruction>(sort->operand(i))->iota_dimension() ==
+            dimension_to_sort) {
+      iota_values_parameter_index = i;
+    }
     ShapeIndex shape_index =
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and outputs is the
@@ -2196,10 +2209,10 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
     }
   }
 
-  int64 dimension_to_sort = sort->dimensions(0);
-  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
+  uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
-  auto index_type = b_.getInt64Ty();
+  CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
+  CHECK_LT(1ULL << (num_stages - 1), dimension_to_sort_bound);
 
   // Naive C++ code for the outer loops:
   //
@@ -2213,42 +2226,120 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   //   }
   // }
   //
-  // This follows the algorithm described on Wikipedia:
-  // https://en.wikipedia.org/wiki/Bitonic_sorter
-
+  // This follows the alternative representation of the algorithm described on
+  // Wikipedia: https://en.wikipedia.org/wiki/Bitonic_sorter
+  //
+  // Each mask specifies how to derive from one position in the array the
+  // position with which it should be compared (we calculate the xor of the
+  // position with the mask).
+  // As an optimization, we can move the 'mask' loop to inside the
+  // sorting/comparison loop if the comparisons happen within a small block of
+  // the array. To make this work, we collect all consecutive masks that are
+  // smaller than our chosen power of 2 tile size, and pass them to SortInPlace.
+  // Each thread then processes one tile of data.
+
+  const uint64 kTileSize = std::min(2048ULL, 1ULL << num_stages);
+
+  // If we cannot combine several xor masks together, we don't use tiling, so we
+  // calculate the standard launch dimensions for the shape. However we only
+  // need to iterate through ~half of the dimension to sort (rounded up to the
+  // next highest power of 2), because each iteration compares one pair of
+  // elements.
+  Shape standard_iteration_shape = keys_shape;
+  uint64 standard_num_iterations_in_sort_dim = 1ULL << (num_stages - 1);
+  standard_iteration_shape.set_dimensions(dimension_to_sort,
+                                          standard_num_iterations_in_sort_dim);
+  LaunchDimensions standard_launch_dimensions = CalculateLaunchDimensions(
+      standard_iteration_shape, ir_emitter_context_->device_description());
+
+  // Calculate the launch dimensions for the case where we use tiling. We split
+  // the dimension that should be sorted into tiles of size 'kTileSize'. This
+  // means we first need to round 'dimension_to_sort_bound' up to be a multiple
+  // of the tile size.
+  int64 rounded_bound = RoundUpToNearest(dimension_to_sort_bound, kTileSize);
+  Shape iteration_shape = keys_shape;
+
+  // We iterate through the element pairs that should be compared.
+  uint64 num_iterations_in_sort_dim = rounded_bound / 2;
+  iteration_shape.set_dimensions(dimension_to_sort, num_iterations_in_sort_dim);
+  uint64 num_iterations = ShapeUtil::ElementsIn(iteration_shape);
+
+  // For correctness reasons we need exactly 'kTileSize' / 2 many threads per
+  // block. Each thread is responsible for copying exactly two adjacent elements
+  // into shared memory, and then does a comparison of two possibly different
+  // elements taken from shared memory.
+  const uint64 kThreadsPerBlock = kTileSize / 2;
+
+  // Check whether we should use any tiling. We might not be able to use it if
+  // we have not enough threads, or not enough shared memory. Also it does not
+  // give a speedup if the tile size is < 128.
+  int64 total_shared_memory_needed = 0;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    total_shared_memory_needed +=
+        kTileSize * ShapeUtil::ByteSizeOfPrimitiveType(
+                        sort->operand(i)->shape().element_type());
+  }
+  bool no_tiling =
+      kTileSize < 128 ||
+      kThreadsPerBlock >
+          ir_emitter_context_->device_description().threads_per_block_limit() ||
+      total_shared_memory_needed >
+          ir_emitter_context_->device_description().shared_memory_per_block();
+
+  uint64 num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
+  LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
+
+  auto emit_kernel = [&](absl::Span<const int64> xor_masks) {
+    thunks.push_back(
+        BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+    LaunchDimensions launch_dimensions = xor_masks.size() > 1
+                                             ? tiled_launch_dimensions
+                                             : standard_launch_dimensions;
+    UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
+                           ir_emitter_context_->llvm_module());
+    IrArray keys_array;
+    std::vector<IrArray> values_arrays;
+    values_arrays.reserve(sort->operand_count() - 1);
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      ShapeIndex shape_index =
+          sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
+      if (i == 0) {
+        keys_array = GetIrArray(*sort, *sort, shape_index);
+      } else {
+        values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
+      }
+    }
+    return llvm_ir::EmitSortInPlace(
+        dimension_to_sort, keys_array, values_arrays,
+        iota_values_parameter_index, IrName(sort), xor_masks, &b_,
+        launch_dimensions,
+        xor_masks.size() > 1 ? num_iterations_in_sort_dim
+                             : standard_num_iterations_in_sort_dim,
+        kTileSize);
+  };
+  std::vector<int64> xor_masks;
   for (int64 stage = 0; stage < num_stages; ++stage) {
     for (int64 mask = stage; mask >= 0; --mask) {
-      thunks.push_back(
-          BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
-      LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-          keys_shape, ir_emitter_context_->device_description());
-      UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
-                             ir_emitter_context_->llvm_module());
-
-      llvm::Value* xor_mask;
+      int64 xor_mask;
       if (mask == stage) {
-        xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1);
+        xor_mask = (1LL << (stage + 1)) - 1;
       } else {
-        xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask);
+        xor_mask = 1LL << mask;
       }
-
-      IrArray keys_array;
-      std::vector<IrArray> values_arrays;
-      values_arrays.reserve(sort->operand_count() - 1);
-      for (int64 i = 0; i < sort->operand_count(); ++i) {
-        ShapeIndex shape_index =
-            sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
-        if (i == 0) {
-          keys_array = GetIrArray(*sort, *sort, shape_index);
-        } else {
-          values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
+      if (xor_mask >= kTileSize || no_tiling) {
+        if (!xor_masks.empty()) {
+          TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+          xor_masks.clear();
         }
+        TF_RETURN_IF_ERROR(emit_kernel({xor_mask}));
+      } else {
+        xor_masks.push_back(xor_mask);
       }
-      TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace(
-          dimension_to_sort, keys_array, values_arrays, IrName(sort), xor_mask,
-          &b_, &launch_dimensions));
     }
   }
+  if (!xor_masks.empty()) {
+    TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+  }
 
   AddThunkToThunkSequence(
       absl::make_unique<SequentialThunk>(std::move(thunks), sort));
@@ -3068,31 +3159,6 @@ std::vector<IrArray> IrEmitterUnnested::ConstructIrArrayForInputs(
   return param_arrays;
 }
 
-int IrEmitterUnnested::ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-    const HloInstruction& hlo, const std::vector<IrArray>& output_arrays,
-    absl::Span<const int64> reduced_output_dims,
-    std::vector<Shape>* output_reduced_shapes,
-    std::vector<IrArray>* output_in_reduced_shape_arrays) {
-  int64 num_outputs = 1;
-  if (hlo.IsMultiOutputFusion()) {
-    num_outputs = ShapeUtil::TupleElementCount(hlo.shape());
-    output_in_reduced_shape_arrays->reserve(num_outputs);
-    output_reduced_shapes->reserve(num_outputs);
-    for (int64 i = 0; i < num_outputs; ++i) {
-      output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
-          ShapeUtil::GetSubshape(hlo.shape(), {i}).element_type(),
-          reduced_output_dims));
-      output_in_reduced_shape_arrays->push_back(
-          output_arrays[i].CastToShape((*output_reduced_shapes)[i], &b_));
-    }
-  } else {
-    output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
-        hlo.shape().element_type(), reduced_output_dims));
-    output_in_reduced_shape_arrays->push_back(
-        output_arrays[0].CastToShape((*output_reduced_shapes)[0], &b_));
-  }
-  return num_outputs;
-}
 
 int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
     const HloInstruction& hlo, const std::vector<IrArray>& param_arrays,
@@ -3152,308 +3218,525 @@ llvm::Value* GetBlockIdx(llvm::IRBuilder<>* builder, llvm::Type* index_ty,
                                 "block.id.x");
 }
 
-// Emits code to process up to (tile_size/num_rows) elements in a tile, given
-// `emit_elem_function` is the function to emit code to process one element, `y`
-// and `x` are the coordinates for the first element to process, and `index` is
-// the index for the origin of the tile. Emits bounds check to ensure that each
-// processed element is within the boundary defined by `tile_width` and
-// `tile_height`.
+void EmitFullTile(const KernelMappingScheme* mapping_scheme,
+                  const IrArray::Index& tile_origin_index,
+                  llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
+                  llvm::Type* index_ty,
+                  const std::function<void(const IrArray::Index&, llvm::Value*,
+                                           llvm::Value*)>& emit_elem_function) {
+  int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
+  int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+  int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
+  for (int64 i = 0; i < tile_size_y; i += num_threads_y) {
+    IrArray::Index source_idx_y =
+        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, i),
+                                         KernelMappingScheme::DimY, builder);
+    llvm::Value* y_loc =
+        builder->CreateAdd(llvm::ConstantInt::get(index_ty, i), y);
+    for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
+      IrArray::Index source_idx =
+          source_idx_y.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
+                                      KernelMappingScheme::DimX, builder);
+      llvm::Value* x_loc =
+          builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+      emit_elem_function(source_idx, y_loc, x_loc);
+    }
+  }
+}
+
+void EmitPartialTile(
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
+    llvm::Type* index_ty,
+    const std::function<void(const IrArray::Index&, llvm::Value*,
+                             llvm::Value*)>& emit_elem_function) {
+  int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
+  int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+
+  for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
+    IrArray::Index source_idx =
+        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
+                                         KernelMappingScheme::DimX, builder);
+    llvm::Value* x_loc =
+        builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+
+    ksl->IfReturnVoid(
+        "x_in_tile", builder->CreateICmpULT(x_loc, tile_width), [&] {
+          // tile_height_bound =
+          //   ceil(tile_height / num_threads_y) * num_threads_y
+          llvm::Value* ceiling_of_ratio = builder->CreateUDiv(
+              builder->CreateAdd(tile_height, llvm::ConstantInt::get(
+                                                  index_ty, num_threads_y - 1)),
+              llvm::ConstantInt::get(index_ty, num_threads_y));
+          llvm::Value* tile_height_bound = builder->CreateMul(
+              ceiling_of_ratio,
+              llvm::ConstantInt::get(index_ty, num_threads_y));
+          ksl->ForReturnVoid(
+              loop_name, /*start=*/llvm::ConstantInt::get(index_ty, 0),
+              /*end=*/tile_height_bound,
+              /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
+              [&](llvm::Value* y_indvar) {
+                llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
+                ksl->IfReturnVoid(
+                    "y_in_tile", builder->CreateICmpULT(y_loc, tile_height),
+                    [&] {
+                      emit_elem_function(
+                          source_idx.AddOffsetToDim(
+                              y_indvar, KernelMappingScheme::DimY, builder),
+                          y_loc, x_loc);
+                    });
+              });
+        });
+  }
+}
+
+// Emits code to process up to
+// (tile_size_x/num_threads_x * tile_size_y/num_threads_y) elements in a tile,
+// given `emit_elem_function` is the function to emit code to process one
+// element, `y` and `x` are the intra-tile coordinates for the first element
+// to process, and `index` is the index for the origin of the tile. Information
+// about tile_size_x/y and num_threads_x/y are stored in `mapping_scheme`. Emits
+// bounds check to ensure that each processed element is within the boundary
+// defined by `tile_width` and `tile_height`.
 void EmitTiledElementalCodeWithBoundsCheck(
-    int64 tile_size, int64 num_rows, const IrArray::Index& index,
-    const string& loop_name, KernelSupportLibrary* ksl,
-    llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
-    llvm::Value* tile_width, llvm::Value* tile_height,
-    const std::function<void(const IrArray::Index&, llvm::Value*)>&
-        emit_elem_function) {
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
+    const std::function<void(const IrArray::Index&, llvm::Value*,
+                             llvm::Value*)>& emit_elem_function) {
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+  int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
   llvm::Type* index_ty = tile_width->getType();
-  // Emits a constant value with index type.
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-  // Adds `addend` to the given `dim` of `index`.
-  auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = builder->CreateAdd(index[dim], addend);
-    return index;
-  };
-
-  auto emit_full_tile = [&] {
-    for (int64 i = 0; i < tile_size; i += num_rows) {
-      auto source_idx = offset_dim(index, index_typed_constant(i), /*dim=*/1);
-      auto y_loc = builder->CreateAdd(index_typed_constant(i), y);
-      emit_elem_function(source_idx, y_loc);
-    }
-  };
 
-  auto emit_last_row = [&] {
-    ksl->IfReturnVoid("x_in_tile", builder->CreateICmpULT(x, tile_width), [&] {
-      // tile_height_upper_bound =
-      //   ceil(tile_height / num_rows) * num_rows
-      auto tile_height_upper_bound = builder->CreateMul(
-          builder->CreateUDiv(
-              builder->CreateAdd(tile_height,
-                                 index_typed_constant(num_rows - 1)),
-              index_typed_constant(num_rows)),
-          index_typed_constant(num_rows));
-      ksl->ForReturnVoid(
-          loop_name, /*start=*/index_typed_constant(0),
-          /*end=*/tile_height_upper_bound,
-          /*step=*/index_typed_constant(num_rows), [&](llvm::Value* y_indvar) {
-            auto y_loc = builder->CreateAdd(y_indvar, y);
-            ksl->IfReturnVoid(
-                "y_in_tile", builder->CreateICmpULT(y_loc, tile_height), [&] {
-                  emit_elem_function(offset_dim(index, y_indvar, /*dim=*/1),
-                                     y_loc);
-                });
-          });
-    });
-  };
   ksl->IfReturnVoid(
       "full_tile",
       builder->CreateAnd(
-          builder->CreateICmpEQ(index_typed_constant(tile_size), tile_width),
-          builder->CreateICmpEQ(index_typed_constant(tile_size), tile_height)),
-      emit_full_tile, emit_last_row);
+          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_x),
+                                tile_width),
+          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_y),
+                                tile_height)),
+      [&] {
+        EmitFullTile(mapping_scheme, tile_origin_index, builder, y, x, index_ty,
+                     emit_elem_function);
+      },
+      [&] {
+        EmitPartialTile(mapping_scheme, tile_origin_index, loop_name, ksl,
+                        builder, y, x, tile_height, tile_width, index_ty,
+                        emit_elem_function);
+      });
 }
 }  // namespace
 
-// Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
-// algorithm to improve the memory access patterns for the input parameters
-// which have a shape that is a 0-2-1 transpose of the output tensors.
-//
-// For the purpose of tiling, the output tensors have a logical shape of three
-// components 0-2-1 while the relevant input parameters have a logical shape of
-// three components 0-1-2 in the order major to minor. The x- and y- dimensions
-// of the tensors are tiled in square tiles of edge length `kTileSize`. Each
-// thread block of `kTileSize` x `kNumRows` threads transposes one tile: each
-// thread copies kTileSize/kNumRows elements from the input to a shared memory
-// tile, then the otherwise "regular hlo kernel" reads from the shared memory
-// instead of the original input.
-//
-// This is similar to the following CUDA algorithm in TensorFlow:
-// https://goo.gl/MStRV6.
+// Emits code to process a tensor element in a tile for the given kCopy HLO that
+// performs a 0-2-1 transpose.
 //
-// `kTileSize` should usually be same as warp size. We currently choose 32 for
-// `kTileSize` and 4 for `kNumRows`. The CUDA algorithm uses 8 for `kNumRows`.
+// index: The index for the first output element in the normalized tensor. The
+//   normalized tensor is the resulting tensor after collapsing contiguous
+//   dimensions that play the same role in the transpose.
+// y_loc: The y coordinate within a tile.
+// x_loc: The x coordinate within a tile.
+// kernel_info: Other information to support the kernel code generation.
+void IrEmitterUnnested::EmitTileElementForCopy(
+    HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc) {
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
+  // TODO(jlebar): Add AA metadata to this load.
+  llvm::Instruction* load_from_shmem_buffer =
+      Load(GEP(tiled_param_info->GetBufferForParameter(0),
+               {b_.getInt64(0), x_loc, y_loc}),
+           "output_element");
+  llvm_ir::IrArray output_array = GetIrArray(*hlo, *hlo);
+  Shape output_reduced_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      hlo->shape().element_type(),
+      kernel_info->GetKernelMappingScheme()->GetDimensionsInElements());
+  // When the output_reduced_shape is a 0-2-1 transpose of the input shape,
+  // the 0-2-1 transpose is achieved through EmitWriteArrayElement.
+  output_array.CastToShape(output_reduced_shape, &b_)
+      .EmitWriteArrayElement(index, load_from_shmem_buffer, &b_);
+}
+
+// Emits code to process a tensor element in a tile for the given kLoop fusion
+// HLO containing parameters that are 0-2-1 transpose of its outputs.
 //
-// TODO(b/33320379): Here each block transposes 1 tile. It may be more efficient
-// to launch fewer blocks so each transposes many tiles.
-LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
-    HloInstruction* hlo, absl::Span<const int64> reduced_output_dims,
-    absl::Span<const int64> tiled_param_ids) {
-  // Parameters for the tiling algorithm.
-  constexpr int64 kTileSize = 32;
-  constexpr int64 kNumRows = 4;
-  constexpr int64 kThreadsPerTile = kTileSize * kNumRows;
-
-  // Construct IrArrays for the inputs and outputs.
+// index: The index for the first output element in the normalized tensor, that
+//   is the resulting tensor after collapsing contiguous dimensions that play
+//   the same role in the transpose.
+// kernel_info: Other information to support the kernel code generation.
+// y_loc: The y coordinate within a tile.
+// x_loc: The x coordinate within a tile.
+void IrEmitterUnnested::EmitTileElementForFusion(
+    HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc) {
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
-  int64 num_outputs = output_arrays.size();
-  std::vector<IrArray> param_arrays = ConstructIrArrayForInputs(*hlo);
-  int64 num_params = param_arrays.size();
+  GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
+                                     GetNestedComputer());
+  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
+                               &elem_emitter);
+  tiled_param_info->set_y(y_loc);
+  tiled_param_info->set_x(x_loc);
+  fused_emitter.SetTiledParameterInfo(tiled_param_info);
+  TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
+  IrArray::Index untiled_index =
+      kernel_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
+          index, output_arrays[0].GetShape());
+  const llvm_ir::ElementGenerator& output_generator =
+      fused_emitter.GetRootGenerator();
+  llvm::Value* output_value = output_generator(untiled_index).ValueOrDie();
+  if (hlo->IsMultiOutputFusion()) {
+    DCHECK(output_value->getType()->isStructTy());
+    DCHECK_EQ(output_value->getType()->getStructNumElements(),
+              output_arrays.size());
+    for (int64 i = 0; i < output_arrays.size(); ++i) {
+      output_arrays[i].EmitWriteArrayElement(
+          untiled_index, ExtractValue(output_value, i), &b_);
+    }
+  } else {
+    output_arrays[0].EmitWriteArrayElement(untiled_index, output_value, &b_);
+  }
+}
+
+// Emits a block of tiles, given a function object to emit one tile.
+void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
+                                  const KernelCodegenInfo* kernel_info,
+                                  KernelSupportLibrary& ksl,
+                                  llvm::Type* index_ty) {
+  KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
+  absl::Span<const int64> dims_in_tile = mapping_scheme->GetDimensionsInTiles();
+  absl::Span<const int64> dims_in_block =
+      mapping_scheme->GetDimensionsInBlocks();
+  absl::Span<const int64> block_sizes = mapping_scheme->GetBlockSizes();
+  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
+    return llvm::ConstantInt::get(index_ty, c);
+  };
+
+  // Emit all the tiles for a given dimension in a tile block.
+  auto emit_tiles_for_block_dim =
+      [&](const string& loop_name, const IrArray::Index& starting_tile,
+          int dim_id,
+          const std::function<void(const IrArray::Index& tile_index)>
+              emit_next_block_dim) {
+        if (block_sizes[dim_id] == 1) {
+          emit_next_block_dim(starting_tile);
+        } else {
+          llvm::Value* starting_tile_index_for_dim = starting_tile[dim_id];
+          llvm::Value* block_size_for_dim =
+              index_typed_constant(block_sizes[dim_id]);
+          llvm::Value* block_id_for_dim =
+              b_.CreateUDiv(starting_tile_index_for_dim, block_size_for_dim);
+          llvm::Value* last_block_for_dim =
+              index_typed_constant(dims_in_block[dim_id] - 1);
+          llvm::Value* last_block_size_for_dim = index_typed_constant(
+              dims_in_tile[dim_id] -
+              (dims_in_block[dim_id] - 1) * block_sizes[dim_id]);
+          llvm::Value* num_tiles_in_block =
+              Select(ICmpEQ(last_block_for_dim, block_id_for_dim),
+                     last_block_size_for_dim, block_size_for_dim);
+
+          ksl.ForReturnVoid(
+              loop_name,
+              /*start=*/index_typed_constant(0),
+              /*end=*/num_tiles_in_block,
+              /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
+                IrArray::Index tile_index = starting_tile.AddOffsetToDim(
+                    block_dim_induction_var, dim_id, &b_);
+                emit_next_block_dim(tile_index);
+              });
+        }
+      };
+
+  absl::Span<const int64> reduced_dims =
+      mapping_scheme->GetDimensionsInElements();
+  const bool block_contains_multi_tiles =
+      mapping_scheme->GetNumberOfTilesInOneBlock() > 1;
+
+  // Emit the tile with a given tile_index, by calculating the tight bounds for
+  // each dimension of the tile and then calling emit_one_tile.
+  auto emit_one_tile_for_tile_index = [&](const IrArray::Index& tile_index) {
+    std::vector<llvm::Value*> output_tile_bounds(3);
+    for (int i = KernelMappingScheme::DimY; i < KernelMappingScheme::DimTot;
+         ++i) {
+      int64 tile_size_for_dim = mapping_scheme->GetTileSizeForDimension(i);
+      // Only last row or column may not have full size.
+      llvm::Value* is_last_row =
+          ICmpEQ(tile_index[i], index_typed_constant(dims_in_tile[i] - 1));
+      int64 partial_row_size =
+          reduced_dims[i] - (dims_in_tile[i] - 1) * tile_size_for_dim;
+      output_tile_bounds[i] =
+          Select(is_last_row, index_typed_constant(partial_row_size),
+                 index_typed_constant(tile_size_for_dim), "tile_bound");
+    }
+
+    IrArray::Index tile_origin =
+        mapping_scheme->GetElementIndexForTileOrigin(tile_index);
+    emit_one_tile(tile_origin, output_tile_bounds, block_contains_multi_tiles);
+  };
 
+  const IrArray::Index starting_block =
+      mapping_scheme->EmitBlockIndex(index_ty);
+  const IrArray::Index starting_tile_for_dim_z =
+      mapping_scheme->GetTileIndexForBlockOrigin(starting_block);
+
+  // Emit the three dimensional block of tiles.
+  emit_tiles_for_block_dim(
+      "block_dim_z", starting_tile_for_dim_z, KernelMappingScheme::DimZ,
+      [&](const IrArray::Index& starting_tile_for_dim_y) {
+        emit_tiles_for_block_dim(
+            "block_dim_y", starting_tile_for_dim_y, KernelMappingScheme::DimY,
+            [&](const IrArray::Index& starting_tile_for_dim_x) {
+              emit_tiles_for_block_dim("block_dim_x", starting_tile_for_dim_x,
+                                       KernelMappingScheme::DimX,
+                                       emit_one_tile_for_tile_index);
+            });
+      });
+}
+
+// Emits a kernel for the hlo instruction using the given kernel mapping scheme.
+//
+// unnested_hlo: The unnested hlo instruction for which the kernel is generated.
+//   Currently, these hlo instructions are supported: kLoop fusion, kCopy.
+// tiled_param_ids: The IDs for the parameters that are 0-2-1 transpose of
+//   other tensors with the same dimensions and need to be tiled and tranposed.
+// mapping_scheme: The tiling scheme to use.
+// kernel_generator: Contains function objects for code generation, such as
+//   element generator, block prologue and epilogue generators.
+// kernel_info: Represent other information to support the code generation
+//   of the tiled kernel for the hlo.
+LaunchDimensions IrEmitterUnnested::EmitKernel(
+    HloInstruction* unnested_hlo, absl::Span<const int64> tiled_param_ids,
+    const KernelCodeGenerator& kernel_generator,
+    KernelCodegenInfo* kernel_info) {
+  KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
+
+  std::vector<IrArray> param_arrays = ConstructIrArrayForInputs(*unnested_hlo);
+  int64 num_params = param_arrays.size();
   // Allocate shared memory buffers to store the tiled inputs.
   std::vector<llvm::Value*> param_shmem_buffers(num_params, nullptr);
   for (int64 id : tiled_param_ids) {
-    const HloInstruction* param = hlo->operand(id);
-    // Add 1 to the minor dimension to reduce shared memory bank conflicts.
-    llvm::Type* tile_type = llvm::ArrayType::get(
-        llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
-                                 param->shape().element_type(), module_),
-                             kTileSize + 1),
-        kTileSize);
-    const int kNVPTXSharedMemoryAddrSpace = 3;
-    auto* tile_base_ptr = new llvm::GlobalVariable(
-        *b_.GetInsertBlock()->getParent()->getParent(), tile_type,
-        /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage,
-        llvm::UndefValue::get(tile_type),
-        llvm_ir::AsStringRef(IrName(hlo, StrCat("tile", id))), nullptr,
-        llvm::GlobalValue::NotThreadLocal, kNVPTXSharedMemoryAddrSpace);
-    param_shmem_buffers[id] = tile_base_ptr;
+    const HloInstruction* param = unnested_hlo->operand(id);
+    param_shmem_buffers[id] =
+        mapping_scheme->GetSharedMemoryBufferForElementType(
+            llvm_ir::PrimitiveTypeToIrType(param->shape().element_type(),
+                                           module_),
+            IrName(unnested_hlo, StrCat("tile", id)));
     VLOG(3) << "Added shmem buffer for parameter " << id << ": "
-            << llvm_ir::DumpToString(*tile_base_ptr);
-  }
-
-  // The 0-2-1 shape of the tiling scheme is the reduced shape of the HLO result
-  // for the purpose of tiling. Calculate the logical output dimensions in the
-  // tile from the reduced output dimensions.
-  std::vector<int64> output_dims_in_tiles = std::vector<int64>(
-      reduced_output_dims.begin(), reduced_output_dims.end());
-  CHECK_EQ(output_dims_in_tiles.size(), 3);
-  for (int i = 1; i < 3; ++i) {
-    output_dims_in_tiles[i] =
-        CeilOfRatio<int64>(output_dims_in_tiles[i], kTileSize);
+            << llvm_ir::DumpToString(*param_shmem_buffers[id]);
   }
-  const int64 num_tiles =
-      absl::c_accumulate(output_dims_in_tiles, 1, std::multiplies<int64>());
-  LaunchDimensions launch_dimensions(num_tiles, kThreadsPerTile);
 
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(hlo, launch_dimensions.launch_bound(), &b_);
+  CHECK_EQ(mapping_scheme->GetThreadsPerTile() % kWarpSize, 0);
+  LaunchDimensions launch_dimensions = LaunchDimensions(
+      mapping_scheme->GetNumberOfBlocks(), mapping_scheme->GetThreadsPerTile());
+  llvm::Type* index_ty = GetIndexTypeForKernel(
+      unnested_hlo, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
 
-  // Cast each output IrArray to its corresponding reduced shape and keep the
-  // reduced shape live during IR emission.
-  std::vector<IrArray> output_in_reduced_shape_arrays;
-  std::vector<Shape> output_reduced_shapes;
-  CHECK_EQ(ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-               *hlo, output_arrays, reduced_output_dims, &output_reduced_shapes,
-               &output_in_reduced_shape_arrays),
-           num_outputs);
-
   // For each tiled parameter, cast its input IrArray to the corresponding
   // reduced shape and keep the reduced shape live during IR emission.
   std::vector<IrArray> param_in_reduced_shape_arrays;
   std::vector<Shape> param_reduced_shapes;
-  CHECK_EQ(ConstructInputReducedShapeAndCastInputIrArrayToShape(
-               *hlo, param_arrays, param_shmem_buffers, reduced_output_dims,
-               &param_reduced_shapes, &param_in_reduced_shape_arrays),
-           num_params);
+  absl::Span<const int64> reduced_dims =
+      mapping_scheme->GetDimensionsInElements();
+  int num_shapes = ConstructInputReducedShapeAndCastInputIrArrayToShape(
+      *unnested_hlo, param_arrays, param_shmem_buffers, reduced_dims,
+      &param_reduced_shapes, &param_in_reduced_shape_arrays);
+  DCHECK_EQ(num_shapes, num_params);
 
   // Calculate the starting element coordinate within a tile for the current
   // thread, (y, x) from thread_id.
   llvm::Value* x;
   llvm::Value* y;
-  std::tie(y, x) = CalculateYXCoordinateWithinTile(
-      &b_, index_typed_constant(kTileSize), kThreadsPerTile);
-
-  // Calculate the index for the current output tile from block_id.
-  const IrArray::Index output_tile_index(
-      GetBlockIdx(&b_, index_ty, num_tiles),
-      ShapeUtil::MakeShapeWithDescendingLayout(PRED /*arbitrary*/,
-                                               output_dims_in_tiles),
-      &b_);
-
-  // Output tile origin is the index for the first element of the current output
-  // tile.
-  const IrArray::Index output_tile_origin = [&] {
-    IrArray::Index index = output_tile_index;
-    for (int i = 1; i < 3; ++i) {
-      index[i] = Mul(output_tile_index[i], index_typed_constant(kTileSize),
-                     "tile_origin." + std::to_string(i));
-    }
-    return index;
-  }();
+  std::tie(y, x) = mapping_scheme->EmitThreadYXCoordinate(index_ty);
 
-  // Calculate the input tile origin from the output tile origin.
-  const IrArray::Index input_tile_origin(
-      Permute({0, 2, 1}, output_tile_origin.multidim()));
-
-  // Calculate the current output tile bounds in each of the logical dimensions.
-  std::vector<llvm::Value*> output_tile_bounds(3);
-  for (int i = 1; i < 3; ++i) {
-    // Only last row or column may not have full size.
-    output_tile_bounds[i] =
-        Select(ICmpEQ(output_tile_index[i],
-                      index_typed_constant(output_dims_in_tiles[i] - 1)),
-               index_typed_constant(reduced_output_dims[i] -
-                                    (output_dims_in_tiles[i] - 1) * kTileSize),
-               index_typed_constant(kTileSize), "kTileSize");
-  }
+  kernel_info->SetLaneId(
+      mapping_scheme->GetNumberOfThreadsForDimensionX() == kWarpSize ? x
+                                                                     : nullptr);
 
   KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
-
   // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck.
   auto emit_tiled_elemental_code_with_bounds_check =
       [&](const IrArray::Index& index, const string& loop_name,
-          llvm::Value* tile_width, llvm::Value* tile_height,
-          const std::function<void(const IrArray::Index&, llvm::Value*)>&
-              emit_elem_function) {
-        EmitTiledElementalCodeWithBoundsCheck(
-            kTileSize, kNumRows, index, loop_name, &ksl, &b_, y, x, tile_width,
-            tile_height, emit_elem_function);
+          llvm::Value* tile_height, llvm::Value* tile_width,
+          const std::function<void(const IrArray::Index&, llvm::Value*,
+                                   llvm::Value*)>& emit_elem_function) {
+        EmitTiledElementalCodeWithBoundsCheck(mapping_scheme, index, loop_name,
+                                              &ksl, &b_, y, x, tile_height,
+                                              tile_width, emit_elem_function);
       };
 
-  // Adds `addend` to the given `dim` of `index`.
-  auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = Add(index[dim], addend);
-    return index;
-  };
-  const IrArray::Index input_index =
-      offset_dim(offset_dim(input_tile_origin, x, /*dim=*/2), y, /*dim=*/1);
-
-  // Copy input parameter values to shared memory buffers:
-  // tile[y, x] = input[index]
-  emit_tiled_elemental_code_with_bounds_check(
-      input_index, "input", output_tile_bounds[1], output_tile_bounds[2],
-      [&](const IrArray::Index& index, llvm::Value* y_loc) {
-        for (int64 id : tiled_param_ids) {
-          IrArray& input_in_logical_shape = param_in_reduced_shape_arrays[id];
-          llvm::Value* shmem_buffer = param_shmem_buffers[id];
-          // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
-          // global variables, so LLVM can't infer much about it.
-          Store(input_in_logical_shape.EmitReadArrayElement(index, &b_,
-                                                            "input_element"),
-                GEP(shmem_buffer, {index_typed_constant(0), y_loc, x}));
-        }
-      });
+  auto emit_one_tile = [&](const IrArray::Index& output_tile_origin,
+                           absl::Span<llvm::Value* const> output_tile_bounds,
+                           bool block_contains_multi_tiles) {
+    // Calculate the input tile origin from the output tile origin.
+    const IrArray::Index input_tile_origin(
+        Permute({0, 2, 1}, output_tile_origin.multidim()));
+
+    const IrArray::Index input_index =
+        input_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
+            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
+
+    // Copy input parameter values to shared memory buffers:
+    // tile[y, x] = input[index]
+    // Note that tile_width and tile_height are flipped here because we are
+    // reading a transposed tile.
+    emit_tiled_elemental_code_with_bounds_check(
+        input_index, "input", output_tile_bounds[2], output_tile_bounds[1],
+        [&](const IrArray::Index& index, llvm::Value* y_loc,
+            llvm::Value* x_loc) {
+          for (int64 id : tiled_param_ids) {
+            IrArray& input_in_logical_shape = param_in_reduced_shape_arrays[id];
+            llvm::Value* shmem_buffer = param_shmem_buffers[id];
+            // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
+            // global variables, so LLVM can't infer much about it.
+            Store(input_in_logical_shape.EmitReadArrayElement(index, &b_,
+                                                              "input_element"),
+                  GEP(shmem_buffer, {index_typed_constant(0), y_loc, x_loc}));
+          }
+        });
 
-  // Wait for all threads to reach this point, lest we copy a value from tile to
-  // output before the other thread copies it from input to tile.
-  // This is `__syncthreads` in CUDA.
-  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    // If shared memory transpose is needed, wait for all threads to reach this
+    // point, lest we copy a value from tile to output before the other thread
+    // copies it from input to tile. This is `__syncthreads` in CUDA.
+    if (!tiled_param_ids.empty()) {
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    }
 
-  llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
+    llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
+    kernel_info->SetTiledParamInfo(&tiled_param_info);
 
-  const IrArray::Index output_index =
-      offset_dim(offset_dim(output_tile_origin, x, /*dim=*/2), y, /*dim=*/1);
+    const IrArray::Index output_index =
+        output_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
+            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
 
-  // Write to output[index] by emitting code like normal, except that values for
-  // the tiled parameters are read from the shmem buffers.
-  if (hlo->opcode() == HloOpcode::kCopy) {
+    // Write to output[index] by emitting code like normal, except that values
+    // for the tiled parameters are read from the shmem buffers.
     emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
-        [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          // TODO(jlebar): Add AA metadata to this load.
-          llvm::Instruction* load_from_shmem_buffer =
-              Load(GEP(param_shmem_buffers[0], {b_.getInt64(0), x, y_loc}),
-                   "output_element");
-          output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-              index, load_from_shmem_buffer, &b_);
-        });
-  } else {
-    CHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
-    emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
-        [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
-                                             GetNestedComputer());
-          FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
-                                       &elem_emitter);
-          tiled_param_info.set_y(y_loc);
-          fused_emitter.SetTiledParameterInfo(&tiled_param_info);
-          TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
-          IrArray::Index untiled_index = llvm_ir::GetUnreducedOutputIndex(
-              index, output_reduced_shapes[0], output_arrays[0].GetShape(),
-              &b_);
-          const llvm_ir::ElementGenerator& output_generator =
-              fused_emitter.GetRootGenerator();
-          llvm::Value* output_value =
-              output_generator(untiled_index).ValueOrDie();
-          if (hlo->IsMultiOutputFusion()) {
-            CHECK(output_value->getType()->isStructTy());
-            CHECK_EQ(output_value->getType()->getStructNumElements(),
-                     output_in_reduced_shape_arrays.size());
-            for (int64 i = 0; i < output_in_reduced_shape_arrays.size(); ++i) {
-              output_in_reduced_shape_arrays[i].EmitWriteArrayElement(
-                  index, ExtractValue(output_value, i), &b_);
-            }
-          } else {
-            output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-                index, output_value, &b_);
-          }
+        output_index, "output", output_tile_bounds[1], output_tile_bounds[2],
+        [&](const IrArray::Index& index, llvm::Value* y_loc,
+            llvm::Value* x_loc) {
+          kernel_generator.GetTileElementGenerator()(unnested_hlo, index,
+                                                     kernel_info, y_loc, x_loc);
         });
+    // If a tile block contains multiple tiles and shared memory buffers are
+    // used, we need to wait for all threads to finish using the shared memory
+    // buffer for the current tile before we move on to process the next tile
+    // and overwrite the shared memory buffers.
+    if (block_contains_multi_tiles && !tiled_param_ids.empty()) {
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    }
+  };
+
+  const BlockPrologueGenerator& block_prologue_generator =
+      kernel_generator.GetBlockPrologueGenerator();
+  if (block_prologue_generator) {
+    block_prologue_generator(unnested_hlo, kernel_info);
   }
 
-  // For multioutput fusion, emit a tuple with all the individual outputs.
-  if (hlo->IsMultiOutputFusion()) {
-    llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo), output_arrays, &b_, module_);
+  EmitBlock(std::move(emit_one_tile), kernel_info, ksl, index_ty);
+
+  const BlockEpilogueGenerator& block_epilogue_generator =
+      kernel_generator.GetBlockEpilogueGenerator();
+  if (block_epilogue_generator) {
+    block_epilogue_generator(unnested_hlo, kernel_info);
+  }
+
+  // For multioutput fusion, emit a tuple with pointers to all the individual
+  // outputs.
+  if (unnested_hlo->IsMultiOutputFusion()) {
+    std::vector<IrArray> output_arrays =
+        ConstructIrArrayForOutputs(*unnested_hlo);
+    llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo), output_arrays,
+                       &b_, module_);
   }
 
   return launch_dimensions;
 }
 
+// Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
+// algorithm to improve the memory access patterns for the input parameters
+// with a shape that is a 0-2-1 transpose of the output tensor shape.
+//
+// For the purpose of tiling, the output tensors have a logical shape of three
+// components 0-2-1 while the relevant input parameters have a logical shape
+// of three components 0-1-2 in the order major to minor. The x- and y-
+// dimensions of the tensors are tiled in square tiles with an edge length
+// `kTileSize`. Each thread block of `kTileSize` x `kNumRows` threads
+// transposes one tile: each thread copies kTileSize/kNumRows elements from
+// the input to a shared memory tile, then the otherwise "regular HLO kernel"
+// reads from the shared memory instead of the original input.
+//
+// This is similar to the following CUDA algorithm in TensorFlow:
+// https://goo.gl/MStRV6.
+//
+// `kTileSize` should usually be same as warp size. We currently choose 32 for
+// `kTileSize` and 4 for `kNumRows`. The CUDA algorithm uses 8 for `kNumRows`.
+//
+// TODO(b/33320379): Here each block transposes 1 tile. It may be more
+// efficient to launch fewer blocks so each transposes many tiles.
+LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
+    HloInstruction* hlo, absl::Span<const int64> reduced_output_dims,
+    absl::Span<const int64> tiled_param_ids) {
+  constexpr int kNumRows = 4;
+  KernelMappingScheme mapping_scheme(
+      reduced_output_dims, /*tile_size_y=*/kWarpSize,
+      /*tile_size_x=*/kWarpSize, /*req_block_sizes=*/{1, 1, 1},
+      /*num_threads_y=*/kNumRows,
+      /*num_threads_x=*/kWarpSize, &b_);
+  TileElementGenerator element_generator;
+  if (hlo->opcode() == HloOpcode::kCopy) {
+    element_generator = [&](HloInstruction* hlo,
+                            const llvm_ir::IrArray::Index& index,
+                            const KernelCodegenInfo* kernel_info,
+                            llvm::Value* y_loc, llvm::Value* x_loc) {
+      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc);
+    };
+  } else {
+    DCHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
+    element_generator = [&](HloInstruction* hlo,
+                            const llvm_ir::IrArray::Index& index,
+                            const KernelCodegenInfo* kernel_info,
+                            llvm::Value* y_loc, llvm::Value* x_loc) {
+      EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc);
+    };
+  }
+  KernelCodegenInfo kernel_info(&mapping_scheme);
+  KernelCodeGenerator kernel_generator(std::move(element_generator));
+  return EmitKernel(hlo, tiled_param_ids, kernel_generator, &kernel_info);
+}
+
+namespace {
+// Returns true to indicate it is safe to use the tile based shared memory
+// transpose implementation to implement the kernel for the instruction.
+//
+// An instruction is not safe for such an implementation if it can change the
+// element order of a tensor without changing the dimension of the tensor, and
+// the instruction has a corresponding elemental_ir_emitter.
+bool IsInstructionSafeForTileBasedTranspose(const HloInstruction* hlo) {
+  auto is_safe_for_tile_based_transpose = [&](const HloInstruction* instr) {
+    HloOpcode opcode = instr->opcode();
+    CHECK_NE(opcode, HloOpcode::kFusion);
+    return (opcode != HloOpcode::kReverse && opcode != HloOpcode::kGather);
+  };
+
+  if (hlo->opcode() == HloOpcode::kFusion) {
+    return absl::c_all_of(hlo->fused_instructions_computation()->instructions(),
+                          is_safe_for_tile_based_transpose);
+  }
+
+  return is_safe_for_tile_based_transpose(hlo);
+}
+}  // namespace
+
 bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   HloOpcode opcode = hlo->opcode();
   CHECK(opcode == HloOpcode::kFusion || opcode == HloOpcode::kCopy);
@@ -3465,8 +3748,8 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
                                   ? ShapeUtil::GetSubshape(hlo->shape(), {0})
                                   : hlo->shape();
 
-  // If the output_shape is reduced to 021 shape, find all the parameters of the
-  // hlo that are in the corresponding 012 shape.
+  // If the output_shape is reduced to 021 shape, find all the parameters of
+  // the HLO that are in the corresponding 012 shape.
   std::vector<int64> params_012;
   optional<std::vector<int64>> reduced_dims_021;
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
@@ -3498,10 +3781,14 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
     return false;
   }
 
+  if (!IsInstructionSafeForTileBasedTranspose(hlo)) {
+    return false;
+  }
+
   // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
-  // elements are of size 4 bytes), and CUDA has an architectural limit of 48kb
-  // shared memory per SM.  (This is increased to 96kb in Volta, but we don't
-  // use this, in part because it eats into our L1 cache space.)
+  // elements are of size 4 bytes), and CUDA has an architectural limit of
+  // 48kb shared memory per SM.  (This is increased to 96kb in Volta, but we
+  // don't use this, in part because it eats into our L1 cache space.)
   //
   // For correctness we need to ensure that we don't make more than 48kb worth
   // of shmem tiles per block.  And for performance, we'd probably like to use
@@ -3509,9 +3796,9 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   // gpu core.
   //
   // We say without benchmarks that we want at least 3 threads/block,
-  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We choose
-  // which params get the shmem transpose treatment arbitrarily; it's not clear
-  // if there's a Right Choice.
+  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We
+  // choose which params get the shmem transpose treatment arbitrarily; it's
+  // not clear if there's a Right Choice.
   //
   // This is only sound if tiled transposes are the only place where we use
   // shared memory in fusions.  If in the future other fusible ops use shared
@@ -3565,10 +3852,10 @@ Status IrEmitterUnnested::EmitConstantGlobals() {
     }
 
     // These globals will be looked up by name by GpuExecutable so we need to
-    // give them an external linkage.  Not all of their uses are visible in the
-    // LLVM IR (e.g. TupleThunk) so we can't give then a linkage that merely
-    // preserves their names (like available_externally), we also need to ensure
-    // that they stick around even if they're "unused".
+    // give them an external linkage.  Not all of their uses are visible in
+    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
+    // merely preserves their names (like available_externally), we also need
+    // to ensure that they stick around even if they're "unused".
     //
     // We may have to be more more clever here in the future if we notice that
     // we're keeping around too many globals because of their linkage.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 334c0b3c20b0888fa9b167a8979221f0184a82e7..97a1e10455336cd4842275b6cf1482614bfbfa60 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h"
 
 namespace xla {
@@ -47,6 +48,94 @@ namespace gpu {
 //
 class IrEmitterUnnested : public IrEmitter {
  public:
+  // Parameter block_contains_multi_tiles indicates whether a tile block
+  // consists of multiple tiles or not. If the tile block contains only one
+  // tile, there is no need to use atomic operation to accumulate a local result
+  // to a global result to implement reduction.
+  using TileGenerator =
+      std::function<void(const llvm_ir::IrArray::Index& output_tile_origin,
+                         absl::Span<llvm::Value* const> output_tile_bounds,
+                         bool block_contains_multi_tiles)>;
+  // KernelCodegenInfo records the common information to support the code
+  // generation for a kernel to process tensor elements by blocks. A block of
+  // tensor elements may contain one or multiple tiles. The code generators that
+  // generate code for tile elements or block prologue/epilogue refer to this
+  // class in their prototypes. If the implementations of such code generators
+  // require other information that are specific to the HLO instructions, the
+  // implementations need to define and use derived classes of this class.
+  class KernelCodegenInfo {
+   public:
+    explicit KernelCodegenInfo(llvm_ir::KernelMappingScheme* mapping_scheme)
+        : mapping_scheme_(mapping_scheme),
+          tiled_param_info_(nullptr),
+          lane_id_(nullptr) {}
+
+    void SetLaneId(llvm::Value* v) { lane_id_ = v; }
+    void SetTiledParamInfo(llvm_ir::TiledParameterInfo* tiled_param_info) {
+      CHECK_EQ(tiled_param_info_, nullptr);
+      tiled_param_info_ = tiled_param_info;
+    }
+
+    llvm::Value* GetLaneId() const { return lane_id_; }
+    llvm_ir::KernelMappingScheme* GetKernelMappingScheme() const {
+      return mapping_scheme_;
+    }
+    llvm_ir::TiledParameterInfo* GetTiledParameterInfo() const {
+      return tiled_param_info_;
+    }
+
+   private:
+    llvm_ir::KernelMappingScheme* mapping_scheme_;
+    llvm_ir::TiledParameterInfo* tiled_param_info_;
+    llvm::Value* lane_id_;
+  };
+
+  // A function object to prepare for the code generation for a tile block.
+  using BlockPrologueGenerator =
+      std::function<void(HloInstruction* hlo, KernelCodegenInfo* kernel_info)>;
+  // A function object to finalize the code generation for a tile block.
+  using BlockEpilogueGenerator =
+      std::function<void(HloInstruction* hlo, KernelCodegenInfo* kernel_info)>;
+  // A function object to generate code to process one element in a tile.
+  //
+  // hlo: the instruction for which the code is generated for.
+  // index: the index for the first output element of the current thread.
+  // y_loc: The y coordinate within a tile.
+  // x_loc: The x coordinate within a tile.
+  // kernel_info: Other information to support the kernel code generation.
+  using TileElementGenerator = std::function<void(
+      HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+      const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+      llvm::Value* x_loc)>;
+
+  // KernelCodeGenerator records the code generator objects that generate code
+  // for tile elements or tile block prologue/epilogue.
+  class KernelCodeGenerator {
+   public:
+    explicit KernelCodeGenerator(
+        TileElementGenerator tile_element_generator,
+        BlockPrologueGenerator block_prologue_generator = {},
+        BlockEpilogueGenerator block_epilogue_generator = {})
+        : tile_element_generator_(std::move(tile_element_generator)),
+          block_prologue_generator_(std::move(block_prologue_generator)),
+          block_epilogue_generator_(std::move(block_epilogue_generator)) {}
+
+    const TileElementGenerator& GetTileElementGenerator() const {
+      return tile_element_generator_;
+    }
+    const BlockPrologueGenerator& GetBlockPrologueGenerator() const {
+      return block_prologue_generator_;
+    }
+    const BlockEpilogueGenerator& GetBlockEpilogueGenerator() const {
+      return block_epilogue_generator_;
+    }
+
+   private:
+    TileElementGenerator tile_element_generator_;
+    BlockPrologueGenerator block_prologue_generator_;
+    BlockEpilogueGenerator block_epilogue_generator_;
+  };
+
   IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
                     const HloComputation* hlo_computation,
                     IrEmitterContext* ir_emitter_context);
@@ -205,22 +294,32 @@ class IrEmitterUnnested : public IrEmitter {
   LaunchDimensions EmitHlo021Tile(HloInstruction* hlo,
                                   absl::Span<const int64> reduced_output_dims,
                                   absl::Span<const int64> tiled_param_ids);
+  // Emits a kernel for an unnested HLO instruction.
+  LaunchDimensions EmitKernel(HloInstruction* unnested_hlo,
+                              absl::Span<const int64> param_ids,
+                              const KernelCodeGenerator& kernel_generator,
+                              KernelCodegenInfo* kernel_info);
+  void EmitBlock(const TileGenerator& emit_one_tile,
+                 const KernelCodegenInfo* kernel_info,
+                 KernelSupportLibrary& ksl, llvm::Type* index_ty);
+  // Emits code to process a tensor element in a tile for the given kCopy HLO
+  // that performs a 0-2-1 transpose.
+  void EmitTileElementForCopy(HloInstruction* hlo,
+                              const llvm_ir::IrArray::Index& index,
+                              const KernelCodegenInfo* kernel_info,
+                              llvm::Value* y_loc, llvm::Value* x_loc);
+  // Emits code to process a tensor element in a tile for the given kLoop fusion
+  // HLO containing parameters that are 0-2-1 transpose of its outputs.
+  void EmitTileElementForFusion(HloInstruction* hlo,
+                                const llvm_ir::IrArray::Index& index,
+                                const KernelCodegenInfo* kernel_info,
+                                llvm::Value* y_loc, llvm::Value* x_loc);
 
   // Generates the IrArray for each input of an hlo and returns a vector that
   // constains such IrArrays.
   std::vector<llvm_ir::IrArray> ConstructIrArrayForInputs(
       const HloInstruction& hlo);
 
-  // For each output of the `hlo` instruction, constructs the reduced shape for
-  // the output with the given `reduced_output_dims` and cast the original
-  // output IrArray element in `output_arrays` to the reduced shape. Returns
-  // the number of outputs.
-  int ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-      const HloInstruction& hlo,
-      const std::vector<llvm_ir::IrArray>& output_arrays,
-      absl::Span<const int64> reduced_output_dims,
-      std::vector<Shape>* output_reduced_shapes,
-      std::vector<llvm_ir::IrArray>* output_in_reduced_shape_arrays);
   // For each input of the `hlo` instruction, checks its value in
   // `param_buffers` to find out whether the input has a reduced shape. If the
   // input has a reduced shape, constructs the reduced shape for the input and
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index 8751e3a9c2a4c8da46d3ecd8437629450d4a2ba2..364f69a69d47644b383af9cf6865c93360b82bab 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -453,12 +453,12 @@ void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
   // * 3-6 gives similar results as 2;
   // * >6 start hurting the performance of at least dot product kernels.
   //
-  // TODO(jingyue): The current threshold only considers the numbr of IR
+  // TODO(jingyue): The current threshold only considers the number of IR
   // instructions which do not accurately reflect the true cost. We need a
   // better cost model.
   FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
-  // TODO(b/22073864): Increase limit when scan memory dependency.
-  // This helps to reduce more redundant load instructions.
+  // Increase limit when scanning memory dependencies.  This helps to reduce
+  // more redundant load instructions.
   //
   // The specific value is currently large enough for s3d in shoc benchmark,
   // which contains a lot of load instructions and many arithmetic instructions
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 9427d3d54addc7d794ddc0a8f4c45b39b248bc5f..d9b06828e2b5d334873c88cb49c2e0d5675bb5fe 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -140,6 +140,18 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
     return false;
   }
 
+  // The emitter only supports in-place DUS for fusions with a single DUS at the
+  // root. Don't sibling fuse DUS for now.
+  // TODO(b/119178699): Multi-output fusing DUS can improve performance if we
+  // share the input and output buffers and add support to the emitter.
+  if (instr1->fused_expression_root()->opcode() ==
+          HloOpcode::kDynamicUpdateSlice ||
+      (instr2->opcode() == HloOpcode::kFusion &&
+       instr2->fused_expression_root()->opcode() ==
+           HloOpcode::kDynamicUpdateSlice)) {
+    return false;
+  }
+
   // Do this check last, as it may be expensive.
   return !GpuInstructionFusion::FusionWouldBeTooLarge(instr1, instr2);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 1d4856e0cae163bbd9ab741917b85792097d8512..d16c87ba5c63aa582753fe949e9e39ee2d8b81e5 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -580,7 +580,7 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   //   ...
   // where each of the (pi * pj)'s is represented as a fusion node so that
   // multi-output fusion will pay attention to it.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
 
@@ -621,5 +621,39 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   }
 }
 
+TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
+  auto module = ParseHloString(R"(HloModule dus_mof
+    fusion.1 {
+      p.0 = f16[50,96,1024]{2,1,0} parameter(0)
+      p.1 = s32[1]{0} parameter(1)
+      p.2 = f16[1,96,1024]{2,1,0} parameter(2)
+      c.0 = s32[] constant(0)
+      pad = s32[3]{0} pad(p.1, c.0), padding=0_2
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+    }
+
+    fusion.2 {
+      p.0 = f16[50,96,1024]{2,1,0} parameter(0)
+      p.1 = s32[1]{0} parameter(1)
+      p.2 = f16[1,96,1024]{2,1,0} parameter(2)
+      c.0 = s32[] constant(0)
+      pad = s32[3]{0} pad(p.1, c.0), padding=0_2
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+    }
+
+    ENTRY entry {
+      p.00 = f16[50,96,1024]{2,1,0} parameter(0)
+      p.01 = f16[50,96,1024]{2,1,0} parameter(1)
+      p.1 = s32[1]{0} parameter(2)
+      p.2 = f16[1,96,1024]{2,1,0} parameter(3)
+
+      f1 = f16[50,96,1024] fusion(p.00, p.1, p.2), kind=kLoop, calls=fusion.1
+      f2 = f16[50,96,1024] fusion(p.01, p.1, p.2), kind=kLoop, calls=fusion.2
+      ROOT tuple = (f16[50,96,1024],f16[50,96,1024]) tuple(f1, f2)
+    })")
+                    .ValueOrDie();
+  ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index de04ed85c30717f5be7c5485ff3b68270c8ec188..637b861f70235f17e8e739907a3f262b7004ee7c 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -67,6 +67,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
@@ -142,6 +143,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
                          Compiler* compiler) {
   {
     HloPassPipeline pipeline("optimization");
+    pipeline.AddPass<HloGetDimensionSizeRewriter>();
     pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                               /*allow_mixed_precision=*/false);
     pipeline.AddPass<GpuHloSupportChecker>();
@@ -177,9 +179,10 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
 
-      pass.AddPass<AlgebraicSimplifier>(
-          /*is_layout_sensitive=*/false,
+      AlgebraicSimplifierOptions options(
           [](const Shape&, const Shape&) { return false; });
+      options.set_enable_permutation_sort_replacement(true);
+      pass.AddPass<AlgebraicSimplifier>(options);
       pass.AddPass<TupleSimplifier>();
       pass.AddPass<WhileLoopConstantSinking>();
       pass.AddPass<WhileLoopSimplifier>();
@@ -248,11 +251,13 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        /*is_layout_sensitive=*/true,
+    AlgebraicSimplifierOptions options(
         /*valid_bitcast_callback=*/[](const Shape&, const Shape&) {
           return true;
         });
+    options.set_is_layout_sensitive(true);
+    options.set_enable_permutation_sort_replacement(true);
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
 
     // Choose the fastest algorithm for each conv.
     //
@@ -810,7 +815,7 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
             // binaries are not available. We don't want to spam logs with
             // identical warnings in this case.
 
-            // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N
+            // TODO(jlebar): we should implement a LOG_FIRST_N and LOG_EVERY_N
             // for more general usage.
             static std::atomic<bool> warning_done(false);
             log_warning = !warning_done.exchange(true);
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index 5b6cf2c04d05378a363232e33a6df6432cd6848e..4775baf44aecfe6adaf2bf0d2791595436635b16 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -122,7 +122,7 @@ std::unique_ptr<StreamAssignment> AssignStreams(const HloModule& module) {
   auto stream_assignment = absl::make_unique<StreamAssignment>();
   const HloComputation& computation = *module.entry_computation();
   std::unique_ptr<HloReachabilityMap> reachability =
-      computation.ComputeReachability();
+      HloReachabilityMap::Build(&computation);
   std::vector<const HloInstruction*> seen_gemms;
   // The execution of different RNG Hlo instructions in the same module updates
   // a common global variable. To avoid a race condition, we simply assign all
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index c4f43cc9a614283acb376b5f98e4976615b590ad..31a5d7a8c04e9863830e2026fc73cd7ded8c322e 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -21,16 +21,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
 
-class StreamAssignmentTest : public HloVerifiedTestBase {
+class StreamAssignmentTest : public HloTestBase {
  protected:
-  std::unique_ptr<HloModule> CreateNewModule() {
+  std::unique_ptr<HloModule> CreateNewVerifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -55,7 +55,7 @@ TEST_F(StreamAssignmentTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -76,7 +76,7 @@ TEST_F(StreamAssignmentTest, ConcurrentMatMul) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -120,7 +120,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index ed46f08d5970d479db33a7b9ad416a1480535764..d798b31643782eb25bba08227e29903ec0e7a597 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -37,7 +37,7 @@ cc_library(
     hdrs = ["gpu_codegen_test.h"],
     tags = tf_cuda_tests_tags(),
     deps = [
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
         "//tensorflow/compiler/xla/tests:filecheck",
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
index 79e77d4c4d649020cf52ac25c220c3f90e8469b9..9e3ff8750b88d08bcbc1aae3faead5aecfa19848 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/core/platform/logging.h"
@@ -23,9 +23,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-std::unique_ptr<HloModule> GpuCodegenTest::CreateNewModuleWithFTZ(bool ftz) {
+std::unique_ptr<HloModule> GpuCodegenTest::CreateNewUnverifiedModuleWithFTZ(
+    bool ftz) {
   HloModuleConfig config;
-  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  auto debug_options = GetDebugOptionsFromFlags();
   debug_options.set_xla_gpu_ftz(ftz);
   debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
index e4a3573babb7ed746504c1466f85b582aa4d044f..d917320e36363c4fa7e4c0055e8f3345cbc610a2 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -26,9 +26,9 @@ namespace gpu {
 // Tests that verify IR or PTX emitted by the GPU backend is as expected.
 class GpuCodegenTest : public LlvmIrGenTestBase {
  protected:
-  // Like HloTestBase::CreateNewModule(), with a flag for configuring the ftz
-  // option.
-  std::unique_ptr<HloModule> CreateNewModuleWithFTZ(bool ftz);
+  // Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring
+  // the ftz option.
+  std::unique_ptr<HloModule> CreateNewUnverifiedModuleWithFTZ(bool ftz);
 
   // Compiles the given HLO module to PTX and verifies the PTX matches the given
   // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index 780539c164277f14c2bd964024f7c3ca179f4ada..a1ed8499040359fe7265a7317b0577a990a2234c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -46,7 +46,7 @@ TEST_F(GpuCopyTest, UseMemcpy) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // There should not be any kernel prefixed "copy".
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index 177b94934c7f519172508b5cc6e088f908401193..d0ccd8619bde9ddd560989380b403efed5c5f42c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -39,7 +39,7 @@ class GpuFtzTest : public GpuCodegenTest {
         /* parameter_number=*/1, param_shape, "y"));
     builder.AddInstruction(HloInstruction::CreateBinary(param_shape, op, x, y));
 
-    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    auto hlo_module = CreateNewUnverifiedModuleWithFTZ(ftz_);
     hlo_module->AddEntryComputation(builder.Build());
     return hlo_module;
   }
@@ -54,7 +54,7 @@ class GpuFtzTest : public GpuCodegenTest {
         /* parameter_number=*/0, param_shape, "x"));
     builder.AddInstruction(HloInstruction::CreateUnary(param_shape, op, x));
 
-    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    auto hlo_module = CreateNewUnverifiedModuleWithFTZ(ftz_);
     hlo_module->AddEntryComputation(builder.Build());
     return hlo_module;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index a06576df7b874745236a8d9075355a01ec42e777..6814be779e0b02c38e3bc7008f036b845d88cb6f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -51,7 +51,7 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y));
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index 15d1e269cc22b88f5269175084f20600f165011c..a302b582ede3723acd118d2e4a4bb3efdf7a4d0b 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -193,6 +193,33 @@ TEST_F(GpuKernelTilingTest,
                      /*match_optimized_ir=*/true);
 }
 
+TEST_F(GpuKernelTilingTest, FusionTransposeWithReverseNotTiled) {
+  const char *const kHloString = R"(
+    HloModule FusionTransposeWithReverseNotTiled
+    fused_computation.1 {
+      arg0 = f32[128,64]{1,0} parameter(0)
+      copy0 = f32[128,64]{0,1} copy(arg0)
+      ROOT reverse0 = f32[128,64]{0,1} reverse(copy0), dimensions={0}
+    }
+
+    ENTRY reverse_break_assumption {
+      param0 = f32[128,64]{1,0} parameter(0)
+      ROOT fusion0 = f32[128,64]{0,1} fusion(param0), kind=kLoop,
+        calls=fused_computation.1
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index 6a9ecd9dae7c9ddde0b56d8615e4a39fb3df0af9..3019215c015a4e0aa094a62424d650ced0de2a0e 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -48,7 +48,7 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -73,7 +73,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
   builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -95,7 +95,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
 // reduce in the foreseeable future.  But if that turns out to be wrong, I give
 // you, future reader, permission to delete this test.
 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation* reduce_computation;
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index 15198865bda98f9718342d5a444a20305f923b48..ca0a78034d7dc83d17ad72202914d95f37ac122b 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -47,7 +47,7 @@ TEST_F(GpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyIr(std::move(hlo_module),
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
index 0f2d5568cafc9db0f5f067437fdd5e2e775ad2c8..4636f1d9d20b8c213ffadec427b3820a89c68a7f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -85,7 +85,7 @@ TEST_F(GpuUnrollingTest, UnrollFourTimes) {
 TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
   // The default unrolling factor is 4.
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsFromFlags());
   auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
 
   CompileAndVerifyIr(std::move(hlo_module),
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index 141f3219387940a08ef22cbcc0be0971a14c2cd6..6b2d76764a077dc6cfa3f9ddc6e525ab330323be 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -45,7 +45,7 @@ void ThunkSchedule::AddDependenciesOnTransitiveOperands(
 ThunkSchedule::ThunkSchedule(
     std::unique_ptr<ThunkSequence> thunks,
     std::unique_ptr<StreamAssignment> stream_assignment,
-    const std::vector<const HloInstruction*>& hlo_total_order)
+    const std::vector<HloInstruction*>& hlo_total_order)
     : thunks_(std::move(thunks)),
       stream_assignment_(std::move(stream_assignment)) {
   std::unordered_map<const HloInstruction*, Thunk*> hlo_to_thunk;
@@ -53,7 +53,7 @@ ThunkSchedule::ThunkSchedule(
     InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get());
   }
 
-  for (const HloInstruction* hlo : hlo_total_order) {
+  for (HloInstruction* hlo : hlo_total_order) {
     if (hlo_to_thunk.count(hlo)) {
       thunk_total_order_.push_back(FindOrDie(hlo_to_thunk, hlo));
     }
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
index d3352994f845a535233612a17e19107511ce0622..43b628a1baf0e79a3197f3cfad3547991642eaed 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
@@ -46,7 +46,7 @@ class ThunkSchedule {
  public:
   ThunkSchedule(std::unique_ptr<ThunkSequence> thunks,
                 std::unique_ptr<StreamAssignment> stream_assignment,
-                const std::vector<const HloInstruction*>& hlo_total_order);
+                const std::vector<HloInstruction*>& hlo_total_order);
 
   // Returns the total order of executing all the thunks.
   const std::vector<Thunk*>& TotalOrder() const { return thunk_total_order_; }
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
index 5fa9e91050a85b67eb22a48d47e4dd157a53c699..3d00ac4dc7b57664a317157c093d7ffaa01b4fd6 100644
--- a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -32,7 +32,7 @@ namespace gpu {
 namespace {
 using match::Concatenate;
 
-class VariadicOpSplitterTest : public HloVerifiedTestBase {};
+class VariadicOpSplitterTest : public HloTestBase {};
 
 TEST_F(VariadicOpSplitterTest, DontSplit) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index 926b59a1b854bd3d7d2699124e10b70147e52e2a..2dce7749bbd8da2673ae607eee3d731d9917e8fe 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -29,7 +29,7 @@ namespace {
 class WhileTransformerTest : public HloTestBase {
  protected:
   WhileTransformerTest()
-      : module_(CreateNewModule()),
+      : module_(CreateNewVerifiedModule()),
         induction_variable_shape_(ShapeUtil::MakeShape(S32, {})),
         data_shape_(ShapeUtil::MakeShape(F32, {8})),
         condition_result_shape_(ShapeUtil::MakeShape(PRED, {})) {}
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index e30e7667f3015bc7bfe67c65147a5016332780f7..dc40b9446ad1bffcb757543e52fc9ab20de6d52e 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -30,16 +30,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class MinimumMemoryForSequenceTest : public HloVerifiedTestBase {};
+class MinimumMemoryForSequenceTest : public HloTestBase {};
 
 TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
@@ -86,7 +86,7 @@ TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
     return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   };
 
-  HloSchedule schedule(module);
+  HloSchedule schedule(module.get());
   schedule.set_sequence(cond_computation,
                         {cond_param, cond_iter, cond_data, cond_lt});
   schedule.set_sequence(body_computation, {body_param});
@@ -258,7 +258,7 @@ class HeapSimulatorTracker {
   // Constructor for testing a single entry computation.
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
-      const std::vector<const HloInstruction*>& instruction_sequence) {
+      const std::vector<HloInstruction*>& instruction_sequence) {
     HloModuleConfig config;
     module_ = absl::make_unique<HloModule>(name, config);
     module_->AddEntryComputation(std::move(computation));
@@ -286,7 +286,7 @@ class HeapSimulatorTracker {
   // Similar to the single entry computation constructor above, but runs the
   // simulation over the entire module.
   void RunWholeModule(
-      const std::vector<const HloInstruction*>& full_module_sequence) {
+      const std::vector<HloInstruction*>& full_module_sequence) {
     points_to_analysis_ =
         TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
 
@@ -294,7 +294,7 @@ class HeapSimulatorTracker {
     HloSchedule schedule(module_.get());
     absl::flat_hash_map<const HloInstruction*, int> reverse_position;
     for (int i = 0; i < full_module_sequence.size(); ++i) {
-      const HloInstruction* instruction = full_module_sequence[i];
+      HloInstruction* instruction = full_module_sequence[i];
       schedule.GetOrCreateSequence(instruction->parent())
           .push_back(instruction);
       reverse_position[instruction] = full_module_sequence.size() - i;
@@ -351,7 +351,7 @@ class HeapSimulatorTracker {
   HeapSimulator::Result result_;
 };
 
-class HeapSimulatorTest : public HloVerifiedTestBase {
+class HeapSimulatorTest : public HloTestBase {
  protected:
   HeapSimulatorTest() {}
   ~HeapSimulatorTest() override {}
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index dbab62f847e8ca5e0b46dfd4162a0f4222640252..913d4c34b43087d322634dbc436f2f7c5666c77a 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -251,6 +251,41 @@ message HloInputOutputAliasProto {
   repeated AliasEntryProto entries = 1;
 }
 
+message DynamicParameterBindingProto {
+  // A list of bindings which indicates that the `target_dim_num` in
+  // the subshape `target_param_index` of parameter `target_param_num`
+  // is a dynamic dimension and its real dynamic size is represented
+  // by `dynamic_param_index` in parameter `dynamic_param_num`.
+  //
+  // As an example, imagine we have a program:
+  //
+  // ENTRY main {
+  //   a = f32[] parameter(0)
+  //   b = f32[10] parameter(1)
+  //   ROOT root = (f32[], f32[10]) tuple(%a, %b)
+  // }
+  //
+  // Let's say 'b' (param index 1) is a dynamic shape whose input has
+  // an upperbound of 10 and real size is determined at runtime.'a'
+  // represents the real size of b's first dimension.
+  //
+  // In this case, the fields are set in the following way:
+  // dynamic_param_num = 1
+  // dynamic_param_index = {}
+  // target_param_num = 0
+  // target_param_index = {}
+  // target_param_dim = 0
+  message Binding {
+    int64 dynamic_param_num = 1;
+    repeated int64 dynamic_param_index = 2;
+    int64 target_param_num = 3;
+    repeated int64 target_param_index = 4;
+    int64 target_param_dim_num = 5;
+  }
+
+  repeated Binding entries = 1;
+}
+
 // Serialization of HloModule.
 message HloModuleProto {
   string name = 1;
@@ -272,6 +307,8 @@ message HloModuleProto {
 
   // Describes alias information between inputs and outputs.
   HloInputOutputAliasProto input_output_alias = 8;
+
+  DynamicParameterBindingProto dynamic_parameter_binding = 9;
 }
 
 // Serialization of LogicalBuffer.
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 5c8d97b2d15e15d15cb8014a7d25b37437ce8aec..7e6150e94153cd15463725e862ce1b8593f2c991 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -39,17 +39,17 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-class HloAliasAnalysisTest : public HloVerifiedTestBase {
+class HloAliasAnalysisTest : public HloTestBase {
  protected:
-  HloAliasAnalysisTest() : HloVerifiedTestBase() {
-    module_ = CreateNewModule();
+  HloAliasAnalysisTest() : HloTestBase() {
+    module_ = CreateNewVerifiedModule();
   }
 
   // Run alias analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
   HloAliasAnalysis& RunAnalysis() {
     hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
-    analysis_ = HloAliasAnalysis::Run(module_,
+    analysis_ = HloAliasAnalysis::Run(module_.get(),
                                       /*fusion_can_share_buffer=*/nullptr)
                     .ConsumeValueOrDie();
     return *analysis_;
@@ -93,7 +93,7 @@ class HloAliasAnalysisTest : public HloVerifiedTestBase {
   // never occurs, but HLO graphs with interference can be explicitly
   // constructed.
   bool AnyValuesInSameBufferInterfere() {
-    DependencyHloOrdering ordering(module_);
+    DependencyHloOrdering ordering(module_.get());
     for (const HloBuffer& buffer : analysis_->buffers()) {
       for (const HloValue* value_a : buffer.values()) {
         for (const HloValue* value_b : buffer.values()) {
@@ -110,7 +110,7 @@ class HloAliasAnalysisTest : public HloVerifiedTestBase {
     return false;
   }
 
-  HloModule* module_;
+  std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloAliasAnalysis> analysis_;
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
@@ -638,7 +638,7 @@ TEST_F(HloAliasAnalysisTest, SequentialWhiles) {
   module_->AddEntryComputation(builder.Build());
 
   FlattenCallGraph flattener;
-  TF_ASSERT_OK(flattener.Run(module_).status());
+  TF_ASSERT_OK(flattener.Run(module_.get()).status());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -1012,7 +1012,7 @@ TEST_F(HloAliasAnalysisTest, BitcastInterference) {
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
-  DependencyHloOrdering ordering(module_);
+  DependencyHloOrdering ordering(module_.get());
   EXPECT_FALSE(analysis.HasLiveRangeInterference(ordering));
 }
 
@@ -1054,13 +1054,13 @@ TEST_F(HloAliasAnalysisTest, WhileInterference) {
   {
     // Dependency ordering should interfere because the negate and while are
     // unordered.
-    DependencyHloOrdering ordering(module_);
+    DependencyHloOrdering ordering(module_.get());
     EXPECT_TRUE(analysis.HasLiveRangeInterference(ordering));
   }
 
   // For a sequential order, if there is interference iff the negate is after
   // the while.
-  HloSchedule schedule(module_);
+  HloSchedule schedule(module_.get());
   schedule.set_sequence(body, {body_param, body_root});
   schedule.set_sequence(condition, {cond_param, cond_root});
   {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index b0f7cd91ad1db0a59c09cfbfc1885813dc57e01e..65bd251dd8642314e62dffc118e30e62de1844e4 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -321,7 +322,7 @@ void HloComputation::ComputeInstructionPostOrder(
 
     // Add the operands to the stack in reverse order so the first operand is
     // processed first. This will produce a more natural ordering and a nicer
-    // result for thigns like HLO stringification.
+    // result for things like HLO stringification.
     const auto& operands = current->operands();
     for (int64 i = operands.size() - 1; i >= 0; --i) {
       dfs_stack.emplace_back(operands[i]);
@@ -739,72 +740,6 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
   return RemoveInstructionAndUnusedOperands(old_instruction);
 }
 
-std::unique_ptr<HloReachabilityMap> HloComputation::ComputeReachability()
-    const {
-  const auto& all = MakeInstructionPostOrder();
-  auto result = absl::make_unique<HloReachabilityMap>(all);
-  auto channel_dependency_map = ComputeChannelDependencies();
-
-  std::vector<HloInstruction*> inputs;
-  for (const HloInstruction* hlo : all) {
-    inputs.assign(hlo->operands().begin(), hlo->operands().end());
-    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
-                  hlo->control_predecessors().end());
-
-    switch (hlo->opcode()) {
-      case HloOpcode::kRecvDone: {
-        auto it = channel_dependency_map.find(hlo->channel_id());
-        if (it != channel_dependency_map.end()) {
-          absl::c_copy(it->second, std::back_inserter(inputs));
-        }
-        break;
-      }
-      case HloOpcode::kCrossReplicaSum: {
-        auto all_reduce_id = hlo->all_reduce_id();
-        if (all_reduce_id) {
-          auto it = channel_dependency_map.find(all_reduce_id.value());
-          if (it != channel_dependency_map.end()) {
-            absl::c_copy(it->second, std::back_inserter(inputs));
-          }
-        }
-        break;
-      }
-      default:
-        break;
-    }
-
-    result->FastSetReachabilityToUnion(inputs, hlo);
-  }
-  return result;
-}
-
-void HloComputation::UpdateReachabilityThroughInstruction(
-    const HloInstruction* instruction, HloReachabilityMap* reachability_map) {
-  std::queue<const HloInstruction*> worklist;
-  worklist.push(instruction);
-
-  std::vector<HloInstruction*> inputs;
-
-  while (!worklist.empty()) {
-    const HloInstruction* item = worklist.front();
-    worklist.pop();
-
-    inputs.assign(item->operands().begin(), item->operands().end());
-    inputs.insert(inputs.end(), item->control_predecessors().begin(),
-                  item->control_predecessors().end());
-
-    if (reachability_map->SetReachabilityToUnion(inputs, item)) {
-      // Add immediate successors to worklist.
-      for (const HloInstruction* user : item->users()) {
-        worklist.push(user);
-      }
-      for (const HloInstruction* succ : item->control_successors()) {
-        worklist.push(succ);
-      }
-    }
-  }
-}
-
 std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
   std::vector<HloInstruction*> unreachable_roots;
   for (auto* instruction : instructions()) {
@@ -860,7 +795,7 @@ Status HloComputation::AcceptWithOperandOrder(
 template <typename HloInstructionPtr>
 Status HloComputation::AcceptOrdered(
     DfsHloVisitorBase<HloInstructionPtr>* visitor,
-    const std::vector<const HloInstruction*>& order) const {
+    const std::vector<HloInstruction*>& order) const {
   VLOG(3) << "Accepting visitor with order.";
   for (HloInstruction* root : CollectUnreachableRoots()) {
     TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end())
@@ -890,9 +825,9 @@ Status HloComputation::AcceptOrdered(
 
 // Explicit instantiations.
 template Status HloComputation::AcceptOrdered(
-    DfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+    DfsHloVisitor*, const std::vector<HloInstruction*>&) const;
 template Status HloComputation::AcceptOrdered(
-    ConstDfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+    ConstDfsHloVisitor*, const std::vector<HloInstruction*>&) const;
 
 Status HloComputation::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
@@ -911,14 +846,46 @@ std::unique_ptr<HloComputation> HloComputation::Clone(
   return CloneWithReplacements(
       /*replacements=*/std::unordered_map<const HloInstruction*,
                                           std::unique_ptr<HloInstruction>>(),
-      /*extras=*/{}, context, suffix);
+      context, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+    HloCloneContext* context, const string& suffix) {
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(std::move(r1));
+  return CloneWithReplacements(std::move(replacements), context, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+    HloCloneContext* context, const string& suffix) {
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(std::move(r1));
+  replacements.emplace(std::move(r2));
+  return CloneWithReplacements(std::move(replacements), context, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
+    HloCloneContext* context, const string& suffix) {
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(std::move(r1));
+  replacements.emplace(std::move(r2));
+  replacements.emplace(std::move(r3));
+  return CloneWithReplacements(std::move(replacements), context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
-    absl::Span<HloInstruction*> extras, HloCloneContext* context,
-    const string& suffix) {
+    HloCloneContext* context, const string& suffix) {
   std::unique_ptr<HloCloneContext> context_ptr;
   if (context == nullptr) {
     context_ptr = absl::make_unique<HloCloneContext>(parent(), suffix);
@@ -939,18 +906,50 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
   };
 
   VLOG(1) << "Cloning " << name() << " --> " << suffix << "\n";
+
+  // We want to do a postorder walk over [replace(i) for i in instructions_].
+  // We can't reuse MakeInstructionPostOrder() for this, because that will
+  // generate a postorder of plain instructions_, and our replacements may
+  // change the postorder!
+  //
+  // The postorder we want here is simpler than what MakeInstructionPostOrder()
+  // does -- we only care about operand dependencies -- so let's just do it
+  // ourselves.
   std::vector<HloInstruction*> postorder;
-  for (HloInstruction* instr : extras) {
-    postorder.push_back(instr);
-  }
-  for (HloInstruction* instr : MakeInstructionPostOrder()) {
-    if (HloInstruction* replacement = replace(instr)) {
-      postorder.push_back(replacement);
+  absl::flat_hash_map<HloInstruction*, VisitState> visited;
+  for (const auto& instr : instructions_) {
+    std::vector<HloInstruction*> dfs_stack;
+    HloInstruction* new_instr = replace(instr.get());
+    if (!new_instr) {
+      continue;
+    }
+    dfs_stack.push_back(new_instr);
+
+    while (!dfs_stack.empty()) {
+      auto* cur = dfs_stack.back();
+      auto it = visited.find(cur);
+      if (it != visited.end()) {
+        dfs_stack.pop_back();
+        if (it->second == kVisited) {
+          continue;
+        }
+        CHECK_EQ(it->second, kVisiting);
+        postorder.push_back(cur);
+        it->second = kVisited;
+        continue;
+      }
+
+      visited.insert({cur, kVisiting});
+      for (HloInstruction* operand : cur->operands()) {
+        HloInstruction* new_operand = replace(operand);
+        if (new_operand) {
+          dfs_stack.emplace_back(new_operand);
+        }
+      }
     }
   }
 
   std::vector<std::unique_ptr<HloInstruction>> instructions;
-  std::unique_ptr<HloInstruction> new_instr;
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
@@ -960,9 +959,8 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
           << operand->ToString() << ", used by " << instr->ToString();
       new_operands.push_back(context->GetInstruction(replaced_operand));
     }
-    new_instr =
-        instr->CloneWithNewOperands(instr->shape(), new_operands, context);
-    instructions.push_back(std::move(new_instr));
+    instructions.push_back(
+        instr->CloneWithNewOperands(instr->shape(), new_operands, context));
   }
   Builder builder(name() + "." + suffix);
   for (auto& instr : instructions) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index dec96d11a93cf56d3c40a6bb7882ffb7336aeeb0..be1ce336968504b6406c9ef4b879821821c5b187 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -215,19 +214,6 @@ class HloComputation {
   // this order, definitions of values always appear before their uses.
   std::vector<HloInstruction*> MakeInstructionPostOrder() const;
 
-  // Computes and returns the reachability between HLO instructions in the
-  // computation. The returned HloReachabilityMap is constructed such that
-  // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a
-  // directed path (from producer to consumer) from 'a' to 'b'. Both data
-  // dependencies (operands) and control dependencies are considered for
-  // reachability. Trivially an instruction is reachable from itself.
-  std::unique_ptr<HloReachabilityMap> ComputeReachability() const;
-
-  // Updates the given reachability map after the immediate predecessor set
-  // (operands and control predecessors) of 'instruction' has changed.
-  void UpdateReachabilityThroughInstruction(
-      const HloInstruction* instruction, HloReachabilityMap* reachability_map);
-
   int64 instruction_count() const { return instruction_iterators_.size(); }
 
   // Creates and returns a list of the embedded computations called by this
@@ -315,7 +301,7 @@ class HloComputation {
   // be a topological sort of all instructions in the computation.
   template <typename HloInstructionPtr>
   Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
-                       const std::vector<const HloInstruction*>& order) const;
+                       const std::vector<HloInstruction*>& order) const;
 
   // Same as Accept() above, but the visitor is given as a function.
   Status Accept(const std::function<Status(HloInstruction*)>& visitor_func);
@@ -333,14 +319,38 @@ class HloComputation {
   // the map's value to replace that instruction in the cloned computation.
   //
   // If replacements maps a key to nullptr, we remove that instruction from the
-  // new computation.
-  // If additional instructions are used by instructions in replacement map,
-  // they must be passed in post-order in the extras span.
+  // new computation.  If an element of `replacements` references an instruction
+  // that's not already in the computation, it's cloned and added to the new
+  // computation.
+  //
+  // All relevant instructions are cloned, *including* unique_ptr in the
+  // `replacements` map.
   std::unique_ptr<HloComputation> CloneWithReplacements(
       std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
           replacements,
-      absl::Span<HloInstruction*> extras, HloCloneContext* context = nullptr,
-      const string& suffix = "clone");
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
+
+  // Convenience overloads for CloneWithReplacements.  You want to do
+  //
+  //   CloneWithReplacements({{a, std::move(b)}, {c, std::move(d)}})  // ERROR
+  //
+  // but that doesn't work because std::initializer_list is not movable.  These
+  // overloads let you do
+  //
+  //   CloneWithReplacementPairs({a, std::move(b)}, {c, std::move(d)});   // OK
+  //
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
 
   // Returns true if the given instruction can be removed from the computation.
   // Parameter instructions cannot be removed without violating invariants of
@@ -355,6 +365,14 @@ class HloComputation {
   // channel complete).
   bool IsRemovable(const HloInstruction* instruction);
 
+  // Returns a map from channel-id to directed dependencies of the channel
+  // instructions. For send&recv pairs it means the send instruction and for
+  // cross-replica-sum the union of the dependencies for all participating
+  // instructions.
+  using ChannelDependencyMap =
+      absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
+  ChannelDependencyMap ComputeChannelDependencies() const;
+
   // Returns true if this computation has a side effect. A computation has a
   // side effect if it contains one or more instructions with a side effect.
   bool HasSideEffect() const;
@@ -410,14 +428,6 @@ class HloComputation {
   // Internal helper to collect unreachable roots.
   std::vector<HloInstruction*> CollectUnreachableRoots() const;
 
-  // Returns a map from channel-id to directed dependencies of the channel
-  // instructions. For send&recv pairs it means the send instruction and for
-  // cross-replica-sum the union of the dependencies for all participating
-  // instructions.
-  using ChannelDependencyMap =
-      absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
-  ChannelDependencyMap ComputeChannelDependencies() const;
-
   enum VisitState { kVisiting, kVisited };
   void ComputeInstructionPostOrder(
       const HloComputation::ChannelDependencyMap& channel_dependency_map,
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 2aaaef1d36d58bcce18db4aa37ff05ea352e484b..8b50cfa9aed90091cfbedc1df902440ec9bf2a80 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -65,7 +65,7 @@ class HloComputationTest : public HloTestBase {
 };
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEntryComputation(CreateNegateComputation());
   EXPECT_TRUE(negate_computation->MakeEmbeddedComputationsList().empty());
@@ -73,7 +73,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
   // Create computation which calls one other computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map_computation =
@@ -85,7 +85,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsDiamond) {
   // Create computations with a diamond-shaped callgraph.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map1_computation =
@@ -119,7 +119,7 @@ TEST_F(HloComputationTest, PostOrderSingleton) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(), ElementsAre(constant));
 }
@@ -134,7 +134,7 @@ TEST_F(HloComputationTest, PostOrderSimple) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto negate2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               ElementsAre(constant, negate1, negate2));
@@ -151,7 +151,7 @@ TEST_F(HloComputationTest, PostOrderTrace) {
       builder.AddInstruction(HloInstruction::CreateTrace("foobar", negate1));
   auto negate2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Trace instructions should be at the end of the sort.
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
@@ -170,7 +170,7 @@ TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   auto constant4 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               UnorderedElementsAre(constant1, constant2, constant3, constant4));
@@ -192,7 +192,7 @@ TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
       r0f32_, HloOpcode::kAdd, constant2, constant3));
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant3));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto post_order = computation->MakeInstructionPostOrder();
   EXPECT_EQ(6, post_order.size());
@@ -217,7 +217,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
                                                       constant2, constant3));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       constant1, constant3));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Visitor which keeps track of which instructions have been visited.
   class TestVisitor : public DfsHloVisitorWithDefault {
@@ -257,7 +257,7 @@ TEST_F(HloComputationTest, DeepCopyArray) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(constant).ValueOrDie();
 
@@ -274,7 +274,7 @@ TEST_F(HloComputationTest, DeepCopyTuple) {
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto tuple_copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
@@ -376,7 +376,7 @@ TEST_F(HloComputationTest, DeepCopyToken) {
   // copied.
   auto builder = HloComputation::Builder(TestName());
   auto token = builder.AddInstruction(HloInstruction::CreateToken());
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(token).ValueOrDie();
 
@@ -393,7 +393,7 @@ TEST_F(HloComputationTest, DeepCopyTokenTuple) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({token, constant}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
@@ -412,7 +412,7 @@ TEST_F(HloComputationTest, CycleDetection) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, negate, negate));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Add a control dependency to create a cycle.
   ASSERT_IS_OK(add->AddControlDependencyTo(negate));
@@ -440,7 +440,7 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
       r0f32_, HloOpcode::kAdd, dead_negate, dead_negate));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
@@ -466,7 +466,7 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
       HloInstruction::CreateParameter(0, r0f32_, "param0"));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation =
       module->AddEntryComputation(builder.Build(/*root_instruction=*/add));
 
@@ -484,107 +484,6 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
   EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add));
 }
 
-TEST_F(HloComputationTest, Reachability) {
-  // Test reachability of a non-trivial computation:
-  //
-  // const1    const2
-  //    |         |
-  //    | +-------+
-  //    | |       |
-  //    add ..   negate
-  //     |   .     |
-  //     |   .... exp
-  //     |         |
-  //     +---+   +-+---+
-  //         |   |     |
-  //       multiply   copy
-  //
-  // There is a control dependency from 'add' to 'exp'.
-  auto builder = HloComputation::Builder(TestName());
-  auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
-  auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
-      r0f32_, HloOpcode::kAdd, constant1, constant2));
-  auto negate = builder.AddInstruction(
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant2));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, negate));
-  auto mul = builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kMultiply, add, exp));
-  auto copy = builder.AddInstruction(
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kCopy, exp));
-
-  auto module = CreateNewModule();
-  auto computation =
-      module->AddEntryComputation(builder.Build(/*root_instruction=*/mul));
-
-  TF_CHECK_OK(add->AddControlDependencyTo(exp));
-  auto reachability = computation->ComputeReachability();
-
-  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
-  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant1, add));
-  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
-  EXPECT_TRUE(reachability->IsReachable(constant1, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
-  EXPECT_TRUE(reachability->IsReachable(constant1, copy));
-
-  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
-  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant2, add));
-  EXPECT_TRUE(reachability->IsReachable(constant2, negate));
-  EXPECT_TRUE(reachability->IsReachable(constant2, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
-  EXPECT_TRUE(reachability->IsReachable(constant2, copy));
-
-  EXPECT_FALSE(reachability->IsReachable(exp, constant1));
-  EXPECT_FALSE(reachability->IsReachable(exp, constant2));
-  EXPECT_FALSE(reachability->IsReachable(exp, add));
-  EXPECT_FALSE(reachability->IsReachable(exp, negate));
-  EXPECT_TRUE(reachability->IsReachable(exp, exp));
-  EXPECT_TRUE(reachability->IsReachable(exp, mul));
-  EXPECT_TRUE(reachability->IsReachable(exp, copy));
-
-  EXPECT_FALSE(reachability->IsReachable(mul, constant1));
-  EXPECT_FALSE(reachability->IsReachable(mul, constant2));
-  EXPECT_FALSE(reachability->IsReachable(mul, add));
-  EXPECT_FALSE(reachability->IsReachable(mul, negate));
-  EXPECT_FALSE(reachability->IsReachable(mul, exp));
-  EXPECT_TRUE(reachability->IsReachable(mul, mul));
-  EXPECT_FALSE(reachability->IsReachable(mul, copy));
-
-  EXPECT_TRUE(reachability->IsConnected(constant1, copy));
-  EXPECT_TRUE(reachability->IsConnected(copy, constant1));
-  EXPECT_FALSE(reachability->IsConnected(negate, add));
-  EXPECT_FALSE(reachability->IsConnected(add, negate));
-
-  // Remove the control dependency then update and verify the reachability map
-  ASSERT_IS_OK(add->RemoveControlDependencyTo(exp));
-  computation->UpdateReachabilityThroughInstruction(exp, reachability.get());
-
-  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
-  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant1, add));
-  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
-  EXPECT_FALSE(reachability->IsReachable(constant1, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
-  EXPECT_FALSE(reachability->IsReachable(constant1, copy));
-
-  // Change a use within the graph then update and verify the reachability map
-  ASSERT_IS_OK(constant2->ReplaceUseWith(negate, constant1));
-  computation->UpdateReachabilityThroughInstruction(negate, reachability.get());
-
-  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
-  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant2, add));
-  EXPECT_FALSE(reachability->IsReachable(constant2, negate));
-  EXPECT_FALSE(reachability->IsReachable(constant2, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
-  EXPECT_FALSE(reachability->IsReachable(constant2, copy));
-}
-
 TEST_F(HloComputationTest, Stringification) {
   const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
   const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
@@ -606,7 +505,7 @@ TEST_F(HloComputationTest, Stringification) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
@@ -641,7 +540,7 @@ TEST_F(HloComputationTest, StringificationIndent) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options =
@@ -677,7 +576,7 @@ TEST_F(HloComputationTest, StringificationCanonical) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
@@ -700,27 +599,5 @@ TEST_F(HloComputationTest, StringificationCanonical) {
   EXPECT_EQ(computation->ToString(options), expected_computation2);
 }
 
-TEST_F(HloComputationTest, ChannelReachability) {
-  const Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
-  HloComputation::Builder builder("ChannelReachability");
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param"));
-  auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
-  auto send =
-      builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1));
-  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
-  auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
-  auto recv =
-      builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1));
-  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
-
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build(recv_done));
-  auto reachability = computation->ComputeReachability();
-  EXPECT_TRUE(reachability->IsReachable(param, recv_done));
-  EXPECT_FALSE(reachability->IsReachable(send, recv));
-  EXPECT_FALSE(reachability->IsReachable(send_done, recv));
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 4f898ce61c3f36e83e4b13130a404dbb4a2c36c6..5e37883d3d8d5067bab873ac6b5f732e7360c5fa 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -52,8 +52,10 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           computation->root_instruction() != instruction) {
         continue;
       }
-      // Skip Constant, Parameter, and AfterAll operation.
-      // TODO(b/64407269): Enable Tuple once the timeout issue is resolved.
+      // Skip Constant, Parameter, Tuple, AfterAll operation.
+      // Tuple constants are not directly supported by any backends, hence
+      // folding Tuple is not useful and would in fact be expanded back into
+      // kTuple by Algebraic Simplifier.
       // TODO(b/110532604): Enable AfterAll once AfterAll requires at least one
       // operand in which case constant folding will be impossible and this
       // special case is not necessary.
@@ -63,6 +65,7 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           instruction->opcode() == HloOpcode::kAfterAll) {
         continue;
       }
+
       // Skip instructions with non-constant operands.
       if (!hlo_query::AllOperandsAreConstants(*instruction)) {
         continue;
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index e45f905f7152c37a9ab2b41d407310671310c2a3..d12f920722e20a3390a99f74c8a10c7c9e3fdf6c 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
 
@@ -37,7 +37,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-using HloConstantFoldingTest = HloVerifiedTestBase;
+using HloConstantFoldingTest = HloTestBase;
 
 TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   HloComputation::Builder builder(TestName());
@@ -46,13 +46,13 @@ TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
@@ -67,13 +67,13 @@ TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
@@ -88,13 +88,13 @@ TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
@@ -130,11 +130,11 @@ TEST_F(HloConstantFoldingTest, Concatenate) {
     Shape shape = ShapeUtil::MakeShape(F32, dimensions);
     builder.AddInstruction(HloInstruction::CreateConcatenate(
         shape, operands, test_config.concat_dimension));
-    auto module = CreateNewModule();
+    auto module = CreateNewVerifiedModule();
     auto computation = module->AddEntryComputation(builder.Build());
 
     HloConstantFolding const_folder;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
     EXPECT_TRUE(result);
 
     HloInstruction* root = computation->root_instruction();
@@ -157,11 +157,11 @@ TEST_F(HloConstantFoldingTest, Slice) {
   Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
   builder.AddInstruction(HloInstruction::CreateSlice(
       shape, literal_instruction, slice_start, slice_limits, slice_strides));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
@@ -182,11 +182,11 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   const int64 permutation[] = {1, 2, 0, 4, 3};
   builder.AddInstruction(
       HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
@@ -219,27 +219,28 @@ const char* const kConstantFoldReduce = R"(
   })";
 
 TEST_F(HloConstantFoldingTest, ConstantFoldReduce) {
-  ParseAndVerifyModule(kConstantFoldReduce);
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(kConstantFoldReduce));
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(m.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_EQ(6, module()
-                   .entry_computation()
+  EXPECT_EQ(6, m->entry_computation()
                    ->root_instruction()
                    ->literal()
                    .GetFirstElement<int32>());
 }
 
 TEST_F(HloConstantFoldingTest, ConstantFoldReduceNoLayout) {
-  ParseAndVerifyModule(kConstantFoldReduce);
-  HloInstruction* add = module().computations().begin()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(kConstantFoldReduce));
+  HloInstruction* add = m->computations().begin()->root_instruction();
   LayoutUtil::ClearLayout(add->mutable_shape());
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(m.get()));
   EXPECT_FALSE(result);
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(), op::Reduce());
+  EXPECT_THAT(m->entry_computation()->root_instruction(), op::Reduce());
 }
 
 const char* const kConstantFoldLargePad = R"(
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 108aeea097d7170d236b988c414b517a1a284640..fdfb38b858c32ba5b092ec2db84d4bac487c3e78 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -269,7 +269,7 @@ Status HloCostAnalysis::HandleOutfeed(const HloInstruction*) {
 Status HloCostAnalysis::HandleMap(const HloInstruction* map) {
   // Compute properties of the mapped function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(map->to_apply()));
+                      ProcessNestedSubcomputation(map->to_apply()));
 
   // Compute the cost of all elements for this Map operation.
   const int64 element_count = ShapeUtil::ElementsIn(map->shape());
@@ -285,7 +285,7 @@ Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
   HloComputation* function = reduce->to_apply();
   // Compute the cost of the user function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(function));
+                      ProcessNestedSubcomputation(function));
 
   // Compute the cost of all elements for this Reduce operation.
   // This counts the number of times the reduction function is applied, so it
@@ -311,7 +311,7 @@ Status HloCostAnalysis::HandleReduceWindow(
   auto function = reduce_window->to_apply();
   // Compute the properties of the reduction function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(function));
+                      ProcessNestedSubcomputation(function));
 
   // Compute the cost of all elements for this ReduceWindow operation. For each
   // output element there are window_size - 1 reductions to perform.
@@ -336,9 +336,9 @@ Status HloCostAnalysis::HandleSelectAndScatter(
   // Compute the properties of the select and scatter function.
   // Compute the properties of the reduction function.
   TF_ASSIGN_OR_RETURN(const Properties select_properties,
-                      ProcessSubcomputation(instruction->select()));
+                      ProcessNestedSubcomputation(instruction->select()));
   TF_ASSIGN_OR_RETURN(const Properties scatter_properties,
-                      ProcessSubcomputation(instruction->scatter()));
+                      ProcessNestedSubcomputation(instruction->scatter()));
 
   // Compute the cost of all elements for this operation. For each scatter
   // source element there are window_size - 1 select computations to perform and
@@ -574,7 +574,7 @@ Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
 Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
   TF_ASSIGN_OR_RETURN(
       current_properties_,
-      ProcessSubcomputation(fusion->fused_instructions_computation()));
+      ProcessNestedSubcomputation(fusion->fused_instructions_computation()));
 
   // Fusion nodes that produce a tuple also produce the entries in the tuple.
   // Ignore the memory accessed inside fused ops, since fusion is supposed to
@@ -595,7 +595,7 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
 
 Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
   TF_ASSIGN_OR_RETURN(current_properties_,
-                      ProcessSubcomputation(call->to_apply()));
+                      ProcessUnnestedSubcomputation(call->to_apply()));
   current_should_compute_bottleneck_time_ = false;
   return Status::OK();
 }
@@ -624,13 +624,12 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
   // Since the number of iterations of the while node will not always be
   // something that we can statically analyze, we cannot precisely compute the
   // cost of a while node. For now compute the cost of a single iteration.
-  //
-  // TODO(b/26346211): Improve the cost analysis for while nodes.
   TF_ASSIGN_OR_RETURN(const Properties body_properties,
-                      ProcessSubcomputation(xla_while->while_body()));
+                      ProcessUnnestedSubcomputation(xla_while->while_body()));
 
-  TF_ASSIGN_OR_RETURN(const Properties condition_properties,
-                      ProcessSubcomputation(xla_while->while_condition()));
+  TF_ASSIGN_OR_RETURN(
+      const Properties condition_properties,
+      ProcessUnnestedSubcomputation(xla_while->while_condition()));
 
   current_properties_.clear();
   for (const auto& property : body_properties) {
@@ -647,10 +646,12 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
 Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
   // Compute the cost of the true and false computations and take the maximum
   // from those for each property.
-  TF_ASSIGN_OR_RETURN(const Properties true_computation_properties,
-                      ProcessSubcomputation(conditional->true_computation()));
-  TF_ASSIGN_OR_RETURN(const Properties false_computation_properties,
-                      ProcessSubcomputation(conditional->false_computation()));
+  TF_ASSIGN_OR_RETURN(
+      const Properties true_computation_properties,
+      ProcessUnnestedSubcomputation(conditional->true_computation()));
+  TF_ASSIGN_OR_RETURN(
+      const Properties false_computation_properties,
+      ProcessUnnestedSubcomputation(conditional->false_computation()));
   current_properties_ = true_computation_properties;
   for (const auto& property : false_computation_properties) {
     if (!tensorflow::gtl::InsertIfNotPresent(&current_properties_, property)) {
@@ -680,7 +681,7 @@ Status HloCostAnalysis::HandleScatter(const HloInstruction* scatter) {
   const int64 element_count =
       ShapeUtil::ElementsIn(scatter->operand(2)->shape());
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(scatter->to_apply()));
+                      ProcessNestedSubcomputation(scatter->to_apply()));
   for (const auto& property : sub_properties) {
     if (property.first != kBytesAccessedKey) {
       current_properties_[property.first] = property.second * element_count;
@@ -689,6 +690,11 @@ Status HloCostAnalysis::HandleScatter(const HloInstruction* scatter) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleGetDimensionSize(
+    const HloInstruction* /*get_size*/) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::FinishVisit(const HloInstruction*) {
   return Status::OK();
 }
@@ -725,10 +731,19 @@ float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const {
   return GetPropertyForHlo(hlo, kOptimalSecondsKey, hlo_properties_);
 }
 
-StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
-    HloComputation* computation) {
+StatusOr<HloCostAnalysis::Properties>
+HloCostAnalysis::ProcessNestedSubcomputation(HloComputation* computation) {
+  HloCostAnalysis visitor(shape_size_, per_second_rates_);
+  TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  return visitor.properties();
+}
+
+StatusOr<HloCostAnalysis::Properties>
+HloCostAnalysis::ProcessUnnestedSubcomputation(HloComputation* computation) {
   HloCostAnalysis visitor(shape_size_, per_second_rates_);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  hlo_properties_.insert(visitor.hlo_properties_.begin(),
+                         visitor.hlo_properties_.end());
   return visitor.properties();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 46b4bbeef222e6de581360fc01b293e812f1dedd..8ced9d776e150ac587e9ac3ed0beffbc38dc5503 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -107,6 +107,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleConditional(const HloInstruction* conditional) override;
   Status HandleGather(const HloInstruction* gather) override;
   Status HandleScatter(const HloInstruction* scatter) override;
+  Status HandleGetDimensionSize(const HloInstruction* get_size) override;
   Status FinishVisit(const HloInstruction* root) override;
 
   Status Preprocess(const HloInstruction* hlo) override;
@@ -153,7 +154,24 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
 
   // Returns the properties computed from visiting the computation rooted at the
   // given hlo.
-  StatusOr<Properties> ProcessSubcomputation(HloComputation* computation);
+  //
+  // The difference between ProcessNestedSubcomputation and
+  // ProcessUnnestedSubcomputation is that we expect to get profile results for
+  // an unnested subcomputation's individual instructions, while we expect that
+  // a nested subcomputation is completely subsumed by its parent.
+  //
+  // For example, subcomputations inside kFusion and kMap are considered nested,
+  // while subcomputations inside kWhile and kConditional are considered
+  // unnested.
+  //
+  // Another way of thinking of this is, kFusion is implemented on the GPU
+  // backend using just one GPU kernel, while kWhile's body is implemented as a
+  // sequence of kernels, one for each HLO therein.  Backends don't necessarily
+  // need to follow this same implementation strategy, but we assume they do for
+  // the purposes of this platform-generic cost analysis.
+  StatusOr<Properties> ProcessNestedSubcomputation(HloComputation* computation);
+  StatusOr<Properties> ProcessUnnestedSubcomputation(
+      HloComputation* computation);
 
   // Utility function to handle all element-wise operations.
   Status HandleElementwiseOp(const HloInstruction* hlo_instruction);
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 9acee892d5993be3498d51ed66d7fa4647d7de88..ff32faf298dd1f04c5b769f2a88f76a7a1e18ae7 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -387,7 +387,7 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
         HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract, mul, clamp));
     auto tuple = HloInstruction::CreateTuple({sub, sub, mul, c1});
 
-    auto module = CreateNewModule();
+    auto module = CreateNewVerifiedModule();
     auto* computation = module->AddEntryComputation(builder.Build());
     auto* fusion = computation->CreateFusionInstruction(
         {sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
@@ -429,7 +429,7 @@ TEST_F(FusionCostAnalysis, NoLayout) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       shape_with_layout, HloOpcode::kAdd, c1, broadcast));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add, broadcast}, HloInstruction::FusionKind::kLoop);
@@ -472,7 +472,7 @@ TEST_F(DomainCostAnalysis, DomainCost) {
   auto domain = builder.AddInstruction(
       HloInstruction::CreateDomain(tuple->shape(), tuple, nullptr, nullptr));
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(hlo_module->entry_computation()->root_instruction(), domain);
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index e07a196d1154dc0ea45ccd2f15b0b9b56f7c41f8..aaa9ec60eb3c4e0159ed40b37d772e0973d306ec 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -19,22 +19,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
-class HloCreationUtilsTest : public HloVerifiedTestBase {
+class HloCreationUtilsTest : public HloTestBase {
  protected:
-  HloModule* CreateModuleWithProgramShape(
+  std::unique_ptr<VerifiedHloModule> CreateModuleWithProgramShape(
       PrimitiveType primitive_type, absl::Span<const int64> input_shape_dims,
       absl::Span<const int64> output_shape_dims, HloInstruction** param,
       HloComputation** entry_computation) {
     Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims);
     Shape output_shape =
         ShapeUtil::MakeShape(primitive_type, output_shape_dims);
-    auto module = CreateNewModule("test");
+    auto module = CreateNewVerifiedModule("test");
     *entry_computation = module->AddEntryComputation(
         CreateComputationWithSignature({&input_shape}, output_shape, "entry")
             .ValueOrDie());
@@ -47,10 +47,9 @@ TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{2},
-                                                   /*output_shape_dims=*/{2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{2}, &param,
+                                             &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_1_dims_collapsed,
                           CollapseFirstNDims(param, 1));
@@ -67,9 +66,8 @@ TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(
-      S32,
-      /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, &param,
+  auto module = CreateModuleWithProgramShape(
+      S32, /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, &param,
       &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_2_dims_collapsed,
@@ -92,10 +90,9 @@ TEST_F(HloCreationUtilsTest, Prepend1DegenerateDim) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{2},
-                                                   /*output_shape_dims=*/{1, 2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{1, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_1_degenerate_dim_prepended,
                           PrependDegenerateDims(param, 1));
@@ -113,10 +110,9 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(
-      S32,
-      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 1, 2}, &param,
-      &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{1, 1, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
                           PrependDegenerateDims(param, 2));
@@ -134,10 +130,9 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{},
-                                                   /*output_shape_dims=*/{1, 1},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{1, 1},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
                           PrependDegenerateDims(param, 2));
@@ -154,10 +149,9 @@ TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(
-      S32,
-      /*input_shape_dims=*/{6}, /*output_shape_dims=*/{3, 1, 2}, &param,
-      &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{6},
+                                             /*output_shape_dims=*/{3, 1, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_dim_expanded,
                           ExpandFirstDimIntoNDims(param, {3, 1, 2}));
@@ -176,10 +170,9 @@ TEST_F(HloCreationUtilsTest, PadVectorWithZeros) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{2},
-                                                   /*output_shape_dims=*/{6},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{6}, &param,
+                                             &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloInstruction * zero_padded_param,
@@ -197,10 +190,9 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{},
-                                                   /*output_shape_dims=*/{2, 2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloInstruction * zeros,
@@ -218,10 +210,9 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(F32,
-                                                   /*input_shape_dims=*/{},
-                                                   /*output_shape_dims=*/{2, 2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(F32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloInstruction * zeros,
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 9b18b0284f63c25934c1b7118dc8973caa62cadc..1eb0260468c4560985027947e89c62cc21139e7e 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -44,7 +44,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-class HloCseTest : public HloVerifiedTestBase {
+class HloCseTest : public HloTestBase {
  protected:
   HloCseTest() {}
 };
@@ -59,13 +59,13 @@ TEST_F(HloCseTest, CombineTwoConstants) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
   HloInstruction* constant = *computation->instructions().begin();
@@ -89,14 +89,14 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
   EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
   auto first_operand = add->operand(0);
@@ -121,14 +121,14 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
   EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   EXPECT_THAT(add, op::Add(constant1, constant2));
@@ -171,13 +171,13 @@ TEST_F(HloCseTest, ConstantsSameValueDifferentType) {
         shape_r0, HloOpcode::kAdd, root, constants[i]));
   }
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(20, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   // CSE will remove both the second float(42.0f) and the corresponding
   // convert/cast.
@@ -201,7 +201,7 @@ TEST_F(HloCseTest, NonscalarConstants) {
   auto tuple = builder.AddInstruction(HloInstruction::CreateTuple(
       {common_constant1, common_constant2, uncommon_constant}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
@@ -209,7 +209,7 @@ TEST_F(HloCseTest, NonscalarConstants) {
               op::Tuple(common_constant1, common_constant2, uncommon_constant));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   auto first_operand = tuple->operand(0);
@@ -233,14 +233,14 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2, exp3}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2, exp3));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   auto first_operand = tuple->operand(0);
@@ -250,7 +250,7 @@ TEST_F(HloCseTest, IdenticalInstructions) {
 
 // Test two identical while loops with same inputs
 TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesSameInput) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalConditionsAndBodiesSameInput
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -277,21 +277,21 @@ index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
 f32[]) while((f32[], f32[]) %tuple.1), condition=%condition, body=%body ROOT
 %while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
 condition=%condition.1, body=%body
-    }
-    )");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(5, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_TRUE(cse.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(4, computation->instruction_count());
 }
 
 // Test two while loops with same conditions, same inputs, but different
 // bodies
 TEST_F(HloCseTest, WhileLoopsIdenticalConditionsSameInputAndDifferentBodies) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalConditionsSameInputAndDifferentBodies
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -327,20 +327,20 @@ index=1 %sub = f32[] subtract(f32[] %get-tuple-element.2, f32[]
       %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
 condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
 f32[]) %tuple.1), condition=%condition.1, body=%body2
-    }
-    )");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(5, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(5, computation->instruction_count());
 }
 
 // Test two identical while loops with different inputs
 TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesDifferentInput) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalConditionsAndBodiesDifferentInput
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -369,22 +369,21 @@ condition=%condition, body=%body %constant.4 = f32[] constant(1) %constant.5 =
 f32[] constant(2) %tuple.2 = (f32[], f32[]) tuple(f32[] %constant.4, f32[]
 %constant.5) ROOT %while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.2),
 condition=%condition.1, body=%body
-    }
-
-    )");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(8, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(8, computation->instruction_count());
 }
 
 // Test two while loops with identical bodies and same inputs, but different
 // conditions
 TEST_F(HloCseTest, WhileLoopsIdenticalBodiesAndInputDifferntConditions) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalBodiesAndInputDifferntConditions
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -411,13 +410,14 @@ f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2)
       %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
 condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
 f32[]) %tuple.1), condition=%condition.1, body=%body
-    })");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(5, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(5, computation->instruction_count());
 }
 
@@ -439,14 +439,14 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
@@ -470,14 +470,14 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   auto first_operand = tuple->operand(0);
@@ -488,7 +488,7 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
 TEST_F(HloCseTest, FusionInternalCSE) {
   // Test that we can CSE expressions that live within a fusion node
   // computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape_r0 = ShapeUtil::MakeShape(F32, {});
@@ -512,7 +512,7 @@ TEST_F(HloCseTest, FusionInternalCSE) {
 
   EXPECT_EQ(5, fused_computation->instruction_count());
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
   EXPECT_EQ(4, fused_computation->instruction_count());
 
   auto root = fused_computation->root_instruction();
@@ -554,14 +554,14 @@ TEST_F(HloCseTest, IdenticalExpressions) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({add1, add2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(8, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(op::Add(negate1, exp1), op::Add(negate2, exp2)));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(5, computation->instruction_count());
   auto operand = tuple->operand(0);
@@ -586,7 +586,7 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, rng1, rng2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
@@ -595,7 +595,7 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   uint32 count_before = computation->instruction_count();
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   uint32 count_after = computation->instruction_count();
   EXPECT_EQ(count_before, count_after);
@@ -607,7 +607,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   // Test that two calls to an impure function are not commoned. RNG
   // is the source of the impurity.
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // rng_function is an impure function because it does RNG.
   HloComputation* rng_function = nullptr;
@@ -649,7 +649,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   VLOG(3) << "before: " << module->ToString();
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   VLOG(3) << "after: " << module->ToString();
 
@@ -659,7 +659,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
 }
 
 TEST_F(HloCseTest, CompareComputations) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule m
 
     add_computation {
@@ -680,11 +680,12 @@ TEST_F(HloCseTest, CompareComputations) {
       r1 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation
       r2 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation2
       ROOT f2 = (f32[],f32[]) tuple(r1, r2)
-    })");
+    })";
 
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module()).ValueOrDie());
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_TRUE(cse.Run(m.get()).ValueOrDie());
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_EQ(root->operand(0), root->operand(1));
 }
 
@@ -697,19 +698,19 @@ TEST_F(HloCseTest, ConstantsSameValueInDifferentDomains) {
   builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(42)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
 }
 
 TEST_F(HloCseTest, Domain) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
 HloModule module
 ENTRY %entry {
   %param = f32[] parameter(0), sharding={maximal device=0}
@@ -730,11 +731,12 @@ ENTRY %entry {
     domain={kind="sharding", entry={maximal device=2}, exit={maximal device=0}}
   %add = f32[] add(%domain.3, %domain.4)
   ROOT %sub = f32[] subtract(%add, %domain.5)
-})");
+})";
 
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module()).ValueOrDie());
-  const HloInstruction* sub = module().entry_computation()->root_instruction();
+  EXPECT_TRUE(cse.Run(m.get()).ValueOrDie());
+  const HloInstruction* sub = m->entry_computation()->root_instruction();
   const HloInstruction* add = sub->operand(0);
   EXPECT_EQ(add->operand(0), add->operand(1));
   EXPECT_NE(add->operand(0), sub->operand(1));
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 909853106d57d181e85e3e4134b4039be2b176f5..e8eb7066f96537ff7d5a932434852bc4cf209281 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -43,7 +43,7 @@ using ::testing::UnorderedElementsAre;
 class HloDataflowAnalysisTest : public HloTestBase,
                                 public ::testing::WithParamInterface<bool> {
  protected:
-  HloDataflowAnalysisTest() : module_(CreateNewModule()) {}
+  HloDataflowAnalysisTest() : module_(CreateNewVerifiedModule()) {}
 
   // Run dataflow analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
@@ -1884,7 +1884,7 @@ INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
 class HloDataflowAnalysisTestBase : public HloTestBase {
  protected:
   void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
+    module_ = CreateNewUnverifiedModule();
     computation_ = module_->AddEntryComputation(std::move(computation));
   }
 
@@ -2476,7 +2476,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
     return builder.Build();
   };
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   HloComputation* cond_computation =
       module_->AddEmbeddedComputation(make_cond());
   HloComputation* body_computation =
@@ -2511,7 +2511,7 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
   auto add = sub_builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
   sub_computation->CreateFusionInstruction({add, ones},
                                            HloInstruction::FusionKind::kLoop);
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 3b5cde2996c4195ef458662cd21de85a832d8d55..1fa4259a3e42286cbc911907eea563e6ca6f8611 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -59,7 +59,7 @@ TEST_F(HloDceTest, NoDeadCode) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
@@ -80,7 +80,7 @@ TEST_F(HloDceTest, InstructionsWithSideEffect) {
       HloInstruction::CreateSend(constant, token, /*channel_id=*/0));
   builder.AddInstruction(HloInstruction::CreateTuple({}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
@@ -110,7 +110,7 @@ TEST_F(HloDceTest, DeadParameters) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       live_param->shape(), HloOpcode::kNegate, live_param));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
@@ -150,7 +150,7 @@ TEST_F(HloDceTest, ControlDependencies) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Add a control dependency between two instructions.
@@ -175,7 +175,7 @@ TEST_F(HloDceTest, ControlDependencies) {
 
 // Tests that a dead call instruction is removed.
 TEST_F(HloDceTest, DeadInstructionWithCalledComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Called computation for the call instruction.
@@ -215,7 +215,7 @@ TEST_F(HloDceTest, DeadInstructionWithCalledComputation) {
 // Tests that a while instruction with an infeed (effectul instruction) in its
 // body is not removed, even its user count is 0.
 TEST_F(HloDceTest, CalledComputationWithSideEffect) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Condition computation of a while instruction.
@@ -270,7 +270,7 @@ TEST_F(HloDceTest, CalledComputationWithSideEffect) {
 
 // Tests that a nested call instruction with a side effect is not removed.
 TEST_F(HloDceTest, CalledComputationWithNestedSideEffect) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Nested called computation with a side effect.
@@ -323,7 +323,7 @@ TEST_F(HloDceTest, CalledComputationWithNestedSideEffect) {
 }
 
 TEST_F(HloDceTest, RemoveDeadSubcomputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation::Builder subcomp_builder("reduction_subcomp");
@@ -364,7 +364,7 @@ TEST_F(HloDceTest, RemoveDeadSubcomputation) {
 }
 
 TEST_F(HloDceTest, KeepUsedSubcomputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation::Builder subcomp_builder("reduction_subcomp");
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
index b90e8db23398d23e886d2d1fe68de8bb187d9c3a..acdb42128e3d9a1fb912a466c9c2c3cbbe3d3f83 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
@@ -22,13 +22,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class HloDomainTest : public HloVerifiedTestBase {
+class HloDomainTest : public HloTestBase {
  protected:
   bool FindUserViaDomainPath(HloInstruction* instruction,
                              HloInstruction* operand) const {
@@ -64,13 +63,6 @@ class HloDomainTest : public HloVerifiedTestBase {
     }
     return false;
   }
-
-  StatusOr<HloModule*> ParseModule(absl::string_view hlo_string) {
-    HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-    ParseAndVerifyModule(hlo_string, config);
-    return &module();
-  }
 };
 
 // Dummy DomainMetadata implementation which create kDomain boundaries around
@@ -144,31 +136,32 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 }
 
 TEST_F(HloDomainTest, CheckNoDomainAddedIfNoSharding) {
@@ -186,11 +179,12 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(!isolator_changed);
 }
 
@@ -213,26 +207,27 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "b", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "f", "e_element"));
-  EXPECT_FALSE(HasDomainEdge(module, "a", "p0"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "f", "e_element"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "a", "p0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "b", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "f", "e_element"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "f", "e_element"));
 }
 
 TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) {
@@ -250,11 +245,12 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_FALSE(isolator_changed);
 }
 
@@ -273,15 +269,16 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_FALSE(remover_changed);
 
-  HloInstruction* add = FindInstruction(module, "c");
+  HloInstruction* add = FindInstruction(module.get(), "c");
   ASSERT_NE(add, nullptr);
   auto device = add->sharding_unique_device();
   EXPECT_TRUE(device.has_value());
@@ -304,41 +301,42 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator sharding_isolator([]() { return ShardingDomainCreator{}; });
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_isolator_changed,
-                          sharding_isolator.Run(module));
+                          sharding_isolator.Run(module.get()));
   EXPECT_TRUE(sharding_isolator_changed);
 
   HloDomainIsolator opname_isolator([]() { return OpNameDomainCreator{}; });
   TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed,
-                          opname_isolator.Run(module));
+                          opname_isolator.Run(module.get()));
   EXPECT_TRUE(opname_isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover sharding_remover(ShardingMetadata::KindName(),
                                     ShardingMetadata::NormalizeShardingDomain);
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed,
-                          sharding_remover.Run(module));
+                          sharding_remover.Run(module.get()));
   EXPECT_TRUE(sharding_remover_changed);
 
   HloDomainRemover opname_remover(OpNameMetadata::KindName(),
                                   OpNameDomainNormalizer);
   TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed,
-                          opname_remover.Run(module));
+                          opname_remover.Run(module.get()));
   EXPECT_TRUE(opname_remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "c"));
 }
 
 TEST_F(HloDomainTest, CheckNormalizationOnInfeedTuple) {
@@ -359,16 +357,17 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "infeed.data", "infeed"));
-  EXPECT_FALSE(HasDomainEdge(module, "copy0", "gte0"));
-  EXPECT_FALSE(HasDomainEdge(module, "copy1", "gte1"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "infeed.data", "infeed"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy0", "gte0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy1", "gte1"));
 
   // Inject unassigned tuple/gte within the infeed domain, to simulate the
   // HLO passes adding unexpected instructions.
@@ -384,7 +383,7 @@ ENTRY entry {
   //           \       /
   //             TUPLE
   //               |
-  HloInstruction* infeed_data = FindInstruction(module, "infeed.data");
+  HloInstruction* infeed_data = FindInstruction(module.get(), "infeed.data");
   ASSERT_NE(infeed_data, nullptr);
 
   auto infeed_data_users = infeed_data->users();
@@ -410,7 +409,7 @@ ENTRY entry {
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   struct Assignment {
@@ -446,25 +445,26 @@ ENTRY entry {
     sharding={maximal device=1}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "tuple", "param"));
-  EXPECT_FALSE(HasDomainEdge(module, "gte", "tuple"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "tuple", "param"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "gte", "tuple"));
 
   // Remove %tuple and %gte (tuple simplification)
-  HloInstruction* gte = FindInstruction(module, "gte");
-  HloInstruction* tuple = FindInstruction(module, "tuple");
+  HloInstruction* gte = FindInstruction(module.get(), "gte");
+  HloInstruction* tuple = FindInstruction(module.get(), "tuple");
   module->entry_computation()->set_root_instruction(tuple->mutable_operand(0));
   TF_EXPECT_OK(module->entry_computation()->RemoveInstruction(gte));
   TF_EXPECT_OK(module->entry_computation()->RemoveInstruction(tuple));
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   const HloInstruction* root = module->entry_computation()->root_instruction();
@@ -486,11 +486,11 @@ TEST_F(HloDomainTest, DumpParseNullSharding) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, domain, domain));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto hlo_string = module->ToString();
-  ASSERT_TRUE(ParseModule(hlo_string).status().ok());
+  ASSERT_TRUE(ParseAndReturnVerifiedModule(hlo_string).status().ok());
 }
 
 // Tuple inputs are domain instructions.
@@ -507,20 +507,21 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
   // Clear sharding of tpl instruction, in order to test domain sharding
   // application.
-  auto tpl = FindInstruction(module, "tpl");
+  auto tpl = FindInstruction(module.get(), "tpl");
   tpl->clear_sharding();
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   EXPECT_EQ(HloSharding::Tuple(tpl->shape(), {HloSharding::AssignDevice(1),
@@ -555,36 +556,37 @@ ENTRY %entry (p0: (f32[4], f32[4])) -> (f32[4], f32[4], f32[4]) {
   ROOT %g = (f32[4], f32[4], f32[4]) tuple(%domain.2, %domain.3, %domain.4)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainIsolator opname_isolator([]() { return OpNameDomainCreator{}; });
   TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed,
-                          opname_isolator.Run(module));
+                          opname_isolator.Run(module.get()));
   EXPECT_TRUE(opname_isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover sharding_remover(ShardingMetadata::KindName(),
                                     ShardingMetadata::NormalizeShardingDomain);
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed,
-                          sharding_remover.Run(module));
+                          sharding_remover.Run(module.get()));
   EXPECT_TRUE(sharding_remover_changed);
 
   HloDomainRemover opname_remover(OpNameMetadata::KindName(),
                                   OpNameDomainNormalizer);
   TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed,
-                          opname_remover.Run(module));
+                          opname_remover.Run(module.get()));
   EXPECT_TRUE(opname_remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "c"));
 }
 
 // Emulate instructions inserted at top and bottom within nested tuple domain.
@@ -603,15 +605,16 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
   // Clear sharding of tuple.0 instruction, in order to test domain sharding
   // application.
-  auto tuple0 = FindInstruction(module, "tuple.0");
+  auto tuple0 = FindInstruction(module.get(), "tuple.0");
   tuple0->clear_sharding();
 
   // Insert the following instructons above and below tuple.0, to emulate other
@@ -655,7 +658,7 @@ ENTRY entry {
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   EXPECT_TRUE(tuple0->has_sharding());
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 7fcafafc097a623686ca98a7cb3c6256c7904f6d..9783f0574f50ba5542b82d36da899f968ce0e45c 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1279,10 +1279,10 @@ StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
           key_value_vector.push_back(
               std::make_pair(keys_data[i], values_data[i]));
         }
-        std::sort(key_value_vector.begin(), key_value_vector.end(),
-                  [](const kv_pair& a, const kv_pair& b) {
-                    return SafeLess<KeyType>(a.first, b.first);
-                  });
+        std::stable_sort(key_value_vector.begin(), key_value_vector.end(),
+                         [](const kv_pair& a, const kv_pair& b) {
+                           return SafeLess<KeyType>(a.first, b.first);
+                         });
         std::vector<KeyType> result_keys;
         // We use a InlinedVector here because we need to convert it to an
         // absl::Span later, and this would not work with std::vector<bool>.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 07f8d0aad4af0b07303b4e485b3630cc75bcb519..d751f40fff872b831338dc8aa08a04cb00d2838c 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -221,16 +221,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
       const Literal& operand_literal) {
     const auto shape = instruction->shape();
     const auto* operand = instruction->operand(0);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-    // removed.
-    if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s",
-          ShapeUtil::HumanString(shape),
-          ShapeUtil::HumanString(operand->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, operand->shape()));
 
     Literal result(shape);
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 608a42bb60702aa075daca39535ca1672dcc5467..d95b6ad04f2c446b423a3aaef4de333ed2968883 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -50,9 +50,9 @@ namespace {
 static std::array<bool, 2> use_bf16_params{true, false};
 
 class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
-                         public HloVerifiedTestBase {
+                         public HloTestBase {
  protected:
-  HloEvaluatorTest() : HloVerifiedTestBase(), use_bfloat16_(GetParam()) {
+  HloEvaluatorTest() : HloTestBase(), use_bfloat16_(GetParam()) {
     evaluator_ = absl::make_unique<HloEvaluator>();
   }
 
@@ -60,14 +60,14 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     if (use_bfloat16_) {
       // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
       auto type_converter = HloElementTypeConverter(F32, BF16);
-      type_converter.Run(&module()).ValueOrDie();
+      type_converter.Run(m_.get()).ValueOrDie();
     }
-    return evaluator_->Evaluate(*module().entry_computation(), arg_literals)
+    return evaluator_->Evaluate(*m_->entry_computation(), arg_literals)
         .ConsumeValueOrDie();
   }
 
-  // Evaluate function that takes in a local module instead of using module_
-  // that is in HloVerifiedTestBase. Once module_ in HloVerifiedTestBase is
+  // Evaluate function that takes in a local module instead of using m_
+  // that is in HloTestBase. Once m_ in HloTestBase is
   // removed, this should be the default Evaluate function.
   Literal EvaluateWithModule(
       HloModule* module, absl::Span<const Literal* const> arg_literals = {}) {
@@ -88,7 +88,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     auto c1 =
         b.AddInstruction(HloInstruction::CreateConstant(std::move(input)));
     b.AddInstruction(HloInstruction::CreateUnary(expected.shape(), opcode, c1));
-    module().AddEntryComputation(b.Build());
+    m_->AddEntryComputation(b.Build());
 
     Literal result = Evaluate();
 
@@ -108,7 +108,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs)));
     b.AddInstruction(
         HloInstruction::CreateBinary(expected.shape(), opcode, c1, c2));
-    module().AddEntryComputation(b.Build());
+    m_->AddEntryComputation(b.Build());
 
     Literal result = Evaluate();
 
@@ -116,6 +116,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
   }
 
   bool use_bfloat16_;
+  std::unique_ptr<HloModule> m_ = CreateNewVerifiedModule();
 };
 
 #define XLA_TYPED_TEST_P(test_case_name, test_name, test_type1) \
@@ -135,7 +136,7 @@ TEST_P(HloEvaluatorTest, DoesClamp) {
   auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
   b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -156,7 +157,7 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
   auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
   b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -181,7 +182,7 @@ TEST_P(HloEvaluatorTest, DoesSelect) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(on_false)));
   b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kSelect, c1, c2, c3));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -322,7 +323,7 @@ TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
       b.AddInstruction(HloInstruction::CreateParameter(2, shape, "rhs2"));
   b.AddInstruction(HloInstruction::CreateBinary(shape, HloOpcode::kAdd,
                                                 lhs_instruction, param_rhs2));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate(args);
 
@@ -346,7 +347,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) {
   const int64 permutation[] = {1, 2, 0, 4, 3};
   b.AddInstruction(
       HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -367,7 +368,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcast) {
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal.shape(), literal_instruction, {1, 2}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -386,7 +387,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal.shape(), literal_instruction,
       /*broadcast_dimensions=*/{}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -406,7 +407,7 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
   Shape shape = ShapeUtil::MakeShape(S64, {4, 2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -428,7 +429,7 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   Shape shape = ShapeUtil::MakeShape(S64, {2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -448,7 +449,7 @@ TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected.shape(), constant));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -468,7 +469,7 @@ TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected.shape(), constant));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -503,7 +504,7 @@ TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   Shape shape = ShapeUtil::MakeShape(S32, {5, 2});
   b.AddInstruction(HloInstruction::CreatePad(
       shape, operand_instruction, padding_value_instruction, padding_config));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -530,7 +531,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
       CreatePaddingConfig({{{1, 0, 2}}, {{0, 2, 1}}, {{0, 0, 0}}, {{0, 0, 0}}});
   b.AddInstruction(HloInstruction::CreatePad(
       shape, input_instruction, pad_instruction, r4_padding_on_dim0_dim1));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -574,7 +575,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -619,7 +620,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -658,7 +659,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
                                              rhs_instruction, dot_dnums,
                                              DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -704,7 +705,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
   b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
                                              rhs_instruction, dot_dnums,
                                              DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -748,7 +749,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
                                              rhs_instruction, dot_dnums,
                                              DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -802,7 +803,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -857,7 +858,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -941,7 +942,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1019,7 +1020,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1079,7 +1080,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1143,7 +1144,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1215,7 +1216,7 @@ TEST_P(HloEvaluatorTest,
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1286,7 +1287,7 @@ TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction,
       /*feature_group_count=*/2, window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1297,11 +1298,12 @@ TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
+class HloEvaluatorPreciseReduceTest : public HloTestBase {};
 
 // Tests that Reduce doesn't lose precision when adding many numbers (because
 // it accumulates its result in a double).
 TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder b(TestName());
 
   constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
@@ -1319,12 +1321,12 @@ TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m->AddEmbeddedComputation(add_computation.Build());
 
   HloInstruction* reduce_instruction = b.AddInstruction(
       HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
                                    /*dimensions_to_reduce=*/{0}, add_func));
-  module().AddEntryComputation(b.Build());
+  m->AddEntryComputation(b.Build());
 
   HloEvaluator hlo_eval;
   Literal result = hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
@@ -1337,7 +1339,7 @@ void BM_ReducePrecisely(int num_iters) {
   tensorflow::testing::StopTiming();
   HloComputation::Builder b("BM_ReducePrecisely");
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsFromFlags());
   HloModule module("BM_ReducePrecisely", config);
 
   constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
@@ -1396,14 +1398,14 @@ TEST_P(HloEvaluatorTest, ReduceAdd) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m_->AddEmbeddedComputation(add_computation.Build());
 
   Shape shape = ShapeUtil::MakeShape(F32, {2});
   b.AddInstruction(
       HloInstruction::CreateReduce(shape, arg_instruction, init_value,
                                    /*dimensions_to_reduce=*/{1}, add_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1438,7 +1440,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   max_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
-  auto max_func = module().AddEmbeddedComputation(max_computation.Build());
+  auto max_func = m_->AddEmbeddedComputation(max_computation.Build());
 
   Window window;
   WindowDimension dim;
@@ -1455,7 +1457,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, max_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1490,7 +1492,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   max_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
-  auto max_func = module().AddEmbeddedComputation(max_computation.Build());
+  auto max_func = m_->AddEmbeddedComputation(max_computation.Build());
 
   Window window;
   WindowDimension dim;
@@ -1507,7 +1509,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, max_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1541,7 +1543,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m_->AddEmbeddedComputation(add_computation.Build());
 
   Window window;
   WindowDimension dim;
@@ -1564,7 +1566,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1594,7 +1596,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m_->AddEmbeddedComputation(add_computation.Build());
 
   Window window;
 
@@ -1625,7 +1627,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1657,7 +1659,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) {
                                                /*start_indices=*/{0, 2},
                                                /*limit_indices=*/{3, 5},
                                                /*strides=*/{2, 3}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1691,7 +1693,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1727,7 +1729,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1764,7 +1766,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       shape, operand, update, start_indices));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1800,7 +1802,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateGetTupleElement(shape, tuple, 1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1839,7 +1841,7 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
   b.AddInstruction(
       HloInstruction::CreateGetTupleElement(tuple2->shape(), outer_tuple, 1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1877,7 +1879,7 @@ TEST_P(HloEvaluatorTest, Reverse) {
 
   const Shape shape = ShapeUtil::MakeShape(F32, {4, 3, 2, 1});
   b.AddInstruction(HloInstruction::CreateReverse(shape, operand, {0, 1}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1966,7 +1968,7 @@ ENTRY main {
       slice_sizes={1, 3}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -1990,7 +1992,7 @@ ENTRY main {
       slice_sizes={3, 1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2014,7 +2016,7 @@ ENTRY main {
       slice_sizes={3, 1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
@@ -2039,7 +2041,7 @@ ENTRY main {
       slice_sizes={1,1,2}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2066,7 +2068,7 @@ ENTRY main {
       slice_sizes={1,1,2}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2092,7 +2094,7 @@ ENTRY main {
       slice_sizes={1,1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({1, 1});
@@ -2115,7 +2117,7 @@ ENTRY main {
       slice_sizes={1,1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
@@ -2139,7 +2141,7 @@ ENTRY main {
       slice_sizes={1, 0}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{}, {}}),
@@ -2161,7 +2163,7 @@ ENTRY main {
       slice_sizes={1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
   Literal start_indices =
@@ -2192,7 +2194,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2223,7 +2225,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2256,7 +2258,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2288,7 +2290,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2320,7 +2322,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<float>(
       {{1.1, 2.2, 3.3}, {4.4, 5.5, 6.6}, {7.7, 8.8, 9.9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({2, 1});
@@ -2354,7 +2356,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({1, 1});
@@ -2386,7 +2388,7 @@ ENTRY main {
       index_vector_dim=2
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
@@ -2418,7 +2420,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2455,7 +2457,7 @@ ENTRY main {
       index_vector_dim=0
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2491,7 +2493,7 @@ ENTRY main {
       index_vector_dim=0
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({1, 1});
@@ -2523,7 +2525,7 @@ ENTRY main {
       index_vector_dim=0
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
@@ -2555,7 +2557,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
   Literal updates = LiteralUtil::CreateR2<int32>({{}, {}});
@@ -2585,7 +2587,7 @@ ENTRY main {
       index_vector_dim=2
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
   Literal scatter_indices =
@@ -2736,7 +2738,7 @@ ENTRY main {
   ROOT %reduce = bf16[] reduce(arg0, init), dimensions={0}, to_apply=add_bf16
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal arg = LiteralUtil::CreateR1<bfloat16>(
       {bfloat16(1.0f), bfloat16(3.0f), bfloat16(-2.0f), bfloat16(42.0f)});
@@ -2754,7 +2756,7 @@ ENTRY main {
   ROOT %slice = f32[2,2,2]{1,0,2} slice(f32[2,2,2]{0,1,2} %arg), slice={[0:2], [0:2], [0:2]}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal arg = LiteralUtil::CreateR3WithLayout<float>(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index ebed875eb4954bc9a9da3f182005fa3d44326493..b87fc3e34012e75ee07bff6c1e113dce404f83cb 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -161,9 +161,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                          HloOpcodeString(hlo_instruction->opcode()));
   }
 
-  // TODO(b/35950897): many of the stl functions used in the handlers are not
-  // overloaded for every XLA primitive type.
-
   template <typename NativeT,
             typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
                 nullptr>
@@ -596,7 +593,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  Status HandleDivide(HloInstruction* divide) {
+  Status HandleDivide(HloInstruction* divide) override {
     return HandleDivide<ElementwiseT>(divide);
   }
 
@@ -1556,10 +1553,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           const auto& row_data = row_to_sort.data<NativeT>();
 
           std::vector<NativeT> result_data(row_data.begin(), row_data.end());
-          std::sort(result_data.begin(), result_data.end(),
-                    [](const NativeT& a, const NativeT& b) {
-                      return SafeLess<NativeT>(a, b);
-                    });
+          std::stable_sort(result_data.begin(), result_data.end(),
+                           [](const NativeT& a, const NativeT& b) {
+                             return SafeLess<NativeT>(a, b);
+                           });
           Literal sorted_row(ShapeUtil::MakeShape(keys->shape().element_type(),
                                                   {sort_dim_elements}));
           sorted_row.PopulateR1(absl::Span<const NativeT>(result_data));
@@ -2546,12 +2543,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   template <typename NativeT,
             typename std::enable_if<
-                std::is_same<NativeT, float>::value ||
-                std::is_same<NativeT, int32>::value ||
-                std::is_same<NativeT, uint32>::value>::type* = nullptr>
+                std::is_integral<NativeT>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleIota(HloInstruction* instruction) {
     auto* iota = Cast<HloIotaInstruction>(instruction);
-    std::vector<NativeT> data(iota->shape().dimensions(iota->iota_dimension()));
+    // Avoid using std::vector since std::vector<bool> does not convert to
+    // absl::Span<bool>.
+    absl::InlinedVector<NativeT, 1> data(
+        iota->shape().dimensions(iota->iota_dimension()));
     std::iota(data.begin(), data.end(), 0);
     auto result = LiteralUtil::CreateR1<NativeT>(data);
 
@@ -2568,9 +2567,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
   template <typename NativeT,
             typename std::enable_if<
-                !(std::is_same<NativeT, float>::value ||
-                  std::is_same<NativeT, int32>::value ||
-                  std::is_same<NativeT, uint32>::value)>::type* = nullptr>
+                !(std::is_integral<NativeT>::value ||
+                  std::is_floating_point<NativeT>::value)>::type* = nullptr>
   Status HandleIota(HloInstruction* iota) {
     return InvalidArgument("Unsupported type for iota");
   }
@@ -2722,17 +2720,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const auto shape = instruction->shape();
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
-    // is removed.
-    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()),
-          ShapeUtil::HumanString(rhs->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
@@ -2756,19 +2745,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
     const auto* ehs = instruction->operand(2);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
-    // broadcast is removed.
-    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
-          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()),
-          ShapeUtil::HumanString(rhs->shape()),
-          ShapeUtil::HumanString(ehs->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, lhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()));
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..631b3ad735f369922d10b37d11e2a1b1ba117e6b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+
+namespace xla {
+
+namespace {
+
+StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kGetDimensionSize) {
+    return false;
+  }
+  HloComputation* computation = instr->parent();
+
+  TF_ASSIGN_OR_RETURN(auto legal_shape,
+                      ShapeInference::InferGetDimensionSizeShape(
+                          instr->operand(0)->shape(), instr->dimension()));
+  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape));
+  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), U32));
+  uint32 size = instr->operand(0)->shape().dimensions(instr->dimension());
+  HloInstruction* new_instr = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
+  TF_RETURN_IF_ERROR(computation->ReplaceInstruction(instr, new_instr));
+  return true;
+}
+
+}  // namespace
+
+StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
+  bool changed = false;
+  HloProto proto;
+  *proto.mutable_hlo_module() = module->ToProto();
+  for (auto* computation : module->computations()) {
+    // Replacing instructions will change the instruction list in the
+    // computation. So instead of iterating computation->instructions()
+    // directly, we make a copy of the list to avoid use-after-free.
+    std::vector<HloInstruction*> instrs(computation->instruction_count());
+    absl::c_copy(computation->instructions(), instrs.begin());
+    for (auto instruction : instrs) {
+      TF_ASSIGN_OR_RETURN(bool replaced, ReplaceGetSize(instruction));
+      changed = changed || replaced;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..30f44c23a835b3bcc935caaa917e040e07c4e703
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Pass to replace a kGetDimensionSize instruction with a constant instruction.
+class HloGetDimensionSizeRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "hlo-get-dimension-size-rewriter";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a86aebdd5b64240e6e07d8e8050c0c8681cce765
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class HloGetDimensionSizeRewriterTest : public HloTestBase {
+ protected:
+  HloGetDimensionSizeRewriterTest() {}
+};
+
+TEST_F(HloGetDimensionSizeRewriterTest, Ok) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3,4] parameter(0)
+  size0 = u32[] get-dimension-size(p), dimensions={0}
+  size1 = u32[] get-dimension-size(p), dimensions={1}
+  ROOT mul = u32[] multiply(size0, size1)
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Multiply(op::Constant(), op::Constant()));
+}
+
+TEST_F(HloGetDimensionSizeRewriterTest, IllegalType) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3]{0} parameter(0)
+  ROOT gds = s64[] get-dimension-size(p), dimensions={0}
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+TEST_F(HloGetDimensionSizeRewriterTest, IllegalDimension) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = f32[2,5] parameter(0)
+  ROOT gds = u32[] get-dimension-size(p), dimensions={2}
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 13a74fd8a115c5dc9a9518b226dfee4445cc7180..05cc1593e4ef4fc52b94e0536628645b1fa2abbc 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1043,6 +1043,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kDomain:
     case HloOpcode::kFusion:
     case HloOpcode::kMap:
+    case HloOpcode::kGetDimensionSize:
       return kGray;
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kAllToAll:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index f6ed86b41650fd331201814559386ff644092c23..cd95052580b3d203c2d2a586bc4d9fdbb9d19bf4 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -312,6 +312,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                 proto.exponent_bits(), proto.mantissa_bits());
       break;
     case HloOpcode::kInfeed: {
+      TF_RET_CHECK(ShapeUtil::IsTuple(proto.shape()) &&
+                   (ShapeUtil::TupleElementCount(proto.shape()) == 2))
+          << "Infeed should have a tuple shape with 2 operands, but has: "
+          << proto.shape();
       const Shape& data_shape =
           ShapeUtil::GetTupleElementShape(proto.shape(), 0);
       TF_RET_CHECK(proto.operand_ids_size() == 1)
@@ -530,6 +534,12 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           absl::make_unique<ShardingMetadata>(exit_hlo_sharding));
       break;
     }
+    case HloOpcode::kGetDimensionSize:
+      TF_RET_CHECK(proto.operand_ids_size() == 1);
+      TF_RET_CHECK(proto.dimensions_size() == 1);
+      instruction = CreateGetDimensionSize(proto.shape(), operands(0),
+                                           proto.dimensions(0));
+      break;
     default: {
       instruction = absl::WrapUnique(new HloInstruction(opcode, proto.shape()));
       for (const int64 operand_id : proto.operand_ids()) {
@@ -1001,6 +1011,14 @@ HloInstruction::CreateSelectAndScatter(
                                                     broadcast_dimensions);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateGetDimensionSize(const Shape& shape,
+                                       HloInstruction* operand,
+                                       int64 dimension) {
+  return absl::make_unique<HloGetDimensionSizeInstruction>(shape, operand,
+                                                           dimension);
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBroadcastSequence(
     const Shape& output_shape, HloInstruction* operand,
@@ -1109,7 +1127,11 @@ void HloInstruction::set_single_sharding(const HloSharding& sharding) {
 
 void HloInstruction::SetupDerivedInstruction(
     HloInstruction* derived_instruction) const {
-  if (sharding_ != nullptr) {
+  if (sharding_ != nullptr && ShapeUtil::CompatibleIgnoringElementType(
+                                  shape_, derived_instruction->shape())) {
+    // Only copy sharding if the shape of the two instruction is compatible
+    // because copying it between differently shaped instructions can produce
+    // invalid shardings.
     derived_instruction->set_sharding(*sharding_);
   } else {
     derived_instruction->clear_sharding();
@@ -1268,6 +1290,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kIota:
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
+    case HloOpcode::kGetDimensionSize:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1715,6 +1738,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kScatter:
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
+    case HloOpcode::kGetDimensionSize:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
@@ -1876,6 +1900,11 @@ void HloInstruction::set_while_body(HloComputation* computation) {
   called_computations_[kBodyComputationIndex] = computation;
 }
 
+HloInstruction* HloInstruction::while_init() const {
+  CHECK_EQ(HloOpcode::kWhile, opcode_);
+  return operands_[0];
+}
+
 HloComputation* HloInstruction::true_computation() const {
   CHECK_EQ(HloOpcode::kConditional, opcode_);
   return called_computations_[kTrueComputationIndex];
@@ -2440,6 +2469,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleAfterAll(this);
     case HloOpcode::kIota:
       return visitor->HandleIota(this);
+    case HloOpcode::kGetDimensionSize:
+      return visitor->HandleGetDimensionSize(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
@@ -2597,36 +2628,6 @@ Status HloInstruction::AcceptWithOperandOrder(
   return Status::OK();
 }
 
-namespace {
-
-// Returns true if the given order is a topological sort of the instructions
-// it contains.
-bool OrderIsTopologicalSort(const std::vector<const HloInstruction*>& order) {
-  // Create a map from instruction to its position in 'order'.
-  std::unordered_map<const HloInstruction*, int> order_position;
-  for (int i = 0; i < order.size(); i++) {
-    if (!order_position.insert({order[i], i}).second) {
-      // Instruction order[i] is duplicated in the order.
-      return false;
-    }
-  }
-  // Verify that the operand of each instruction in the order is also in the
-  // order *and* the operand's position is earlier (defs are before uses for
-  // all ops).
-  for (auto* instruction : order) {
-    for (auto* operand : instruction->operands()) {
-      if (!ContainsKey(order_position, operand) ||
-          order_position.at(operand) >= order_position.at(instruction)) {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-}  // namespace
-
 Status HloInstruction::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
   FunctionVisitor visitor(visitor_func);
@@ -2639,49 +2640,7 @@ Status HloInstruction::Accept(
   return this->Accept(&visitor);
 }
 
-Status HloInstruction::AcceptOrdered(
-    DfsHloVisitor* visitor, const std::vector<const HloInstruction*>& order) {
-  VLOG(2) << "HloInstruction::AcceptOrdered(%" << name() << ")";
-  TF_RET_CHECK(OrderIsTopologicalSort(order));
-
-  // Compute the predecessors of this instruction.
-  std::unordered_set<const HloInstruction*> predecessors;
-  TF_RETURN_IF_ERROR(this->Accept([&predecessors](HloInstruction* instruction) {
-    predecessors.insert(instruction);
-    return Status::OK();
-  }));
-
-  for (auto* const_instruction : order) {
-    if (!ContainsKey(predecessors, const_instruction)) {
-      // Instruction is not a predecessors of 'this'.
-      continue;
-    }
-
-    // The visitor can mark instructions as visited to skip particular
-    // instructions.
-    if (visitor->DidVisit(*const_instruction)) {
-      VLOG(3) << "Not visiting HLO %" << const_instruction->name()
-              << " as it was already visited.";
-      continue;
-    }
-
-    // TODO(b/78350259): Eliminate const laundering.
-    HloInstruction* instruction =
-        const_cast<HloInstruction*>(const_instruction);
-
-    TF_RETURN_IF_ERROR(visitor->Preprocess(instruction));
-    VLOG(2) << "Visiting HLO %" << instruction->name();
-    TF_RETURN_IF_ERROR(instruction->Visit(visitor));
-    visitor->SetVisited(*instruction);
-    TF_RETURN_IF_ERROR(visitor->Postprocess(instruction));
-  }
-
-  return visitor->FinishVisit(this);
-}
-
-const Shape& HloInstruction::shape() const {
-  return shape_;
-}
+const Shape& HloInstruction::shape() const { return shape_; }
 
 std::vector<int64> HloInstruction::OperandIndices(
     const HloInstruction* operand) const {
@@ -3080,6 +3039,10 @@ int64 HloInstruction::concatenate_dimension() const {
   return Cast<HloConcatenateInstruction>(this)->concatenate_dimension();
 }
 
+int64 HloInstruction::dimension() const {
+  return Cast<HloGetDimensionSizeInstruction>(this)->dimension();
+}
+
 bool HloInstruction::IsRank2Transpose() const {
   auto transpose = DynCast<HloTransposeInstruction>(this);
   return transpose != nullptr && transpose->IsRank2Transpose();
@@ -3259,6 +3222,11 @@ absl::optional<int64> HloInstruction::all_reduce_id() const {
   return Cast<HloAllReduceInstruction>(this)->all_reduce_id();
 }
 
+void HloInstruction::set_all_reduce_id(
+    const absl::optional<int64>& all_reduce_id) {
+  return Cast<HloAllReduceInstruction>(this)->set_all_reduce_id(all_reduce_id);
+}
+
 const ConvolutionDimensionNumbers&
 HloInstruction::convolution_dimension_numbers() const {
   if (auto convolution = DynCast<HloConvolutionInstruction>(this)) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 15a4da8dbe0053aad314989a6718ebd61532ab8b..95ad29235afa36dc4091feec54cd4b0f5f24048f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -767,6 +767,9 @@ class HloInstruction {
   // when we plumb a primordial token from the entry computation.
   static std::unique_ptr<HloInstruction> CreateToken();
 
+  static std::unique_ptr<HloInstruction> CreateGetDimensionSize(
+      const Shape& shape, HloInstruction* operand, int64 dimension);
+
   // Returns the opcode for this instruction.
   HloOpcode opcode() const { return opcode_; }
 
@@ -880,11 +883,15 @@ class HloInstruction {
       return false;
     }
 
-    // Use an explicit loop rather than ContainerEquals, because copying around
-    // std::functions may be too expensive in some cases.
-    for (size_t i = 0; i < operands().size(); ++i) {
-      if (!eq_operands(operand(i), other.operand(i))) {
-        return false;
+    // Two AllReduces are Identical if they have the same all_reduce_id.
+    // Their operands don't have to be Identical.
+    if (!this->IsCrossModuleAllReduce()) {
+      // Use an explicit loop rather than ContainerEquals, because copying
+      // around std::functions may be too expensive in some cases.
+      for (size_t i = 0; i < operands().size(); ++i) {
+        if (!eq_operands(operand(i), other.operand(i))) {
+          return false;
+        }
       }
     }
 
@@ -954,16 +961,6 @@ class HloInstruction {
   Status Accept(
       const std::function<Status(const HloInstruction*)>& visitor_func) const;
 
-  // Visits all instructions rooted at this instruction using the given visitor
-  // in the given order. 'order' must contain at least the set of instructions
-  // rooted at this node (ie, those accessible from a DFS traversal from this
-  // instruction). Instructions contained in 'order' which are not in the set of
-  // instructions rooted at this node are ignored. 'order' must also be a valid
-  // topological sort of these instructions (defs appear before uses) though
-  // need not be a DFS post-order.
-  Status AcceptOrdered(DfsHloVisitor* visitor,
-                       const std::vector<const HloInstruction*>& order);
-
   // Visit this instruction and only this instruction with the given visitor.
   template <typename HloInstructionPtr>
   Status Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor);
@@ -1004,6 +1001,8 @@ class HloInstruction {
   void set_while_condition(HloComputation* while_condition);
   void set_while_body(HloComputation* while_body);
 
+  HloInstruction* while_init() const;
+
   // Gets/sets the true and false HloComputation for Conditional. The setters
   // should only be called by HloModule or HloComputation methods.
   //
@@ -1324,6 +1323,9 @@ class HloInstruction {
   // Delegates to HloConcatenateInstruction::concatenate_dimension.
   int64 concatenate_dimension() const;
 
+  // Delegates to HloGetDimensionSizeInstruction::dimension.
+  int64 dimension() const;
+
   // Returns whether this instruction does a rank-2 transposition.
   bool IsRank2Transpose() const;
 
@@ -1442,6 +1444,7 @@ class HloInstruction {
 
   // Delegates to HloAllReduceInstruction::all_reduce_id.
   absl::optional<int64> all_reduce_id() const;
+  void set_all_reduce_id(const absl::optional<int64>& all_reduce_id);
 
   // Returns data on the window in a windowed operation such as
   // convolution.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index d93351fe0435b5f29035dc4ea0621a8c576bfd5a..8048e332cb57747286758b75773b29ba154aa888 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 
@@ -39,7 +39,7 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
-class HloInstructionTest : public HloVerifiedTestBase {
+class HloInstructionTest : public HloTestBase {
  protected:
   Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
 };
@@ -151,7 +151,7 @@ TEST_F(HloInstructionTest, UserWithTwoOperands) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r0f32_, "bar"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(add->operands(), UnorderedElementsAre(foo, bar));
@@ -188,7 +188,7 @@ TEST_F(HloInstructionTest, MultipleUsers) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
@@ -221,7 +221,7 @@ TEST_F(HloInstructionTest, RepeatedUser) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "foo"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(1, foo->user_count());
@@ -256,7 +256,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperands) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c0, param1));
   auto addtotal = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
@@ -305,7 +305,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperandsWithUnaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
   auto neg2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, addtotal));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
@@ -327,7 +327,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
   //
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   Shape f32a100x10 = ShapeUtil::MakeShape(F32, {100, 10});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // Builds an x+1.0 computation to use in a Map.
   auto embedded_builder = HloComputation::Builder("f32+1");
@@ -375,7 +375,7 @@ TEST_F(HloInstructionTest, TrivialReduce) {
       HloInstruction::CreateParameter(1, r0f32, "y"));
   embedded_builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, paramx, paramy));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
 
   // Builds a parameter and an initial value and feeds them to the reduce.
@@ -416,7 +416,7 @@ TEST_F(HloInstructionTest, ReplaceUseInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -451,7 +451,7 @@ TEST_F(HloInstructionTest, ReplaceUseInVariadicOp) {
       builder.AddInstruction(HloInstruction::CreateTuple({foo, bar, baz, foo}));
   auto add_foobar = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -479,7 +479,7 @@ TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -516,7 +516,7 @@ TEST_F(HloInstructionTest, ReplaceAllUsesWithInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -546,7 +546,7 @@ TEST_F(HloInstructionTest, ReplaceAllUsesInMultipleOps) {
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({foo, bar}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
@@ -611,7 +611,7 @@ TEST_F(HloInstructionTest, PostProcessAllVisitedNodes) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, exp, log));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   NodeCollectorAndPostProcessor visitor;
@@ -629,7 +629,7 @@ TEST_F(HloInstructionTest, SingletonFusionOp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f)));
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp}, HloInstruction::FusionKind::kLoop);
@@ -647,7 +647,7 @@ TEST_F(HloInstructionTest, BinaryFusionOp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.1f)));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant2));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add}, HloInstruction::FusionKind::kLoop);
@@ -669,7 +669,7 @@ TEST_F(HloInstructionTest, ChainFusionOp) {
   auto exp3 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp3, exp2, exp1}, HloInstruction::FusionKind::kLoop);
@@ -692,7 +692,7 @@ TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
   exp1->set_metadata(metadata);
   exp2->set_metadata(metadata);
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp2, exp1}, HloInstruction::FusionKind::kLoop);
@@ -749,7 +749,7 @@ TEST_F(HloInstructionTest, PreserveTupleShapeThroughClone) {
 TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
   // Create a fusion instruction containing a single unary operation.
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto make_map_computation = [&]() {
     auto builder = HloComputation::Builder("FusionMap");
@@ -817,7 +817,7 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({sub, sub, mul, c1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {tuple, sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
@@ -977,7 +977,7 @@ TEST_F(HloInstructionTest, FunctionVisitor) {
       HloInstruction::CreateUnary(f32, HloOpcode::kExp, param));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32, HloOpcode::kAdd, negate, exp));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   int visit_num = 0;
@@ -1006,7 +1006,7 @@ TEST_F(HloInstructionTest, FullyElementwise) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, x, y));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(add->IsElementwise());
@@ -1016,7 +1016,7 @@ TEST_F(HloInstructionTest, FullyElementwise) {
 }
 
 TEST_F(HloInstructionTest, MapIsElementwise) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape r2f32 = ShapeUtil::MakeShapeWithLayout(F32, {10, 10}, {1, 0});
   HloComputation::Builder builder(TestName());
   HloComputation::Builder map_builder("id");
@@ -1067,7 +1067,7 @@ TEST_F(HloInstructionTest, PartiallyElementwise) {
   HloInstruction* max = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kMaximum, div, broadcast));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {max, broadcast, div, mul}, HloInstruction::FusionKind::kLoop);
@@ -1108,7 +1108,7 @@ TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) {
   HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kSubtract, min, broadcast));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {sub, broadcast, min}, HloInstruction::FusionKind::kLoop);
@@ -1151,7 +1151,7 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       sout, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kLoop);
@@ -1192,7 +1192,7 @@ TEST_F(HloInstructionTest, NoRedundantFusionOperandsAfterReplacingUse) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       s, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kLoop);
@@ -1204,7 +1204,7 @@ TEST_F(HloInstructionTest, NoRedundantFusionOperandsAfterReplacingUse) {
 }
 
 TEST_F(HloInstructionTest, FusionEquality) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Create two fusion instructions containing a single unary operation.
@@ -1226,7 +1226,7 @@ TEST_F(HloInstructionTest, FusionEquality) {
 }
 
 TEST_F(HloInstructionTest, NestedFusionEquality) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Build a nested fusion computation.
@@ -1330,7 +1330,7 @@ TEST_F(HloInstructionTest, Stringification) {
             "%dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} "
             "%transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* loop = builder.AddInstruction(
@@ -1373,7 +1373,7 @@ TEST_F(HloInstructionTest, StringifyGather_0) {
                                        /*index_vector_dim=*/4),
                                    /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gather_instruction->ToString(),
@@ -1408,7 +1408,7 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
                                        /*index_vector_dim=*/2),
                                    /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gather_instruction->ToString(),
@@ -1443,7 +1443,7 @@ TEST_F(HloInstructionTest, StringifyScatter) {
   update_builder.AddInstruction(
       HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "p2"));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* update_computation =
       module->AddEmbeddedComputation(update_builder.Build());
 
@@ -1495,7 +1495,7 @@ TEST_F(HloInstructionTest, CanonnicalStringificationFusion) {
             "f32[5,20]{1,0} dot(f32[5,10]{1,0}, f32[10,20]{1,0}), "
             "lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kLoop);
@@ -1531,7 +1531,7 @@ TEST_F(HloInstructionTest, CanonnicalStringificationWhile) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       sout, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({dot, reshape},
                                        HloInstruction::FusionKind::kLoop);
@@ -1587,7 +1587,7 @@ TEST_F(HloInstructionTest, CanonnicalStringificationConditional) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       sout, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({dot, reshape},
                                        HloInstruction::FusionKind::kLoop);
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 88495e80000c4f87a778c4fad747f6bdf09b7a14..ed3b2f1564103969a1092f3215f8b6a377d2d2ae 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -370,6 +370,11 @@ HloAllReduceInstruction::HloAllReduceInstruction(
   AppendComputation(reduce_computation);
 }
 
+void HloAllReduceInstruction::set_all_reduce_id(
+    const absl::optional<int64>& all_reduce_id) {
+  all_reduce_id_ = all_reduce_id;
+}
+
 HloInstructionProto HloAllReduceInstruction::ToProto() const {
   HloInstructionProto proto = HloCollectiveInstruction::ToProto();
   // Proto3 is so sad.
@@ -2349,4 +2354,43 @@ HloInstructionProto HloDomainInstruction::ToProto() const {
 
   return proto;
 }
+
+HloGetDimensionSizeInstruction::HloGetDimensionSizeInstruction(
+    const Shape& shape, HloInstruction* operand, int64 dimension)
+    : HloInstruction(HloOpcode::kGetDimensionSize, shape),
+      dimension_(dimension) {
+  AppendOperand(operand);
+}
+
+HloInstructionProto HloGetDimensionSizeInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.add_dimensions(dimension());
+  return proto;
+}
+
+std::vector<string> HloGetDimensionSizeInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& /*options*/) const {
+  return {StrCat("dimensions={", dimension(), "}")};
+}
+
+bool HloGetDimensionSizeInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+    /*eq_computations*/) const {
+  const auto& casted_other =
+      static_cast<const HloGetDimensionSizeInstruction&>(other);
+  return dimension() == casted_other.dimension();
+}
+
+std::unique_ptr<HloInstruction>
+HloGetDimensionSizeInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* /*context*/) const {
+  if (new_operands.size() != 1) {
+    LOG(FATAL) << "expects 1 operand";
+  }
+  return absl::make_unique<HloGetDimensionSizeInstruction>(
+      shape, new_operands[0], dimension());
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index bf4daf2be47ed06d2b88a331a56149d38fa646b3..0b07341cb94c1391c787ec8e0f5a3f17dccc96b2 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -252,6 +252,7 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   }
 
   absl::optional<int64> all_reduce_id() const { return all_reduce_id_; }
+  void set_all_reduce_id(const absl::optional<int64>& all_reduce_id);
 
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
@@ -1385,6 +1386,33 @@ class HloDomainInstruction : public HloInstruction {
   std::unique_ptr<DomainMetadata> operand_side_metadata_;
   std::unique_ptr<DomainMetadata> user_side_metadata_;
 };
+
+class HloGetDimensionSizeInstruction : public HloInstruction {
+ public:
+  explicit HloGetDimensionSizeInstruction(const Shape& shape,
+                                          HloInstruction* operand,
+                                          int64 dimension);
+
+  // Returns the dimension sizes or numbers associated with this instruction.
+  int64 dimension() const { return dimension_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64 dimension_;
+};
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 1717770301e3666b0a1c23d20b7f2e3bac5f62e4..170ec93a334903cdc314f1950675ef30bc4cda5a 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -165,6 +165,7 @@ namespace opcode_matchers {
   }
 HLO_MATCHER(Abs);
 HLO_MATCHER(Add);
+HLO_MATCHER(AllToAll);
 HLO_MATCHER(Bitcast);
 HLO_MATCHER(Broadcast);
 HLO_MATCHER(BatchNormGrad);
@@ -178,6 +179,7 @@ HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
+HLO_MATCHER(CollectivePermute);
 HLO_MATCHER(Divide);
 HLO_MATCHER(Domain);
 HLO_MATCHER(DynamicSlice);
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index 5cee865b7ad34eded1743d9d5455bb40febf6182..d2740bcce26f04c5d7c8b64cfdaea53e3c697855 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -73,7 +73,7 @@ class ListScheduler {
   // Construct and return a memory-minimizing sequence of HLO instructions
   // containing the given HLO computation.
   static StatusOr<HloInstructionSequence> Run(
-      const HloComputation& computation,
+      HloComputation* computation,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_function,
       const absl::flat_hash_map<const HloComputation*, int64>&
@@ -98,7 +98,7 @@ class ListScheduler {
   // comparison operators.
   using Priority = std::pair<int64, int64>;
 
-  ListScheduler(const HloComputation& computation,
+  ListScheduler(HloComputation* computation,
                 const TuplePointsToAnalysis& points_to_analysis,
                 const LogicalBuffer::SizeFunction& size_function,
                 const absl::flat_hash_map<const HloComputation*, int64>&
@@ -111,7 +111,7 @@ class ListScheduler {
     // instruction. An HLO instruction "uses" a LogicalBuffer if the
     // LogicalBuffer is in an operand of the instruction as indicated by
     // points-to analysis.
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       absl::flat_hash_set<const LogicalBuffer*> instr_uses;
       for (auto* operand : instruction->operands()) {
         points_to_analysis.GetPointsToSet(operand).ForEachElement(
@@ -126,13 +126,13 @@ class ListScheduler {
 
     // Create map containing the number of unscheduled uses (hlo instructions)
     // of each logical buffer.
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       for (auto* buffer :
            points_to_analysis.GetBuffersDefinedByInstruction(instruction)) {
         unscheduled_use_count_[buffer] = 0;
       }
     }
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       for (const LogicalBuffer* buffer : buffer_uses_.at(instruction)) {
         ++unscheduled_use_count_[buffer];
       }
@@ -141,7 +141,7 @@ class ListScheduler {
     // Buffers live out of the computation have an implicit use at the end of
     // the computation.
     for (const LogicalBuffer* live_out_buffer :
-         points_to_analysis.GetPointsToSet(computation.root_instruction())
+         points_to_analysis.GetPointsToSet(computation->root_instruction())
              .CreateFlattenedSet()) {
       ++unscheduled_use_count_[live_out_buffer];
     }
@@ -157,7 +157,7 @@ class ListScheduler {
   // HloInstruction, plus some cached metadata, saved for the purposes of making
   // BytesFreedIfScheduled fast.
   struct ReadyListEntry {
-    const HloInstruction* instruction;
+    HloInstruction* instruction;
 
     // The total size of all buffers defined by this instruction.
     int64 bytes_defined;
@@ -171,7 +171,7 @@ class ListScheduler {
   };
 
   // Creates a ReadyListEntry for the given instruction.
-  ReadyListEntry MakeReadyListEntry(const HloInstruction* instruction) {
+  ReadyListEntry MakeReadyListEntry(HloInstruction* instruction) {
     ReadyListEntry entry;
     entry.instruction = instruction;
 
@@ -250,13 +250,13 @@ class ListScheduler {
     // Populate the ready list with instructions which have no operands or
     // control predecessors.
     absl::flat_hash_map<const HloInstruction*, int64> unscheduled_pred_count;
-    for (auto* instruction : computation_.instructions()) {
+    for (auto* instruction : computation_->instructions()) {
       // TODO(b/34466113): Replace this and above with successors() or
       // predecessors() when these methods are added to HloInstruction.
-      for (const HloInstruction* user : instruction->users()) {
+      for (HloInstruction* user : instruction->users()) {
         unscheduled_pred_count[user]++;
       }
-      for (const HloInstruction* succ : instruction->control_successors()) {
+      for (HloInstruction* succ : instruction->control_successors()) {
         unscheduled_pred_count[succ]++;
       }
     }
@@ -275,7 +275,7 @@ class ListScheduler {
       ready_instructions[inst] = it;
     };
 
-    for (auto* instruction : computation_.instructions()) {
+    for (auto* instruction : computation_->instructions()) {
       if (instruction->operands().empty() &&
           instruction->control_predecessors().empty()) {
         add_to_ready_queue(instruction);
@@ -287,7 +287,7 @@ class ListScheduler {
       // schedule.
       auto best_it = ready_queue.end();
       --best_it;
-      const HloInstruction* best = best_it->second.instruction;
+      HloInstruction* best = best_it->second.instruction;
       VLOG(2) << "Schedule instruction: " << best->ToShortString()
               << " Bytes freed: " << best_it->first.first;
       ready_queue.erase(best_it);
@@ -348,13 +348,13 @@ class ListScheduler {
         }
       }
     }
-    CHECK_EQ(schedule.size(), computation_.instruction_count());
-    CHECK_EQ(scheduled_instructions_.size(), computation_.instruction_count());
+    CHECK_EQ(schedule.size(), computation_->instruction_count());
+    CHECK_EQ(scheduled_instructions_.size(), computation_->instruction_count());
 
     return schedule;
   }
 
-  const HloComputation& computation_;
+  HloComputation* computation_;
   const TuplePointsToAnalysis& points_to_analysis_;
   const LogicalBuffer::SizeFunction& size_function_;
   // Computations are analyzed in post-order. When scheduling an instruction
@@ -386,13 +386,13 @@ int64 SumLogicalBufferSizes(
 }
 
 StatusOr<HloInstructionSequence> ScheduleComputationHelper(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
-  VLOG(2) << "Computation: " << computation.name();
+  VLOG(2) << "Computation: " << computation->name();
   if (algorithm) {
     return algorithm(computation, points_to_analysis, size_function,
                      memory_by_computation);
@@ -404,17 +404,17 @@ StatusOr<HloInstructionSequence> ScheduleComputationHelper(
 }  // namespace
 
 StatusOr<HloInstructionSequence> DFSMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   // These variables are a hack to prevent overflows.
   int64 cumulative_total_size = 0;
-  int64 total_hlos = computation.parent()->instruction_count();
+  int64 total_hlos = computation->parent()->instruction_count();
   absl::flat_hash_map<const HloInstruction*, int64> extra_users;
   absl::flat_hash_map<const HloInstruction*, int64> total_sizes;
-  for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
+  for (const HloInstruction* hlo : computation->MakeInstructionPostOrder()) {
     if (ListScheduler::IgnoreInstruction(*hlo)) {
       extra_users[hlo] = 0;
       total_sizes[hlo] = 0;
@@ -448,8 +448,8 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size);
     extra_users[hlo] = std::min(extra_users[hlo], total_hlos);
   }
-  CHECK_EQ(extra_users.size(), computation.instruction_count());
-  CHECK_EQ(total_sizes.size(), computation.instruction_count());
+  CHECK_EQ(extra_users.size(), computation->instruction_count());
+  CHECK_EQ(total_sizes.size(), computation->instruction_count());
 
   // Construct a total order based on DFS post-order, visiting operands in
   // decreasing cumulative extra user order, and next by cumulative size, with a
@@ -459,7 +459,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     sequence.push_back(hlo);
     return Status::OK();
   });
-  TF_RETURN_IF_ERROR(computation.AcceptWithOperandOrder(
+  TF_RETURN_IF_ERROR(computation->AcceptWithOperandOrder(
       &visitor, [&extra_users, &total_sizes](const HloInstruction* a,
                                              const HloInstruction* b) {
         if (extra_users[a] != extra_users[b]) {
@@ -470,12 +470,12 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
         }
         return a->name() < b->name();
       }));
-  CHECK_EQ(sequence.size(), computation.instruction_count());
+  CHECK_EQ(sequence.size(), computation->instruction_count());
   return sequence;
 }  // namespace xla
 
 StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -485,16 +485,16 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
 }
 
 StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
-  return HloInstructionSequence(computation.MakeInstructionPostOrder());
+  return HloInstructionSequence(computation->MakeInstructionPostOrder());
 }
 
 StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -513,7 +513,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                           memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 list_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, list_sequence, points_to_analysis,
+                          *computation, list_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
@@ -522,7 +522,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                                          size_function, memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 dfs_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, dfs_sequence, points_to_analysis,
+                          *computation, dfs_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
@@ -532,7 +532,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                                memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 post_order_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, post_order_sequence, points_to_analysis,
+                          *computation, post_order_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory post order sequence: "
           << HumanReadableNumBytes(post_order_memory);
@@ -555,17 +555,17 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
 }
 
 StatusOr<HloSchedule> ScheduleModule(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    HloModule* module, const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm) {
-  HloSchedule schedule(&module);
+  HloSchedule schedule(module);
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(&module));
+                      TuplePointsToAnalysis::Run(module));
   absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
-  for (const auto* computation : module.MakeComputationPostOrder()) {
+  for (auto* computation : module->MakeComputationPostOrder()) {
     if (!computation->IsFusionComputation()) {
       TF_ASSIGN_OR_RETURN(HloInstructionSequence computation_sequence,
                           ScheduleComputationHelper(
-                              *computation, *points_to_analysis, size_function,
+                              computation, *points_to_analysis, size_function,
                               algorithm, memory_by_computation));
       memory_by_computation[computation] =
           HeapSimulator::MinimumMemoryForComputation(
@@ -583,11 +583,11 @@ StatusOr<HloSchedule> ScheduleModule(
 }
 
 StatusOr<HloInstructionSequence> ScheduleComputation(
-    const HloComputation& computation,
+    HloComputation* computation,
     const LogicalBuffer::SizeFunction& size_function) {
-  CHECK(!computation.IsFusionComputation());
+  CHECK(!computation->IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(computation.parent()));
+                      TuplePointsToAnalysis::Run(computation->parent()));
   absl::flat_hash_map<const HloComputation*, int64> empty_map;
   return ScheduleComputationHelper(computation, *points_to_analysis,
                                    size_function, nullptr, empty_map);
@@ -600,7 +600,24 @@ HloMemoryScheduler::HloMemoryScheduler(
 
 StatusOr<bool> HloMemoryScheduler::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                      ScheduleModule(*module, size_function_, algorithm_));
+                      ScheduleModule(module, size_function_, algorithm_));
+  TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
+  return true;
+}
+
+StatusOr<bool> HloTrivialScheduler::Run(HloModule* module) {
+  HloSchedule schedule(module);
+  for (HloComputation* computation : module->MakeComputationPostOrder()) {
+    if (!computation->IsFusionComputation()) {
+      HloInstructionSequence& computation_sequence =
+          schedule.GetOrCreateSequence(computation);
+      TF_RETURN_IF_ERROR(computation->Accept(
+          [&computation_sequence](HloInstruction* instruction) {
+            computation_sequence.push_back(instruction);
+            return Status::OK();
+          }));
+    }
+  }
   TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
   return true;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
index a4c1d3db8170a1725043def576f913e09b352e5d..7227bfb27c74758d2b79e404afc9eb97a1ca894d 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
@@ -36,14 +36,14 @@ namespace xla {
 // that describes buffer aliasing, together with a target-specific size function
 // that maps a tensor's logical size to its padded size.
 typedef std::function<StatusOr<HloInstructionSequence>(
-    const HloComputation&, const TuplePointsToAnalysis&,
+    HloComputation*, const TuplePointsToAnalysis&,
     const LogicalBuffer::SizeFunction&,
     const absl::flat_hash_map<const HloComputation*, int64>&)>
     MemorySchedulerAlgorithm;
 
 // List scheduler
 StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -51,7 +51,7 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
 
 // DFS-order scheduler
 StatusOr<HloInstructionSequence> DFSMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -59,7 +59,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
 
 // Naive Post Order scheduler
 StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -69,7 +69,7 @@ StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
 // and the DFS scheduler, and chooses whichever returns a lower min-memory,
 // not accounting for fragmentation.
 StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -79,13 +79,13 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
 // the computation. size_function is the function returning the number of bytes
 // required for a LogicalBuffer.
 StatusOr<HloSchedule> ScheduleModule(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    HloModule* module, const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm = {});
 
 // Computes the schedule for a single computation.
 // Currently only used by the GPU backend.
 StatusOr<HloInstructionSequence> ScheduleComputation(
-    const HloComputation& computation,
+    HloComputation* computation,
     const LogicalBuffer::SizeFunction& size_function);
 
 // A pass which schedules the HLO instructions in a module. The HloModule's
@@ -108,6 +108,15 @@ class HloMemoryScheduler : public HloModulePass {
   MemorySchedulerAlgorithm algorithm_;
 };
 
+// A pass which produces a naive, but correct schedule. The schedule is produced
+// using a DFS traversal of the graph with no attempt to minimize memory use.
+class HloTrivialScheduler : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "hlo-trivial-scheduler"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
 // A trivial pass which clears the schedule currently set on the
 // HloModule. After this pass runs HloModudle::has_schedule will return false.
 class HloDescheduler : public HloModulePass {
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 214119fba881c4411a262cd4227b5cc49cef0d14..bc0d7e2bc00eab014f2660c95a51b966642eaee9 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -65,7 +65,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloMemoryScheduler scheduler([](const BufferValue& buffer) {
@@ -78,7 +78,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
   TF_ASSERT_OK(module->schedule().Verify());
 
   // Verify that all instructions are in the sequence.
-  const std::vector<const HloInstruction*>& sequence =
+  const std::vector<HloInstruction*>& sequence =
       module->schedule().sequence(module->entry_computation()).instructions();
   EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size());
 
@@ -124,9 +124,9 @@ ENTRY root {
   };
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
+      ScheduleModule(module.get(), size_fn, ListMemoryScheduler));
   // Verify that all instructions are in the sequence.
-  const std::vector<const HloInstruction*>& sequence =
+  const std::vector<HloInstruction*>& sequence =
       schedule.sequence(module->entry_computation()).instructions();
   EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size());
 
@@ -172,15 +172,16 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd,
                                                       tuple_elm, abs_abs2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
-                          ScheduleModule(*module,
-                                         [](const BufferValue& buffer) {
-                                           return ShapeUtil::ByteSizeOf(
-                                               buffer.shape(), TUPLE_SIZE);
-                                         },
-                                         ListMemoryScheduler));
+                          ScheduleModule(
+                              module.get(),
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape(),
+                                                             TUPLE_SIZE);
+                              },
+                              ListMemoryScheduler));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -218,19 +219,19 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, tuple_elm, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto fusion = computation->CreateFusionInstruction(
       {tuple, mul, add}, HloInstruction::FusionKind::kLoop);
 
   TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
-                          ScheduleModule(*module,
-                                         [](const BufferValue& buffer) {
-                                           return ShapeUtil::ByteSizeOf(
-                                               buffer.shape(), 2);
-                                         },
-                                         ListMemoryScheduler));
+                          ScheduleModule(
+                              module.get(),
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape(), 2);
+                              },
+                              ListMemoryScheduler));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -242,7 +243,7 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
 }
 
 TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
 
   // param != 0
@@ -252,7 +253,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
       HloInstruction::CreateParameter(0, r1f32, "cond_param"));
   HloInstruction* zero_vector =
       cond_builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{0, 0, 0, 0}})));
+          LiteralUtil::CreateR1<float>({0, 0, 0, 0})));
   cond_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector));
   auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
@@ -284,7 +285,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
   };
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
+      ScheduleModule(module.get(), size_fn, ListMemoryScheduler));
   // Verify that all instructions are in the sequence.
   auto entry_computation = module->entry_computation();
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -309,5 +310,40 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
                     .ValueOrDie());
 }
 
+TEST_F(HloSchedulingTest, TrivialScheduler) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  param.b = (s32[], s32[]) parameter(0)
+  gte.0 = s32[] get-tuple-element(param.b), index=0
+  gte.1 = s32[] get-tuple-element(param.b), index=1
+  add = s32[] add(gte.0, gte.1)
+  ROOT tuple = (s32[], s32[]) tuple(gte.0, add)
+}
+
+cond {
+  param.c = (s32[], s32[]) parameter(0)
+  ROOT constant = pred[] constant(true)
+}
+
+ENTRY main {
+  init = (s32[], s32[]) parameter(0)
+  ROOT while = (s32[], s32[]) while(init), condition=cond, body=body
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  EXPECT_FALSE(module->has_schedule());
+  TF_ASSERT_OK(HloTrivialScheduler().Run(module.get()).status());
+  ASSERT_TRUE(module->has_schedule());
+  TF_ASSERT_OK(module->schedule().Verify());
+
+  // Verify that a clone of the module also has a schedule.
+  std::unique_ptr<HloModule> clone = module->Clone();
+  ASSERT_TRUE(clone->has_schedule());
+  TF_ASSERT_OK(clone->schedule().Verify());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index bcd709c973920d36f6b7f16a1a1a38dbf7fdf0cf..59f44475df55311992d41aecfb1f2f4e53a2e316 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -242,6 +242,8 @@ HloModuleProto HloModule::ToProto() const {
   *proto.mutable_host_program_shape() =
       entry_computation_layout().ComputeProgramShape();
   *proto.mutable_input_output_alias() = input_output_alias_config().ToProto();
+  *proto.mutable_dynamic_parameter_binding() =
+      dynamic_parameter_binding().ToProto();
   return proto;
 }
 
@@ -325,6 +327,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
   // Because we didn't uniquify the names or the ids, double-check that the
   // instruction and computation names and ids are unique from the proto.
+  TF_ASSIGN_OR_RETURN(module->dynamic_parameter_binding_,
+                      DynamicParameterBinding::CreateFromProto(
+                          proto.dynamic_parameter_binding()));
+
   absl::flat_hash_set<string> computation_names;
   absl::flat_hash_set<string> instruction_names;
   absl::flat_hash_set<int> computation_ids;
@@ -559,11 +565,28 @@ std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
 std::unique_ptr<HloModule> HloModule::Clone(const HloModuleConfig& config,
                                             const string& suffix) const {
   VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n";
-  auto module = absl::make_unique<HloModule>(name_ + "-" + suffix, config);
+  auto module = absl::make_unique<HloModule>(
+      absl::StrCat(name_, suffix.empty() ? "" : "-", suffix), config);
 
   HloCloneContext context(module.get(), suffix);
   auto cloned_computation = entry_computation_->Clone(suffix, &context);
   module->AddEntryComputation(std::move(cloned_computation));
+
+  if (has_schedule() && schedule().Verify().ok()) {
+    HloSchedule clone_schedule(module.get());
+    for (HloComputation* computation : computations()) {
+      if (schedule().is_computation_scheduled(computation)) {
+        HloInstructionSequence& clone_sequence =
+            clone_schedule.GetOrCreateSequence(
+                context.GetComputation(computation));
+        for (const HloInstruction* instruction :
+             schedule().sequence(computation).instructions()) {
+          clone_sequence.push_back(context.GetInstruction(instruction));
+        }
+      }
+    }
+    TF_CHECK_OK(module->set_schedule(std::move(clone_schedule)));
+  }
   return module;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 8a1f999e3ab076b87a651a915f4de93320e7067f..66622a1d260c28078d69b01b858fd292b697805b 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -103,11 +104,7 @@ class HloModule {
                                        HloCloneContext* context = nullptr);
 
   // Return a pointer to the entry computation of the module.
-  const HloComputation* entry_computation() const {
-    CHECK_NE(nullptr, entry_computation_);
-    return entry_computation_;
-  }
-  HloComputation* entry_computation() {
+  HloComputation* entry_computation() const {
     CHECK_NE(nullptr, entry_computation_);
     return entry_computation_;
   }
@@ -232,6 +229,16 @@ class HloModule {
     return input_output_alias_config_;
   }
 
+  // DynamicParameterBinding holds the list of bindings that indicates which
+  // parameter dimensions are dynamic and which parameters represent their
+  // runtime value.
+  DynamicParameterBinding& dynamic_parameter_binding() {
+    return dynamic_parameter_binding_;
+  }
+  const DynamicParameterBinding& dynamic_parameter_binding() const {
+    return dynamic_parameter_binding_;
+  }
+
   // Returns an id that is unique to this module across all modules created over
   // the lifetime of this process.
   int unique_id() const { return unique_id_; }
@@ -285,6 +292,9 @@ class HloModule {
   // alias_config indicates the alias information of input/output buffers that
   // are expected from the module.
   HloInputOutputAliasConfig input_output_alias_config_;
+
+  // Bindings for dynamic parameter mapping.
+  DynamicParameterBinding dynamic_parameter_binding_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 39f38b417ab0e8b54864176d8d1e0ad1a422eca6..620cb7e01ad1a060915f5b73474f6950ab18122a 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -63,7 +63,7 @@ class HloModuleTest : public HloTestBase {
 
 TEST_F(HloModuleTest, OneComputationPostOrder) {
   // Create a module with a single computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(CreateConstantComputation());
 
   EXPECT_THAT(module->MakeComputationPostOrder(),
@@ -72,7 +72,7 @@ TEST_F(HloModuleTest, OneComputationPostOrder) {
 
 TEST_F(HloModuleTest, TwoComputationsPostOrder) {
   // Create a module with two unconnected computations.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 = module->AddEntryComputation(CreateConstantComputation());
   auto computation2 =
       module->AddEmbeddedComputation(CreateConstantComputation());
@@ -88,7 +88,7 @@ TEST_F(HloModuleTest, TwoComputationsPostOrder) {
 
 TEST_F(HloModuleTest, CloneTest) {
   // Create and copy a module with a diamond call graph of computations.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -111,7 +111,7 @@ TEST_F(HloModuleTest, CloneTest) {
 }
 
 TEST_F(HloModuleTest, CloneHasFusion) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // Create the fused computation.
   HloComputation* fused_computation;
@@ -154,7 +154,7 @@ TEST_F(HloModuleTest, CloneHasFusion) {
 
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -174,7 +174,7 @@ TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
 
 TEST_F(HloModuleTest, LargeConstantToString) {
   // Create a module with a single computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("Constant");
   std::vector<float> values(16, 42.0);
   builder.AddInstruction(
@@ -194,8 +194,8 @@ TEST_F(HloModuleTest, LargeConstantToString) {
 }
 
 TEST_F(HloModuleTest, UniqueModuleId) {
-  auto module_a = CreateNewModule();
-  auto module_b = CreateNewModule();
+  auto module_a = CreateNewVerifiedModule();
+  auto module_b = CreateNewVerifiedModule();
   EXPECT_NE(module_a->unique_id(), module_b->unique_id());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index e6bfb8025d4bfeba1d334d1f946e33841a2da092..70c7d70b41c5c7bc94d1fac83c0fcf71f155b5f0 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -83,6 +83,7 @@ namespace xla {
   V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
   V(kGather, "gather")                                       \
   V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
+  V(kGetDimensionSize, "get-dimension-size")                 \
   V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
   V(kGetTupleElement, "get-tuple-element")                   \
   V(kGt, "greater-than", kHloOpcodeIsComparison)             \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 23d41d91d6969ddf9062507e926ae39c1e1315d4..ca6a154809be46d6a0305c29e2b89219de408019 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -334,7 +334,7 @@ DependencyHloOrdering::DependencyHloOrdering(const HloModule* module)
   // ordering based on dependencies. ExecutesBefore will return true iff there
   // exists a path in the HLO computation graph from 'a' to 'b'.
   for (auto* computation : module->MakeNonfusionComputations()) {
-    predecessors_.emplace(computation, computation->ComputeReachability());
+    predecessors_.emplace(computation, HloReachabilityMap::Build(computation));
   }
 }
 
@@ -356,8 +356,7 @@ void SequentialHloOrdering::Initialize() {
   // Create a map from instruction to its order position.
   TF_DCHECK_OK(schedule_.Verify());
   for (const auto& computation_sequence : schedule_.sequences()) {
-    const std::vector<const HloInstruction*>& order =
-        computation_sequence.second.instructions();
+    const auto& order = computation_sequence.second.instructions();
     for (int i = 0; i < order.size(); ++i) {
       InsertOrDie(&order_position_, order[i], i);
     }
@@ -374,11 +373,10 @@ bool SequentialHloOrdering::ExecutesBeforeInSameComputation(
   return order_position_.at(a) < order_position_.at(b);
 }
 
-const std::vector<const HloInstruction*>*
-SequentialHloOrdering::SequentialOrder(
+const HloInstructionSequence* SequentialHloOrdering::SequentialOrder(
     const HloComputation& computation) const {
   return schedule_.is_computation_scheduled(&computation)
-             ? &schedule_.sequence(&computation).instructions()
+             ? &schedule_.sequence(&computation)
              : nullptr;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h
index 66313492eb2dd10ac9a6000639ddb8991b367c0f..a07214c22c0989a438f12219e136a7e76ee0dcce 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.h
+++ b/tensorflow/compiler/xla/service/hlo_ordering.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -64,7 +65,7 @@ class HloOrdering {
 
   // Returns the sequential instruction order for the given computation, or
   // nullptr if the computation does not have a sequential ordering.
-  virtual const std::vector<const HloInstruction*>* SequentialOrder(
+  virtual const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const = 0;
 
   // Return the call graph of the module used to compute ordering.
@@ -96,7 +97,7 @@ class PredecessorHloOrdering : public HloOrdering {
 
   // Returns nullptr indicating the computation does not have a sequential
   // ordering.
-  const std::vector<const HloInstruction*>* SequentialOrder(
+  const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const override {
     return nullptr;
   }
@@ -185,7 +186,7 @@ class SequentialHloOrdering : public HloOrdering {
   ~SequentialHloOrdering() override = default;
 
   // Returns the sequential instruction order for the given computation.
-  const std::vector<const HloInstruction*>* SequentialOrder(
+  const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const override;
 
   string ToString() const override;
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index b045adc9640ac0ca8cf4a127fea2fbfcbb1aaf3f..3ca77e60cd5275c22eb0e338cd5437fc44b49958 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -53,7 +53,7 @@ TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
   //   %c = Constant(42.0f)
   //
   // This results in a diamond-shaped callgraph.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder_c = HloComputation::Builder("C");
@@ -126,7 +126,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
   //   %constant = Constant(1.0)
   //   return While(%constant, body, condition)
   //
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -176,7 +176,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
 
 TEST_F(HloOrderingTest, ParametersDefinedBeforeOthers) {
   // Entry parameter should always be defined before other instruction.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
@@ -209,7 +209,7 @@ TEST_F(HloOrderingTest, ValuesInWhileComputations) {
   //   %while = While(%constant, body, condition)
   //   %add = Add(%constant, %while)
   //
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -407,7 +407,7 @@ TEST_F(HloOrderingTest,
   //   %dead = Constant(123.0)
   //
   // %root should interfere with %dead.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder = HloComputation::Builder(TestName());
@@ -455,7 +455,7 @@ TEST_F(HloOrderingTest,
   //   ROOT %call = call({%c}), subcomputation
   //
   // %root should interfere with %dead.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto subbuilder = HloComputation::Builder(TestName() + ".sub");
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index e0011398aad133d07d31c419626e4be54228f9de..4bf287a9ed585889669c22bb61873be2887ff66a 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -47,11 +47,11 @@ const double kF16max = 65504;
 
 // Creates and returns a schedule created using the order of the instructions in
 // the HloComputation::instructions() vectors in the module.
-HloSchedule ScheduleFromInstructionOrder(const HloModule* module) {
+HloSchedule ScheduleFromInstructionOrder(HloModule* module) {
   HloSchedule schedule(module);
-  for (const HloComputation* computation : module->computations()) {
+  for (HloComputation* computation : module->computations()) {
     if (!computation->IsFusionComputation()) {
-      for (const HloInstruction* instruction : computation->instructions()) {
+      for (HloInstruction* instruction : computation->instructions()) {
         schedule.GetOrCreateSequence(computation).push_back(instruction);
       }
     }
@@ -108,7 +108,7 @@ class HloParser {
   bool ParseInstructionList(HloComputation** computation,
                             const string& computation_name);
   bool ParseInstruction(HloComputation::Builder* builder, string* root_name);
-  bool ParseInstruciontRhs(HloComputation::Builder* builder, const string& name,
+  bool ParseInstructionRhs(HloComputation::Builder* builder, const string& name,
                            LocTy name_loc);
   bool ParseControlPredecessors(HloInstruction* instruction);
   bool ParseLiteral(Literal* literal, const Shape& shape);
@@ -608,10 +608,10 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     *root_name = name;
   }
 
-  return ParseInstruciontRhs(builder, name, name_loc);
+  return ParseInstructionRhs(builder, name, name_loc);
 }
 
-bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
+bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
                                     const string& name, LocTy name_loc) {
   Shape shape;
   HloOpcode opcode;
@@ -1547,6 +1547,18 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
+    case HloOpcode::kGetDimensionSize:
+      optional<std::vector<tensorflow::int64>> dimensions;
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateGetDimensionSize(
+              shape, operands[0], (*dimensions)[0]));
+      break;
   }
 
   instruction->SetAndSanitizeName(name);
@@ -1806,6 +1818,10 @@ bool HloParser::SetValueInLiteral(tensorflow::int64 value,
     case U64:
       return SetValueInLiteralHelper<tensorflow::uint64>(value, linear_index,
                                                          literal);
+    case PRED:
+      // Bool type literals with rank >= 1 are printed in 0s and 1s.
+      return SetValueInLiteralHelper<bool>(static_cast<bool>(value),
+                                           linear_index, literal);
     default:
       LOG(FATAL) << "unknown integral primitive type "
                  << PrimitiveType_Name(shape.element_type());
@@ -2060,14 +2076,13 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
         }
         if (lexer_.GetKind() == TokKind::kw_true ||
             lexer_.GetKind() == TokKind::kw_false) {
-          // TODO(congliu): bool type literals with rank >= 1 are actually
-          // printed in a compact form instead of "true" or "false". Fix that.
           if (!SetValueInLiteral(lexer_.GetKind() == TokKind::kw_true,
                                  linear_index++, literal)) {
             return false;
           }
           lexer_.Lex();
-        } else if (primitive_util::IsIntegralType(shape.element_type())) {
+        } else if (primitive_util::IsIntegralType(shape.element_type()) ||
+                   shape.element_type() == PRED) {
           LocTy loc = lexer_.GetLoc();
           tensorflow::int64 value;
           if (!ParseInt64(&value)) {
@@ -2705,7 +2720,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
 
   // The str is expected to have 3 items, lhs, rhs, out, and it must look like
   // lhs_rhs->out, that is, the first separator is "_" and the second is "->".
-  std::vector<string> split1 = absl::StrSplit(str, "_");
+  std::vector<string> split1 = absl::StrSplit(str, '_');
   if (split1.size() != 2) {
     LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees "
                << str;
@@ -3386,7 +3401,7 @@ bool HloParser::ParseSingleInstruction(HloModule* module) {
     // e.g.
     //
     //  f32[10] fusion(...), calls={...}
-    if (!ParseInstruciontRhs(&builder, module->name(), lexer_.GetLoc())) {
+    if (!ParseInstructionRhs(&builder, module->name(), lexer_.GetLoc())) {
       return false;
     }
   } else {
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index 81eeb9f13bf7f06123c0b35e9f3352c197866a7a..d830fa61438239005875f785f85cf2486123ebc9 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -44,7 +44,9 @@ Status ParseHloString(absl::string_view str, HloModule* module);
 // creates a HloModule with default config.
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str);
 
-// Parses the result of HloSharding::ToString(), e.g. "{replicated}".
+// ParseHloString sharding from str. str is supposed to contain the body of the
+// sharding, i.e. just the rhs of the "sharding={...}" attribute string,
+// e.g., "{replicated}".
 StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
 // Parses the result of window_util::ToString(const Window&).
@@ -55,10 +57,6 @@ StatusOr<Window> ParseWindow(absl::string_view str);
 StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
     absl::string_view str);
 
-// ParseHloString sharding from str. str is supposed to contain the body of the
-// sharding, i.e. just the rhs of the "sharding={...}" attribute string.
-StatusOr<HloSharding> ParseSharding(absl::string_view str);
-
 // Parses the result of PaddingConfigToString(), e.g. "0_0x1_1".
 StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str);
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 85d07d8092ce19089543f5f11be9f4a58cbf132f..88682e55fb37e6cacbeaf5826286cc9f70e57e3b 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -75,6 +75,18 @@ ENTRY %constant_pred () -> pred[] {
 
 )"
 },
+// pred array constant
+{
+"ConstantPredArray",
+R"(HloModule module
+
+ENTRY %constant_pred_array () -> pred[2,3] {
+  ROOT %constant = pred[2,3]{1,0} constant(pred[2,3] { { 0, 1, 0 }, { 1, 0, 1 } })
+}
+
+)"
+},
+
 // s32 constant
 {
 "ConstantS32",
@@ -183,7 +195,7 @@ ENTRY %add_constants () -> f32[] {
 R"(HloModule TupleConstant_module
 
 ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
-  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} ))
+  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { {1}, {2} }, {2, 42} ))
 }
 
 )"
@@ -575,7 +587,7 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
 R"(HloModule BasicTraining_module
 
 ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
-  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } })
+  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ { 1, 2 } }, { /*i1=1*/ { 3, 4 } } }, { /*i0=1*/ { /*i1=0*/ { 5, 6 } }, { /*i1=1*/ { 7, 8 } } } })
   %constant.1 = f32[2]{0} constant({2, 3})
   %constant.2 = f32[2]{0} constant({1, 2})
   ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3
@@ -1138,6 +1150,25 @@ ENTRY CrossReplicaSumWithSubgroups {
   ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_groups={{0,1},{2,3}}, barrier="abc", to_apply=add
 }
 
+)"
+},
+// cross-replica-sum with all-reduce-id
+{
+"CrossReplicaSumAllReduce",
+R"(HloModule CRS
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY CRS {
+  input = f32[8]{0} parameter(0)
+  crs.1 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
+  ROOT crs.0 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
+}
+
 )"
 },
 // all-to-all
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
index ee8cb12b231718e09f6ac0d05d7a6887f4c4d746..20384b9da6be4bab447b474f0e2240bcb277a620 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class HloPassPipelineTest : public HloVerifiedTestBase {
+class HloPassPipelineTest : public HloTestBase {
  protected:
   StatusOr<HloModuleGroup> ParseModuleGroup(
       absl::Span<const string> hlo_strings) {
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index 2d5197be9e6f69f698729e06b7506a5bc6260bcd..f968a4a94453f678f5c17e0b8d1df4aea70c93ea 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -104,5 +104,20 @@ bool IsScalarConstant(const HloInstruction* instruction) {
   return instruction->IsConstant() && ShapeUtil::IsScalar(instruction->shape());
 }
 
+bool ContainsInstrWithOpcode(const HloComputation* comp,
+                             const absl::flat_hash_set<HloOpcode>& opcodes) {
+  for (const auto* instr : comp->instructions()) {
+    if (opcodes.count(instr->opcode())) {
+      return true;
+    }
+    for (const HloComputation* subcomp : instr->called_computations()) {
+      if (ContainsInstrWithOpcode(subcomp, opcodes)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace hlo_query
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h
index c0826a6aee1f693484207a86ec258c6604d92318..215051f8834fc94eb9e32b508f34b13626ac9349 100644
--- a/tensorflow/compiler/xla/service/hlo_query.h
+++ b/tensorflow/compiler/xla/service/hlo_query.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -41,6 +43,12 @@ bool AllOperandsAreConstants(const HloInstruction& instruction);
 // Returns whether the instruction is a scalar constant.
 bool IsScalarConstant(const HloInstruction* instruction);
 
+// Determines whether the given computation contains an instruction with one of
+// the given opcodes.  Checks both comp's instructions and the instructions of
+// any computations nested within it.
+bool ContainsInstrWithOpcode(const HloComputation* comp,
+                             const absl::flat_hash_set<HloOpcode>& opcodes);
+
 // Returns an operand of an instruction with the given opcode. If there are
 // multiple matching operands, then the first matching operand is returned. If
 // there are no matching operands then nullptr is returned.
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index 961930f0a888e90f86e4354fa1373a303af8ec2f..4aa8067752481ffab29e1a573ffa49d4aa046f1f 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <queue>
+
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 
 namespace xla {
@@ -22,7 +24,7 @@ HloReachabilityMap::HloReachabilityMap(
     : size_(instructions.size()) {
   bit_vectors_.reserve(size_);
   for (const HloInstruction* hlo : instructions) {
-    indices_[hlo] = bit_vectors_.size();
+    indices_[GetKey(hlo)] = bit_vectors_.size();
     bit_vectors_.emplace_back(size_);
   }
   CHECK_EQ(size_, indices_.size());  // instructions should be unique
@@ -71,4 +73,70 @@ bool HloReachabilityMap::IsConnected(const HloInstruction* a,
   return IsReachable(a, b) || IsReachable(b, a);
 }
 
+std::unique_ptr<HloReachabilityMap> HloReachabilityMap::Build(
+    const HloComputation* computation) {
+  const auto& all = computation->MakeInstructionPostOrder();
+  auto result = absl::make_unique<HloReachabilityMap>(all);
+  auto channel_dependency_map = computation->ComputeChannelDependencies();
+
+  std::vector<HloInstruction*> inputs;
+  for (const HloInstruction* hlo : all) {
+    inputs.assign(hlo->operands().begin(), hlo->operands().end());
+    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
+                  hlo->control_predecessors().end());
+
+    switch (hlo->opcode()) {
+      case HloOpcode::kRecvDone: {
+        auto it = channel_dependency_map.find(hlo->channel_id());
+        if (it != channel_dependency_map.end()) {
+          absl::c_copy(it->second, std::back_inserter(inputs));
+        }
+        break;
+      }
+      case HloOpcode::kCrossReplicaSum: {
+        auto all_reduce_id = hlo->all_reduce_id();
+        if (all_reduce_id) {
+          auto it = channel_dependency_map.find(all_reduce_id.value());
+          if (it != channel_dependency_map.end()) {
+            absl::c_copy(it->second, std::back_inserter(inputs));
+          }
+        }
+        break;
+      }
+      default:
+        break;
+    }
+
+    result->FastSetReachabilityToUnion(inputs, hlo);
+  }
+  return result;
+}
+
+void HloReachabilityMap::UpdateReachabilityThroughInstruction(
+    const HloInstruction* instruction) {
+  std::queue<const HloInstruction*> worklist;
+  worklist.push(instruction);
+
+  std::vector<HloInstruction*> inputs;
+
+  while (!worklist.empty()) {
+    const HloInstruction* item = worklist.front();
+    worklist.pop();
+
+    inputs.assign(item->operands().begin(), item->operands().end());
+    inputs.insert(inputs.end(), item->control_predecessors().begin(),
+                  item->control_predecessors().end());
+
+    if (SetReachabilityToUnion(inputs, item)) {
+      // Add immediate successors to worklist.
+      for (const HloInstruction* user : item->users()) {
+        worklist.push(user);
+      }
+      for (const HloInstruction* succ : item->control_successors()) {
+        worklist.push(succ);
+      }
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index 5a5f01f8fd647c74217c80ce4a7633b8957e335f..7823b06a41b3052f6f50f7ffa358de5b23ba679f 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -16,27 +16,30 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
 
+#include <cstdio>
 #include <list>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-class HloInstruction;
-
 // A class for representing reachability between HloInstructions.
 //
-// !!! THIS CLASS DOES NOT COMPUTE REACHABILITY !!! It has an adjacency matrix
-// and it is up to the user of the class to set the adjacency matrix such that
-// it represents reachability, i.e. such that it is transitive. That the graph
-// be transitive is thus not an invariant of this class, but it is required for
-// the name of the class and its methods to make sense.
+// It has an adjacency matrix and it is up to the user of the class to set the
+// adjacency matrix such that it represents reachability, i.e. such that it is
+// transitive. That the graph be transitive is thus not an invariant of this
+// class, but it is required for the name of the class and its methods to make
+// sense.
 class HloReachabilityMap {
  public:
   // Sets up a graph with no edges and where the nodes correspond to the given
@@ -44,6 +47,15 @@ class HloReachabilityMap {
   explicit HloReachabilityMap(
       absl::Span<const HloInstruction* const> instructions);
 
+  // Computes and returns the reachability between HLO instructions in the
+  // computation. The returned HloReachabilityMap is constructed such that
+  // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a
+  // directed path (from producer to consumer) from 'a' to 'b'. Both data
+  // dependencies (operands) and control dependencies are considered for
+  // reachability. Trivially an instruction is reachable from itself.
+  static std::unique_ptr<HloReachabilityMap> Build(
+      const HloComputation* computation);
+
   // Set the reachability set of 'instruction' to the union of the reachability
   // sets of 'inputs'. Upon return, IsReachable(x, instruction) where
   // 'x' is not 'instruction' will return true iff IsReachable(x, input) is true
@@ -70,6 +82,10 @@ class HloReachabilityMap {
   // adjacency matrix.
   void SetReachable(const HloInstruction* a, const HloInstruction* b);
 
+  // Updates the given reachability map after the immediate predecessor set
+  // (operands and control predecessors) of 'instruction' has changed.
+  void UpdateReachabilityThroughInstruction(const HloInstruction* instruction);
+
   // Returns true if "b" is reachable from "a"
   //
   // Note that this function only correctly answers queries about reachability
@@ -82,6 +98,11 @@ class HloReachabilityMap {
   // if the set of edges that have been provided to this class are transitive.
   bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
 
+  // Checks if an instruction is in the Reachability map.
+  bool IsPresent(const HloInstruction* a) const {
+    return indices_.contains(GetKey(a));
+  }
+
  private:
   // A bit-vector implementation specialized for this use case which provides a
   // fast bitwise OR operation not available in tensorflow::gtl::BitMap.
@@ -143,18 +164,24 @@ class HloReachabilityMap {
       absl::Span<const HloInstruction* const> inputs,
       const HloInstruction* instruction, BitVector* bit_vector);
 
+  uint64 GetKey(const HloInstruction* instruction) const {
+    uint64 unique_id = absl::bit_cast<uint32>(instruction->unique_id());
+    uint64 module_id =
+        absl::bit_cast<uint32>(instruction->parent()->parent()->unique_id());
+    return (module_id << 32) | unique_id;
+  }
   // Return the index of the given instruction. The value is used to index into
   // the vector of BitVectors and the BitVectors themselves.
   int GetIndex(const HloInstruction* instruction) const {
-    return FindOrDie(indices_, instruction);
+    return FindOrDie(indices_, GetKey(instruction));
   }
 
   // The number of instructions in the reachability map.
   const size_t size_;
 
-  // Dense assignment from HloInstruction* to number. These numbers index
-  // into the bit_vectors_ vector and into the bits within a BitVector.
-  absl::flat_hash_map<const HloInstruction*, int> indices_;
+  // Dense assignment from HloInstruction::unique_id to number. These numbers
+  // index into the bit_vectors_ vector and into the bits within a BitVector.
+  absl::flat_hash_map<uint64, int> indices_;
 
   // Bitvectors holding the reachability to each instruction. The bit vector for
   // instruction X includes ones for each instruction which X is reachable from.
diff --git a/tensorflow/compiler/xla/service/hlo_reachability_test.cc b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
index d9848cee0bfa904a90aea4626c3ee62c2cbb45b6..595176709806d54fc7c7c5ea301654717096b2d6 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
 
 namespace {
 
-class HloReachabilityTest : public HloVerifiedTestBase {};
+class HloReachabilityTest : public HloTestBase {};
 
 TEST_F(HloReachabilityTest, Reachability) {
   // Construct and test a reachability graph of the following form:
@@ -48,7 +48,8 @@ TEST_F(HloReachabilityTest, Reachability) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
   auto e = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
-  builder.Build();
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
 
   HloReachabilityMap reachability({a, b, c, d, e});
   reachability.SetReachable(a, a);
@@ -81,6 +82,130 @@ TEST_F(HloReachabilityTest, Reachability) {
   EXPECT_FALSE(reachability.SetReachabilityToUnion({b, c}, d));
 }
 
+TEST_F(HloReachabilityTest, NonTrivialReachability) {
+  // Test reachability of a non-trivial computation:
+  //
+  // const1    const2
+  //    |         |
+  //    | +-------+
+  //    | |       |
+  //    add ..   negate
+  //     |   .     |
+  //     |   .... exp
+  //     |         |
+  //     +---+   +-+---+
+  //         |   |     |
+  //       multiply   copy
+  //
+  // There is a control dependency from 'add' to 'exp'.
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32, HloOpcode::kAdd, constant1, constant2));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kNegate, constant2));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kExp, negate));
+  auto mul = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, add, exp));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kCopy, exp));
+
+  auto module = CreateNewVerifiedModule();
+  auto computation =
+      module->AddEntryComputation(builder.Build(/*root_instruction=*/mul));
+
+  TF_CHECK_OK(add->AddControlDependencyTo(exp));
+  auto reachability = HloReachabilityMap::Build(computation);
+
+  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
+  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant1, add));
+  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
+  EXPECT_TRUE(reachability->IsReachable(constant1, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
+  EXPECT_TRUE(reachability->IsReachable(constant1, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
+  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant2, add));
+  EXPECT_TRUE(reachability->IsReachable(constant2, negate));
+  EXPECT_TRUE(reachability->IsReachable(constant2, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
+  EXPECT_TRUE(reachability->IsReachable(constant2, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(exp, constant1));
+  EXPECT_FALSE(reachability->IsReachable(exp, constant2));
+  EXPECT_FALSE(reachability->IsReachable(exp, add));
+  EXPECT_FALSE(reachability->IsReachable(exp, negate));
+  EXPECT_TRUE(reachability->IsReachable(exp, exp));
+  EXPECT_TRUE(reachability->IsReachable(exp, mul));
+  EXPECT_TRUE(reachability->IsReachable(exp, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(mul, constant1));
+  EXPECT_FALSE(reachability->IsReachable(mul, constant2));
+  EXPECT_FALSE(reachability->IsReachable(mul, add));
+  EXPECT_FALSE(reachability->IsReachable(mul, negate));
+  EXPECT_FALSE(reachability->IsReachable(mul, exp));
+  EXPECT_TRUE(reachability->IsReachable(mul, mul));
+  EXPECT_FALSE(reachability->IsReachable(mul, copy));
+
+  EXPECT_TRUE(reachability->IsConnected(constant1, copy));
+  EXPECT_TRUE(reachability->IsConnected(copy, constant1));
+  EXPECT_FALSE(reachability->IsConnected(negate, add));
+  EXPECT_FALSE(reachability->IsConnected(add, negate));
+
+  // Remove the control dependency then update and verify the reachability map
+  ASSERT_IS_OK(add->RemoveControlDependencyTo(exp));
+  reachability->UpdateReachabilityThroughInstruction(exp);
+
+  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
+  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant1, add));
+  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
+  EXPECT_FALSE(reachability->IsReachable(constant1, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
+  EXPECT_FALSE(reachability->IsReachable(constant1, copy));
+
+  // Change a use within the graph then update and verify the reachability map
+  ASSERT_IS_OK(constant2->ReplaceUseWith(negate, constant1));
+  reachability->UpdateReachabilityThroughInstruction(negate);
+
+  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
+  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant2, add));
+  EXPECT_FALSE(reachability->IsReachable(constant2, negate));
+  EXPECT_FALSE(reachability->IsReachable(constant2, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
+  EXPECT_FALSE(reachability->IsReachable(constant2, copy));
+}
+
+TEST_F(HloReachabilityTest, ChannelReachability) {
+  const Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
+  HloComputation::Builder builder("ChannelReachability");
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
+  auto send =
+      builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1));
+  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
+  auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
+  auto recv =
+      builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1));
+  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
+
+  auto module = CreateNewVerifiedModule();
+  auto computation = module->AddEntryComputation(builder.Build(recv_done));
+  auto reachability = HloReachabilityMap::Build(computation);
+  EXPECT_TRUE(reachability->IsReachable(param, recv_done));
+  EXPECT_FALSE(reachability->IsReachable(send, recv));
+  EXPECT_FALSE(reachability->IsReachable(send_done, recv));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 49e46ecd00ee4370f3e93746348373b79febed3d..48add75523f02005c70bc6baf69a6b7d5aa4f7ef 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -130,10 +130,10 @@ using ItemList = absl::InlinedVector<Item*, 3>;
 // before arbitrary elements.
 class InstructionList {
  public:
-  explicit InstructionList(const std::vector<const HloInstruction*>& order) {
+  explicit InstructionList(const HloInstructionSequence& order) {
     int64 position = 0;
     Item* last = nullptr;
-    for (const HloInstruction* inst : order) {
+    for (HloInstruction* inst : order.instructions()) {
       // Add a new item to the linked list.
       Item* item = new Item;
       item->next = nullptr;
@@ -151,7 +151,7 @@ class InstructionList {
       // to be monotonically increasing through the list, and so is still useful
       // for quickly(-ish) determining the order of arbitrary instructions in
       // the list.
-      item->instruction = const_cast<HloInstruction*>(inst);
+      item->instruction = inst;
       item->position = position;
       position++;
 
@@ -927,7 +927,7 @@ Item* PickRematerializationCandidate(
 
 StatusOr<int64> HloRematerialization::ComputePeakMemory(
     const HloComputation* computation,
-    const std::vector<const HloInstruction*>& order) const {
+    const HloInstructionSequence& order) const {
   InstructionList instruction_list(order);
   MemoryUsageTracker tracker(computation, size_function_, *points_to_analysis_,
                              instruction_list);
@@ -971,8 +971,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
           << HumanReadableNumBytes(computation_peak_memory_.at(computation));
   CHECK(!ContainsKey(rematerialized_computations_, computation));
 
-  InstructionList instruction_list(
-      schedule->sequence(computation).instructions());
+  InstructionList instruction_list(schedule->sequence(computation));
   MemoryUsageTracker memory_tracker(computation, size_function_,
                                     *points_to_analysis_, instruction_list);
   bool changed = false;
@@ -1184,7 +1183,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   sequence.clear();
   for (auto* item = instruction_list.first(); item != nullptr;
        item = instruction_list.next(item)) {
-    const HloInstruction* instruction = item->instruction;
+    HloInstruction* instruction = item->instruction;
     sequence.push_back(instruction);
   }
   rematerialized_computations_.insert(computation);
@@ -1235,10 +1234,8 @@ StatusOr<bool> HloRematerialization::Run(HloModule* module) {
         if (node.context() == CallContext::kSequential) {
           TF_ASSIGN_OR_RETURN(
               computation_peak_memory_[node.computation()],
-              ComputePeakMemory(node.computation(),
-                                module->schedule()
-                                    .sequence(node.computation())
-                                    .instructions()));
+              ComputePeakMemory(node.computation(), module->schedule().sequence(
+                                                        node.computation())));
         }
         return Status::OK();
       },
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 70d83c04f07ca7fd0139f586869e8fe688f958f4..a07d348041b72bba45c6fd1f726f2a0065d01e53 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -87,9 +87,8 @@ class HloRematerialization : public HloModulePass {
   // peak memory is the maximum total size of all live HLO instruction values at
   // any program point. 'order' is the order in which the HLO instructions will
   // be emitted which is used to determine lifespans of HLO values.
-  StatusOr<int64> ComputePeakMemory(
-      const HloComputation* computation,
-      const std::vector<const HloInstruction*>& order) const;
+  StatusOr<int64> ComputePeakMemory(const HloComputation* computation,
+                                    const HloInstructionSequence& order) const;
 
   // Returns the peak memory usage of the called computations for the given
   // instruction. Zero is returned if the instruction calls no computations.
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index f7e82fb1f88e856305f6f481a451d4cd64ba4acf..22c3c40a93a1ddcd36659483fcc79fede32dd2c3 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -36,7 +36,7 @@ namespace op = xla::testing::opcode_matchers;
 
 using ::testing::_;
 
-class HloRematerializationTest : public HloVerifiedTestBase {
+class HloRematerializationTest : public HloTestBase {
  protected:
   // Creates and returns a computation which can benefit from
   // rematerialization. The computation looks like:
@@ -162,7 +162,7 @@ class HloRematerializationTest : public HloVerifiedTestBase {
 // Test rematerialization of a single computation produced by
 // MakeRematerializableComputation.
 TEST_F(HloRematerializationTest, SingleComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeRematerializableComputation());
 
@@ -177,7 +177,7 @@ TEST_F(HloRematerializationTest, SingleComputation) {
   // with rematerialization so pick a memory limit between these values (14KB).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/14 * 1024, module));
+                              /*memory_limit_bytes=*/14 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // Root should not have changed.
@@ -203,7 +203,7 @@ TEST_F(HloRematerializationTest, SingleComputation) {
 // MakeRematerializableComputation but with a sufficiently high memory limit
 // such that no instructions are rematerialized.
 TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeRematerializableComputation());
 
@@ -211,7 +211,7 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/20 * 1024, module));
+                              /*memory_limit_bytes=*/20 * 1024, module.get()));
 
   // No instructions should have been materialized.
   EXPECT_FALSE(changed);
@@ -225,7 +225,7 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
 // computation should be the one chosen because rematerialization in the while
 // will presumably be more expensive.
 TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
   cond_builder.AddInstruction(
@@ -249,7 +249,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
   // bit lower (17KB) to force rematerialization of the entry computation.
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/17 * 1024, module));
+                              /*memory_limit_bytes=*/17 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // Only the entry computation should have a rematerialized instruction added.
@@ -261,7 +261,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
 // while. Both the entry computation and while body computation should have
 // computations rematerialized.
 TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
   cond_builder.AddInstruction(
@@ -282,7 +282,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/15 * 1024, module));
+                              /*memory_limit_bytes=*/15 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // Both computations should have rematerialized instructions added.
@@ -293,7 +293,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
 // Test rematerialization of a doubly nested computation. All computations
 // should have an instruction rematerialized.
 TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
   cond_builder.AddInstruction(
@@ -321,7 +321,7 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   // ~12K so pick something slightly larger.
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/13 * 1024, module));
+                              /*memory_limit_bytes=*/13 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // All computations should have rematerialized instructions added.
@@ -346,7 +346,7 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) {
   //
   //   F32[1024] add_2 = add(rng, add(tanh, add_1))  // LIVE: add_2 + add_1 +
   //                                                 //       rng + tanh + exp
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto param = builder.AddInstruction(
@@ -390,7 +390,7 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       RunHloRematerialization(
-          /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_), module));
+          /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_), module.get()));
   EXPECT_TRUE(changed);
   // The rng should not have been rematerialized.
   EXPECT_EQ(count_rngs(entry_computation), 1);
@@ -420,7 +420,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // The value %bcast is live across each call of Subcomputation (which requires
   // 8KB) though the value is not used in the calls. Rematerializing %bcast
   // across these calls reduces peak memory use from ~20KB down to ~16KB.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation* subcomputation = nullptr;
   {
@@ -482,7 +482,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // rematerialization).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/22 * 1024, module));
+                              /*memory_limit_bytes=*/22 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // The broadcast should have been rematerialized 3 times.
@@ -533,7 +533,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // (ie %bcast is used indirectly by %negate), otherwise the %negate operand
   // aliases %add_2.
   const bool indirectly_used = GetParam();
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation* subcomputation = nullptr;
   {
@@ -576,7 +576,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // rematerialization).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/22 * 1024, module));
+                              /*memory_limit_bytes=*/22 * 1024, module.get()));
   // Rematerialization should only occur if the rematerializable instruction has
   // no indirect uses.
   if (indirectly_used) {
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc
index 0778ff52174ef89c476950f2c830268a63888382..8f6eb974c5179b420c8f961393ca923e0a3b3530 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule.cc
@@ -46,8 +46,8 @@ namespace xla {
         << "No computation exists in HLO module with id " << computation_id;
     const HloComputation* computation = comp_it->second;
 
-    absl::flat_hash_map<int64, const HloInstruction*> id_to_instruction;
-    for (const HloInstruction* instruction : computation->instructions()) {
+    absl::flat_hash_map<int64, HloInstruction*> id_to_instruction;
+    for (HloInstruction* instruction : computation->instructions()) {
       id_to_instruction[instruction->unique_id()] = instruction;
     }
 
@@ -81,9 +81,8 @@ StatusOr<HloScheduleProto> HloSchedule::ToProto() const {
   return std::move(proto);
 }
 
-void HloSchedule::set_sequence(
-    const HloComputation* computation,
-    absl::Span<const HloInstruction* const> sequence) {
+void HloSchedule::set_sequence(const HloComputation* computation,
+                               absl::Span<HloInstruction* const> sequence) {
   set_sequence(computation, HloInstructionSequence(sequence));
 }
 
@@ -114,8 +113,8 @@ Status HloSchedule::UpdateComputationSchedule(
     const HloComputation* computation) {
   // Map from unique ID to HloInstruction pointer for instructions in the
   // computation.
-  absl::flat_hash_map<int, const HloInstruction*> id_to_instruction;
-  for (const HloInstruction* instruction : computation->instructions()) {
+  absl::flat_hash_map<int, HloInstruction*> id_to_instruction;
+  for (HloInstruction* instruction : computation->instructions()) {
     InsertOrDie(&id_to_instruction, instruction->unique_id(), instruction);
   }
 
@@ -128,7 +127,7 @@ Status HloSchedule::UpdateComputationSchedule(
   // Map from HloInstruction X to newly added instructions (instruction is in
   // computation, but not in schedule) which use X. If an instruction is not in
   // the map, then it has no users which are newly added instructions.
-  absl::flat_hash_map<const HloInstruction*, std::vector<const HloInstruction*>>
+  absl::flat_hash_map<const HloInstruction*, std::vector<HloInstruction*>>
       new_instruction_uses;
 
   // For each newly added instruction, this is the count of the instruction's
@@ -138,9 +137,9 @@ Status HloSchedule::UpdateComputationSchedule(
 
   // Create a worklist of newly added instructions which are ready to be added
   // to the schedule. Initialize worklist with those that have zero operands.
-  std::queue<const HloInstruction*> worklist;
+  std::queue<HloInstruction*> worklist;
 
-  for (const HloInstruction* instruction : computation->instructions()) {
+  for (HloInstruction* instruction : computation->instructions()) {
     if (ids_in_schedule.count(instruction->unique_id()) == 0) {
       // This is a newly added instruction which is not in the schedule.
       if (instruction->operands().empty()) {
@@ -161,17 +160,17 @@ Status HloSchedule::UpdateComputationSchedule(
   // Lambda which schedules all instructions on the worklist.
   auto schedule_worklist = [&]() {
     while (!worklist.empty()) {
-      const HloInstruction* instruction = worklist.front();
+      HloInstruction* instruction = worklist.front();
       worklist.pop();
       new_sequence.push_back(instruction);
-      std::vector<const HloInstruction*>* new_users =
+      std::vector<HloInstruction*>* new_users =
           tensorflow::gtl::FindOrNull(new_instruction_uses, instruction);
       if (new_users != nullptr) {
         // This just-scheduled instruction has users which are newly added to
         // the module. Update the number of unscheduled operands and push the
         // newly added instruction to the worklist if it is ready to
         // schedule.
-        for (const HloInstruction* new_user : *new_users) {
+        for (HloInstruction* new_user : *new_users) {
           unscheduled_operand_count.at(new_user)--;
           CHECK_GE(unscheduled_operand_count.at(new_user), 0);
           if (unscheduled_operand_count.at(new_user) == 0) {
@@ -264,7 +263,10 @@ Status HloSchedule::Verify() const {
     }
 
     TF_RET_CHECK(instruction_position.size() ==
-                 computation->instruction_count());
+                 computation->instruction_count())
+        << "Schedule for computation " << computation->name() << " has "
+        << instruction_position.size() << " instructions, expected "
+        << computation->instruction_count();
     for (const HloInstruction* instruction : computation->instructions()) {
       TF_RET_CHECK(instruction_position.count(instruction) == 1)
           << "Instruction " << instruction->name() << " is not in schedule";
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h
index 0a714101ee587aa847fa674bbde5586287c51f33..486ddbf499de80c634bc497158cd79ca066cc866 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/hlo_schedule.h
@@ -35,14 +35,14 @@ class HloInstructionSequence {
  public:
   HloInstructionSequence() = default;
   explicit HloInstructionSequence(
-      absl::Span<const HloInstruction* const> instructions) {
-    for (const HloInstruction* instruction : instructions) {
+      absl::Span<HloInstruction* const> instructions) {
+    for (HloInstruction* instruction : instructions) {
       push_back(instruction);
     }
   }
 
   // Adds the instruction to the end of the sequence.
-  void push_back(const HloInstruction* instruction) {
+  void push_back(HloInstruction* instruction) {
     instruction_sequence_.push_back(instruction);
     id_sequence_.push_back(instruction->unique_id());
   }
@@ -56,7 +56,7 @@ class HloInstructionSequence {
   int64 size() const { return instruction_sequence_.size(); }
 
   // Returns the sequence of HLO instructions.
-  const std::vector<const HloInstruction*>& instructions() const {
+  const std::vector<HloInstruction*>& instructions() const {
     return instruction_sequence_;
   }
 
@@ -65,7 +65,7 @@ class HloInstructionSequence {
 
  private:
   // The sequence as HloInstructions.
-  std::vector<const HloInstruction*> instruction_sequence_;
+  std::vector<HloInstruction*> instruction_sequence_;
 
   // The sequence of HLO instructions, represented by their unique IDs. The
   // sequence is stored as both HloInstructions and unique IDs because the
@@ -98,7 +98,7 @@ class HloSchedule {
 
   // Sets the sequence for the given computation to the given sequence.
   void set_sequence(const HloComputation* computation,
-                    absl::Span<const HloInstruction* const> sequence);
+                    absl::Span<HloInstruction* const> sequence);
   void set_sequence(const HloComputation* computation,
                     HloInstructionSequence sequence);
 
diff --git a/tensorflow/compiler/xla/service/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
index 1424569ac1f62e4b965876141f1eb40be4f15bea..0e56e6f760e35ddcb45c6f58771d78405a09acfe 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
@@ -56,10 +56,10 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
-  const std::vector<const HloInstruction*>& entry_schedule =
+  const auto& entry_schedule =
       schedule.sequence(module->entry_computation()).instructions();
 
   EXPECT_EQ(entry_schedule.size(), 6);
@@ -90,7 +90,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -139,7 +139,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -183,7 +183,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -244,7 +244,7 @@ ENTRY %WhileLoop () -> s32[] {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
@@ -313,7 +313,7 @@ ENTRY %WhileLoop () -> s32[] {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
index 45c684d66752862eec301b8943d350804f070309..c1073911ea9dc3811c195e27bcbae9b00929ad17 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
@@ -66,7 +66,7 @@ class HloSubcomputationUnificationTest : public HloTestBase {
 };
 
 TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -103,7 +103,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
 }
 
 TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -143,7 +143,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
 
 // Do not unify subcomputations with different parameter shapes.
 TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -184,7 +184,7 @@ TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
 // Regression test for b/31466798. Checks that entry_computation is still valid
 // after unification.
 TEST_F(HloSubcomputationUnificationTest, TwoIdenticalComputations) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   for (int i = 0; i < 2; ++i) {
     HloComputation::Builder builder("pow");
     auto x =
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
index 6fd734a2b9e6c8c9fca76a944ca3df4c3b8a212f..1e2b31a1f2bb4865faafc3d14e2b194e3aa171a1 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 
@@ -24,7 +24,7 @@ namespace {
 
 using ::tensorflow::GraphDef;
 
-class HloTfGraphBuilderTest : public HloVerifiedTestBase {
+class HloTfGraphBuilderTest : public HloTestBase {
  protected:
   HloTfGraphBuilderTest() {}
   HloTfGraphBuilder generator_;
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index b6670d409b92e8be42f5cdb40fba8d662ae83958..1f01b0bb365450a933da9cc443db5223c06903f0 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -166,9 +166,6 @@ class HloValue : public BufferValue {
 
   // Whether this value is live out of the HLO module.
   bool live_out_of_module_ = false;
-
-  // Whether this value is live out of its computation.
-  bool live_out_of_computation_ = false;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloValue& hlo_value);
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 136824a33565d65663f1e484713c5180a762b25b..60d8a511b5743d4f342a2cc3a7c91c71acdbeaf8 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
@@ -755,6 +756,12 @@ Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
   return CheckShape(token, ShapeInference::InferAfterAllShape(operand_shapes));
 }
 
+Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) {
+  return CheckShape(get_size,
+                    ShapeInference::InferGetDimensionSizeShape(
+                        get_size->operand(0)->shape(), get_size->dimension()));
+}
+
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
                                  const Shape& inferred_shape) {
   // If allow_mixed_precision_ is false, check if there are operands with
@@ -1331,6 +1338,15 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleCrossReplicaSum(HloInstruction* crs) override {
+    if (crs->all_reduce_id().has_value()) {
+      TF_RET_CHECK(crs->all_reduce_id().value() > 0)
+          << "All reduce id must be greater than 0 for "
+          << crs->ToShortString();
+    }
+    return Status::OK();
+  }
+
   Status Preprocess(HloInstruction* instruction) override {
     auto previous = instructions_by_name_.find(instruction->name());
     TF_RET_CHECK(previous == instructions_by_name_.end())
@@ -1410,6 +1426,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
         return target_metadata_->ShapeSize(shape);
       }));
 
+  TF_RETURN_IF_ERROR(module->dynamic_parameter_binding().Verify(*module));
+
   return false;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 83b6244d1be0e1eec66daabfcfd1be5a3c0131ac..9fbfd6a21c1f1148801000169046fbcbb37934fe 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -94,6 +94,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleGather(HloInstruction* gather) override;
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleAfterAll(HloInstruction* token) override;
+  Status HandleGetDimensionSize(HloInstruction* get_size) override;
 
   Status FinishVisit(HloInstruction*) override { return Status::OK(); }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index afe01e5487c3225815e01343d86e9fe894c2cde8..4bc557e4e62e7df4e25fda86fe417e84129b464c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -35,7 +35,11 @@ namespace {
 
 using ::testing::HasSubstr;
 
-// This class cannot be converted to use HloVerifiedTestBase. It explicitly
+std::unique_ptr<HloModule> CreateUnverifiedModule() {
+  return absl::make_unique<HloModule>("module", HloModuleConfig());
+}
+
+// This class cannot be converted to use HloTestBase. It explicitly
 // uses HloTestBase to create and test malformed HLOs.
 class HloVerifierTest : public HloTestBase {
  public:
@@ -66,7 +70,7 @@ TEST_F(HloVerifierTest, NullInstructionParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -85,7 +89,7 @@ TEST_F(HloVerifierTest, NullComputationParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -104,7 +108,7 @@ TEST_F(HloVerifierTest, DifferentOperandParents) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloComputation::Builder emb_builder(TestName());
@@ -138,7 +142,7 @@ TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add));
 
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Run the verifier twice.  It should fail both times, because it shouldn't
@@ -303,7 +307,7 @@ TEST_F(HloVerifierTest, NegativeInteriorPaddingNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32))),
       padding_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto status = verifier().Run(module.get()).status();
@@ -327,7 +331,7 @@ TEST_F(HloVerifierTest, PadNegativeInteriorDilationNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32).Clone())),
       padding_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index e103222b55faccf2d0286dce33c0f1ce5df01feb..90904ac00110457bcc3b8974816a7080c4ab89fc 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -90,20 +90,29 @@ string HumanReadableProfileBuilder::ToString() const {
         op.optimal_seconds < 0
             ? ""
             : StrFormat("(%12.1f optimal)", op.optimal_seconds * 1e6),
-        op.flop_count <= 0 ? "" : HumanReadableNumFlops(op.flop_count, nsecs),
-        op.transcendental_count <= 0
-            ? ""
-            : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs),
+        op.flop_count > 0 && nsecs > 0
+            ? HumanReadableNumFlops(op.flop_count, nsecs)
+            : "",
+        op.transcendental_count > 0 && nsecs > 0
+            ? HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs)
+            : "",
         bytes_per_sec, bytes_per_cycle, op.name);
   };
 
-  float optimal_seconds_sum = 0.0;
+  double optimal_seconds_sum = 0;
   int64 total_flops = 0.;
   int64 total_transcendentals = 0.;
   int64 total_bytes = 0;
   for (const auto& op : op_infos_) {
     if (op.optimal_seconds > 0) {
-      optimal_seconds_sum += op.optimal_seconds;
+      // An op can run faster than the estimated optimum. For example, we might
+      // estimate a fusion's speed by looking at the size of its operands and
+      // result, but perhaps the fusion doesn't read the entirety of all of its
+      // inputs.  For the purposes of summing the instructions' optimal speeds,
+      // we treat the "optimum" as the smallest of either the estimated optimum
+      // and the actual speed.
+      optimal_seconds_sum +=
+          std::min(double{op.optimal_seconds}, CyclesToSeconds(op.cycles));
     }
     total_flops += std::max(op.flop_count, int64{0});
     total_transcendentals += std::max(op.transcendental_count, int64{0});
@@ -114,7 +123,7 @@ string HumanReadableProfileBuilder::ToString() const {
 
   print_op({is_entry_computation_ ? "[total] [entry]" : "[total]", "[total]",
             /*category=*/"", total_cycles_, total_flops, total_transcendentals,
-            total_bytes, optimal_seconds_sum},
+            total_bytes, static_cast<float>(optimal_seconds_sum)},
            /*is_total=*/true);
 
   // Sort ops in decreasing order of cycles, and print them.
@@ -155,8 +164,10 @@ string HumanReadableProfileBuilder::ToString() const {
         entry.text = op.name;
         entry.short_text = op.short_name;
         entry.category_text = op.category;
-        entry.metric =
-            CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6;
+        // Ignore ops that run faster than the estimated optimal here, as we do
+        // when calculating optimal_seconds_sum.
+        entry.metric = std::max(
+            0., CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6);
         total_discrepancy_in_microseconds += entry.metric;
         table.AddEntry(std::move(entry));
       }
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
index f85d31d5225b8012b68f851b2bfec219d736ba0d..cf6cf897fe11eda01ba6b22119bba34ac2bef8fe 100644
--- a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
@@ -18,19 +18,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
 
-class ImplicitBroadcastRemoverTest : public HloVerifiedTestBase {
+class ImplicitBroadcastRemoverTest : public HloTestBase {
  protected:
   ImplicitBroadcastRemover remover_;
 };
 
 TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
@@ -41,15 +42,16 @@ TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(remover_.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Parameter(), op::Parameter()));
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
@@ -60,13 +62,13 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kPower, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
 
   EXPECT_FALSE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
   EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
 
   EXPECT_THAT(root, op::Power(op::Broadcast(op::Parameter()), op::Parameter()));
@@ -76,6 +78,7 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
@@ -86,9 +89,9 @@ TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       shape, HloOpcode::kSubtract, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Subtract(op::Parameter(),
@@ -98,6 +101,7 @@ TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {1, 4, 1});
@@ -108,9 +112,9 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       shape, HloOpcode::kSubtract, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root,
@@ -120,6 +124,7 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6, 8});
@@ -132,9 +137,9 @@ TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
   builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
                                                        param0, param1, param2));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Reshape(op::Parameter())),
@@ -147,6 +152,7 @@ TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
 
 TEST_F(ImplicitBroadcastRemoverTest,
        TernaryScalarAndDegenerateDimensionBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
@@ -159,9 +165,9 @@ TEST_F(ImplicitBroadcastRemoverTest,
   builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
                                                        param0, param1, param2));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Parameter()),
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 2d03aebc1aca4c55cca588072233b7a18e70a306..98246d5403e4aebc2f4d81e52145706355ddd9a9 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include <ctype.h>
 
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
 namespace xla {
 namespace {
-class IndexedArrayAnalysisTest : public HloVerifiedTestBase {
+class IndexedArrayAnalysisTest : public HloTestBase {
  protected:
   void AssertArrayForRootExpressionIs(const string& hlo_text,
                                       const string& root_expression) {
@@ -61,12 +61,12 @@ class IndexedArrayAnalysisTest : public HloVerifiedTestBase {
                                           const string& root_expression,
                                           bool print_constants) {
     IndexedArrayAnalysis indexed_tensor_analysis;
-    ParseAndVerifyModule(hlo_text);
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                            ParseAndReturnVerifiedModule(hlo_text));
 
-    TF_ASSERT_OK_AND_ASSIGN(
-        IndexedArrayAnalysis::Array* const array_result,
-        indexed_tensor_analysis.GetArrayFor(
-            module().entry_computation()->root_instruction()));
+    TF_ASSERT_OK_AND_ASSIGN(IndexedArrayAnalysis::Array* const array_result,
+                            indexed_tensor_analysis.GetArrayFor(
+                                m->entry_computation()->root_instruction()));
     string string_result = CanonicalizeWhitespace(
         indexed_tensor_analysis.ToString(array_result, print_constants));
     LOG(INFO) << string_result;
@@ -481,8 +481,8 @@ ENTRY main {
   const char* expected_root_expression = R"(
 (scalar-indexed-const
   (constant s32[2,1,1,1,6] s32[2,1,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } },
-    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } } })
+    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } },
+    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } } })
   (reshape %indices to s32[])
   0->[])
 )";
@@ -512,8 +512,8 @@ ENTRY main {
   const char* expected_root_expression = R"(
 (scalar-indexed-const
   (constant s32[2,1,1,6] s32[2,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } },
-    { /*i0=1*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } } })
+    { /*i0=0*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } },
+    { /*i0=1*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } } })
   (reshape %indices to s32[5])
   0->[2])
 )";
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 69a4c160ee5c4539272c3085338dc6de1b9347ff..7f2d7e7cffc6debaaf9b64fffc5a8a7037ecdaa3 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/fusion_queue.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -153,6 +155,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kTanh:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
+    case HloOpcode::kGetDimensionSize:
       return true;
   }
 
@@ -437,8 +440,7 @@ class ReversePostOrderFusionQueue : public FusionQueue {
 }  // namespace
 
 std::unique_ptr<FusionQueue> InstructionFusion::GetFusionQueue(
-    HloComputation* computation,
-    const std::function<bool(HloInstruction*)>& skip_producer) {
+    HloComputation* computation) {
   return absl::make_unique<ReversePostOrderFusionQueue>(computation);
 }
 
@@ -451,14 +453,11 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
   for (auto* computation : module->MakeNonfusionComputations()) {
     CHECK(!computation->IsFusionComputation());
     computation_ = computation;
-    reachability_ = computation_->ComputeReachability();
+    reachability_ = HloReachabilityMap::Build(computation_);
 
     HloInstructionSet do_not_duplicate =
         ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
-    auto fusion_queue =
-        GetFusionQueue(computation_, [&](HloInstruction* producer) {
-          return do_not_duplicate.count(producer) > 0;
-        });
+    auto fusion_queue = GetFusionQueue(computation_);
 
     // Instruction fusion effectively fuses edges in the computation graph
     // (producer instruction -> consumer instruction) so we iterate over all
@@ -489,9 +488,8 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         HloInstruction* fusion_instruction;
         // Try "regular" fusion if the operand may be duplicated. Otherwise,
         // perform multi-output fusion, unless this creates a cycle.
-        // TODO(tjoerg): Consider making multi-output fusion the default.
-        if (ShouldFuse(instruction, i) &&
-            do_not_duplicate.count(operand) == 0) {
+        if (do_not_duplicate.count(operand) == 0 &&
+            ShouldFuse(instruction, i)) {
           fusion_queue->PreFusion(operand, instruction);
           fusion_instruction = Fuse(operand, instruction);
         } else if (ShouldFuseIntoMultiOutput(instruction, i) &&
@@ -565,15 +563,19 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput(
 
 bool InstructionFusion::MultiOutputFusionCreatesCycle(
     HloInstruction* producer, HloInstruction* consumer) {
-  return absl::c_any_of(
-      consumer->operands(), [&](const HloInstruction* consumer_operand) {
-        // The fusion algorithm traverses the HLO graph in reverse post order.
-        // Thus `cosumers` is visited before its operands (including
-        // `producer`). Therefore, consumer operands cannot have been fused yet.
-        // It is thus safe to use the pre-computed reachability map.
-        return consumer_operand != producer &&
-               reachability_->IsReachable(producer, consumer_operand);
-      });
+  auto is_reachable = [&](const HloInstruction* a, const HloInstruction* b) {
+    // A consumer operand may have been multii-output fused into a parallel
+    // consumer and thus be missing  from the oridinal reachability map.
+    if (!reachability_->IsPresent(a) || !reachability_->IsPresent(b)) {
+      reachability_ = HloReachabilityMap::Build(consumer->parent());
+    }
+    return reachability_->IsReachable(a, b);
+  };
+  return absl::c_any_of(consumer->operands(),
+                        [&](const HloInstruction* consumer_operand) {
+                          return consumer_operand != producer &&
+                                 is_reachable(producer, consumer_operand);
+                        });
 }
 
 bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index f14c6675208c72112aea0179c238b58709d625b5..198bd7fce5f392e5e895b959523d4fe9cf208ba2 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace xla {
@@ -54,8 +55,7 @@ class InstructionFusion : public HloModulePass {
   // fused. The default implementation processes consumers in reverse post
   // order.
   virtual std::unique_ptr<FusionQueue> GetFusionQueue(
-      HloComputation* computation,
-      const std::function<bool(HloInstruction*)>& skip_producer);
+      HloComputation* computation);
 
   // Returns whether the given producer instruction should be fused into the
   // given consumer instruction. producer is necessarily an operand of consumer.
@@ -111,6 +111,10 @@ class InstructionFusion : public HloModulePass {
     return is_expensive_(instruction);
   }
 
+  // Whether multi-output fusion would introduce a cycle into the HLO graph.
+  bool MultiOutputFusionCreatesCycle(HloInstruction* producer,
+                                     HloInstruction* consumer);
+
   // Current HloComputation instance the loop fuser is traversing.
   HloComputation* computation_;
   HloModule* module_;
@@ -145,10 +149,6 @@ class InstructionFusion : public HloModulePass {
   // duplicated.
   std::function<bool(const HloInstruction& instruction)> is_expensive_;
 
-  // Whether multi-output fusion would introduce a cycle into the HLO graph.
-  bool MultiOutputFusionCreatesCycle(HloInstruction* producer,
-                                     HloInstruction* consumer);
-
   // Returns whether we may duplicate an instruction if we want to fuse it.
   bool may_duplicate_;
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index da1ad90959dc0ab1a840b3390281ce9d4999651e..6b483126499fe1e635a7d13cf597ec5d089c5b24 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -133,7 +133,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastSimpleReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -149,7 +149,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) {
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {}), param0, {}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose1, computation->root_instruction());
   EXPECT_FALSE(
@@ -172,7 +172,7 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusible) {
   HloInstruction* unary = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(unary, computation->root_instruction());
   EXPECT_FALSE(
@@ -361,7 +361,7 @@ TEST_F(InstructionFusionTest, AllowUnaryDuplication) {
   HloInstruction* unary2 = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, unary1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(unary2, computation->root_instruction());
   EXPECT_TRUE(
@@ -385,7 +385,7 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
   HloInstruction* unary = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(unary, computation->root_instruction());
   EXPECT_TRUE(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index a06d6113e84630df14ff68280c248cccb9afaf06..7635fbfed6f6a51fc9d203251d9bebf43cc63fd9 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -37,7 +37,7 @@ namespace xla {
 namespace interpreter {
 
 InterpreterExecutable::InterpreterExecutable(
-    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloEvaluator> evaluator)
     : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
                  /*hlo_profile_index_map=*/nullptr),
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index 3b1ebce0c75457d65e6834c809fe488a9c4a159a..bda13d376360306c81230e41b01cefc6caff230d 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -42,7 +42,7 @@ namespace interpreter {
 // buffer allocation. Refer to interpreter/README.md for more.
 class InterpreterExecutable : public Executable {
  public:
-  InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module,
+  InterpreterExecutable(std::unique_ptr<HloModule> hlo_module,
                         std::unique_ptr<HloEvaluator> evaluator);
   ~InterpreterExecutable() override;
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index c9b40d3c6195f80a19272a0d98890049d02315b9..b0fc1af8b89d7327a00f77f471e90d143a92de7c 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -110,3 +110,5 @@ REGISTER_MODULE_INITIALIZER(
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(interpreter_platform,
                                      multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     interpreter_platform);
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 6b03394669858ef0ffdbdd1a2bad90e9df9fbcd9..a90411922205c0006159ff99f35a70138b1bee4f 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2092,6 +2092,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kTrace:
     case HloOpcode::kTranspose:
     case HloOpcode::kTuple:
+    case HloOpcode::kGetDimensionSize:
       return true;
   }
 }
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 47bfca2fd6e1527b73e396151d3764867ac03697..61d8a0a4e6aa39e2e921acae1c65df1b3c329e46 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -49,20 +49,19 @@ namespace {
 
 using ::testing::ElementsAre;
 
-class LayoutAssignmentTest : public HloVerifiedTestBase {
+class LayoutAssignmentTest : public HloTestBase {
  protected:
-  void AssignLayouts(HloModule* module,
-                     ComputationLayout* entry_computation_layout,
+  void AssignLayouts(HloModule* m, ComputationLayout* entry_computation_layout,
                      ChannelLayoutConstraints* channel_constraints = nullptr) {
     LayoutAssignment layout_assignment(
         entry_computation_layout, LayoutAssignment::InstructionCanChangeLayout,
         /*channel_constraints=*/channel_constraints);
-    EXPECT_IS_OK(layout_assignment.Run(module).status());
+    EXPECT_IS_OK(layout_assignment.Run(m).status());
   }
 
-  std::vector<int64> LayoutOf(HloModule* module, absl::string_view name) {
+  std::vector<int64> LayoutOf(HloModule* m, absl::string_view name) {
     auto minor_to_major =
-        FindInstruction(module, name)->shape().layout().minor_to_major();
+        FindInstruction(m, name)->shape().layout().minor_to_major();
     return std::vector<int64>(minor_to_major.begin(), minor_to_major.end());
   }
 
@@ -91,7 +90,7 @@ class LayoutAssignmentTest : public HloVerifiedTestBase {
 TEST_F(LayoutAssignmentTest, ComputationLayout) {
   // Verify the layouts of the root and parameter instructions of a computation
   // match the ComputationLayout for two different layouts.
-  std::vector<std::initializer_list<int64>> minor_to_majors = {{0, 1}, {1, 0}};
+  std::vector<std::vector<int64>> minor_to_majors = {{0, 1}, {1, 0}};
   for (auto& minor_to_major : minor_to_majors) {
     auto builder = HloComputation::Builder(TestName());
     Shape ashape = ShapeUtil::MakeShape(F32, {42, 12});
@@ -101,8 +100,8 @@ TEST_F(LayoutAssignmentTest, ComputationLayout) {
         HloInstruction::CreateParameter(1, ashape, "param1"));
     auto add = builder.AddInstruction(
         HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, param0, param1));
-    auto module = CreateNewModule();
-    HloComputation* computation = module->AddEntryComputation(builder.Build());
+    auto m = CreateNewVerifiedModule();
+    HloComputation* computation = m->AddEntryComputation(builder.Build());
 
     Layout layout = LayoutUtil::MakeLayout(minor_to_major);
     Shape shape(ashape);
@@ -113,7 +112,7 @@ TEST_F(LayoutAssignmentTest, ComputationLayout) {
     *computation_layout.mutable_parameter_layout(0) = shape_layout;
     *computation_layout.mutable_parameter_layout(1) = shape_layout;
     *computation_layout.mutable_result_layout() = shape_layout;
-    AssignLayouts(module, &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
     EXPECT_TRUE(LayoutUtil::Equal(layout, param0->shape().layout()));
     EXPECT_TRUE(LayoutUtil::Equal(layout, param1->shape().layout()));
     EXPECT_TRUE(LayoutUtil::Equal(layout, add->shape().layout()));
@@ -131,8 +130,8 @@ TEST_F(LayoutAssignmentTest, ComputationLayoutMixedLayout) {
       HloInstruction::CreateParameter(1, ashape, "param1"));
   builder.AddInstruction(
       HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   Layout col_major_layout = LayoutUtil::MakeLayout({1, 0});
   Shape col_major_shape(ashape);
@@ -149,7 +148,7 @@ TEST_F(LayoutAssignmentTest, ComputationLayoutMixedLayout) {
   *computation_layout.mutable_parameter_layout(1) = row_major;
   *computation_layout.mutable_result_layout() = col_major;
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
   EXPECT_TRUE(LayoutUtil::Equal(col_major_layout, param0->shape().layout()));
   EXPECT_TRUE(LayoutUtil::Equal(row_major_layout, param1->shape().layout()));
   EXPECT_TRUE(LayoutUtil::Equal(
@@ -160,7 +159,7 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
   // Verify that the layout of the fused parameters in a fusion instruction
   // match that of the fusion operands. Other fused instructions should have no
   // layout.
-  std::vector<std::initializer_list<int64>> minor_to_majors = {{0, 1}, {1, 0}};
+  std::vector<std::vector<int64>> minor_to_majors = {{0, 1}, {1, 0}};
   for (auto& minor_to_major : minor_to_majors) {
     auto builder = HloComputation::Builder(TestName());
     auto constant_literal1 = LiteralUtil::CreateR2WithLayout<float>(
@@ -180,8 +179,8 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
     auto negate2 = builder.AddInstruction(
         HloInstruction::CreateUnary(ashape, HloOpcode::kNegate, negate1));
 
-    auto module = CreateNewModule();
-    HloComputation* computation = module->AddEntryComputation(builder.Build());
+    auto m = CreateNewVerifiedModule();
+    HloComputation* computation = m->AddEntryComputation(builder.Build());
 
     auto fusion = computation->CreateFusionInstruction(
         {negate2, negate1, add}, HloInstruction::FusionKind::kLoop);
@@ -194,7 +193,7 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
     ComputationLayout computation_layout(computation->ComputeProgramShape());
     *computation_layout.mutable_result_layout() = shape_layout;
 
-    AssignLayouts(module, &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
 
     EXPECT_TRUE(LayoutUtil::Equal(
         layout, fusion->fused_parameter(0)->shape().layout()));
@@ -229,13 +228,13 @@ TEST_F(LayoutAssignmentTest, TupleLayout) {
   auto negate = builder.AddInstruction(HloInstruction::CreateUnary(
       constant0->shape(), HloOpcode::kNegate, get_element0));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_TRUE(
       LayoutUtil::LayoutsInShapesEqual(constant0->shape(), constant1->shape()));
@@ -267,17 +266,17 @@ TEST_F(LayoutAssignmentTest, TupleSelect) {
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple0->shape(), HloOpcode::kTupleSelect, pred, tuple0, tuple1));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape result_shape =
       ShapeUtil::MakeTupleShape({constant0->shape(), constant1->shape()});
   TF_CHECK_OK(computation_layout.mutable_result_layout()->CopyLayoutFromShape(
       result_shape));
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(result_shape, select->shape()));
 }
@@ -302,11 +301,11 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   auto nested_tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({inner_tuple, inner_tuple}));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape result_shape = nested_tuple->shape();
   *ShapeUtil::GetMutableSubshape(&result_shape, /*index=*/{0, 0}) =
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
@@ -316,7 +315,7 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
       result_shape));
 
   LayoutAssignment layout_assignment(&computation_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   // Layout assignment should have deep copied the result of the computation to
   // address the layout conflict. This results in several Tuple() and
@@ -329,12 +328,11 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   //  %tuple.1 = Tuple(%copy) layout=({0,1})
   //  %tuple.2 = Tuple(%tuple.0, %tuple.1) layout=(({1,0}), ({0,1}))
   //
-  EXPECT_TRUE(
-      AlgebraicSimplifier(/*is_layout_sensitive=*/true,
-                          [](const Shape&, const Shape&) { return false; })
-          .Run(module)
-          .ValueOrDie());
-  HloInstruction* root = module->entry_computation()->root_instruction();
+  AlgebraicSimplifierOptions options(
+      [](const Shape&, const Shape&) { return false; });
+  options.set_is_layout_sensitive(true);
+  EXPECT_TRUE(AlgebraicSimplifier(options).Run(m.get()).ValueOrDie());
+  HloInstruction* root = m->entry_computation()->root_instruction();
   // Verify layout of the root and the root's operands.
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, root->shape()));
   EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::GetSubshape(result_shape, {0}),
@@ -361,9 +359,8 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   auto tanh = builder.AddInstruction(
       HloInstruction::CreateUnary(bshape, HloOpcode::kTanh, reshape));
 
-  auto module = CreateNewModule();
-  HloComputation* computation =
-      module->AddEntryComputation(builder.Build(tanh));
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build(tanh));
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
@@ -374,7 +371,7 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ashape_with_layout);
   *computation_layout.mutable_result_layout() = ShapeLayout(bshape_with_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   auto log_minor_to_major =
       AsInt64Slice(log->shape().layout().minor_to_major());
@@ -403,8 +400,8 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndTranspose) {
       HloInstruction::CreateTranspose(bshape, log, {1, 0}));
   auto tanh = builder.AddInstruction(
       HloInstruction::CreateUnary(bshape, HloOpcode::kTanh, transpose));
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build(tanh));
+  auto m = CreateNewVerifiedModule();
+  auto computation = m->AddEntryComputation(builder.Build(tanh));
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
@@ -415,7 +412,7 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndTranspose) {
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ashape_with_layout);
   *computation_layout.mutable_result_layout() = ShapeLayout(bshape_with_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_TRUE(
       LayoutUtil::Equal(ashape_with_layout.layout(), log->shape().layout()));
@@ -439,9 +436,9 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       HloInstruction::CreateBroadcast(bshape, param, {1, 2}));
   auto transpose = builder.AddInstruction(
       HloInstruction::CreateTranspose(cshape, broadcast, {2, 1, 0}));
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   HloComputation* computation =
-      module->AddEntryComputation(builder.Build(transpose));
+      m->AddEntryComputation(builder.Build(transpose));
 
   Shape input_shape_with_layout(ashape);
   Shape output_shape_with_layout(cshape);
@@ -454,7 +451,7 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       ShapeLayout(input_shape_with_layout);
   *computation_layout.mutable_result_layout() =
       ShapeLayout(output_shape_with_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_THAT(broadcast->shape().layout().minor_to_major(),
               ElementsAre(0, 1, 2));
@@ -488,9 +485,8 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
       HloInstruction::CreateBroadcast(f32_234, tanh, {1, 2}));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({transpose, broadcast2}));
-  auto module = CreateNewModule();
-  HloComputation* computation =
-      module->AddEntryComputation(builder.Build(tuple));
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build(tuple));
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
   Shape param_shape_with_layout(f32_4);
@@ -507,7 +503,7 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
   *computation_layout.mutable_result_layout() =
       ShapeLayout(ShapeUtil::MakeTupleShape(
           {transpose_shape_with_layout, broadcast2_shape_with_layout}));
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_THAT(broadcast->shape().layout().minor_to_major(), ElementsAre(0, 1));
   EXPECT_THAT(transpose->shape().layout().minor_to_major(), ElementsAre(1, 0));
@@ -558,9 +554,8 @@ TEST_F(LayoutAssignmentTest, MakeOperandsTheSame) {
       HloInstruction::CreateConcatenate(bshape, {param0, param1}, 1));
   auto reshape = builder.AddInstruction(
       HloInstruction::CreateReshape(cshape, concatenate));
-  auto module = CreateNewModule();
-  HloComputation* computation =
-      module->AddEntryComputation(builder.Build(reshape));
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build(reshape));
 
   Shape param0_shape_with_layout(ashape);
   Shape param1_shape_with_layout(ashape);
@@ -573,7 +568,7 @@ TEST_F(LayoutAssignmentTest, MakeOperandsTheSame) {
   *computation_layout.mutable_parameter_layout(1) =
       ShapeLayout(param1_shape_with_layout);
   OperandsMustBeTheSameLayoutAssignment layout_assignment(&computation_layout);
-  EXPECT_IS_OK(layout_assignment.Run(module).status());
+  EXPECT_IS_OK(layout_assignment.Run(m.get()).status());
 
   EXPECT_EQ(HloOpcode::kCopy, concatenate->operand(0)->opcode());
   EXPECT_THAT(concatenate->operand(0)->shape().layout().minor_to_major(),
@@ -593,11 +588,11 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastFromOperand) {
       HloInstruction::CreateParameter(0, input_shape_with_layout, "param"));
   auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {6, 7, 3, 5}), param, {2, 3, 0, 1}));
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   HloComputation* computation =
-      module->AddEntryComputation(builder.Build(transpose));
+      m->AddEntryComputation(builder.Build(transpose));
   ComputationLayout computation_layout(computation->ComputeProgramShape());
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
@@ -611,11 +606,11 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) {
       HloInstruction::CreateBroadcast(input_shape, constant, {}));
   auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {6, 7, 3, 5}), broadcast, {2, 3, 0, 1}));
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   HloComputation* computation =
-      module->AddEntryComputation(builder.Build(transpose));
+      m->AddEntryComputation(builder.Build(transpose));
   ComputationLayout computation_layout(computation->ComputeProgramShape());
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
@@ -681,12 +676,12 @@ TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
-
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   std::unique_ptr<HloModule> compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
 
@@ -721,9 +716,10 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module().entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 1, 2}),
        ShapeUtil::MakeTupleShape({
@@ -735,19 +731,19 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
           param_shape));
   computation_layout.mutable_result_layout()->ResetLayout(
       LayoutUtil::MakeLayout({2, 1, 0}));
-  AssignLayouts(&module(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
-  EXPECT_THAT(LayoutOf(&module(), "gte0"), ElementsAre(0, 1, 2));
-  EXPECT_THAT(LayoutOf(&module(), "gte1a"), ElementsAre(1, 2, 0));
-  EXPECT_THAT(LayoutOf(&module(), "gte1b"), ElementsAre(2, 0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "fresult"), ElementsAre(2, 1, 0));
-  EXPECT_THAT(FindInstruction(&module(), "gte1")
+  EXPECT_THAT(LayoutOf(m.get(), "gte0"), ElementsAre(0, 1, 2));
+  EXPECT_THAT(LayoutOf(m.get(), "gte1a"), ElementsAre(1, 2, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "gte1b"), ElementsAre(2, 0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "fresult"), ElementsAre(2, 1, 0));
+  EXPECT_THAT(FindInstruction(m.get(), "gte1")
                   ->shape()
                   .tuple_shapes(0)
                   .layout()
                   .minor_to_major(),
               ElementsAre(1, 2, 0));
-  EXPECT_THAT(FindInstruction(&module(), "gte1")
+  EXPECT_THAT(FindInstruction(m.get(), "gte1")
                   ->shape()
                   .tuple_shapes(1)
                   .layout()
@@ -757,7 +753,7 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
 
 TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   auto builder = HloComputation::Builder(TestName());
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {128, 8});
   Shape tshape = ShapeUtil::MakeTupleShape({shape, shape});
   Shape result_tshape = ShapeUtil::MakeTupleShape({shape});
@@ -784,7 +780,7 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
     true_builder.AddInstruction(HloInstruction::CreateTuple({add}));
   }
   HloComputation* true_computation =
-      module->AddEmbeddedComputation(true_builder.Build());
+      m->AddEmbeddedComputation(true_builder.Build());
 
   auto false_builder = HloComputation::Builder(TestName() + "_FalseBranch");
   {
@@ -800,14 +796,14 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
     false_builder.AddInstruction(HloInstruction::CreateTuple({infeed_data}));
   }
   HloComputation* false_computation =
-      module->AddEmbeddedComputation(false_builder.Build());
+      m->AddEmbeddedComputation(false_builder.Build());
   builder.AddInstruction(HloInstruction::CreateConditional(
       result_tshape, pred, tuple, true_computation, tuple, false_computation));
 
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
   ComputationLayout computation_layout(computation->ComputeProgramShape());
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   const HloInstruction* true_root = true_computation->root_instruction();
   const HloInstruction* false_root = false_computation->root_instruction();
@@ -828,13 +824,13 @@ TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
           {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
   builder.AddInstruction(HloInstruction::CreateUnary(
       constant0->shape(), HloOpcode::kBitcast, constant0));
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   LayoutAssignment layout_assignment(&computation_layout);
-  Status error_status = layout_assignment.Run(module).status();
+  Status error_status = layout_assignment.Run(m.get()).status();
   EXPECT_FALSE(error_status.ok());
   EXPECT_THAT(
       error_status.error_message(),
@@ -861,9 +857,10 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module().entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})});
   TF_ASSERT_OK(
@@ -873,12 +870,12 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
       LayoutUtil::MakeLayout({1, 0}));
 
   ChannelLayoutConstraints channel_constraints;
-  AssignLayouts(&module(), &computation_layout, &channel_constraints);
+  AssignLayouts(m.get(), &computation_layout, &channel_constraints);
 
-  EXPECT_THAT(LayoutOf(&module(), "gte"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "root"), ElementsAre(1, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "gte"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "root"), ElementsAre(1, 0));
   EXPECT_TRUE(ShapeUtil::Equal(
-      ShapeUtil::GetSubshape(FindInstruction(&module(), "send")->shape(), {0}),
+      ShapeUtil::GetSubshape(FindInstruction(m.get(), "send")->shape(), {0}),
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0})));
 }
 
@@ -897,17 +894,17 @@ TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) {
       param = (f32[2,2]) parameter(0)
       gte = f32[2,2] get-tuple-element(param), index=0
       ar.0 = f32[2,2] cross-replica-sum(gte),
-        all_reduce_id=0, replica_groups={{0}}, to_apply=add,
+        all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=0}
       const = f32[2,2] constant(f32[2,2]{{0,1},{2,3}})
       ROOT ar.1 = f32[2,2] cross-replica-sum(const),
-        all_reduce_id=0, replica_groups={{0}}, to_apply=add,
+        all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=1}
     })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
                           ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})});
   TF_ASSERT_OK(
@@ -917,12 +914,12 @@ TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) {
       LayoutUtil::MakeLayout({1, 0}));
 
   ChannelLayoutConstraints channel_constraints;
-  AssignLayouts(module.get(), &computation_layout, &channel_constraints);
+  AssignLayouts(m.get(), &computation_layout, &channel_constraints);
 
-  EXPECT_THAT(LayoutOf(module.get(), "gte"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(module.get(), "ar.0"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(module.get(), "ar.1"), ElementsAre(0, 1));
-  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(LayoutOf(m.get(), "gte"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "ar.0"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "ar.1"), ElementsAre(0, 1));
+  const HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root->shape().layout().minor_to_major(), ElementsAre(1, 0));
 }
 
@@ -938,11 +935,12 @@ TEST_F(LayoutAssignmentTest, CopySliceOperandToAvoidImplicitLayoutChange) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -966,11 +964,12 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -997,11 +996,12 @@ TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -1028,11 +1028,12 @@ TEST_F(LayoutAssignmentTest,
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -1050,11 +1051,12 @@ TEST_F(LayoutAssignmentTest, PropagatingLayoutFromResultToOperand) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
@@ -1107,20 +1109,21 @@ TEST_F(LayoutAssignmentTest, TupleCopyOnLayoutMismatch) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module().entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
 
   // Sanity check to verify that there's a layout mismatch.
-  EXPECT_THAT(LayoutOf(&module(), "ibuf"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "next_buf"), ElementsAre(1, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "ibuf"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "next_buf"), ElementsAre(1, 0));
 
-  AssignLayouts(&module(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   // Make sure that layout assignment did not magically eliminate the mismatch,
   // in which case the test didn't prove anything.
-  EXPECT_THAT(LayoutOf(&module(), "ibuf"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "next_buf"), ElementsAre(1, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "ibuf"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "next_buf"), ElementsAre(1, 0));
 }
 
 TEST_F(LayoutAssignmentTest, CustomCallNotLayoutConstrained) {
@@ -1136,32 +1139,32 @@ ENTRY %CustomCallWithNotLayoutConstrained (p: f32[42,2,3]) -> f32[1,2,3,4] {
   // and result layout should match that of the computation.
   {
     TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<VerifiedHloModule> module,
+        std::unique_ptr<VerifiedHloModule> m,
         ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-    ComputationLayout computation_layout = module->entry_computation_layout();
+    ComputationLayout computation_layout = m->entry_computation_layout();
     *computation_layout.mutable_parameter_layout(0) =
         ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {42, 2, 3}, {0, 2, 1}));
     *computation_layout.mutable_result_layout() = ShapeLayout(
         ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {3, 2, 0, 1}));
-    AssignLayouts(module.get(), &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
 
-    HloInstruction* root = module->entry_computation()->root_instruction();
+    HloInstruction* root = m->entry_computation()->root_instruction();
     ASSERT_THAT(root, op::CustomCall(op::Parameter()));
     ExpectLayoutIs(root->shape(), {3, 2, 0, 1});
     ExpectLayoutIs(root->operand(0)->shape(), {0, 2, 1});
   }
   {
     TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<VerifiedHloModule> module,
+        std::unique_ptr<VerifiedHloModule> m,
         ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-    ComputationLayout computation_layout = module->entry_computation_layout();
+    ComputationLayout computation_layout = m->entry_computation_layout();
     *computation_layout.mutable_parameter_layout(0) =
         ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {42, 2, 3}, {0, 1, 2}));
     *computation_layout.mutable_result_layout() = ShapeLayout(
         ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {0, 2, 3, 1}));
-    AssignLayouts(module.get(), &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
 
-    HloInstruction* root = module->entry_computation()->root_instruction();
+    HloInstruction* root = m->entry_computation()->root_instruction();
     ASSERT_THAT(root, op::CustomCall(op::Parameter()));
     ExpectLayoutIs(root->shape(), {0, 2, 3, 1});
     ExpectLayoutIs(root->operand(0)->shape(), {0, 1, 2});
@@ -1179,24 +1182,24 @@ ENTRY %CustomCallWithLayoutConstraints (p0: f32[4,4], p1: f32[2,3]) -> f32[1,2,3
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> module,
+      std::unique_ptr<VerifiedHloModule> m,
       ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-  ComputationLayout computation_layout = module->entry_computation_layout();
+  ComputationLayout computation_layout = m->entry_computation_layout();
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
   *computation_layout.mutable_parameter_layout(1) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0}));
   *computation_layout.mutable_result_layout() = ShapeLayout(
       ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
-  AssignLayouts(module.get(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   // The custom call should be partially encapsulated in kCopy instructions
   // because of the layout mismatches.
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
               op::Copy(op::CustomCall(op::Copy(), op::Parameter())));
 
   const HloInstruction* custom_call =
-      module->entry_computation()->root_instruction()->operand(0);
+      m->entry_computation()->root_instruction()->operand(0);
   ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
   ExpectLayoutIs(custom_call->operand(0)->shape(), {0, 1});
   ExpectLayoutIs(custom_call->operand(1)->shape(), {1, 0});
@@ -1211,18 +1214,18 @@ ENTRY %CustomCallLayoutConstrainedZeroOperands () -> f32[1,2,3,4] {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> module,
+      std::unique_ptr<VerifiedHloModule> m,
       ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-  ComputationLayout computation_layout = module->entry_computation_layout();
+  ComputationLayout computation_layout = m->entry_computation_layout();
   *computation_layout.mutable_result_layout() = ShapeLayout(
       ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
-  AssignLayouts(module.get(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
               op::Copy(op::CustomCall()));
 
   const HloInstruction* custom_call =
-      module->entry_computation()->root_instruction()->operand(0);
+      m->entry_computation()->root_instruction()->operand(0);
   ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
 }
 
@@ -1238,25 +1241,25 @@ ENTRY %CustomCallLayoutConstrainedTupleOperand (p0: f32[4,4], p1: f32[2,3]) -> f
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> module,
+      std::unique_ptr<VerifiedHloModule> m,
       ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-  ComputationLayout computation_layout = module->entry_computation_layout();
+  ComputationLayout computation_layout = m->entry_computation_layout();
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
   *computation_layout.mutable_parameter_layout(1) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0}));
   *computation_layout.mutable_result_layout() = ShapeLayout(
       ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
-  AssignLayouts(module.get(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
-  HloInstruction* root = module->entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   ExpectLayoutIs(root->shape(), {2, 1, 0, 3});
 
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
               op::Copy(op::CustomCall(op::Tuple())));
 
   const HloInstruction* custom_call =
-      module->entry_computation()->root_instruction()->operand(0);
+      m->entry_computation()->root_instruction()->operand(0);
   ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
   ExpectTupleLayoutIs(custom_call->operand(0)->shape(), {{1, 0}, {0, 1}});
 }
@@ -1273,36 +1276,34 @@ ENTRY %CustomCallLayoutConstrainedTupleResult (p0: f32[4,4]) -> (f32[4,4]{1,0},
   // Try with a couple different layouts. In each case the custom calls operand
   // and result layout should match that of the computation.
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> module,
+      std::unique_ptr<VerifiedHloModule> m,
       ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
-  ComputationLayout computation_layout = module->entry_computation_layout();
+  ComputationLayout computation_layout = m->entry_computation_layout();
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
   *computation_layout.mutable_result_layout() =
       ShapeLayout(ShapeUtil::MakeTupleShape(
           {ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}),
            ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0})}));
-  AssignLayouts(module.get(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
-  ExpectTupleLayoutIs(module->result_shape(), {{1, 0}, {1, 0}});
+  ExpectTupleLayoutIs(m->result_shape(), {{1, 0}, {1, 0}});
 
-  const HloInstruction* custom_call =
-      FindInstruction(module.get(), "custom-call");
+  const HloInstruction* custom_call = FindInstruction(m.get(), "custom-call");
   ExpectTupleLayoutIs(custom_call->shape(), {{1, 0}, {0, 1}});
 }
 
 Status AssignLayoutsToComputation(
-    HloModule* module,
-    ChannelLayoutConstraints* channel_constraints = nullptr) {
-  if (!module->entry_computation_layout().result_layout().LayoutIsSet()) {
-    module->mutable_entry_computation_layout()
+    HloModule* m, ChannelLayoutConstraints* channel_constraints = nullptr) {
+  if (!m->entry_computation_layout().result_layout().LayoutIsSet()) {
+    m->mutable_entry_computation_layout()
         ->mutable_result_layout()
         ->SetToDefaultLayout();
   }
   LayoutAssignment layout_assignment(
-      module->mutable_entry_computation_layout(),
+      m->mutable_entry_computation_layout(),
       LayoutAssignment::InstructionCanChangeLayout, channel_constraints);
-  return layout_assignment.Run(module).status();
+  return layout_assignment.Run(m).status();
 }
 
 TEST_F(LayoutAssignmentTest, OverwriteDiamondShapedConstraintsX) {
@@ -1325,16 +1326,16 @@ TEST_F(LayoutAssignmentTest, OverwriteDiamondShapedConstraintsX) {
   auto add = b.AddInstruction(
       HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, transpose, param1));
   b.AddInstruction(HloInstruction::CreateTuple({add, transpose}));
-  auto module = CreateNewVerifiedModule();
-  module->AddEntryComputation(b.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(b.Build());
   Shape ashape_major = ShapeUtil::MakeShapeWithLayout(F32, {12, 8}, {1, 0});
   Shape ashape_minor = ShapeUtil::MakeShapeWithLayout(F32, {12, 8}, {0, 1});
-  *module->mutable_entry_computation_layout()->mutable_result_layout() =
+  *m->mutable_entry_computation_layout()->mutable_result_layout() =
       ShapeLayout(ShapeUtil::MakeTupleShape({ashape_major, ashape_minor}));
   const Layout r2_dim0major = LayoutUtil::MakeLayout({1, 0});
-  ForceParameterLayout(module.get(), 0, r2_dim0major);
-  ForceParameterLayout(module.get(), 1, r2_dim0major);
-  TF_ASSERT_OK(AssignLayoutsToComputation(module.get()));
+  ForceParameterLayout(m.get(), 0, r2_dim0major);
+  ForceParameterLayout(m.get(), 1, r2_dim0major);
+  TF_ASSERT_OK(AssignLayoutsToComputation(m.get()));
 
   EXPECT_THAT(add->shape().layout().minor_to_major(), ElementsAre(1, 0));
   EXPECT_THAT(add->operand(0)->shape().layout().minor_to_major(),
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 850501a4b5c521f5a5cc29658a04ae4bb638e14f..728a66b388f0f9af480ff88b5e96990a26e36af5 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -169,6 +169,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
     ],
@@ -197,14 +198,17 @@ cc_library(
     hdrs = ["sort_util.h"],
     deps = [
         ":ir_array",
+        ":kernel_support_library",
         ":llvm_loop",
         ":llvm_util",
         ":loop_emitter",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
         "//tensorflow/compiler/xla/service/gpu:partition_assignment",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm//:core",
         "@llvm//:support",
     ],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index f4b05f29c38529b3cce81b4c8ee6fae5c00cafcc..1540a40ef820f483c27b3d0d81d24ebb265847b3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -108,6 +108,14 @@ class IrArray {
     Index(absl::Span<llvm::Value* const> multidim, llvm::Value* linear,
           const Shape& shape);
 
+    // Returns an index that adds `addend` to the given `dim` of the object.
+    Index AddOffsetToDim(llvm::Value* addend, int64 dim,
+                         llvm::IRBuilder<>* b) const {
+      IrArray::Index index = *this;
+      index[dim] = b->CreateAdd(index[dim], addend);
+      return index;
+    }
+
     const std::vector<llvm::Value*>& multidim() const { return multidim_; }
     llvm::Value* linear() const { return linear_; }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index e5fbdbd51b8a9aa14decadedd1eeb3bdbf831738..c26711e526c9b89cdedcb6aed9f93d41dd25dc83 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -52,6 +52,29 @@ Shape MergeDimensions(absl::Span<const size_t> segs, const Shape& shape) {
   return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
                                                   dimensions);
 }
+
+// Given an index for a shape, return the equivalent new index if the shape is
+// reshaped to another shape.
+IrArray::Index GetReshapedIndex(const IrArray::Index& index, const Shape& shape,
+                                const Shape& reshaped_shape,
+                                llvm::IRBuilder<>* b) {
+  auto bounds = shape.dimensions();
+  auto minor_to_major = shape.layout().minor_to_major();
+  llvm::Value* linear_index = index.GetConstantWithIndexType(0);
+  int64 multiplier = 1;
+  for (int i = 0; i < index.size(); ++i) {
+    int64 dim = minor_to_major[i];
+    llvm::Value* addend = b->CreateMul(
+        index[dim], index.GetConstantWithIndexType(multiplier), "linearizing",
+        /*HasNUW=*/true, /*HasNSW=*/true);
+    linear_index = b->CreateAdd(linear_index, addend, "",
+                                /*HasNUW=*/true, /*HasNSW=*/true);
+    multiplier *= bounds[dim];
+  }
+
+  return IrArray::Index(linear_index, reshaped_shape, b);
+}
+
 }  // namespace
 
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
@@ -60,28 +83,30 @@ absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
     return absl::nullopt;
   }
 
-  std::vector<int64> perm(a.dimensions().size());
-  {
-    auto layout_a_orig = LayoutUtil::MinorToMajor(a);
-    std::vector<int64> layout_a(layout_a_orig.rbegin(), layout_a_orig.rend());
-    auto layout_b_orig = LayoutUtil::MinorToMajor(b);
-    std::vector<int64> layout_b(layout_b_orig.rbegin(), layout_b_orig.rend());
-    for (size_t i = 0; i < perm.size(); ++i) {
-      perm[i] = PositionInContainer(layout_b, layout_a[i]);
-    }
+  std::vector<int64> permutation(a.dimensions().size());
+  absl::Span<const int64> minor_to_major_a = LayoutUtil::MinorToMajor(a);
+  std::vector<int64> major_to_minor_a(minor_to_major_a.rbegin(),
+                                      minor_to_major_a.rend());
+  absl::Span<const int64> minor_to_major_b = LayoutUtil::MinorToMajor(b);
+  std::vector<int64> major_to_minor_b(minor_to_major_b.rbegin(),
+                                      minor_to_major_b.rend());
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    permutation[i] = PositionInContainer(major_to_minor_b, major_to_minor_a[i]);
   }
-  auto segs = ConsecutiveSegments(perm);
-  if ((3 == segs.size() && 0 == perm[0]) || 2 == segs.size()) {
-    Shape norm_a =
+
+  std::vector<size_t> segments = ConsecutiveSegments(permutation);
+  if ((3 == segments.size() && 0 == permutation[0]) || 2 == segments.size()) {
+    Shape descending_layout_shape =
         ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a);
-    Shape reduced_a = MergeDimensions(segs, norm_a);
-    auto reduced_a_dims = reduced_a.dimensions();
+    Shape normalized_shape = MergeDimensions(segments, descending_layout_shape);
+    absl::Span<const int64> normalized_dims =
+        AsInt64Slice(normalized_shape.dimensions());
     std::vector<int64> dims_021;
-    if (2 == segs.size()) {
+    if (2 == segments.size()) {
       // The logical component-0 is of size one.
-      dims_021 = {1, reduced_a_dims[1], reduced_a_dims[0]};
+      dims_021 = {1, normalized_dims[1], normalized_dims[0]};
     } else {
-      dims_021 = {reduced_a_dims[0], reduced_a_dims[2], reduced_a_dims[1]};
+      dims_021 = {normalized_dims[0], normalized_dims[2], normalized_dims[1]};
     }
 
     return dims_021;
@@ -90,27 +115,117 @@ absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
   return absl::nullopt;
 }
 
-IrArray::Index GetUnreducedOutputIndex(
-    const IrArray::Index& reduced_output_index,
-    const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* b) {
-  auto bounds = reduced_output_shape.dimensions();
-  auto minor_to_major = reduced_output_shape.layout().minor_to_major();
-  llvm::Value* linear_index = reduced_output_index.GetConstantWithIndexType(0);
-  int64 multiplier = 1;
-  for (int i = 0; i < reduced_output_index.size(); ++i) {
-    int64 dim = minor_to_major[i];
-    llvm::Value* addend =
-        b->CreateMul(reduced_output_index[dim],
-                     reduced_output_index.GetConstantWithIndexType(multiplier),
-                     "linearizing",
-                     /*HasNUW=*/true, /*HasNSW=*/true);
-    linear_index = b->CreateAdd(linear_index, addend, "",
-                                /*HasNUW=*/true, /*HasNSW=*/true);
-    multiplier *= bounds[dim];
+KernelMappingScheme::KernelMappingScheme(
+    absl::Span<const int64> dims_in_elems, int64 tile_size_y, int64 tile_size_x,
+    absl::Span<const int64> req_block_sizes, int64 num_threads_y,
+    int64 num_threads_x, llvm::IRBuilder<>* b)
+    : b_(b),
+      dims_in_elems_(dims_in_elems),
+      tile_sizes_{1, tile_size_y, tile_size_x},
+      num_threads_x_(num_threads_x),
+      num_threads_y_(num_threads_y) {
+  DCHECK_EQ(dims_in_elems_.size(), 3);
+  DCHECK_EQ(req_block_sizes.size(), 3);
+
+  DCHECK_EQ(tile_size_y % num_threads_y_, 0);
+  DCHECK_EQ(tile_size_x % num_threads_x_, 0);
+
+  dims_in_tiles_ = ElementWiseCeilOfRatio<int64>(dims_in_elems_, tile_sizes_);
+  block_sizes_.reserve(req_block_sizes.size());
+  absl::c_transform(req_block_sizes, dims_in_tiles_,
+                    std::back_inserter(block_sizes_),
+                    [](const int64 requested_size, const int64 max_size) {
+                      return std::min(requested_size, max_size);
+                    });
+  dims_in_blocks_ = ElementWiseCeilOfRatio<int64>(dims_in_tiles_, block_sizes_);
+
+  VLOG(10) << "dims_in_elems_ = [" << absl::StrJoin(dims_in_elems_, ",") << "]";
+  VLOG(10) << "dims_in_tiles_ = [" << absl::StrJoin(dims_in_tiles_, ",") << "]";
+  VLOG(10) << "dims_in_blocks_ = [" << absl::StrJoin(dims_in_blocks_, ",")
+           << "]";
+}
+
+IrArray::Index KernelMappingScheme::GetUnnormalizedIndex(
+    const IrArray::Index& normalized_shape_index,
+    const Shape& unnormalized_shape) {
+  DCHECK_EQ(normalized_shape_index.size(), dims_in_elems_.size());
+  Shape output_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      unnormalized_shape.element_type(), GetDimensionsInElements());
+  return GetReshapedIndex(normalized_shape_index, output_shape,
+                          unnormalized_shape, b_);
+}
+
+IrArray::Index KernelMappingScheme::EmitBlockIndex(llvm::Type* index_ty) {
+  llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_);
+  llvm_ir::AddRangeMetadata(0, GetNumberOfBlocks(),
+                            llvm::cast<llvm::Instruction>(block_id));
+  llvm::Value* linear_block_id =
+      b_->CreateIntCast(block_id, index_ty, /*isSigned=*/true, "block.id.x");
+  return IrArray::Index(linear_block_id,
+                        ShapeUtil::MakeShapeWithDescendingLayout(
+                            PRED /*arbitrary*/, dims_in_blocks_),
+                        b_);
+}
+
+IrArray::Index KernelMappingScheme::GetTileIndexForBlockOrigin(
+    const IrArray::Index& block_index) {
+  IrArray::Index tile_index = block_index;
+  for (int i = 0; i < block_sizes_.size(); ++i) {
+    tile_index[i] = b_->CreateMul(
+        block_index[i],
+        llvm::ConstantInt::get(block_index[i]->getType(), block_sizes_[i]),
+        "block_origin." + std::to_string(i));
+  }
+  return tile_index;
+}
+
+IrArray::Index KernelMappingScheme::GetElementIndexForTileOrigin(
+    const IrArray::Index& tile_index) {
+  IrArray::Index elem_index = tile_index;
+  for (int i = DimY; i < DimTot; ++i) {
+    elem_index[i] =
+        b_->CreateMul(tile_index[i],
+                      llvm::ConstantInt::get(tile_index[i]->getType(),
+                                             GetTileSizeForDimension(i)),
+                      "tile_origin." + std::to_string(i));
   }
+  return elem_index;
+}
+
+llvm::GlobalVariable* KernelMappingScheme::GetSharedMemoryBufferForElementType(
+    llvm::Type* elem_ty, absl::string_view buffer_name) {
+  // If shared memory tranpose is needed, we use square tiles.
+  CHECK_EQ(GetTileSizeForDimensionX(), GetTileSizeForDimensionY());
+
+  // For Nvidia GPUs, the warp size is 32 threads and the shared memory bank is
+  // organized into 32-way. We usually use the warp size or a multiplier or a
+  // the warp size as the size for tiling. This may cause all elements in the
+  // same column of a tile use the same memory bank and therefore shared memory
+  // bank conflicts. Adding 1 to the minor dimension of the shared memory buffer
+  // can reduce such shared memory bank conflicts.
+  llvm::Type* buffer_type = llvm::ArrayType::get(
+      llvm::ArrayType::get(elem_ty, GetTileSizeForDimension(DimX) + 1),
+      GetTileSizeForDimension(DimY));
+  return llvm_ir::AllocateSharedMemoryTile(b_->GetInsertBlock()->getModule(),
+                                           buffer_type, buffer_name);
+}
 
-  return IrArray::Index(linear_index, unreduced_output_shape, b);
+std::tuple<llvm::Value*, llvm::Value*>
+KernelMappingScheme::EmitThreadYXCoordinate(llvm::Type* index_ty) {
+  // Calculate (y, x) coordinate of the thread in the 2D view of thread block
+  // defined by (num_thread_y, num_thread_x) from thread_id.
+  llvm::CallInst* thread_id_raw = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_);
+  llvm_ir::AddRangeMetadata(0, GetThreadsPerTile(), thread_id_raw);
+  llvm::Value* thread_id_int =
+      b_->CreateIntCast(thread_id_raw, index_ty,
+                        /*isSigned=*/true, "thread.id.x");
+  llvm::Value* num_thread_x =
+      llvm::ConstantInt::get(index_ty, GetNumberOfThreadsForDimensionX());
+  llvm::Value* x = b_->CreateURem(thread_id_int, num_thread_x);
+  llvm::Value* y = b_->CreateUDiv(thread_id_int, num_thread_x);
+  return std::make_tuple(y, x);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index 5ea05b3188a1c0881e4c0c41625d530aff1b1205..06002d57b0d7daa07f903feebe67a60a083c0e7c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -28,23 +28,160 @@ namespace llvm_ir {
 // If a shape can be viewed as three logical components 0-1-2 in the order of
 // major to minor, a 0-2-1-transpose changes the order of such logical
 // components to 0-2-1. We call the shape being transposed the input shape and
-// the transposed shape the output shape. The logical view of the input and
-// output shapes for the transpose are called the 0-1-2 shape or reduced input
-// shape and the 0-2-1 shape or the reduced output shape respectively. The
-// original input and output shapes are called the unreduced input and output
-// shapes.
-
+// the transposed shape the output shape. The logical view of the input/output
+// shapes for the transpose are called the 0-1-2/0-2-1 shapes or the normalized
+// shapes. The original input/output shapes are called unnormalized shapes.
+//
 // If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the
-// reduced shape of `b` or the 0-2-1 shape.
+// normalized shape of `b` or the 0-2-1 shape.
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
                                                      const Shape& b);
 
-// Return the unreduced output index corresponding to the given reduced output
-// index.
-IrArray::Index GetUnreducedOutputIndex(
-    const IrArray::Index& reduced_output_index,
-    const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* b);
+// A tile is a spatial subdivision of a tensor. We group tensor elements into
+// tiles so that we can launch kernels to process the tensor elements in blocks
+// of tiles.
+//
+// A kernel mapping scheme describes a method to partition the tensors accessed
+// by an unnested HLO instruction into tiles and blocks of tiles, and the
+// associated information to use hardware threads to process the tensor elements
+// in blocks of tiles.
+//
+// Currently, there are two main use cases for a tiling scheme. First, we
+// implement kernels with 0-2-1 memory transpose using shared memory to improve
+// memory access pattern. Second, we implement reduction to contiguous
+// dimensions in layout, with or without memory tranpsose, to achieve better
+// memory access pattern as well as to reduce the need numbers of executed
+// expensive instructions, such as thread synchronization related instructions
+// and atomic operations. For both use cases, we can apply a normalization to
+// the original tensors, to collapse contiguous dimensions for the same purpose
+// and produce normlized three dimensional tensors. For this reason, the tiling
+// scheme class only needs to handle normalized three dimensional tensors and
+// two dimensional tiles.
+//
+// The current implementation of the class is somewhat NVIDIA GPU oriented. This
+// situation can be improved when there is a need though. The idea of 0-2-1
+// transpose using shared memory can be found in the following CUDA algorithm in
+// TensorFlow: https://goo.gl/MStRV6.
+//
+// We use a thread block to process a tile because we want to use the HW thread
+// block synchronization primitives to synchronize the processing of all the
+// elements in the same tile. A thread block can be viewed as a two dimensional
+// array of threads, described by the number of threads for the Y and X
+// dimensions. A thread block (num_threads_y, num_threads_x) processes a tile of
+// (tile_size_y, tile_size_x) as follows: each thread in the thread block
+// processes one element in the tile so that all the threads in the thread block
+// together process a subdivision of the tile that has the same dimension as the
+// thread block array. Then the thread block moves on to process the next
+// subdivision of the tile until the whole tile is processed. Therefore, each
+// thread in the thread block processes
+// tile_size_x/num_threads_x * tile_size_y/num_threads_y elements in a tile.
+//
+// There are situations where we want a thread block to process multiple
+// tiles. We can't group those tiles into a bigger tiles because we limit a tile
+// to a two dimensional spatial subdivision of a tensor. For example, when we
+// use tiling to implement reduction with tranpose, we want the partial sum
+// produced by each thread to accumulate values for more elements before using
+// shlf_down and atomic_add instructions for further reduction, to amortize the
+// cost of such expensive instructions. The concept of tile block is introduced
+// for this purpose. A tile block is a three dimensional array of tiles, of
+// which some dimensions may be degenerated to only one tile.
+class KernelMappingScheme {
+ public:
+  enum { DimZ = 0, DimY, DimX, DimTot };
+
+ public:
+  // dims_in_elems: the normalized tensor dimensions.
+  // req_block_sizes: the requested block size in number of tiles for each
+  //   dimension. The actual block size is set to min(req_block_size,
+  //   dims_in_number_of_blocks).
+  explicit KernelMappingScheme(absl::Span<const int64> dims_in_elems,
+                               int64 tile_size_y, int64 tile_size_x,
+                               absl::Span<const int64> req_block_sizes,
+                               int64 num_threads_y, int64 num_threads_x,
+                               llvm::IRBuilder<>* b);
+
+  absl::Span<const int64> GetDimensionsInElements() const {
+    return dims_in_elems_;
+  }
+  absl::Span<const int64> GetDimensionsInTiles() const {
+    return dims_in_tiles_;
+  }
+  absl::Span<const int64> GetDimensionsInBlocks() const {
+    return dims_in_blocks_;
+  }
+
+  int64 GetNumberOfTilesInTotal() const {
+    return absl::c_accumulate(dims_in_tiles_, 1LL, std::multiplies<int64>());
+  }
+  int64 GetNumberOfTilesInOneBlock() const {
+    return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
+  }
+
+  int64 GetNumberOfBlocks() const {
+    return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
+  }
+
+  int64 GetTileSizeForDimension(int d) const {
+    DCHECK(d >= DimZ && d <= DimX);
+    return tile_sizes_[d];
+  }
+  int64 GetTileSizeForDimensionX() const {
+    return GetTileSizeForDimension(DimX);
+  }
+  int64 GetTileSizeForDimensionY() const {
+    return GetTileSizeForDimension(DimY);
+  }
+
+  absl::Span<const int64> GetBlockSizes() const { return block_sizes_; }
+
+  int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; }
+  int64 GetNumberOfThreadsForDimensionY() const { return num_threads_y_; }
+
+  int64 GetThreadsPerTile() const {
+    return GetNumberOfThreadsForDimensionX() *
+           GetNumberOfThreadsForDimensionY();
+  }
+
+  IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
+  // Returns the index for the first tile in the block with the given block
+  // index.
+  IrArray::Index GetTileIndexForBlockOrigin(const IrArray::Index& block_index);
+  // Returns the index for the first element in the tile with the given tile
+  // index.
+  IrArray::Index GetElementIndexForTileOrigin(const IrArray::Index& tile_index);
+
+  std::tuple<llvm::Value*, llvm::Value*> EmitThreadYXCoordinate(
+      llvm::Type* index_ty);
+
+  IrArray::Index GetUnnormalizedIndex(
+      const IrArray::Index& normalized_shape_index,
+      const Shape& unnormalized_shape);
+
+  llvm::GlobalVariable* GetSharedMemoryBufferForElementType(
+      llvm::Type* elem_ty, absl::string_view buffer_name);
+
+ private:
+  llvm::IRBuilder<>* b_;
+  // The number of elements in each dimension.
+  absl::Span<const int64> dims_in_elems_;
+
+  // The number of elements for each dimension of a tile.
+  std::vector<int64> tile_sizes_;
+  // The number of tiles in each dimension. It is computed from dims_in_elem_
+  // and tile_sizes_.
+  std::vector<int64> dims_in_tiles_;
+
+  // The number of tiles for each dimension of a tile block.
+  std::vector<int64> block_sizes_;
+  // The number of blocks in each dimension of a tile block. It is computed from
+  // dims_in_tile_ and block_sizes_.
+  std::vector<int64> dims_in_blocks_;
+
+  // Number of threads used to process elements in the X direction of a tile.
+  int64 num_threads_x_;
+  // Number of threads used to process elements in the Y direction of a tile.
+  int64 num_threads_y_;
+};
 
 // A class to represent information for tiled parameters to support IR emission
 // for 021 transpose.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 2e5aebb74c29b809ae5c323b1912043d9f160d67..df78726166eea953b57e72a5a5fc81ee246aca34 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Target/TargetOptions.h"
@@ -83,10 +84,9 @@ string DumpModuleToString(const llvm::Module& module) {
   return AsString(buffer_string);
 }
 
-llvm::Value* EmitCallToIntrinsic(llvm::Intrinsic::ID intrinsic_id,
-                                 absl::Span<llvm::Value* const> operands,
-                                 absl::Span<llvm::Type* const> overloaded_types,
-                                 llvm::IRBuilder<>* b) {
+llvm::CallInst* EmitCallToIntrinsic(
+    llvm::Intrinsic::ID intrinsic_id, absl::Span<llvm::Value* const> operands,
+    absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilder<>* b) {
   llvm::Module* module = ModuleFromIRBuilder(b);
   llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(
       module, intrinsic_id, AsArrayRef(overloaded_types));
@@ -260,6 +260,17 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
       /*AddNull=*/false);
 }
 
+llvm::GlobalVariable* AllocateSharedMemoryTile(llvm::Module* module,
+                                               llvm::Type* tile_type,
+                                               absl::string_view name) {
+  const int kNVPTXSharedMemoryAddrSpace = 3;
+  return new llvm::GlobalVariable(
+      *module, tile_type,
+      /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage,
+      llvm::UndefValue::get(tile_type), AsStringRef(name), nullptr,
+      llvm::GlobalValue::NotThreadLocal, kNVPTXSharedMemoryAddrSpace);
+}
+
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
                                             absl::string_view name,
                                             llvm::IRBuilder<>* b,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index f59baff263fe7184c6b0821c9dbd9eee205586a6..c604c7c870adf734a29017e6accbd159317a9548 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -101,10 +102,9 @@ string SanitizeFunctionName(string function_name);
 // intrinsics (for example, "minnum") must include a type in overloaded_types
 // for each overloaded type. Typically, overloaded intrinsics have only a single
 // overloaded type.
-llvm::Value* EmitCallToIntrinsic(llvm::Intrinsic::ID intrinsic_id,
-                                 absl::Span<llvm::Value* const> operands,
-                                 absl::Span<llvm::Type* const> overloaded_types,
-                                 llvm::IRBuilder<>* b);
+llvm::CallInst* EmitCallToIntrinsic(
+    llvm::Intrinsic::ID intrinsic_id, absl::Span<llvm::Value* const> operands,
+    absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilder<>* b);
 
 // Emit float max. Emit maxnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
@@ -155,6 +155,11 @@ StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
 llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
                                            llvm::Module* module);
 
+// Allocates a tile of shared memory.
+llvm::GlobalVariable* AllocateSharedMemoryTile(llvm::Module* module,
+                                               llvm::Type* tile_type,
+                                               absl::string_view name);
+
 // Inserts an allocate of the requested type at the entry point of the
 // function that the builder is currently building. The insert point
 // of the builder is set to the same place after calling this function
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index 05ba4a40da413f0e774214e55ef69d023afc48e2..e22c2173c271fc9571be1ddb0759d2b31562dc98 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include <vector>
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -28,10 +30,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -39,147 +43,365 @@ namespace xla {
 namespace llvm_ir {
 
 namespace {
-// Adds the inner comparison loop where we compare elements pointed to by
-// 'keys_index' and 'compare_keys_index'.
-void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index,
-                     const IrArray::Index& compare_keys_index,
-                     const IrArray& keys_array,
-                     const std::vector<IrArray>& values_arrays,
-                     llvm::IRBuilder<>* b) {
-  // if (is_smaller_index &&
-  //     compare_keys[dimension_to_sort] < dimension_to_sort_bound)
-  llvm::Value* is_smaller_index = b->CreateICmpSLT(
-      keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]);
-  int64 dimension_to_sort_bound =
-      keys_array.GetShape().dimensions(dimension_to_sort);
-  auto if_data = EmitIfThenElse(
-      b->CreateAnd(is_smaller_index,
-                   b->CreateICmpSLT(compare_keys_index[dimension_to_sort],
-                                    keys_index.GetConstantWithIndexType(
-                                        dimension_to_sort_bound))),
-      "smaller_comparison_index", b, /*emit_else=*/false);
-  SetToFirstInsertPoint(if_data.true_block, b);
-  auto key1 = keys_array.EmitReadArrayElement(keys_index, b);
-  auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, b);
-  auto compare_key1 = key1;
-  auto compare_key2 = key2;
-  auto key_type = keys_array.GetShape().element_type();
-  bool is_signed_comparison = true;
-  if (primitive_util::IsFloatingPointType(key_type)) {
-    // We would like a total order of floating point numbers so that the sort
-    // has a predictable behavior in the presence of NaNs. Rather than using
-    // floating point comparison, we use the following trick:
-    // If f is a float, and
-    // x = bit_cast<int32>(f);
-    // y = x < 0 ? 0x7FFFFFFF - x : x;
-    // then y is ordered as an int32 such that finite values have the obvious
-    // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
-    // and end of the ordering.
-    auto k = b->getInt(llvm::APInt::getSignedMaxValue(
-        key1->getType()->getPrimitiveSizeInBits()));
-    auto comparison_type = k->getType();
-    auto zero = llvm::ConstantInt::get(comparison_type, 0);
-    auto maybe_flip = [&](llvm::Value* v) {
-      return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
-                             b->CreateSub(k, v), v);
-    };
-    compare_key1 = b->CreateBitCast(key1, comparison_type);
-    compare_key2 = b->CreateBitCast(key2, comparison_type);
-    compare_key1 = maybe_flip(compare_key1);
-    compare_key2 = maybe_flip(compare_key2);
-  } else if (!primitive_util::IsSignedIntegralType(key_type)) {
-    is_signed_comparison = false;
+
+// Adds the inner comparison loop body where we compare elements.
+void EmitCompareLoopBody(
+    int64 iteration_bound, PrimitiveType key_type, int64 num_values,
+    int64 iota_values_parameter_index, llvm::Value* element_pair_index,
+    int64 xor_mask, llvm::Type* index_type,
+    std::function<llvm::Value*(int64 operand, llvm::Value* index)> read_element,
+    std::function<void(int64 operand, llvm::Value* index, llvm::Value* value)>
+        write_element,
+    llvm::IRBuilder<>* b, bool needs_bounds_checks = true) {
+  auto index_typed_constant = [&](int64 value) {
+    return llvm::ConstantInt::get(index_type, value);
+  };
+  // The 'xor_mask' determines which elements are compared against each other.
+  // Index 'current_keys_index' will be compared with 'current_keys_index' xor
+  // 'xor_mask'. This means that we will always compare a block of consecutive
+  // elements against elements from the adjacent block of the same size. When
+  // 'xor_mask' is a power of 2, it immediately identifies the size of such a
+  // block. We can also have 'xor_mask' being 2^k - 1 (for some value of k). In
+  // that case, we essentially flip the last 'k' - 1 bits when computing the
+  // position of the element to compare to, so the block size is 2^(k - 1).
+  int64 block_size = xor_mask;
+  // Check if it is a value 2^k - 1.
+  if (xor_mask > 1 && (xor_mask & (xor_mask + 1)) == 0) {
+    block_size = (xor_mask + 1) / 2;
   }
-  auto comparison =
-      b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
-                                         : llvm::ICmpInst::ICMP_ULT,
-                    compare_key2, compare_key1);
-  // If key2 < key1
-  auto if_smaller_data =
-      EmitIfThenElse(comparison, "is_smaller_than", b, /*emit_else=*/false);
-  SetToFirstInsertPoint(if_smaller_data.true_block, b);
-  // Swap key1 with key2.
-  keys_array.EmitWriteArrayElement(keys_index, key2, b);
-  keys_array.EmitWriteArrayElement(compare_keys_index, key1, b);
-  for (const auto& values_array : values_arrays) {
-    // Also swap the values.
-    auto value1 = values_array.EmitReadArrayElement(keys_index, b);
-    auto value2 = values_array.EmitReadArrayElement(compare_keys_index, b);
-    values_array.EmitWriteArrayElement(keys_index, value2, b);
-    values_array.EmitWriteArrayElement(compare_keys_index, value1, b);
+  auto current_keys_index = element_pair_index;
+  if (block_size == 1) {
+    // If the block size is 1, we take every second element and compare it to
+    // the next one.
+    current_keys_index =
+        b->CreateMul(current_keys_index, index_typed_constant(2));
+  } else if (block_size * 2 < iteration_bound) {
+    // current_keys_index iterates through the 'left' elements of the element
+    // pairs to be compared. We first need to compute the comparison block to
+    // which the element belongs. The block id of that block is index /
+    // block_size.
+    auto block_id =
+        b->CreateUDiv(current_keys_index, index_typed_constant(block_size));
+    // The index of the 'left' element within its block is simply the remainder
+    // when dividing by 'block_size'.
+    auto index_within_block =
+        b->CreateURem(current_keys_index, index_typed_constant(block_size));
+    // The first element of the 'left' block of elements that is compared
+    // against elements from the adjacent 'right' block of elements is
+    // 'block_id' * (2 * 'block_size').
+    auto first_element_in_block =
+        b->CreateMul(block_id, index_typed_constant(2 * block_size));
+    current_keys_index =
+        b->CreateAdd(first_element_in_block, index_within_block);
   }
+  auto compare_keys_index =
+      b->CreateXor(current_keys_index, index_typed_constant(xor_mask));
+  // current_keys_index < compare_keys_index
+  llvm::Value* is_smaller_index =
+      b->CreateICmpSLT(current_keys_index, compare_keys_index);
+  // compare_keys_index < iteration_bound
+  llvm::Value* index_is_inbounds = b->CreateICmpSLT(
+      compare_keys_index, index_typed_constant(iteration_bound));
+  llvm::Value* do_comparison =
+      needs_bounds_checks ? b->CreateAnd(is_smaller_index, index_is_inbounds)
+                          : b->getInt1(true);
+
+  // if (is_smaller_index && index_is_inbounds)
+  KernelSupportLibrary ksl(b);
+  ksl.IfReturnVoid("smaller_comparison_index", do_comparison, [&]() {
+    auto key1 = read_element(0, current_keys_index);
+    auto key2 = read_element(0, compare_keys_index);
+    auto compare_key1 = key1;
+    auto compare_key2 = key2;
+    bool is_signed_comparison = true;
+    if (primitive_util::IsFloatingPointType(key_type)) {
+      // We would like a total order of floating point numbers so that the
+      // sort has a predictable behavior in the presence of NaNs. Rather
+      // than using floating point comparison, we use the following trick:
+      // If f is a float, and
+      // x = bit_cast<int32>(f);
+      // y = x < 0 ? 0x7FFFFFFF - x : x;
+      // then y is ordered as an int32 such that finite values have the
+      // obvious order, -0 is ordered before 0, and -NaN and NaN appear at
+      // the beginning and end of the ordering.
+      auto k = b->getInt(llvm::APInt::getSignedMaxValue(
+          key1->getType()->getPrimitiveSizeInBits()));
+      auto comparison_type = k->getType();
+      auto zero = llvm::ConstantInt::get(comparison_type, 0);
+      auto maybe_flip = [&](llvm::Value* v) {
+        return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
+                               b->CreateSub(k, v), v);
+      };
+      compare_key1 = b->CreateBitCast(key1, comparison_type);
+      compare_key2 = b->CreateBitCast(key2, comparison_type);
+      compare_key1 = maybe_flip(compare_key1);
+      compare_key2 = maybe_flip(compare_key2);
+    } else if (!primitive_util::IsSignedIntegralType(key_type)) {
+      is_signed_comparison = false;
+    }
+    // If key2 < key1
+    auto is_smaller_than =
+        b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
+                                           : llvm::ICmpInst::ICMP_ULT,
+                      compare_key2, compare_key1);
+    if (iota_values_parameter_index >= 0) {
+      auto keys_equal = b->CreateICmpEQ(compare_key1, compare_key2);
+      auto key_index1 =
+          read_element(iota_values_parameter_index, current_keys_index);
+      auto key_index2 =
+          read_element(iota_values_parameter_index, compare_keys_index);
+      auto index_is_smaller_than =
+          b->CreateICmp(llvm::ICmpInst::ICMP_ULT, key_index2, key_index1);
+      is_smaller_than = b->CreateOr(
+          is_smaller_than, b->CreateAnd(keys_equal, index_is_smaller_than));
+    }
+    ksl.IfReturnVoid("is_smaller_than", is_smaller_than, [&]() {
+      // Swap key1 with key2.
+      write_element(0, current_keys_index, key2);
+      write_element(0, compare_keys_index, key1);
+      for (int64 i = 1; i <= num_values; ++i) {
+        // Also swap the values.
+        auto value1 = read_element(i, current_keys_index);
+        auto value2 = read_element(i, compare_keys_index);
+        write_element(i, current_keys_index, value2);
+        write_element(i, compare_keys_index, value1);
+      }
+    });
+  });
+}
+
+void EmitTiledCompareLoop(
+    const IrArray::Index& tiled_keys_index, int64 dimension_to_sort,
+    int64 dimension_to_sort_bound, PrimitiveType keys_type,
+    absl::Span<const int64> xor_masks, const std::vector<IrArray>& params,
+    const std::vector<llvm::Value*>& param_shmem_buffers,
+    int64 iota_values_parameter_index, int64 tile_size, llvm::IRBuilder<>* b) {
+  KernelSupportLibrary ksl(b);
+  llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b);
+  llvm_ir::AddRangeMetadata(0, tile_size / 2,
+                            llvm::cast<llvm::Instruction>(thread_id));
+  thread_id = b->CreateIntCast(thread_id, tiled_keys_index.GetType(),
+                               /*isSigned=*/true, "thread.id.x");
+
+  auto copy_loop_body =
+      [&](std::function<void(llvm::Value * cache_index, llvm::Value * index)>
+              read_or_write) {
+        auto value_one = tiled_keys_index.GetConstantWithIndexType(1);
+        auto current_keys_index =
+            b->CreateShl(tiled_keys_index[dimension_to_sort], value_one);
+        // We want to copy two adjacent elements. We first check whether the
+        // first index position is within bounds.
+        ksl.IfReturnVoid(
+            "smaller_keys_index",
+            b->CreateICmpSLT(current_keys_index,
+                             tiled_keys_index.GetConstantWithIndexType(
+                                 dimension_to_sort_bound)),
+            [&]() {
+              auto cache_index = b->CreateShl(thread_id, value_one);
+              read_or_write(cache_index, current_keys_index);
+              // Increment to go the next index position.
+              current_keys_index = b->CreateAdd(current_keys_index, value_one);
+              // Here we check whether the next index position is within bounds.
+              ksl.IfReturnVoid(
+                  "inner_smaller_keys_index",
+                  b->CreateICmpSLT(current_keys_index,
+                                   tiled_keys_index.GetConstantWithIndexType(
+                                       dimension_to_sort_bound)),
+                  [&]() {
+                    cache_index = b->CreateAdd(cache_index, value_one);
+                    read_or_write(cache_index, current_keys_index);
+                  });
+            });
+      };
+
+  // Copy operand tiles from the operand buffers to shared memory.
+  IrArray::Index keys_index = tiled_keys_index;
+  for (int64 i = 0; i < params.size(); ++i) {
+    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+      keys_index[dimension_to_sort] = index;
+      auto value = params[i].EmitReadArrayElement(keys_index, b);
+      b->CreateStore(value,
+                     b->CreateGEP(param_shmem_buffers[i],
+                                  {tiled_keys_index.GetConstantWithIndexType(0),
+                                   cache_index}));
+    });
+  }
+  // Wait until all reads have happened.
+  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
+
+  // Now emit the bodies of the comparison loops.
+  auto read_element = [&](int64 operand, llvm::Value* index) {
+    return b->CreateLoad(
+        b->CreateGEP(param_shmem_buffers[operand],
+                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+  };
+  auto write_element = [&](int64 operand, llvm::Value* index,
+                           llvm::Value* value) {
+    b->CreateStore(
+        value,
+        b->CreateGEP(param_shmem_buffers[operand],
+                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+  };
+  for (int64 xor_mask : xor_masks) {
+    // The index of the element pair to be compared within the tile stored in
+    // shared memory. We order the element pairs by the element with the smaller
+    // index.
+    auto element_pair_index = thread_id;
+    // If 'dimension_to_sort_bound' is evenly divisible by 'tile_size', we don't
+    // need any bounds checks.
+    if (dimension_to_sort_bound % tile_size) {
+      // Otherwise we need a bounds check for the last tile. The last tile has
+      // size 'dimension_to_sort_bound' % 'tile_size'.
+      ksl.IfReturnVoid(
+          "is_last_tile",
+          b->CreateICmpUGE(
+              b->CreateMul(tiled_keys_index[dimension_to_sort],
+                           tiled_keys_index.GetConstantWithIndexType(2)),
+              tiled_keys_index.GetConstantWithIndexType(
+                  RoundDownToNearest(dimension_to_sort_bound, tile_size))),
+          [&]() {
+            EmitCompareLoopBody(dimension_to_sort_bound % tile_size, keys_type,
+                                params.size() - 1, iota_values_parameter_index,
+                                element_pair_index, xor_mask,
+                                tiled_keys_index.GetType(), read_element,
+                                write_element, b);
+          },
+          [&]() {
+            EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
+                                iota_values_parameter_index, element_pair_index,
+                                xor_mask, tiled_keys_index.GetType(),
+                                read_element, write_element, b,
+                                /*needs_bounds_checks=*/false);
+          });
+    } else {
+      EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
+                          iota_values_parameter_index, element_pair_index,
+                          xor_mask, tiled_keys_index.GetType(), read_element,
+                          write_element, b, /*needs_bounds_checks=*/false);
+    }
+    // Wait until all comparisons have happened.
+    llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
+  }
+
+  // Copy the operand tiles back from shared memory to the operand buffers.
+  for (int64 i = 0; i < params.size(); ++i) {
+    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+      keys_index[dimension_to_sort] = index;
+      auto value = b->CreateLoad(b->CreateGEP(
+          param_shmem_buffers[i],
+          {tiled_keys_index.GetConstantWithIndexType(0), cache_index}));
+      params[i].EmitWriteArrayElement(keys_index, value, b);
+    });
+  }
+  // We should normally synchronize here to make sure all writes have happened.
+  // However the very next thing each thread does is reading 2 elements from the
+  // operand buffer and writing it into the same location in shared memory from
+  // which it previously copied it to the operand buffer, and we synchronize
+  // after this has happened. We can be sure that a thread always writes to the
+  // same location in shared memory because we have exactly tile_size / 2 many
+  // threads, and the linear index calculated by ParallelLoopEmitter uses
+  // linear_index = blockIdx.x * blockDim.x + threadIdx.x;
 }
 }  // namespace
 
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
                        const std::vector<IrArray>& values_arrays,
-                       absl::string_view name, llvm::Value* xor_mask,
-                       llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions* launch_dimensions) {
-  const Shape& keys_shape = keys_array.GetShape();
+                       int64 iota_values_parameter_index,
+                       absl::string_view name,
+                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions& launch_dimensions,
+                       int64 num_iterations_in_sort_dim,
+                       const int64 tile_size) {
+  // Iterate through the keys shape in physical order, but skip the dimension to
+  // sort and make it the innermost loop which is the loop where the comparisons
+  // happen. In the dimension to sort, if we use tiling, we iterate through it
+  // in tiles of 64 elements each, so we use another loop that happens within
+  // one thread to process this tile worth of data (thereby combining several
+  // comparison stages of the bitonic sort algorithm because they all happen
+  // within those 64 elements and are therefore independent of the other
+  // comparisons).
 
-  // Create loop nests which loop through the operand dimensions. The sort
-  // dimension is handled in the innermost loop which performs the sorting.
-  ForLoopNest loop_nest(name, b);
-  IrArray::Index keys_index =
-      loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys");
-  if (loop_nest.GetInnerLoopBodyBasicBlock() != nullptr) {
-    SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b);
+  const Shape& keys_shape = keys_array.GetShape();
+  int64 rank = ShapeUtil::Rank(keys_shape);
+  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
+  std::vector<int64> dimensions_in_iteration_order(rank);
+  std::vector<int64> iteration_order_to_logical_order(rank);
+  int64 dim = 0;
+  for (int64 dimension : LayoutUtil::MinorToMajor(keys_shape)) {
+    if (dimension != dimension_to_sort) {
+      dimensions_in_iteration_order[dim] = keys_shape.dimensions(dimension);
+      iteration_order_to_logical_order[dim++] = dimension;
+    }
   }
+  dimensions_in_iteration_order[dim] = num_iterations_in_sort_dim;
+  iteration_order_to_logical_order[dim] = dimension_to_sort;
 
-  // 'compare_keys_index' is the index of the element that 'keys_index' should
-  // be compared to.
-  IrArray::Index compare_keys_index(keys_index.GetType());
-  for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) {
-    if (dimension != dimension_to_sort) {
-      compare_keys_index.push_back(keys_index[dimension]);
-    } else {
-      compare_keys_index.push_back(nullptr);
+  Shape iteration_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
+                                               dimensions_in_iteration_order);
+  std::vector<IrArray> params(1, keys_array);
+  params.insert(params.end(), values_arrays.begin(), values_arrays.end());
+
+  // Allocate shared memory for the tiled compare loop.
+  std::vector<llvm::Value*> param_shmem_buffers(params.size(), nullptr);
+  if (xor_masks.size() > 1) {
+    llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    for (int64 i = 0; i < params.size(); ++i) {
+      llvm::Type* tile_type =
+          llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
+                                   params[i].GetShape().element_type(), module),
+                               tile_size);
+      param_shmem_buffers[i] = llvm_ir::AllocateSharedMemoryTile(
+          module, tile_type, absl::StrCat(name, "_tile_param_", i));
     }
   }
 
-  // Naive C++ code for the inner compare loop:
-  //
-  // for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
-  //   int64 j = i ^ xor_mask;
-  //   if (i < j && j < dimension_to_sort_bound) {
-  //     int64 min_key = std::min(keys[i], keys[j]);
-  //     keys[j] = std::max(keys[i], keys[j]);
-  //     keys[i] = min_key;
-  //   }
-  // }
-  //
-  // This follows the algorithm described on Wikipedia:
-  // https://en.wikipedia.org/wiki/Bitonic_sorter
-
-  int64 dimension_to_sort_bound =
-      keys_array.GetShape().dimensions(dimension_to_sort);
-  Shape compare_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
-                                             {dimension_to_sort_bound});
   auto compare_loop_body_emitter =
-      [&](const IrArray::Index& compare_index) -> Status {
-    keys_index[dimension_to_sort] = compare_index[0];
-    compare_keys_index[dimension_to_sort] =
-        b->CreateXor(compare_index[0], xor_mask);
-    EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index,
-                    keys_array, values_arrays, b);
+      [&](const IrArray::Index& tiles_index) -> Status {
+    // Naive C++ code for the inner compare loop:
+    //
+    // for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+    //   int64 j = i ^ xor_mask;
+    //   /* emitted in EmitCompareLoopBody() */
+    //   if (i < j && j < dimension_to_sort_bound) {
+    //     int64 min_key = std::min(keys[i], keys[j]);
+    //     keys[j] = std::max(keys[i], keys[j]);
+    //     keys[i] = min_key;
+    //   }
+    // }
+    //
+    // This follows the algorithm described on Wikipedia:
+    // https://en.wikipedia.org/wiki/Bitonic_sorter
+    IrArray::Index keys_index(tiles_index.GetType(), rank);
+    for (int64 i = 0; i < rank; ++i) {
+      keys_index[iteration_order_to_logical_order[i]] = tiles_index[i];
+    }
+    if (xor_masks.size() > 1) {
+      EmitTiledCompareLoop(keys_index, dimension_to_sort,
+                           dimension_to_sort_bound, keys_shape.element_type(),
+                           xor_masks, params, param_shmem_buffers,
+                           iota_values_parameter_index, tile_size, b);
+    } else {
+      auto read_element = [&](int64 operand, llvm::Value* index) {
+        keys_index[dimension_to_sort] = index;
+        return params[operand].EmitReadArrayElement(keys_index, b);
+      };
+      auto write_element = [&](int64 operand, llvm::Value* index,
+                               llvm::Value* value) {
+        keys_index[dimension_to_sort] = index;
+        params[operand].EmitWriteArrayElement(keys_index, value, b);
+      };
+      EmitCompareLoopBody(dimension_to_sort_bound, keys_shape.element_type(),
+                          values_arrays.size(), iota_values_parameter_index,
+                          tiles_index[rank - 1], xor_masks[0],
+                          tiles_index.GetType(), read_element, write_element,
+                          b);
+    }
     return Status::OK();
   };
-  if (launch_dimensions != nullptr) {
-    TF_RETURN_IF_ERROR(gpu::ParallelLoopEmitter(compare_loop_body_emitter,
-                                                compare_shape,
-                                                *launch_dimensions, b)
-                           .EmitLoop(name));
-  } else {
-    TF_RETURN_IF_ERROR(LoopEmitter(compare_loop_body_emitter, compare_shape, b)
-                           .EmitLoop(name));
-  }
-
-  // Set the IR builder insert point to the exit basic block of the outer most
-  // loop. This ensures later instructions are inserted after this loop nest.
-  b->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
-
-  return Status::OK();
+  return gpu::ParallelLoopEmitter(compare_loop_body_emitter, iteration_shape,
+                                  launch_dimensions, b)
+      .EmitLoop(name);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index 2f3bcda2307bcbb35a03b9e71dbbe44e366b3820..685f9383acba416f51681270e4037d56abb4b6ea 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -29,13 +30,17 @@ namespace xla {
 namespace llvm_ir {
 // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
 // dimension of 'keys_array'. All other dimensions are kept as-is. This
-// implements the inner loop of BitonicSort. If 'launch_dimensions' is nullptr,
-// the inner compare loop will not be parallelized.
+// implements the inner loop of BitonicSort. It is assumed that 'xor_masks'
+// contains only powers of 2, or values 2^k - 1 (k > 0). If
+// 'iota_values_parameter_index' is >= 0, it points at a 'values_arrays' operand
+// that is a iota and can be used to make the sorting stable.
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
                        const std::vector<IrArray>& values_arrays,
-                       absl::string_view name, llvm::Value* xor_mask,
-                       llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions* launch_dimensions);
+                       int64 iota_values_parameter_index,
+                       absl::string_view name,
+                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions& launch_dimensions,
+                       int64 num_iterations_in_sort_dim, int64 tile_size);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index cca37556173bb95ef062b59ab0a4bf9ca7c496fe..2180ac845dd71da3a67b0a818866540764ce0848 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -220,4 +220,10 @@ StatusOr<const ShapedBuffer*> LocalService::GlobalDataToShapedBuffer(
   return buffers[replica_number];
 }
 
+StatusOr<GlobalDataHandle> LocalService::RegisterReplicatedBuffers(
+    std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag) {
+  return allocation_tracker_.RegisterReplicatedBuffers(
+      std::move(replicated_buffers), tag);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 3b4f0b50832d6d2b64528ffb63eb5c7375396aec..f56ba32b04b9bf3aba75654bdb98887ad22e6791 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -63,6 +63,11 @@ class LocalService : public Service {
   StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
       const GlobalDataHandle& data, int replica_number);
 
+  // Registers a vector of shaped buffers of device memory, one per replica, and
+  // returns a corresponding handle that can be used for talking to XLA clients.
+  StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
+      std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag);
+
  private:
   explicit LocalService(const ServiceOptions& options,
                         std::unique_ptr<Backend> backend);
diff --git a/tensorflow/compiler/xla/service/map_inliner_test.cc b/tensorflow/compiler/xla/service/map_inliner_test.cc
index 84059dd0f71ee8fc0a25703cbab2268d7dc149a8..fd18bfdc3e7f4b5f94237c554c3e6ca8bd065a35 100644
--- a/tensorflow/compiler/xla/service/map_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/map_inliner_test.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -35,7 +35,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-using MapInlinerTest = HloVerifiedTestBase;
+using MapInlinerTest = HloTestBase;
 
 // Test that `map` with `max` is transformed to `max`
 TEST_F(MapInlinerTest, MapMax) {
@@ -59,12 +59,12 @@ TEST_F(MapInlinerTest, MapMax) {
       HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
 
   MapInliner inliner;
-  EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
               op::Maximum(lhs, rhs));
 
@@ -93,12 +93,12 @@ TEST_F(MapInlinerTest, MapConstant) {
       HloInstruction::CreateMap(lhs->shape(), {lhs}, const2_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEmbeddedComputation(std::move(const2_f32));
   hlo_module->AddEntryComputation(std::move(computation));
   HloInstruction* root = hlo_module->entry_computation()->root_instruction();
   MapInliner inliner;
-  EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   root = hlo_module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
 
@@ -131,12 +131,12 @@ TEST_F(MapInlinerTest, MapSubtractOppositeOrder) {
     HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
 
   MapInliner inliner;
-  EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
           op::Subtract(rhs, lhs));
 
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index 2ca527bc4cb8f66a085c1e6a7cbb8ddaedbfc07e..9ccdd7d8d818b9fa3aa77cdd10d37ca18928b448 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -257,7 +258,7 @@ bool MultiOutputFusion::LegalToFuse(HloInstruction* instr1,
 }
 
 void MultiOutputFusion::RecomputeReachability() {
-  reachability_ = computation_->ComputeReachability();
+  reachability_ = HloReachabilityMap::Build(computation_);
 }
 
 void MultiOutputFusion::UpdateReachability(
@@ -317,9 +318,9 @@ bool MultiOutputFusion::Perform() {
                 << instr2->fused_instructions_computation()->ToString(
                        HloPrintOptions().set_indent_amount(1));
       }
+      Update(instr1, instr2);
       HloInstruction* ret = Fuse(instr1, instr2);
       set_is_fused(ret == instr1 ? instr2 : instr1);
-      Update(instr1, instr2);
       changed = true;
       VLOG(2) << "After fusion, \t this: " << ret->name() << "\n"
               << ret->fused_instructions_computation()->ToString(
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h
index 9508ab2ed1d38ec40983d8892ec8875b848fb21b..1c7583ece720f9e4d4b71a6279b976fed40e10cb 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 6152cdc6099a182f1ed98f9501613e0aa123cdbb..f196d9b7f586474f4a5e997b26acf93b732afdda 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -90,8 +90,8 @@ namespace xla {
 // are provided below.
 //
 // Example nullary instruction:
-//   Param()                        == Op().WithOpcode(HloOpcode::kParam)
-//   Param(&a)                      == Op(&a).WithOpcode(HloOpcode::kParam)
+//   Parameter()                    == Op().WithOpcode(HloOpcode::kParameter)
+//   Parameter(&a)                  == Op(&a).WithOpcode(HloOpcode::kParameter)
 //
 // Example unary instruction:
 //   Abs()                             == Op().WithOpcode(HloOpcode::kAbs)
@@ -1067,8 +1067,10 @@ XLA_UNOP_PATTERN(RoundNearestAfz)
 XLA_UNOP_PATTERN(Bitcast)
 XLA_UNOP_PATTERN(Broadcast)
 XLA_UNOP_PATTERN(Ceil)
+XLA_UNOP_PATTERN(Convert)
 XLA_UNOP_PATTERN(Copy)
 XLA_UNOP_PATTERN(Cos)
+XLA_UNOP_PATTERN(CrossReplicaSum)
 XLA_UNOP_PATTERN(Exp)
 XLA_UNOP_PATTERN(Fft)
 XLA_UNOP_PATTERN(Floor)
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index c522e7ae23b734090f85d241bf365fccc37f0adb..c227106511c2c17b44569d3b696cd7d764226e81 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -59,20 +59,15 @@ string CanonicalPlatformName(const string& name) {
 
 /* static */ StatusOr<std::vector<se::Platform*>>
 PlatformUtil::GetSupportedPlatforms() {
-  se::MultiPlatformManager::PlatformMap platform_map;
-  se::port::Status platforms_status = se::MultiPlatformManager::WithPlatforms(
-      [&platform_map](se::MultiPlatformManager::PlatformMap* map) {
-        platform_map = *map;
-        return se::port::Status::OK();
-      });
-  if (platform_map.empty()) {
+  std::vector<se::Platform*> all_platforms =
+      se::MultiPlatformManager::AllPlatforms();
+  if (all_platforms.empty()) {
     LOG(WARNING) << "no executor platforms available: platform map is empty";
   }
 
   // Gather all platforms which have an XLA compiler.
   std::vector<se::Platform*> platforms;
-  for (auto& platform_pair : platform_map) {
-    auto* platform = platform_pair.second;
+  for (se::Platform* platform : all_platforms) {
     auto compiler_status = Compiler::GetForPlatform(platform);
     if (compiler_status.ok()) {
       platforms.push_back(platform);
@@ -222,8 +217,8 @@ PlatformUtil::GetStreamExecutors(se::Platform* platform) {
     // fix the number of devices to one.  However we do let the user override
     // this behavior to help run tests on the host that run models in parallel
     // across multiple devices.
-    device_count = legacy_flags::GetDebugOptionsFromFlags()
-                       .xla_force_host_platform_device_count();
+    device_count =
+        GetDebugOptionsFromFlags().xla_force_host_platform_device_count();
   }
   std::vector<se::StreamExecutor*> stream_executors(device_count, nullptr);
   VLOG(1) << "Initializing devices";
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
index 688cceff0cd10df62a4093f00ad3331ca77652e0..b70cb7057477a338bfb36ebab76237b30d018e41 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -111,7 +111,7 @@ StatusOr<bool> ReducePrecisionInsertion::insert_on_inputs(
       VLOG(2) << "Adding to operand " << i << ": " << operand;
 
       if (!is_valid_shape(operand->shape())) {
-        VLOG(2) << "Skipped: value is not an F32 vector";
+        VLOG(2) << "Skipped: value is not of type F32";
         continue;
       }
 
@@ -168,7 +168,7 @@ StatusOr<bool> ReducePrecisionInsertion::insert_on_outputs(
             << instruction->ToString();
 
     if (!is_valid_shape(instruction->shape())) {
-      VLOG(2) << "Skipped: value is not an F32 nonscalar array";
+      VLOG(2) << "Skipped: value is not of type F32";
       continue;
     }
 
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
index 0b4e82e8d606cf2cacfab42d07c2201939d5e10b..76c6a87f176ec9c6f8e49c25278c6dad703e3c7c 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
@@ -118,13 +118,7 @@ class ReducePrecisionInsertion : public HloModulePass {
     // equivalent behavior can be obtained by adding ReducePrecision
     // instructions after the instructions that pull the F32 arrays out of
     // the tuples.
-    //
-    // TODO(b/64093391): Remove the IsScalar check once this won't cause
-    // failures on the GPU backend if the ReducePrecision instruction ends up
-    // inserted between a scalar constant and the init_value argument of a
-    // Reduce operation.
-    return shape.element_type() == PrimitiveType::F32 &&
-           !ShapeUtil::IsScalar(shape);
+    return shape.element_type() == PrimitiveType::F32;
   }
 
   // Is this instruction one such that following or preceding it with a new
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
index 69e4b534bd8e3aeab8b729f3e594a10b4368f15f..efeec96571455d8a9e4b7837dd7286392c12f1a3 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
@@ -54,7 +54,34 @@ TEST_F(ReducePrecisionInsertionTest, BeforeUnaryInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  // Confirm expected state before adding ops.
+  EXPECT_EQ(computation->root_instruction(), b);
+  EXPECT_EQ(b->operand(0), a);
+
+  EXPECT_TRUE(InsertOps(module.get(), HloReducePrecisionOptions::OP_INPUTS,
+                        [](const HloInstruction* instruction) {
+                          return instruction->opcode() == HloOpcode::kCos;
+                        }));
+
+  // Confirm expected graph after adding ops.
+  EXPECT_EQ(computation->root_instruction(), b);
+  EXPECT_THAT(b->operand(0), op::ReducePrecision(a));
+}
+
+TEST_F(ReducePrecisionInsertionTest, BeforeUnaryScalarInstruction) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {});
+
+  // Create a simple graph with a parameter feeding a unary cosine function.
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
+
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -84,7 +111,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeBinaryInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -113,7 +140,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeZeroInputInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -146,7 +173,7 @@ TEST_F(ReducePrecisionInsertionTest, AvoidAddingDuplicateInstructions) {
   HloInstruction* d = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, b, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -178,7 +205,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterRootInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -215,7 +242,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterNonRootInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a_cos, b_cos));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -242,7 +269,7 @@ TEST_F(ReducePrecisionInsertionTest, OutputIsNotFloat) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -268,7 +295,7 @@ TEST_F(ReducePrecisionInsertionTest, ShouldReduceOutputPrecisionIsFalse) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -294,7 +321,7 @@ TEST_F(ReducePrecisionInsertionTest, InsertionIsNotRecursive) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, a, 8, 23));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -321,7 +348,7 @@ TEST_F(ReducePrecisionInsertionTest, SkipRedundantReducePrecisionAfter) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 5, 10));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -349,7 +376,7 @@ TEST_F(ReducePrecisionInsertionTest, AddNonRedundantReducePrecision) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 8, 23));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -375,7 +402,7 @@ TEST_F(ReducePrecisionInsertionTest, IgnoreOpsInsideFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -411,7 +438,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInHeadOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -458,7 +485,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInTailOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index fcf269eee925c2ddb7511d70e71bd815e4b8c24a..341659b15c4c7355d39739ee171a4a749d87e929 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -34,9 +34,10 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class ReshapeMoverTest : public HloVerifiedTestBase {};
+class ReshapeMoverTest : public HloTestBase {};
 
 TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -50,12 +51,12 @@ TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
@@ -74,6 +75,7 @@ TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
 // Verifies that the reshape is not moved, since rng0 is trivially reshapable
 // and therefore there is no nontrivial reshapes to move.
 TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto rng0 = builder.AddInstruction(HloInstruction::CreateRng(
@@ -92,18 +94,19 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, const1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(rng0), const1));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(rng0), const1));
 }
 
 TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -117,12 +120,12 @@ TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -130,6 +133,7 @@ TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -143,11 +147,11 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Add(param0, param1)));
@@ -177,6 +181,7 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
 // |
 // reshape4
 TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
   auto const0 = builder.AddInstruction(
@@ -196,12 +201,12 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, const0, reshape1, reshape2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(const0, reshape1, reshape2));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Select(op::Reshape(const0), param1, param2)));
@@ -221,6 +226,7 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
 // Verifies that the reshape0 does not sink below add, because param1 is not
 // trivially reshapable nor is a Reshape/Transpose.
 TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -232,11 +238,11 @@ TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), param1));
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), param1));
@@ -257,6 +263,7 @@ TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
 // Verifies that we don't unnecessarily sink reshapes, which are in fact
 // trivial reshapes.
 TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {3, 2});
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -275,12 +282,12 @@ TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, pred, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
@@ -309,6 +316,7 @@ TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
 //
 // (note that reshape1 here is trivial).
 TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -320,12 +328,12 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, const1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), const1));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Add(param0, op::Reshape(const1))));
@@ -348,6 +356,7 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
 // For now we treat it as non-trivial, so we verify that we don't sink the
 // reshapes in this case.
 TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {1, 1, 3});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -362,12 +371,12 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(const1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(const1)));
@@ -376,6 +385,7 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -389,14 +399,14 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add},
                                        HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(),
               op::Fusion(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Fusion(param0, param1)));
@@ -405,6 +415,7 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto pred_shape = ShapeUtil::MakeShape(PRED, {8, 7});
@@ -423,13 +434,13 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, reshape_pred, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
       op::Select(op::Reshape(pred), op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Select(pred, param0, param1)));
@@ -438,6 +449,7 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
 }
 
 TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {});
   auto pred_shape = ShapeUtil::MakeShape(PRED, {});
@@ -452,11 +464,11 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, reshape_pred, param0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               op::Select(op::Reshape(pred), param0, param1));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(op::Reshape(pred), param0, param1));
@@ -477,6 +489,7 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
 //
 // We expect reshape{0,1} AND reshape{2,3} to be lifted.
 TEST_F(ReshapeMoverTest, MultiplePasses) {
+  auto m = CreateNewVerifiedModule();
   auto shape1 = ShapeUtil::MakeShape(F32, {1, 8, 1, 7});
   auto shape2 = ShapeUtil::MakeShape(F32, {8, 7, 1});
   auto shape3 = ShapeUtil::MakeShape(F32, {8, 7});
@@ -500,14 +513,14 @@ TEST_F(ReshapeMoverTest, MultiplePasses) {
   builder.AddInstruction(HloInstruction::CreateBinary(shape3, HloOpcode::kAdd,
                                                       reshape2, reshape3));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
       op::Add(op::Reshape(param2),
               op::Reshape(op::Add(op::Reshape(param0), op::Reshape(param1)))));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -526,11 +539,11 @@ TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_TRUE(changed);
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Transpose(op::Multiply()));
 }
 
@@ -555,8 +568,8 @@ TEST_F(ReshapeMoverTest, ReshapeWithUsersOutsideCandidatesNotSink) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -580,10 +593,10 @@ TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink1) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_TRUE(changed);
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Reshape(), op::Reshape(), op::Reshape()));
 }
 
@@ -597,10 +610,10 @@ TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink2) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_TRUE(changed);
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Reshape(op::Add()));
 }
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 6f9094a5c2e882f4bc1531efdef654a6afa2ddb6..13fd6bc0093f3bb94c61fc46dc16ecfea03eb326 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -292,7 +292,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     config->set_seed(execution_options->seed());
     config->set_debug_options(execution_options->debug_options());
   } else {
-    config->set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config->set_debug_options(GetDebugOptionsFromFlags());
   }
 
   if (execute_backend_ != nullptr &&
@@ -760,38 +760,6 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   return Status::OK();
 }
 
-Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
-                              ExecuteResponse* result) {
-  ExecuteGraphParallelRequest parallel_arg;
-  *parallel_arg.add_requests() = *arg;
-  ExecuteParallelResponse parallel_result;
-  TF_RETURN_IF_ERROR(ExecuteGraphParallel(&parallel_arg, &parallel_result));
-  return PickParallelResponse(parallel_result, result);
-}
-
-Status Service::PickParallelResponse(
-    const ExecuteParallelResponse& parallel_result, ExecuteResponse* result) {
-  // The "result device" selection is a bit hacky, but better than assuming it
-  // is device 0. We have b/76035356 for restructuring the client API to clean
-  // up the current asymmetries and support more functionalities.
-  for (int64 i = 0; i < parallel_result.responses_size(); ++i) {
-    TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
-                        allocation_tracker_.ResolveForReplica(
-                            parallel_result.responses(i).output(), 0));
-    const Shape& shape = buffer->on_host_shape();
-    if (!ShapeUtil::IsEmptyTuple(shape)) {
-      *result = parallel_result.responses(i);
-      VLOG(3) << "Fetching result from device " << i << ": "
-              << ShapeUtil::HumanString(shape);
-      return Status::OK();
-    }
-  }
-  TF_RET_CHECK(parallel_result.responses_size() > 0);
-  *result = parallel_result.responses(0);
-  VLOG(1) << "Defaulting to device 0 result";
-  return Status::OK();
-}
-
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const HloModuleProto& module_proto,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -836,10 +804,8 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   return std::move(executable);
 }
 
-Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
-                             ExecuteResponse* result) {
-  VLOG(1) << "running execute-graph request";
-
+Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
+  VLOG(1) << "running compile request";
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
   }
@@ -847,22 +813,21 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
     return InvalidArgument("programe shape may not be empty");
   }
 
-  // If we received multiple device handles, we must partition the module.
   if (arg->execution_options().device_handles_size() > 1) {
-    return ExecuteOneToN(arg, result);
+    return InvalidArgument(
+        "The compile request does not support multiple device handles.");
   }
 
-  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
-                                              SingleComputationDeviceHandle()));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-      ResolveAndValidateArguments(arg->arguments(), replicas));
-
+  std::vector<const Shape*> argument_shapes;
+  absl::c_transform(arg->input_shape_with_layout(),
+                    std::back_inserter(argument_shapes),
+                    [](const Shape& shape) { return &shape; });
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
       CreateModuleConfig(arg->computation().host_program_shape(),
-                         replicated_arguments.front(),
-                         arg->execution_options()));
+                         argument_shapes, &arg->execution_options()));
+  VLOG(3) << "Compile created HloModuleConfig computation layout: "
+          << module_config->entry_computation_layout().ToString();
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
@@ -871,6 +836,48 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
                       execute_backend_->default_stream_executor(),
                       /*device_allocator=*/nullptr));
 
+  *result->mutable_handle() = compilation_cache_.Insert(std::move(executable));
+
+  VLOG(1) << "successfully completed 'compile' request";
+  return Status::OK();
+}
+
+Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
+  VLOG(1) << "running execute request";
+  if (!arg->has_handle()) {
+    return InvalidArgument("execution handle should not be empty");
+  }
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      compilation_cache_.LookUp(arg->handle()));
+
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
+                                              SingleComputationDeviceHandle()));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arg->arguments(), replicas));
+
+  // Check that the replicated_arguments has the same shape and layout as the
+  // module config used when creating the exectuable.
+  const int64 num_module_args =
+      executable->module_config().entry_computation_layout().parameter_count();
+  if (num_module_args != arg->arguments_size()) {
+    return InvalidArgument(
+        "The executable expects %lld arguments, but sees %lld.",
+        num_module_args, arg->arguments_size());
+  }
+  for (int64 i = 0; i < num_module_args; i++) {
+    const Shape& shape_module =
+        executable->module_config().entry_computation_layout().parameter_shape(
+            i);
+    const Shape& shape_arg = replicated_arguments.front()[i]->on_host_shape();
+    if (!ShapeUtil::Equal(shape_module, shape_arg)) {
+      return InvalidArgumentStrCat(
+          "The executable exepcts the ", i, "th argument in shape ",
+          ShapeUtil::HumanStringWithLayout(shape_module), " but sees ",
+          ShapeUtil::HumanStringWithLayout(shape_arg));
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(auto stream,
                       execute_backend_->BorrowStream(
                           execute_backend_->default_stream_executor()));
@@ -884,9 +891,10 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
 
   TF_ASSIGN_OR_RETURN(
       *result->mutable_output(),
-      ExecuteAndRegisterResult(
-          executable.get(), replicated_arguments, execute_backend_.get(),
-          "result of " + arg->computation().name(), result->mutable_profile()));
+      ExecuteAndRegisterResult(executable.get(), replicated_arguments,
+                               execute_backend_.get(),
+                               "result of " + executable->module().name(),
+                               result->mutable_profile()));
 
   if (executable->dumping_snapshot()) {
     TF_ASSIGN_OR_RETURN(
@@ -898,7 +906,7 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
     TF_RETURN_IF_ERROR(executable->DumpHloSnapshot());
   }
 
-  VLOG(1) << "successfully completed 'execute-graph' request";
+  VLOG(1) << "successfully completed 'execute' request";
   return Status::OK();
 }
 
@@ -949,21 +957,6 @@ Status Service::TransferToClient(const TransferToClientRequest* arg,
   return Status::OK();
 }
 
-namespace {
-
-// Creates a clone of the given shaped buffer with the given device ordinal. The
-// shape and DeviceMemoryBase values of the clone are identical to the original.
-std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
-    const ShapedBuffer& shaped_buffer, int device_ordinal) {
-  auto clone = absl::make_unique<ShapedBuffer>(
-      shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape(),
-      shaped_buffer.platform(), device_ordinal);
-  clone->buffers() = shaped_buffer.buffers();
-  return clone;
-}
-
-}  // namespace
-
 Status Service::TransferToServer(const TransferToServerRequest* arg,
                                  TransferToServerResponse* result) {
   TF_ASSIGN_OR_RETURN(Literal literal,
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 8cf1a7b9f01fbb3572c6849c8b18e14174ced89f..11e1a79552fbd944ab28da129b08cfe676fb08e9 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -22,11 +22,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/allocation_tracker.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
+#include "tensorflow/compiler/xla/service/compilation_cache.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
@@ -90,11 +91,14 @@ class Service : public ServiceInterface {
   Status DeconstructTuple(const DeconstructTupleRequest* arg,
                           DeconstructTupleResponse* result) override;
 
-  // Executes a computation with the provided global data passed as
-  // immutable arguments. The request contains the whole computation graph.
-  // Returns global data output and execution timing.
-  Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                      ExecuteResponse* result) override;
+  // Compiles a computation into an executable. The request contains the whole
+  // computation graph. Returns the handle to the executable.
+  Status Compile(const CompileRequest* arg, CompileResponse* result) override;
+
+  // Executes an executable with the provided global data passes as immutable
+  // arguments. The request contains the handle to the executable. Returns
+  // global data output and execution timing.
+  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override;
 
   // Executes one or more computations in parallel with the provided global data
   // passed as immutable arguments. Returns global data output for each
@@ -179,10 +183,6 @@ class Service : public ServiceInterface {
       absl::Span<const ShapedBuffer* const> arguments,
       const ExecutionOptions& execution_options);
 
-  // Picks a parallel response and fills the result.
-  Status PickParallelResponse(const ExecuteParallelResponse& parallel_result,
-                              ExecuteResponse* result);
-
   // Prepare the executors for executing parallel.
   StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
       const ExecutionOptions& execution_options, int64 requests_size,
@@ -254,11 +254,6 @@ class Service : public ServiceInterface {
       Backend* backend, absl::Span<const DeviceHandle> device_handles,
       absl::Span<const string> result_tags, ExecutionProfile* profile);
 
-  // Executes a single computation which has more than one target device.
-  // The N devices are expected to all return an empty tuple, but one, which
-  // will be the result of this computation.
-  Status ExecuteOneToN(const ExecuteGraphRequest* arg, ExecuteResponse* result);
-
   // Convenience function which checks whether the given client_shape
   // (presumably passed by the client to set the result layout) is valid for the
   // given computation result shape.
@@ -281,6 +276,9 @@ class Service : public ServiceInterface {
 
   ServiceOptions options_;
 
+  // Cache containing previously built Executables.
+  CompilationCache compilation_cache_;
+
   // Tracks channels created via the API.
   ChannelTracker channel_tracker_;
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 2f8f092303ed1821d9bff021da0e835f1878f5ed..2bfc1676bddc66bdc90052589ed3024510c24d8f 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -2031,6 +2031,25 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return operand_shape;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferGetDimensionSizeShape(
+    const Shape& shape, int64 dimension) {
+  if (dimension < 0 || dimension >= ShapeUtil::Rank(shape)) {
+    return InvalidArgument("GetDimensionSize dimension out of bounds: %d.",
+                           dimension);
+  }
+
+  // TODO(b/119580730): Remove this restriction when very large dimension size
+  // is needed.
+  if (shape.dimensions(dimension) > std::numeric_limits<uint32>::max()) {
+    return InvalidArgument(
+        "GetDimensionSize's input shape is %s, the %dth dimension exceeds the "
+        "UINT_MAX limit.",
+        ShapeUtil::HumanString(shape), dimension);
+  }
+
+  return ShapeUtil::MakeShape(U32, {});
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
     const Shape& arg, absl::Span<const int64> starts,
     absl::Span<const int64> limits, absl::Span<const int64> strides) {
@@ -2833,6 +2852,15 @@ Status ValidateScatterDimensionNumbers(
     }
   }
 
+  // Validate window size.
+  auto window_size = dim_numbers.update_window_dims_size() +
+                     dim_numbers.inserted_window_dims_size();
+  if (window_size != ShapeUtil::Rank(operand_shape)) {
+    return InvalidArgument(
+        "Scatter op has window of size %d; doesn't match operand of rank %d.",
+        window_size, ShapeUtil::Rank(operand_shape));
+  }
+
   // Validate scatter_dims_to_operand_dims in ScatterDimensionNumbers.
   if (dim_numbers.scatter_dims_to_operand_dims_size() !=
       scatter_indices_shape[dim_numbers.index_vector_dim()]) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index cd4e5ab52ca5e33424f2b78f83cc94961b254493..31ef4b2e41078f87731a1eff58e37409a6004ba4 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -291,6 +291,9 @@ class ShapeInference {
       const Shape& updates_shape, const ProgramShape& to_apply_shape,
       const ScatterDimensionNumbers& scatter_dim_numbers);
 
+  static StatusOr<Shape> InferGetDimensionSizeShape(const Shape& shape,
+                                                    int64 dimension);
+
  private:
   // Helper that infers the shape produced by performing an element-wise binary
   // operation with the given LHS and RHS shapes.
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 7b65e8c1c9d2bc730c6c8550e9265b69fdde71cf..4639e32db4d59080a9e85e46983fac61d9e76be9 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -2673,5 +2673,23 @@ TEST_F(ScatterGatherShapeInferenceTest,
       << statusor.status();
 }
 
+TEST_F(ScatterGatherShapeInferenceTest,
+       InvalidScatterDimNumbers_InsufficientWindowDims) {
+  StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+      f32_5d_tensor_50_49_48_47_46_, s64_scalar_,
+      ShapeUtil::MakeShape(F32, {30, 29, 28, 27}), to_apply_,
+      HloScatterInstruction::MakeScatterDimNumbers(
+          /*update_window_dims=*/{0, 1, 2, 3},
+          /*inserted_window_dims=*/{},
+          /*scatter_dims_to_operand_dims=*/{0},
+          /*index_vector_dim=*/0));
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(
+      statusor.status().error_message(),
+      HasSubstr(
+          "Scatter op has window of size 4; doesn't match operand of rank 5."))
+      << statusor.status();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 56952e3adae59656605a12fd499162504a2a3379..28a30b5ee2dbcb5012804578d4d037c241045309 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -157,4 +157,23 @@ void ScopedShapedBuffer::Deallocate() {
   }
 }
 
+ScopedShapedBuffer ScopedShapedBuffer::TakeSubTree(ShapeIndexView index) {
+  const xla::Shape& sub_on_host_shape =
+      xla::ShapeUtil::GetSubshape(on_host_shape(), {index});
+  const xla::Shape& sub_on_device_shape =
+      xla::ShapeUtil::GetSubshape(on_device_shape(), {index});
+
+  ScopedShapedBuffer output(sub_on_host_shape, sub_on_device_shape,
+                            memory_allocator(), device_ordinal());
+  auto src_it = buffers().find(index);
+  auto dst_it = output.buffers().begin();
+  while (dst_it != output.buffers().end()) {
+    dst_it->second = src_it->second;
+    src_it->second = tensorflow::se::DeviceMemoryBase(nullptr, 0);
+    ++src_it;
+    ++dst_it;
+  }
+  return output;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index e1d26da4a20c0105be304b1a34c81515fcdc6b7f..f5210c9cfa6b29853bcd0f5bfd581ee3e116a509 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -176,6 +176,11 @@ class ScopedShapedBuffer : public ShapedBuffer {
   // It's the caller's job to ensure that the memory contained therein is freed.
   TF_MUST_USE_RESULT ShapedBuffer release();
 
+  // Extracts the sub-tree rooted at 'index' and returns a ScopedShapedBuffer
+  // that holds ownership of the subtree. Sets the buffers corresponding to the
+  // subtree to null in 'this'.
+  ScopedShapedBuffer TakeSubTree(ShapeIndexView index);
+
  protected:
   void Deallocate();
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
index d69e6362e91e4696dab3c46d99a981c67b593a1c..ca64bd3c8dd2baa686db2b85c937a034b37ab22b 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace xla {
@@ -107,5 +109,79 @@ TEST(ScopedShapedBufferTest, TestMoveAssignmentOperator) {
   // TestAllocator's destructor checks that all memory was freed.
 }
 
+TEST(ScopedShapedBufferTest, TestTakeSubTree) {
+  TestAllocator allocator;
+
+  Shape s = ShapeUtil::MakeShape(F32, {1});
+  s = xla::ShapeUtil::MakeTupleShape(std::vector<xla::Shape>(2, s));
+  s = xla::ShapeUtil::MakeTupleShape(std::vector<xla::Shape>(3, s));
+
+  ScopedShapedBuffer sb(s, s, &allocator, /*device_ordinal=*/0);
+  sb.buffers().ForEachMutableElement(
+      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        TF_ASSERT_OK_AND_ASSIGN(
+            OwningDeviceMemory m,
+            allocator.Allocate(/*device_ordinal=*/0, /*size=*/77));
+        *buffer = m.Forget();
+      });
+  ShapeTree<se::DeviceMemoryBase> buffers = sb.buffers();
+
+  // Takes a subtree out of 'sb', and verifies the buffers are as expected.
+  xla::ShapeIndex subtree_index = {1};
+  ScopedShapedBuffer output = sb.TakeSubTree(subtree_index);
+
+  output.buffers().ForEachElement([&](const xla::ShapeIndex& sub_index,
+                                      const se::DeviceMemoryBase& buffer) {
+    xla::ShapeIndex orig_index = subtree_index;
+    for (int i : sub_index) {
+      orig_index.push_back(i);
+    }
+    EXPECT_TRUE(buffers.find(orig_index)->second.IsSameAs(buffer));
+  });
+  sb.buffers().ForEachElement(
+      [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
+        if (ShapeIndexView(index).StartsWith(subtree_index)) {
+          EXPECT_TRUE(buffer.is_null());
+        } else {
+          EXPECT_TRUE(buffers.find(index)->second.IsSameAs(buffer));
+        }
+      });
+}
+
+// Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs
+// (cardinality of each non-leaf node's children).
+void BM_TakeSubTree(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  TestAllocator allocator;
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  }
+  xla::ScopedShapedBuffer shaped_buffer(shape, shape, /*allocator=*/&allocator,
+                                        /*device_ordinal=*/0);
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    // Extract a buffer from approximately the middle of the first level of the
+    // tree.
+    (void)shaped_buffer.TakeSubTree(/*index=*/{fan_out / 2}).release();
+  }
+  tensorflow::testing::StopTiming();
+}
+
+BENCHMARK(BM_TakeSubTree)
+    ->ArgPair(1, 4)
+    ->ArgPair(1, 8)
+    ->ArgPair(1, 32)
+    ->ArgPair(1, 64)
+    ->ArgPair(1, 128)
+    ->ArgPair(1, 256)
+    ->ArgPair(1, 512)
+    ->ArgPair(2, 4)
+    ->ArgPair(2, 8)
+    ->ArgPair(2, 32)
+    ->ArgPair(2, 64)
+    ->ArgPair(2, 128);
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 79b5c09abb355cd067a4891af558c8c44d80d88e..17cdaa74fc328d156292f5af828d4222a9a01f1f 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -172,7 +172,7 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   HloInstruction* mul = builder.AddInstruction(HloInstruction::CreateBinary(
       add->shape(), HloOpcode::kMultiply, add, sub));
 
-  auto module = CreateNewModule("fuse_with_constant_operands");
+  auto module = CreateNewVerifiedModule("fuse_with_constant_operands");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(mul));
   HloInstruction* call = module->OutlineExpressionFromComputation(
@@ -247,7 +247,7 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -302,7 +302,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -362,7 +362,7 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -428,7 +428,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index d9ebebf74ed846aa05326a4df72019ef3e71ad88..10ef2d38fa21c3e93c270535bc99b2f76435337d 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -48,7 +48,7 @@ class TuplePointsToAnalysisTest : public HloTestBase {
   }
 
   void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
+    module_ = CreateNewUnverifiedModule();
     module_->AddEntryComputation(std::move(computation));
   }
 
@@ -809,7 +809,7 @@ TEST_F(FusionPointsToAnalysisTest, FusionParam0TwoUsers) {
 class PointsToAnalysisTestBase : public HloTestBase {
  protected:
   void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
+    module_ = CreateNewUnverifiedModule();
     computation_ = module_->AddEntryComputation(std::move(computation));
   }
 
@@ -1176,7 +1176,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
     return builder.Build();
   };
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   HloComputation* cond_computation =
       module_->AddEmbeddedComputation(make_cond());
   HloComputation* body_computation =
@@ -1211,7 +1211,7 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
   auto add = sub_builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
   sub_computation->CreateFusionInstruction({add, ones},
                                            HloInstruction::FusionKind::kLoop);
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index 516754e2110ee50a597818c4a8bcfbfbb76c5cec..65b0f8c804475d8f22fff9798e79c9881a51f1f1 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -34,7 +34,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-class TupleSimplifierTest : public HloVerifiedTestBase {
+class TupleSimplifierTest : public HloTestBase {
  protected:
   void Run(HloModule* module, bool change_expected) {
     TupleSimplifier simplifier;
@@ -65,10 +65,10 @@ TEST_F(TupleSimplifierTest, TupleOfParameters) {
   HloInstruction* param2 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, scalar_shape_, "param2"));
   builder.AddInstruction(HloInstruction::CreateTuple({param0, param1, param2}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  Run(module, /*change_expected=*/false);
+  Run(module.get(), /*change_expected=*/false);
 }
 
 TEST_F(TupleSimplifierTest, GteOfTupleOfParameter) {
@@ -78,10 +78,10 @@ TEST_F(TupleSimplifierTest, GteOfTupleOfParameter) {
       HloInstruction::CreateParameter(0, tuple_shape_, "param"));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  Run(module, /*change_expected=*/false);
+  Run(module.get(), /*change_expected=*/false);
 }
 
 TEST_F(TupleSimplifierTest, GteOfTuple) {
@@ -98,12 +98,12 @@ TEST_F(TupleSimplifierTest, GteOfTuple) {
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape_, tuple, 1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), gte);
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), param1);
 }
@@ -125,13 +125,13 @@ TEST_F(TupleSimplifierTest, GteOfTupleChain) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, element));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Negate(op::GetTupleElement(op::Tuple())));
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), op::Negate(op::Parameter()));
 }
@@ -157,12 +157,12 @@ TEST_F(TupleSimplifierTest, NestedGteOfTuples) {
         ShapeUtil::GetTupleElementShape(element->shape(), 0), element, 0));
   }
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), element);
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -182,12 +182,12 @@ TEST_F(TupleSimplifierTest, TupleOfGteInstructions) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), tuple);
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), tuple_param);
 }
@@ -207,19 +207,19 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), tuple);
 
-  Run(module, /*change_expected=*/false);
+  Run(module.get(), /*change_expected=*/false);
 
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
 TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
   //  Verify that the root computation can be excluded
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloInstruction* p0;
   HloInstruction* p1;
@@ -281,7 +281,7 @@ TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
     entry = module->AddEntryComputation(builder.Build());
   }
 
-  Run(module, /*change_expected=*/true, /*exclude_entry=*/true);
+  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/true);
 
   EXPECT_THAT(c0->root_instruction(), p0);
   EXPECT_THAT(c1->root_instruction(), p1);
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
index 541b117e0299c94de330604ec5c16e20f07c425f..68e2569f66bea9ec1223e454d1ead0efc7b9498e 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 
 namespace xla {
 
@@ -229,4 +232,96 @@ optional<int64> ComputeWhileLoopTripCount(HloInstruction* while_op,
   return nullopt;
 }
 
+// If the only user of this instruction is a get-tuple-element, return that
+// get-tuple-element, otherwise return null. If this runs before CSE/DCE, we may
+// get a false negative if there are several copies of the same GTE, or there
+// are unused GTEs, but we can live with this.
+static HloInstruction* GetOnlyGTE(HloInstruction* inst) {
+  if (inst->user_count() != 1) {
+    return nullptr;
+  }
+
+  HloInstruction* user = inst->users().back();
+  if (user->opcode() != HloOpcode::kGetTupleElement) {
+    return nullptr;
+  }
+  return user;
+}
+
+optional<int64> ComputeWhileLoopTripCountUpperBound(HloInstruction* while_op) {
+  // If we know the exact trip count, it's also the upper bound.
+  auto exact_trip_count = ComputeWhileLoopTripCount(while_op);
+  if (exact_trip_count) {
+    VLOG(2) << "Loop has exact trip count.";
+    return exact_trip_count;
+  }
+
+  // There is one more case we know how to handle. If the loop condition only
+  // looks at one element of the tuple, and the loop body sets this element to a
+  // constant, there are two options:
+  // 1) Evaluating the condition on this constant returns true. In this case,
+  // the loop either executes 0 times, or is an infinite loop, depending on the
+  // init value.
+  // 2) Evaluating the condition on this constant returns false. In this case,
+  // the loop executes 0 or 1 times, depending on the init value. This means
+  // that, regardless of the init value, the upper bound on the trip count is 1.
+
+  // Check whether the condition depends on a single parameter, and find out
+  // which.
+  auto* while_cond = while_op->while_condition();
+  auto* while_cond_param = while_cond->parameter_instruction(0);
+  auto* cond_gte = GetOnlyGTE(while_cond_param);
+  if (!cond_gte) {
+    VLOG(2) << "Induction variable not found in loop condition: "
+            << while_cond->root_instruction()->ToString();
+    return nullopt;
+  }
+
+  // Now check whether this gets set to a constant by the while body.
+  auto* while_body = while_op->while_body();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_body_root->opcode() != HloOpcode::kTuple) {
+    VLOG(3) << "While body's root is not a tuple instruction: "
+            << while_body_root->ToString();
+    return nullopt;
+  }
+
+  int64 indvar_index = cond_gte->tuple_index();
+  auto* while_body_indvar = while_body_root->operand(indvar_index);
+  if (while_body_indvar->opcode() != HloOpcode::kConstant) {
+    VLOG(3) << "While body does not set the IV to a constant: "
+            << while_body_indvar->ToString();
+    return nullopt;
+  }
+
+  // We have a constant. Evaluate the condition on this constant.
+  HloEvaluator evaluator(/*max_loop_iterations=*/0);
+  Literal fake_input = Literal::CreateFromShape(while_cond_param->shape());
+  TF_CHECK_OK(fake_input.CopyFrom(while_body_indvar->literal(),
+                                  /*dest_shape_index=*/{indvar_index},
+                                  /*src_shape_index=*/{}));
+  StatusOr<Literal> eval_result =
+      evaluator.Evaluate<Literal>(*while_cond, {std::move(fake_input)});
+
+  if (!eval_result.ok()) {
+    VLOG(2) << "Couldn't evaluate while loop condition.";
+    return nullopt;
+  }
+
+  Literal cond_result_pred = std::move(eval_result.ValueOrDie());
+  CHECK(ShapeUtil::Equal(cond_result_pred.shape(),
+                         ShapeUtil::MakeShape(PRED, {})));
+
+  // Per the explanation above, if the evaluated condition returns false, the
+  // loop executes at most once.
+  bool cond_returns_true = cond_result_pred.GetFirstElement<bool>();
+  if (!cond_returns_true) {
+    VLOG(2) << "Upper bound on the trip count is 1";
+    return 1;
+  }
+
+  VLOG(2) << "Loop has no known upper bound on the trip count.";
+  return nullopt;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.h b/tensorflow/compiler/xla/service/while_loop_analysis.h
index bf497f4892b95c927379411468a66d8961465413..ac69a727bd6b403672a676400993fb7d8afc0a55 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.h
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.h
@@ -28,6 +28,10 @@ namespace xla {
 absl::optional<int64> ComputeWhileLoopTripCount(HloInstruction *while_op,
                                                 int64 max_value_returned = 128);
 
+// Returns an upper bound on the trip count of the loop if it's statically
+// known, nullopt otherwise.
+absl::optional<int64> ComputeWhileLoopTripCountUpperBound(
+    HloInstruction *while_op);
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis_test.cc b/tensorflow/compiler/xla/service/while_loop_analysis_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1da0fbeac89a93eaaef893e5f25dd3b87cc1d5d5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_analysis_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
+
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class WhileLoopAnalysisTest : public HloTestBase {};
+
+TEST_F(WhileLoopAnalysisTest, SingleIterationUpperBound) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], s32[]) parameter(0)
+      val = f32[2] get-tuple-element(p_body), index=0
+      const = s32[] constant(-1)
+      ROOT root = (f32[2], s32[]) tuple(val, const)
+    }
+
+    condition {
+      p_cond = (f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=1
+      const = s32[] constant(42)
+      ROOT result = pred[] equal-to(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], s32[]) tuple(param.0, param.1)
+      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  HloInstruction* while_op = module->entry_computation()->root_instruction();
+  EXPECT_EQ(*ComputeWhileLoopTripCountUpperBound(while_op), 1);
+}
+
+TEST_F(WhileLoopAnalysisTest, NoUpperBound) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], s32[]) parameter(0)
+      val = f32[2] get-tuple-element(p_body), index=0
+      const = s32[] constant(42)
+      ROOT root = (f32[2], s32[]) tuple(val, const)
+    }
+
+    condition {
+      p_cond = (f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=1
+      const = s32[] constant(42)
+      ROOT result = pred[] equal-to(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], s32[]) tuple(param.0, param.1)
+      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  HloInstruction* while_op = module->entry_computation()->root_instruction();
+  EXPECT_EQ(ComputeWhileLoopTripCountUpperBound(while_op), absl::nullopt);
+}
+
+TEST_F(WhileLoopAnalysisTest, ExactBound) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], s32[]) parameter(0)
+      val = f32[2] get-tuple-element(p_body), index=0
+      index = s32[] get-tuple-element(p_body), index=1
+      one = s32[] constant(1)
+      inc = s32[] add(index, one)
+      ROOT root = (f32[2], s32[]) tuple(val, inc)
+    }
+
+    condition {
+      p_cond = (f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=1
+      const = s32[] constant(42)
+      ROOT result = pred[] less-than(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] constant(0)
+      while_init = (f32[2], s32[]) tuple(param.0, param.1)
+      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  HloInstruction* while_op = module->entry_computation()->root_instruction();
+  EXPECT_EQ(*ComputeWhileLoopTripCountUpperBound(while_op), 42);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
index 067cfcc17d65860a249de4d9e31703df12091d3a..8b381dec07397c1427e98bc30511ac21dc577610 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
@@ -46,8 +46,9 @@ static Status ReplaceUsesWhileKeepingLoopInvariance(
   return Status::OK();
 }
 
-StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody(
+StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
     HloInstruction* while_instr) {
+  HloComputation* while_cond = while_instr->while_condition();
   HloComputation* while_body = while_instr->while_body();
 
   const HloInstruction& init_value = *while_instr->operand(0);
@@ -57,24 +58,48 @@ StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody(
 
   bool changed = false;
 
-  for (HloInstruction* invariant_gte :
-       WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) {
-    int64 index = invariant_gte->tuple_index();
+  absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>
+      conditional_gte_index_to_insts =
+          WhileUtil::GetGTEsMapForWhileConditional(*while_cond);
+  std::vector<HloInstruction*> invariant_body_gtes =
+      WhileUtil::GetInvariantGTEsForWhileBody(*while_body);
+
+  for (HloInstruction* invariant_body_gte : invariant_body_gtes) {
+    int64 index = invariant_body_gte->tuple_index();
     const HloInstruction& invariant_value = *init_value.operand(index);
 
-    // Should have at least one user that's not while_body_root.
-    if (invariant_gte->user_count() <= 1) {
+    // Original value should be a constant.
+    if (invariant_value.opcode() != HloOpcode::kConstant) {
       continue;
     }
 
-    if (invariant_value.opcode() == HloOpcode::kConstant) {
-      auto* constant_instr =
+    // Sink into the while_body.
+    // Should have at least one user that's not while_body_root.
+    if (invariant_body_gte->user_count() > 1) {
+      HloInstruction* constant_instr =
           while_body->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk"));
       TF_RETURN_IF_ERROR(ReplaceUsesWhileKeepingLoopInvariance(
-          invariant_gte, constant_instr, while_body->root_instruction(),
+          invariant_body_gte, constant_instr, while_body->root_instruction(),
           index));
       changed = true;
     }
+
+    // Check if there is a corresponding GTE in while_conditional.
+    auto it = conditional_gte_index_to_insts.find(index);
+    if (it == conditional_gte_index_to_insts.end()) {
+      continue;
+    }
+
+    for (HloInstruction* invariant_cond_gte : it->second) {
+      // Should have at least one user.
+      if (invariant_cond_gte->user_count() > 0) {
+        HloInstruction* constant_instr = while_cond->AddInstruction(
+            invariant_value.Clone(/*suffix=*/".sunk"));
+        TF_RETURN_IF_ERROR(
+            invariant_cond_gte->ReplaceAllUsesWith(constant_instr));
+        changed = true;
+      }
+    }
   }
 
   return changed;
@@ -115,10 +140,8 @@ StatusOr<bool> WhileLoopConstantSinking::Run(HloModule* module) {
   }
 
   for (HloInstruction* while_instr : while_instrs) {
-    // We only sink into while loop bodies, but this can be extended to
-    // transform conditions as well.
     TF_ASSIGN_OR_RETURN(bool result,
-                        TrySinkingConstantsIntoWhileBody(while_instr));
+                        TrySinkingConstantsIntoWhileLoop(while_instr));
     changed |= result;
   }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
index 577bad6c7062d2ee40271e407e8eed7655fa13bf..a866bc1264b4013bb7530b5e02b546e6f78d676b 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
@@ -23,8 +23,8 @@ limitations under the License.
 namespace xla {
 
 // Sinks while loop invariant values that happen to be constants into the while
-// loop body.  This is probably not a win in isolation but may unlock further
-// optimizations like constant folding.
+// loop body and conditional. This is probably not a win in isolation but may
+// unlock further optimizations like constant folding.
 //
 //   state = (..., const, ...)
 //   while (pred(state)) {
@@ -46,22 +46,19 @@ namespace xla {
 // tuple trivially loop invariant.  WhileLoopSimplifier will later get rid of
 // `v`.
 //
-// We only sink into while loop bodies, but this can be extended to transform
-// conditions as well.
-//
 // TODO(b/79121449):  We should also sink broadcasts of constants.
 class WhileLoopConstantSinking : public HloModulePass {
  public:
   ~WhileLoopConstantSinking() override = default;
 
   absl::string_view name() const override {
-    return "while-loop-invariant-code-motion";
+    return "while-loop-constant-sinking";
   }
 
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  StatusOr<bool> TrySinkingConstantsIntoWhileBody(HloInstruction* while_instr);
+  StatusOr<bool> TrySinkingConstantsIntoWhileLoop(HloInstruction* while_instr);
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index d17b86fab5b14d13250a03fc8f74abb9661ed5ce..75d406435b6f58faecc86b82c33e9e2dd6bccbea 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -242,5 +242,178 @@ ENTRY entry {
     }
   }
 }
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalSinkConstant) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[],f32[]) parameter(0)
+  p_body.0 = f32[] get-tuple-element((f32[],f32[]) p_body), index=0
+  const = f32[] constant(1)
+  add = f32[] add(p_body.0, const)
+  p_body.1 = f32[] get-tuple-element((f32[],f32[]) p_body), index=1
+  ROOT root = (f32[],f32[]) tuple(add, p_body.1)
+}
+
+condition {
+  p_cond = (f32[],f32[]) parameter(0)
+  p_cond.0 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=0
+  p_cond.1 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=1
+  ROOT result = pred[] less-than(p_cond.0, p_cond.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = f32[] constant(10)
+  while_init = (f32[],f32[]) tuple(const_0, const_1)
+  ROOT while = (f32[],f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(), op::Lt(_, op::Constant()));
+}
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalTupleShapedConstants) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_b = (f32[],(f32[],f32[])) parameter(0)
+  p_b.0 = f32[] get-tuple-element((f32[],(f32[],f32[])) p_b), index=0
+  p_b.1 = (f32[],f32[]) get-tuple-element((f32[],(f32[],f32[])) p_b), index=1
+  p_b.1.0 = f32[] get-tuple-element((f32[],f32[]) p_b.1), index=0
+  add = f32[] add(p_b.0, p_b.1.0)
+  ROOT root = (f32[],(f32[],f32[])) tuple(add, p_b.1)
+}
+
+condition {
+  p_c = (f32[],(f32[],f32[])) parameter(0)
+  p_c.0 = f32[] get-tuple-element((f32[],(f32[],f32[])) p_c), index=0
+  p_c.1 = (f32[],f32[]) get-tuple-element((f32[],(f32[],f32[])) p_c), index=1
+  p_c.1.1 = f32[] get-tuple-element((f32[],f32[]) p_c.1), index=1
+  ROOT result = pred[] less-than(p_c.0, p_c.1.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = (f32[], f32[]) constant((f32[], f32[]) (1, 10))
+  while_init = (f32[],(f32[],f32[])) tuple(const_0, const_1)
+  ROOT while = (f32[],(f32[],f32[])) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(),
+              op::Lt(_, op::GetTupleElement(op::Constant())));
+}
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalDontCreateDeadConstant) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[],f32[],f32[]) parameter(0)
+  p_body.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=0
+  const = f32[] constant(1)
+  add = f32[] add(p_body.0, const)
+  p_body.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=1
+  p_body.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=2
+  ROOT root = (f32[],f32[],f32[]) tuple(add, p_body.1, p_body.2)
+}
+
+condition {
+  p_cond = (f32[],f32[],f32[]) parameter(0)
+  p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0
+  p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1
+  p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
+  ROOT result = pred[] less-than(p_cond.0, p_cond.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = f32[] constant(10)
+  const_2 = f32[] constant(12)
+  while_init = (f32[],f32[],f32[]) tuple(const_0, const_1, const_2)
+  ROOT while = (f32[],f32[],f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(), op::Lt(_, op::Constant()));
+  for (const HloInstruction* inst : while_condition->instructions()) {
+    if (inst->opcode() == HloOpcode::kConstant) {
+      EXPECT_GT(inst->user_count(), 0);
+    }
+  }
+}
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalMultipleSameIndexGTEs) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[],f32[],f32[]) parameter(0)
+  p_body.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=0
+  const = f32[] constant(1)
+  add.0 = f32[] add(p_body.0, const)
+  p_body.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=1
+  add.1 = f32[] add(p_body.1, const)
+  p_body.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=2
+  ROOT root = (f32[],f32[],f32[]) tuple(add.0, add.1, p_body.2)
+}
+
+condition {
+  p_cond = (f32[],f32[],f32[]) parameter(0)
+  p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0
+  p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
+  lt.0 = pred[] less-than(p_cond.0, p_cond.2)
+  p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1
+  p_cond.2.c = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
+  lt.1 = pred[] less-than(p_cond.1, p_cond.2.c)
+  ROOT result = pred[] and(lt.0, lt.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = f32[] constant(0)
+  const_2 = f32[] constant(12)
+  while_init = (f32[],f32[],f32[]) tuple(const_0, const_1, const_2)
+  ROOT while = (f32[],f32[],f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(),
+              op::And(op::Lt(_, op::Constant()), op::Lt(_, op::Constant())));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 9795b2830b6d9add82b89ac76b5438ddc3d2bfe8..41011176ffa91e885bc58364d1fb19617d3518ad 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -19,7 +19,9 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -143,6 +145,12 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   string while_instr_name = while_instr->ToString(print_no_metadata);
   VLOG(2) << "Trying to hoist from " << while_instr_name;
 
+  auto maybe_upper_bound = ComputeWhileLoopTripCountUpperBound(while_instr);
+  if (maybe_upper_bound && *maybe_upper_bound <= 1) {
+    VLOG(2) << "Loop has a trip count of at most 1, skipping.";
+    return false;
+  }
+
   HloComputation* while_body = while_instr->while_body();
 
   // Maps instructions in the while body to instructions hoisted outside the
@@ -180,6 +188,13 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
     return false;
   }
 
+  // LICM in the presence of domain instructions is complex, bail.
+  for (auto* instruction : while_body->MakeInstructionPostOrder()) {
+    if (instruction->opcode() == HloOpcode::kDomain) {
+      return false;
+    }
+  }
+
   // instructions_to_replace[i] is hoisted into a loop invariant instruction
   // replacement_instructions[i].
   std::vector<HloInstruction*> instructions_to_replace;
@@ -193,6 +208,37 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
       continue;
     }
 
+    if (!hoist_size_inflating_ops_) {
+      // Check that hoisting the instruction doesn't cause a significant memory
+      // blow-up. LICM extends the live-range of the output of the hoisted
+      // instruction to be the entire while loop, which may be problematic on
+      // platforms where memory is limited. This can be especially harmful if
+      // the instruction has a significantly larger output than its input, e.g.
+      // kIota, kBroadcast or kConstant.
+      int64 input_size = 0, output_size = 0;
+
+      for (auto* operand : instruction->operands()) {
+        ShapeUtil::ForEachSubshape(
+            operand->shape(),
+            [&input_size](const Shape& subshape, const ShapeIndex& /*index*/) {
+              if (ShapeUtil::IsArray(subshape)) {
+                input_size += ShapeUtil::ByteSizeOfElements(subshape);
+              }
+            });
+      }
+      ShapeUtil::ForEachSubshape(
+          instruction->shape(),
+          [&output_size](const Shape& subshape, const ShapeIndex& /*index*/) {
+            if (ShapeUtil::IsArray(subshape)) {
+              output_size += ShapeUtil::ByteSizeOfElements(subshape);
+            }
+          });
+
+      if (output_size > input_size) {
+        continue;
+      }
+    }
+
     auto is_invariant = [&](HloInstruction* op) {
       return hoisted_instructions.find(op) != hoisted_instructions.end() ||
              unhoisted_invariant_instructions.count(op) ||
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
index 3031899f71e0fd77f20448d9d7489798af01615c..bd6232dc0a988775a0490abbf6125daad8476295 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
@@ -34,8 +34,14 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
   // Setting `hoist_constants` to false can be help if LICM is run in the mid
   // level HLO pipeline because hoisting constants out of while loop bodies can
   // break optimizations like constant folding.
-  explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false)
-      : hoist_constants_(hoist_constants) {}
+  // Setting `hoist_size_inflating_ops` to false will forbid hoisting
+  // instructions where the size of the output(s) is larger than the size of the
+  // input(s). This is useful on platforms on which it's important to prevent
+  // blow-ups in memory size.
+  explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false,
+                                        bool hoist_size_inflating_ops = true)
+      : hoist_constants_(hoist_constants),
+        hoist_size_inflating_ops_(hoist_size_inflating_ops) {}
   ~WhileLoopInvariantCodeMotion() override = default;
 
   absl::string_view name() const override {
@@ -49,6 +55,7 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
       HloInstruction* while_instr);
 
   bool hoist_constants_;
+  bool hoist_size_inflating_ops_;
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 32e69c335b713c438bd7fcb2053709b0624f58ed..8e7c4bc8828552e197b41f874c070d496b85a382 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -26,7 +26,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class WhileLoopInvariantCodeMotionTest : public HloVerifiedTestBase {
+class WhileLoopInvariantCodeMotionTest : public HloTestBase {
  public:
   // Makes a computation which has one parameter, of the given shape, and always
   // returns PRED[]{true}.  This is useful as a dummy loop condition.
@@ -58,6 +58,7 @@ HloComputation* WhileLoopInvariantCodeMotionTest::MakeAlwaysTrueComputation(
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -76,19 +77,18 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  HloComputation* entry_computation =
-      module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  HloComputation* entry_computation = m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
   HloInstruction* transformed_while;
@@ -100,6 +100,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -135,19 +136,18 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, divide_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  HloComputation* entry_computation =
-      module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  HloComputation* entry_computation = m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
   HloInstruction* transformed_while;
@@ -173,6 +173,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
 TEST_F(WhileLoopInvariantCodeMotionTest,
        DontHoistTriviallyLoopVaryingComputation) {
   // Basic negative test: the add expression is not loop invariant.
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
 
@@ -189,20 +190,20 @@ TEST_F(WhileLoopInvariantCodeMotionTest,
             scalar_s32, HloOpcode::kAdd, gte_0, gte_1));
     builder.AddInstruction(HloInstruction::CreateTuple({gte_0, add_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(), Contains(op::Add()));
@@ -210,6 +211,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest,
 
 TEST_F(WhileLoopInvariantCodeMotionTest,
        DontHoistLoopVaryingComputationWithAlternatingTuples) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -228,25 +230,26 @@ TEST_F(WhileLoopInvariantCodeMotionTest,
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_1, gte_0, add_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(), Contains(op::Add()));
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto token_shape = ShapeUtil::MakeTokenShape();
   Shape while_shape =
@@ -267,7 +270,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, out_token}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
@@ -277,14 +280,14 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateTuple({scalar_param, scalar_param, token}));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_s32, while_inst, 0));
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   ASSERT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(),
@@ -294,6 +297,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
 TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
   // The bitcast's user, an outfeed, can't be hoisted, so don't hoist the
   // bitcast either.
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
   auto token_shape = ShapeUtil::MakeTokenShape();
@@ -317,7 +321,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, out_token}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
@@ -327,15 +331,15 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateTuple({scalar_param, scalar_param, token}));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_s32, while_inst, 0));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(),
@@ -346,6 +350,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
   // The bitcast's user can be hoisted, so hoist the bitcast too.
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
   Shape while_shape =
@@ -367,21 +372,20 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_inst}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
 
-  HloComputation* entry_computation =
-      module().AddEntryComputation(builder.Build());
+  HloComputation* entry_computation = m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
   HloInstruction* transformed_while;
@@ -396,6 +400,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistControlDependencies) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -416,22 +421,23 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistControlDependencies) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_result}));
 
-    while_body = module().AddEmbeddedComputation(builder.Build());
+    while_body = m->AddEmbeddedComputation(builder.Build());
   }
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
 
@@ -439,7 +445,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
     HloComputation::Builder builder(TestName() + ".passthrough");
     HloInstruction* param = builder.AddInstruction(
         HloInstruction::CreateParameter(0, while_shape, "param"));
-    HloComputation* result = module().AddEmbeddedComputation(builder.Build());
+    HloComputation* result = m->AddEmbeddedComputation(builder.Build());
 
     result->AddInstruction(
         HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
@@ -450,11 +456,11 @@ TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
@@ -482,14 +488,14 @@ ENTRY entry {
 )";
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistsConstantWhenAsked) {
-  ParseAndVerifyModule(kConstantHoistingTestCase);
+  auto m = ParseAndReturnVerifiedModule(kConstantHoistingTestCase).ValueOrDie();
 
   TF_ASSERT_OK_AND_ASSIGN(
       bool simplified_loop,
-      WhileLoopInvariantCodeMotion{/*hoist_constants=*/true}.Run(&module()));
+      WhileLoopInvariantCodeMotion{/*hoist_constants=*/true}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
-  HloComputation* while_body = module().GetComputationWithName("wide.body");
+  HloComputation* while_body = m->GetComputationWithName("wide.body");
   ASSERT_NE(while_body, nullptr);
 
   // We expect the while body to be the equivalent of:
@@ -523,10 +529,98 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistsConstantWhenAsked) {
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistConstantByDefault) {
-  ParseAndVerifyModule(kConstantHoistingTestCase);
+  auto m = ParseAndReturnVerifiedModule(kConstantHoistingTestCase).ValueOrDie();
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, DoNotHoistOutOfSingleIteration) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], f32[2], f32[2], s32[]) parameter(0)
+      val.0 = f32[2] get-tuple-element(p_body), index=0
+      val.1 = f32[2] get-tuple-element(p_body), index=1
+      add = f32[2] add(val.0, val.1)
+      const = s32[] constant(-1)
+      ROOT root = (f32[2], f32[2], f32[2], s32[]) tuple(val.0, val.1, add, const)
+    }
+
+    condition {
+      p_cond = (f32[2], f32[2], f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=3
+      const = s32[] constant(42)
+      ROOT result = pred[] equal-to(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], f32[2], f32[2], s32[]) tuple(param.0, param.0, param.0, param.1)
+      ROOT while = (f32[2], f32[2], f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(module.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
+const char* const kInflatingTestCase = R"(
+HloModule ModuleWithWhile
+
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+
+body {
+  p_body = (f32[]) parameter(0)
+  iota = f32[1024, 1024] iota(), iota_dimension=0
+  add = f32[1024, 1024] add(iota, iota)
+  constant = f32[] constant(1.0)
+  reduce = f32[] reduce(f32[1024, 1024] add, f32[] constant), dimensions={0,1}, to_apply=mul
+  ROOT root = (f32[]) tuple(reduce)
+}
+
+condition {
+  p_cond = (f32[]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  param = f32[] parameter(0)
+  while_init = (f32[]) tuple(param)
+  ROOT while = (f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+TEST_F(WhileLoopInvariantCodeMotionTest, HoistsInflatingByDefault) {
+  auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopInvariantCodeMotion(/*hoist_constants=*/true).Run(m.get()));
+  EXPECT_TRUE(simplified_loop);
+
+  HloComputation* while_body = m->GetComputationWithName("wide.body");
+  ASSERT_NE(while_body, nullptr);
+  EXPECT_THAT(while_body->instructions(), Not(Contains(op::Iota())));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, NoHoistInflating) {
+  auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopInvariantCodeMotion(/*hoist_constants=*/true,
+                                   /*hoist_size_inflating_ops=*/false)
+          .Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 630d71e5ca25e9d282ce6283284a32d6f725a193..c4790a7f199a90ca81e5503b4256bd95df88d4f4 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -19,41 +19,19 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 
 namespace xla {
 
+namespace m = match;
 using absl::optional;
-
-// Determines whether the given instruction is a send/recv node, or has a
-// subcomputation which contains a send/recv node.
-static bool IsOrContainsSendOrRecv(const HloInstruction* instr);
-
-// Determines whether the given computation contains a send or recv node.
-static bool ContainsSendOrRecv(const HloComputation* comp) {
-  for (const auto* instr : comp->instructions()) {
-    if (IsOrContainsSendOrRecv(instr)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-static bool IsOrContainsSendOrRecv(const HloInstruction* instr) {
-  if (instr->opcode() == HloOpcode::kSend ||
-      instr->opcode() == HloOpcode::kSendDone ||
-      instr->opcode() == HloOpcode::kRecv ||
-      instr->opcode() == HloOpcode::kRecvDone) {
-    return true;
-  }
-  for (const auto& subcomp : instr->called_computations()) {
-    if (ContainsSendOrRecv(subcomp)) {
-      return true;
-    }
-  }
-  return false;
-}
+using hlo_query::ContainsInstrWithOpcode;
 
 // Tries to remove elements in a while loop's tuple that aren't used within the
 // loop.
@@ -253,7 +231,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // Create the new while condition, body, and init value.
   std::unique_ptr<HloComputation> new_while_cond =
       while_cond->CloneWithReplacements(
-          make_while_computation_replacements(while_cond), /*extras=*/{});
+          make_while_computation_replacements(while_cond));
 
   std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       while_body_replacements = make_while_computation_replacements(while_body);
@@ -266,8 +244,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   while_body_replacements.emplace(
       while_body_root, HloInstruction::CreateTuple(new_while_body_root_elems));
   std::unique_ptr<HloComputation> new_while_body =
-      while_body->CloneWithReplacements(std::move(while_body_replacements),
-                                        /*extras=*/{});
+      while_body->CloneWithReplacements(std::move(while_body_replacements));
 
   // Add a new while_init instruction that repackages the old while_init
   // instruction's elements.  We rely on the AlgebraicSimplifier and DCE to
@@ -329,6 +306,147 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   return true;
 }
 
+// Removes each loop parameter (i.e. member of the while loop tuple) that is a
+// constant and is the same in the while loop body and the while loop init.
+static StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+
+  absl::flat_hash_set<int64> constant_tuple_indices;
+  const auto& while_shape = while_init->shape();
+  for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    auto* init_elem = while_init->operand(i);
+    auto* body_elem = while_body_root->operand(i);
+    if (init_elem->opcode() == HloOpcode::kConstant &&
+        body_elem->opcode() == HloOpcode::kConstant &&
+        init_elem->literal() == body_elem->literal()) {
+      constant_tuple_indices.insert(i);
+    }
+  }
+
+  if (constant_tuple_indices.empty()) {
+    return false;
+  }
+
+  // OK, we found some constant elements of the while parameter!  Eliminate
+  // them.
+  std::vector<Shape> new_while_shape_elems;
+  for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    if (!constant_tuple_indices.count(i)) {
+      new_while_shape_elems.push_back(while_shape.tuple_shapes(i));
+    }
+  }
+  Shape new_while_shape = ShapeUtil::MakeTupleShape(new_while_shape_elems);
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  // Returns a new tuple without the elements of constant_tuple_indices.
+  auto remove_constant_elems = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), while_shape));
+
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      if (!constant_tuple_indices.count(i)) {
+        tuple_elems.push_back(
+            add_new_instr(HloInstruction::CreateGetTupleElement(
+                while_shape.tuple_shapes(i), instr, i)));
+      }
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  auto add_constant_elems = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
+
+    std::vector<HloInstruction*> tuple_elems;
+    int64 j = 0;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      if (constant_tuple_indices.count(i)) {
+        tuple_elems.push_back(while_init->mutable_operand(i));
+      } else {
+        tuple_elems.push_back(
+            add_new_instr(HloInstruction::CreateGetTupleElement(
+                while_shape.tuple_shapes(i), instr, j)));
+        ++j;
+      }
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Special case: constant_tuple_indices covers the whole while parameter, so
+  // the new while shape is the empty tuple.  In this case, the value of the
+  // while loop is simply equal to the value of `init`.
+  //
+  // It's unfortunate to special-case this, but it's simpler than the
+  // alternative.  The problem is that if our while parameter has no
+  // non-constant elems, the tuple returned by `add_constant_elems` won't depend
+  // on instr (the loop body/cond parameter), and therefore
+  // CloneWithReplacementPairs will *leave the parameter out entirely*, creating
+  // invalid HLO.
+  if (ShapeUtil::IsEmptyTuple(new_while_shape)) {
+    TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, while_init));
+    return true;
+  }
+
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          add_constant_elems(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  std::unique_ptr<HloComputation> new_while_body =
+      while_body->CloneWithReplacementPairs(
+          {
+              while_body->parameter_instruction(0),
+              add_constant_elems(add_new_instr(HloInstruction::CreateParameter(
+                  0, new_while_shape,
+                  while_cond->parameter_instruction(0)->name()))),
+          },
+          {
+              while_body->root_instruction(),
+              remove_constant_elems(
+                  add_new_instr(while_body->root_instruction()->Clone())),
+          });
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op,
+      add_constant_elems(
+          computation->AddInstruction(HloInstruction::CreateWhile(
+              new_while_shape,
+              module->AddEmbeddedComputation(std::move(new_while_cond)),
+              module->AddEmbeddedComputation(std::move(new_while_body)),
+              add_new_instr(remove_constant_elems(while_init)))))));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return true;
+}
+
 // Tries to remove a while loop from the graph.
 //
 //  - Loops with trip count of 0 can be replaced by the loop's "init" value.
@@ -458,6 +576,414 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
   return changed_cond || changed_body;
 }
 
+// Converts a flat list of instructions into a tuple of the desired shape.  For
+// example, given a tuple shape ((x, x), x) and instructions {A, B, C}, returns
+// a tuple of value ((A, B), C).
+//
+// desired_shape must be a tuple.  (This precondition allows us to return a
+// unique_ptr rather than a raw ptr.)
+static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
+    absl::Span<HloInstruction*> instrs, const Shape& desired_shape,
+    std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
+  CHECK(ShapeUtil::IsTuple(desired_shape))
+      << ShapeUtil::HumanString(desired_shape);
+
+  // For each child shape in `desired_shape`, slice out the correct number of
+  // `instrs` and call UnflattenTupleInstr recursively.  At each step we remove
+  // elements from `instrs` so that it only contains instructions we have not
+  // yet processed.
+  std::vector<HloInstruction*> elems;
+  for (int64 i = 0; i < desired_shape.tuple_shapes_size(); ++i) {
+    const Shape& subshape = desired_shape.tuple_shapes(i);
+    if (!ShapeUtil::IsTuple(subshape)) {
+      elems.push_back(instrs[0]);
+      instrs.remove_prefix(1);
+      continue;
+    }
+
+    // Count the number of leaf nodes underneath desired_shape[i].
+    int64 num_leaves = 0;
+    ShapeUtil::ForEachSubshape(
+        subshape, [&](const Shape& s, const ShapeIndex& /*index*/) {
+          if (!ShapeUtil::IsTuple(s)) {
+            ++num_leaves;
+          }
+        });
+
+    std::unique_ptr<HloInstruction> subinstr =
+        UnflattenTupleInstr(instrs.subspan(0, num_leaves),
+                            desired_shape.tuple_shapes(i), new_instrs);
+    elems.push_back(subinstr.get());
+    new_instrs->push_back(std::move(subinstr));
+    instrs.remove_prefix(num_leaves);
+  }
+  return HloInstruction::CreateTuple(elems);
+}
+
+// Builds a vector whose elements are the values in the flattened tuple for
+// `instr`.  For example, if `instr` is a tuple of form ((A, B), C), returns the
+// vector {A, B, C} (or kGetTupleElement ops which point to A, B, and C).
+static std::vector<HloInstruction*> GetFlatTupleElems(
+    HloInstruction* instr,
+    std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
+  const auto& shape = instr->shape();
+  if (!ShapeUtil::IsTuple(shape)) {
+    return {instr};
+  }
+  std::vector<HloInstruction*> elems;
+  for (int64 i = 0; i < shape.tuple_shapes_size(); ++i) {
+    const Shape& subshape = shape.tuple_shapes(i);
+    new_instrs->push_back(
+        HloInstruction::CreateGetTupleElement(subshape, instr, i));
+    auto* gte = new_instrs->back().get();
+    auto flattened_subshape = GetFlatTupleElems(gte, new_instrs);
+    elems.insert(elems.end(), flattened_subshape.begin(),
+                 flattened_subshape.end());
+  }
+  return elems;
+}
+
+static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+  Shape while_shape = while_init->shape();
+  if (!ShapeUtil::IsNestedTuple(while_shape)) {
+    return false;
+  }
+
+  std::vector<Shape> flattened_shape_elems;
+  ShapeUtil::ForEachSubshape(while_shape,
+                             [&](const Shape& s, const ShapeIndex& /*index*/) {
+                               if (!ShapeUtil::IsTuple(s)) {
+                                 flattened_shape_elems.push_back(s);
+                               }
+                             });
+  Shape flattened_shape = ShapeUtil::MakeTupleShape(flattened_shape_elems);
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  auto nested = [&](HloInstruction* instr) {
+    std::vector<HloInstruction*> gtes;
+    const Shape& flat_shape = instr->shape();
+    for (int64 i = 0; i < flat_shape.tuple_shapes_size(); ++i) {
+      gtes.push_back(add_new_instr(HloInstruction::CreateGetTupleElement(
+          flat_shape.tuple_shapes(i), instr, i)));
+    }
+    auto nested_instr =
+        UnflattenTupleInstr(absl::MakeSpan(gtes), while_shape, &new_instrs);
+    CHECK(ShapeUtil::Compatible(nested_instr->shape(), while_shape))
+        << ShapeUtil::HumanString(nested_instr->shape()) << " vs "
+        << ShapeUtil::HumanString(while_shape);
+    return nested_instr;
+  };
+
+  auto flattened = [&](HloInstruction* instr) {
+    return HloInstruction::CreateTuple(GetFlatTupleElems(instr, &new_instrs));
+  };
+
+  // Create a new while-condition computation, where parameter 0 has flat shape
+  // but all uses of it go through the nested shape.
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          nested(add_new_instr(HloInstruction::CreateParameter(
+              0, flattened_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  // Create a new while-body computation, where parameter 0 has a flat shape and
+  // all uses of it go through the nested shape, and where the root has a flat
+  // shape constructed from the old nested root.
+  std::unique_ptr<HloComputation> new_while_body =
+      while_body->CloneWithReplacementPairs(
+          {
+              while_body->parameter_instruction(0),
+              nested(add_new_instr(HloInstruction::CreateParameter(
+                  0, flattened_shape,
+                  while_body->parameter_instruction(0)->name()))),
+          },
+          {
+              while_body->root_instruction(),
+              flattened(add_new_instr(while_body->root_instruction()->Clone())),
+          });
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op, nested(computation->AddInstruction(HloInstruction::CreateWhile(
+                    flattened_shape,
+                    module->AddEmbeddedComputation(std::move(new_while_cond)),
+                    module->AddEmbeddedComputation(std::move(new_while_body)),
+                    computation->AddInstruction(flattened(while_init)))))));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return true;
+}
+
+// Tries to merge loop induction variables of a given type.
+//
+// In this pass we're only concerned with elements of the loop's tuple that
+// are effective-scalars of type `elem_ty`.  Some terminology:
+//
+//  - The trip counter is the first element of the loop's tuple that starts at
+//    0 and does x++ on each iteration.
+//
+//  - An induction variable is an element of the loop's tuple that is not the
+//    trip counter and does `x += <constant>` on each iteration of the loop.
+//    Negative constants are OK.
+//
+// This pass adds a trip counter if one isn't already present, then replaces
+// each induction variable with
+//
+//   <initial_value> + <trip_count> * <constant>.
+//
+// This reduces the number of scalar operations in the loop, which is important
+// e.g. on GPUs, where each scalar operation is nontrivially expensive because
+// it's a separate kernel launch.
+//
+// Returns the new loop if a change was made, or null if no change was made.
+// Note that the new loop is not a valid replacement for the old loop; it may
+// need to be wrapped in a tuple that changes its shape.  We return the loop
+// itself so that you can call TryMergeInductionVariables in a loop, once for
+// each integral type elem_ty.
+static StatusOr<HloInstruction*> TryMergeInductionVariables(
+    HloInstruction* while_op, PrimitiveType elem_ty) {
+  CHECK(primitive_util::IsIntegralType(elem_ty)) << PrimitiveType_Name(elem_ty);
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return nullptr;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+  Shape while_shape = while_init->shape();
+
+  // The tuple index of the trip counter, if one is present.
+  absl::optional<int64> trip_counter;
+  // Maps the tuple index of each induction variable to its constant increment.
+  absl::flat_hash_map<int64, const HloConstantInstruction*> induction_vars;
+  for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
+    const auto& elem_shape = while_body_root->operand(i)->shape();
+    if (!ShapeUtil::IsEffectiveScalar(elem_shape) ||
+        elem_shape.element_type() != elem_ty) {
+      continue;
+    }
+
+    HloInstruction* constant;
+    if (!Match(while_body_root->mutable_operand(i),
+               m::AddAnyOrder(m::GetTupleElement(m::Parameter(), i),
+                              m::Constant(&constant)))) {
+      continue;
+    }
+    if (!trip_counter && constant->literal().IsAll(1) &&
+        while_init->operand(i)->IsConstant() &&
+        while_init->operand(i)->literal().IsAll(0)) {
+      VLOG(10) << "Found existing trip counter at index " << i;
+      trip_counter = i;
+    } else {
+      VLOG(10) << "Found induction variable at index " << i;
+      induction_vars.emplace(i, Cast<HloConstantInstruction>(constant));
+    }
+  }
+
+  // There's only something to simplify if we can either:
+  //
+  //  - combine one or more induction vars with an existing trip counter, or
+  //  - replace two or more induction variables with a new trip counter.
+  //
+  // Put another way, there's only something to simplify if the number of
+  // induction vars plus the number of existing trip counters (0 or 1) is >= 2.
+  if (induction_vars.size() + (trip_counter.has_value() ? 1 : 0) < 2) {
+    return nullptr;
+  }
+
+  // OK, we're going to do the transformation!  Set up some helpers.
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  auto add_binary_op = [&](const Shape& shape, HloOpcode opcode,
+                           HloInstruction* lhs, HloInstruction* rhs) {
+    // Reshape lhs/rhs to the output shape if necessary.  This deals with the
+    // fact that induction variables need only be effective scalars, not true
+    // scalars.
+    if (!ShapeUtil::Compatible(shape, lhs->shape())) {
+      lhs = add_new_instr(HloInstruction::CreateReshape(shape, lhs));
+    }
+    if (!ShapeUtil::Compatible(shape, rhs->shape())) {
+      rhs = add_new_instr(HloInstruction::CreateReshape(shape, rhs));
+    }
+    return add_new_instr(HloInstruction::CreateBinary(shape, opcode, lhs, rhs));
+  };
+
+  auto add_gte = [&](HloInstruction* src, int64 idx) {
+    return add_new_instr(HloInstruction::CreateGetTupleElement(
+        src->shape().tuple_shapes(idx), src, idx));
+  };
+
+  // Our new while loop will have the same shape as the old while loop, except
+  // we'll add a trip counter to the end if it wasn't originally present.
+  Shape new_while_shape = while_shape;
+  bool added_trip_counter = false;
+  if (!trip_counter) {
+    VLOG(10) << "Adding new trip counter to end of loop's tuple.";
+    trip_counter = new_while_shape.tuple_shapes_size();
+    *new_while_shape.add_tuple_shapes() =
+        ShapeUtil::MakeShape(elem_ty, /*dimensions=*/{});
+    added_trip_counter = true;
+  }
+
+  // Converts `instr` into a tuple of the "old" form -- that is, to a tuple with
+  // shape `while_body->shape()` and where the induction variables are "reified"
+  // (i.e. they have value <init> + <counter> * <constant>).
+  auto convert_to_old_form = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      const auto& elem_shape = while_shape.tuple_shapes(i);
+      if (!induction_vars.count(i)) {
+        tuple_elems.push_back(add_gte(instr, i));
+        continue;
+      }
+      tuple_elems.push_back(add_binary_op(
+          elem_shape, HloOpcode::kAdd, add_gte(instr, i),
+          add_binary_op(elem_shape, HloOpcode::kMultiply,
+                        add_gte(instr, *trip_counter),
+                        add_new_instr(induction_vars.at(i)->Clone()))));
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Converts `root` into a tuple of the "new" form -- that is, to a tuple with
+  // shape `new_while_shape` and where the induction variables (but not trip
+  // counters) are replaced with their unchanging <loop_body_param> values.
+  auto convert_to_new_form = [&](HloInstruction* old_root,
+                                 HloParameterInstruction* loop_body_param) {
+    CHECK(ShapeUtil::Compatible(old_root->shape(), while_shape));
+    std::vector<HloInstruction*> tuple_elems;
+
+    // In the new form, induction variables come from `init`, everything else
+    // (including the trip counter if it's not one we created ourselves) comes
+    // from the `root` tuple unmodified.
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      tuple_elems.push_back(
+          add_gte((induction_vars.count(i) ? loop_body_param : old_root), i));
+    }
+    // If we created a trip counter ourselves, add 1 to it in the next
+    // iteration.
+    if (added_trip_counter) {
+      tuple_elems.push_back(add_binary_op(
+          new_while_shape.tuple_shapes(*trip_counter), HloOpcode::kAdd,
+          add_gte(loop_body_param, *trip_counter),
+          add_new_instr(
+              HloInstruction::CreateConstant(LiteralUtil::One(elem_ty)))));
+    }
+
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Creates a new init tuple, which is the same as the old init tuple except if
+  // we added a trip counter, it's set to 0.
+  auto get_new_while_init = [&](HloInstruction* init) {
+    CHECK(ShapeUtil::Compatible(init->shape(), while_shape));
+    if (!added_trip_counter) {
+      return init;
+    }
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      tuple_elems.push_back(add_gte(init, i));
+    }
+    tuple_elems.push_back(add_new_instr(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(elem_ty))));
+    return add_new_instr(HloInstruction::CreateTuple(tuple_elems));
+  };
+
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          convert_to_old_form(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  // Creating the new while body proceeds in two steps.  First we convert the
+  // users of the parameter to the old form.  Then as a second
+  // CloneWithReplacement operation we convert the root to the new form.  We
+  // have to do this in two steps because the new root needs to use the new
+  // param0, and during the first clone operation, only the *old-form* param0 is
+  // accessible.
+  //
+  // We have to add temp_new_while_body to the module because cloning a
+  // computation touches the module (to get its NameUniquer).
+  HloComputation* temp_new_while_body =
+      module->AddEmbeddedComputation(while_body->CloneWithReplacementPairs({
+          while_body->parameter_instruction(0),
+          convert_to_old_form(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_body->parameter_instruction(0)->name()))),
+      }));
+  std::unique_ptr<HloComputation> new_while_body =
+      temp_new_while_body->CloneWithReplacementPairs({
+          temp_new_while_body->root_instruction(),
+          convert_to_new_form(
+              add_new_instr(temp_new_while_body->root_instruction()->Clone()),
+              Cast<HloParameterInstruction>(
+                  temp_new_while_body->parameter_instruction(0))),
+      });
+  TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(temp_new_while_body));
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  auto* new_while = computation->AddInstruction(HloInstruction::CreateWhile(
+      new_while_shape,
+      module->AddEmbeddedComputation(std::move(new_while_cond)),
+      module->AddEmbeddedComputation(std::move(new_while_body)),
+      get_new_while_init(while_init)));
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op, convert_to_old_form(new_while)));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return new_while;
+}
+
 StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(3,
                  "WhileLoopSimplifier::Run(), before:\n" + module->ToString());
@@ -478,32 +1004,77 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
   for (HloInstruction* while_op : while_ops) {
     // We can't remove while loops that contain send/recv nodes, because we rely
     // on the particular loop structure around the node matching on the send and
-    // recv sides.  Removing dead while params requires us to remove the loop
+    // recv sides.  Other while simplifications require us to remove the loop
     // and replace it with a new one, so we can't do that either.
-    if (ContainsSendOrRecv(while_op->while_body()) ||
-        ContainsSendOrRecv(while_op->while_condition())) {
+    if (ContainsInstrWithOpcode(while_op->while_body(),
+                                {HloOpcode::kSend, HloOpcode::kSendDone,
+                                 HloOpcode::kRecv, HloOpcode::kRecvDone}) ||
+        ContainsInstrWithOpcode(while_op->while_condition(),
+                                {HloOpcode::kSend, HloOpcode::kSendDone,
+                                 HloOpcode::kRecv, HloOpcode::kRecvDone})) {
       VLOG(2) << "Not attempting to simplify while loop because it contains a "
                  "send/recv node: "
               << while_op->ToShortString();
       continue;
     }
 
-    StatusOr<bool> result = TryPropagateConstant(while_op);
-    TF_RETURN_IF_ERROR(result.status());
-    changed |= result.ValueOrDie();
+    TF_ASSIGN_OR_RETURN(bool result, TryPropagateConstant(while_op));
+    changed |= result;
+
+    TF_ASSIGN_OR_RETURN(result, TryRemoveWhileLoop(while_op));
+    changed |= result;
+    if (result) {
+      // Don't continue simplifying after successfully removing the while loop
+      // -- that would result in use-after-free nastiness.
+      continue;
+    }
+
+    // TODO(b/119281462): Cowardly refuse to perform any of the following
+    // optimizations in the presence of kDomain instructions.  It seems that
+    // modifying a while loop's tuple doesn't work when kDomain is present.
+    if (ContainsInstrWithOpcode(while_op->while_body(), {HloOpcode::kDomain}) ||
+        ContainsInstrWithOpcode(while_op->while_condition(),
+                                {HloOpcode::kDomain})) {
+      continue;
+    }
+
+    // Each of the optimizations below modifies the while loop itself if it's
+    // successful, meaning that `while_op` is no longer valid after one of these
+    // transformations returns true.
 
-    result = TryRemoveWhileLoop(while_op);
-    TF_RETURN_IF_ERROR(result.status());
-    if (result.ValueOrDie()) {
-      changed = true;
-      // Don't try to remove dead while params after successfully removing the
-      // while loop -- that would result in use-after-free nastiness.
+    TF_ASSIGN_OR_RETURN(result, TryFlattenNestedTuples(while_op));
+    changed |= result;
+    if (result) {
       continue;
     }
 
-    result = TryRemoveDeadWhileParams(while_op);
-    TF_RETURN_IF_ERROR(result.status());
-    changed |= result.ValueOrDie();
+    TF_ASSIGN_OR_RETURN(result, TryRemoveDeadWhileParams(while_op));
+    changed |= result;
+    if (result) {
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(result, TryRemoveConstantParams(while_op));
+    changed |= result;
+    if (result) {
+      continue;
+    }
+
+    bool merged_induction_vars = false;
+    // Notably missing from this list are S16 and U16.  These don't currently
+    // work because S/U16 literals are not implemented.
+    for (auto elem_ty : {S8, U8, S32, U32, S64, U64}) {
+      TF_ASSIGN_OR_RETURN(auto* new_while_op,
+                          TryMergeInductionVariables(while_op, elem_ty));
+      if (new_while_op) {
+        while_op = new_while_op;
+        changed = true;
+        merged_induction_vars = true;
+      }
+    }
+    if (merged_induction_vars) {
+      continue;
+    }
   }
 
   XLA_VLOG_LINES(3,
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
index 0bc5a0107bbcfb3b29a01d593fb79b89a863e49b..a378f179c63c788cd205ddbb784dee0e6b2106d7 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.h
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -25,11 +25,22 @@ namespace xla {
 // HLO pass that makes the following transformations on while loops:
 //
 //  - A while loop with static trip count of 0 is deleted.
+//
 //  - A while loop with static trip count of 1 is replaced by its body (sans
 //    loop).
+//
 //  - Elements of a while loop's tuple that the loop doesn't use are removed
 //    from the tuple.
 //
+//  - If the while loop's parameter is a nested tuple, it's flattened to a
+//    single-level tuple.  This is good because it usually reduces the number of
+//    kTuple instructions, but also because it unlocks additional optimizations
+//    (e.g. removing unused loop parameters).
+//
+// Flattening nested while loop tuples adds a whole mess of likely unnecessary
+// kGetTupleElement and kTuple operations to the graph.  We expect that tuple
+// simplifier will be run afterwards.
+//
 class WhileLoopSimplifier : public HloModulePass {
  public:
   ~WhileLoopSimplifier() override {}
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 1c892ba179ec67ccc9dbfe93d925551d6977ba15..4950e8269e9cf0723d717bd1734518d104c0c9f2 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -17,28 +17,45 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
+using ::testing::_;
 namespace op = xla::testing::opcode_matchers;
 
-class WhileLoopSimplifierTest : public HloVerifiedTestBase {
+// Returns the first kWhile instruction within m's entry computation.
+HloInstruction* FindFirstWhile(HloModule* m) {
+  const auto& instrs = m->entry_computation()->instructions();
+  return *absl::c_find_if(instrs, [](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kWhile;
+  });
+}
+
+class WhileLoopSimplifierTest : public HloTestBase {
  protected:
   // Makes an HloModule that contains a loop with `num_iters` iteration.
-  void MakeModuleWithSimpleLoop(int num_iters);
+  TF_MUST_USE_RESULT std::unique_ptr<VerifiedHloModule>
+  MakeModuleWithSimpleLoop(int num_iters);
 
   // Similar to MakeModuleWithSimpleLoop except that the loop bound is passed to
   // the loop-condition through an element of a tuple which is the
   // loop-condition parameter.
-  void MakeModuleWithSimpleLoopTupleElementLoopBound(int num_iters);
+  TF_MUST_USE_RESULT std::unique_ptr<VerifiedHloModule>
+  MakeModuleWithSimpleLoopTupleElementLoopBound(int num_iters);
 };
 
-void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
+std::unique_ptr<VerifiedHloModule>
+WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
   string hlo_string_template = R"(
   HloModule SimpleLoop
   SimpleLoop.body {
@@ -67,10 +84,11 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
 
   string hlo_string = absl::StrReplaceAll(
       hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}});
-  ParseAndVerifyModule(hlo_string);
+  return ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
 }
 
-void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
+std::unique_ptr<VerifiedHloModule>
+WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
     int num_iters) {
   string hlo_string_template = R"(
   HloModule SimpleLoopWithIndirectLoopBound
@@ -104,60 +122,55 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
 
   string hlo_string = absl::StrReplaceAll(
       hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}});
-  ParseAndVerifyModule(hlo_string);
+  return ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithZeroIterationSimiplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/0);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/0);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Constant(), op::Constant()));
 }
 
 TEST_F(WhileLoopSimplifierTest,
        LoopWithZeroIterationTupleElementLoopBoundSimplified) {
-  MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/0);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/0);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Constant(), op::Constant(), op::Constant()));
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithOneIterationSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Add(), op::Multiply()));
 }
 
 TEST_F(WhileLoopSimplifierTest,
        LoopWithOneIterationTupleELementLoopBoundSimplified) {
-  MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/1);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Add(), op::Multiply(), op::Constant()));
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithTwoIterationsNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/2);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/2);
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest,
        LoopWithControlDependencySimplifiedDependencyPreserved) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* true_op = while_op->while_body()->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
   TF_ASSERT_OK(true_op->AddControlDependencyTo(
       while_op->while_body()->root_instruction()));
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction()->control_predecessors(),
               ElementsAre(op::Constant()))
       << computation->ToString();
@@ -166,9 +179,8 @@ TEST_F(WhileLoopSimplifierTest,
 // Loops that contain send/recv nodes can't be simplified; the loop structure
 // around send/recv nodes must be preserved.
 TEST_F(WhileLoopSimplifierTest, LoopWithSendNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
@@ -179,13 +191,12 @@ TEST_F(WhileLoopSimplifierTest, LoopWithSendNotSimplified) {
       token,
       /*channel_id=*/0));
   while_body->AddInstruction(HloInstruction::CreateSendDone(send));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
@@ -194,7 +205,7 @@ TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
       HloInstruction::CreateRecv(ShapeUtil::MakeShape(F32, {1}), token,
                                  /*channel_id=*/0));
   while_body->AddInstruction(HloInstruction::CreateRecvDone(recv));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // The limitation on not being able to simplify loops that contain infeeds (and
@@ -202,16 +213,15 @@ TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
 // fact that our infrastructure sees simplifying such a loop as tantamount to
 // removing the non-removable instruction.
 TEST_F(WhileLoopSimplifierTest, LoopWithInfeedNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
   auto token = while_body->AddInstruction(HloInstruction::CreateToken());
   while_body->AddInstruction(HloInstruction::CreateInfeed(
       ShapeUtil::MakeShape(F32, {1}), token, "config"));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // A non-tuple shaped loop shouldn't be simplified or crash the compiler.
@@ -236,8 +246,8 @@ TEST_F(WhileLoopSimplifierTest, NonTupleShapedLoopNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // A while loop that does nothing else besides swapping tuple elements
@@ -268,8 +278,8 @@ TEST_F(WhileLoopSimplifierTest, LoopSwappingTupleElementsNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // Construct a loop where we assign a constant to tuple element 0 in each
@@ -297,8 +307,8 @@ TEST_F(WhileLoopSimplifierTest,
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // Nothing to simplify in a while loop whose tuple has 0 elements.
@@ -320,8 +330,8 @@ TEST_F(WhileLoopSimplifierTest, LoopWithEmptyTupleNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // While loop where one tuple element is used twice in the body, and thus can't
@@ -348,8 +358,8 @@ TEST_F(WhileLoopSimplifierTest, LoopWithElemUsedTwiceNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // This while loop has three tuple elements.  Element 0 is unused and should be
@@ -390,16 +400,15 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  HloModule* the_module = &module();
-  EXPECT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 
   // The original while instruction is still left in the module as a dead
   // instruction, find a while instruction with a different name as the new
   // while instruction.
   HloInstruction* new_while_op =
-      *std::find_if(the_module->entry_computation()->instructions().begin(),
-                    the_module->entry_computation()->instructions().end(),
+      *std::find_if(m->entry_computation()->instructions().begin(),
+                    m->entry_computation()->instructions().end(),
                     [&](const HloInstruction* instr) {
                       return (instr->opcode() == HloOpcode::kWhile &&
                               instr->name() != "while");
@@ -440,8 +449,8 @@ TEST_F(WhileLoopSimplifierTest, LoopWithNonTupleBodyShapeNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest,
@@ -473,8 +482,8 @@ TEST_F(WhileLoopSimplifierTest,
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithArrayConstantNotSimplified) {
@@ -505,8 +514,233 @@ TEST_F(WhileLoopSimplifierTest, LoopWithArrayConstantNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+}
+
+TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ta = (s32[1]) get-tuple-element(param), index=0
+    a = s32[1] get-tuple-element(ta), index=0
+    a.1 = s32[1] add(a, a)
+    tbcd = (s32[2], s32[3], (s32[4])) get-tuple-element(param), index=1
+    ROOT tuple = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd)
+  }
+  Cond {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    b = s32[2] constant({0,1})
+    c = s32[3] constant({0,1,2})
+    d = s32[4] constant({0,1,2,3})
+    ta = (s32[1]) tuple(a)
+    td = (s32[4]) tuple(d)
+    tbcd = (s32[2], s32[3], (s32[4])) tuple(b, c, td)
+    init = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd)
+    ROOT while = ((s32[1]), (s32[2], s32[3], (s32[4]))) while(init),
+      condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  Shape flat_tuple =
+      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3], s32[4])")
+          .ValueOrDie();
+  SCOPED_TRACE(m->ToString());
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(), flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      m->entry_computation()->root_instruction()->shape(),
+      ShapeUtil::ParseShapeString("((s32[1]), (s32[2], s32[3], (s32[4])))")
+          .ValueOrDie()));
+}
+
+// Edge-case: All elements of the loop carry are constants which can be removed,
+// leaving us with a nullary loop.  This is a special case, we just replace the
+// loop with its init.
+TEST_F(WhileLoopSimplifierTest, OnlyConstantsInLoopCarry) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1]) parameter(0)
+    a = s32[1] constant({0})
+    ROOT tuple = (s32[1]) tuple(a)
+  }
+  Cond {
+    param = (s32[1]) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    init = (s32[1]) tuple(a)
+    ROOT while = (s32[1]) while(init), condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(op::Constant()));
+}
+
+TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    a.1 = s32[1] add(a, a)
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({10,10,10})
+    ROOT tuple = (s32[1], s32[2], s32[3]) tuple(a.1, b, c)
+  }
+  Cond {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    /* Use each tuple element.  The verifier will then ensure that if any of
+     * these get modified, they're replaced with values of the correct shape. */
+    a = s32[1] get-tuple-element(param), index=0
+    b = s32[2] get-tuple-element(param), index=1
+    c = s32[3] get-tuple-element(param), index=2
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    /* Only `b` should be simplified away.  `a` is not a constant within the
+     * loop, and `c`'s value changes depending on whether we run 0 or 1
+     * iterations of the loop. */
+    a = s32[1] constant({0})
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({2,2,2})
+    init = (s32[1], s32[2], s32[3]) tuple(a,b,c)
+    ROOT while = (s32[1], s32[2], s32[3]) while(init),
+      condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  // Run the tuple simplifier to make the resulting HLO a bit easier to check.
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  Shape new_while_shape =
+      ShapeUtil::ParseShapeString("(s32[1], s32[3])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      m->entry_computation()->root_instruction()->shape(),
+      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3])").ValueOrDie()));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(_, op::Constant(), _));
+}
+
+const char* const kSimpleMergeInductionVariablesModule = R"(
+  HloModule Test
+  Body {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+
+    a = TYPE[] get-tuple-element(param), index=0
+    one = TYPE[] constant(1)
+    a1 = TYPE[] add(a, one)
+
+    b = TYPE[] get-tuple-element(param), index=1
+    negone = TYPE[] constant(-1)
+    b1 = TYPE[] add(b, negone)
+
+    c = TYPE[] add(a, b)
+
+    ROOT tuple = (TYPE[], TYPE[], TYPE[]) tuple(a1,b1,c)
+  }
+  Cond {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+    a = TYPE[] get-tuple-element(param), index=0
+    b = TYPE[] get-tuple-element(param), index=1
+    sum = TYPE[] power(a, b)
+    ten = TYPE[] constant(10)
+    ROOT cond = pred[] less-than(sum, ten)
+  }
+  ENTRY Loop {
+    a = TYPE[] constant(10)
+    b = TYPE[] constant(100)
+    c = TYPE[] constant(0)
+    init = (TYPE[], TYPE[], TYPE[]) tuple(a,b,c)
+    while = (TYPE[], TYPE[], TYPE[]) while(init), condition=Cond, body=Body
+
+    a1 = TYPE[] get-tuple-element(while), index=0
+    b1 = TYPE[] get-tuple-element(while), index=1
+    ROOT sum = TYPE[] add(a1, b1)
+  })";
+
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_Simple) {
+  string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule,
+                                          {{"TYPE", "s32"}});
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find, and run the tuple simplifier to make the resulting HLO
+  // easier to check.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  // We should have added a new loop counter for s32[] to the end of the tuple.
+  SCOPED_TRACE(m->ToString());
+  Shape new_while_shape =
+      ShapeUtil::ParseShapeString("(s32[], s32[], s32[], s32[])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+
+  EXPECT_THAT(new_while->while_body()->root_instruction(),
+              op::Tuple(op::GetTupleElement(op::Parameter(), 0),
+                        op::GetTupleElement(op::Parameter(), 1), op::Add(),
+                        op::Add(op::GetTupleElement(op::Parameter(), 3),
+                                op::Constant())));
+  EXPECT_THAT(new_while->while_condition()->root_instruction(),
+              op::Lt(op::Power(op::Add(), op::Add()), op::Constant()));
+}
+
+// We shouldn't merge S16 induction variables; we can't create constants of this
+// type because S16 literals are not implemented.
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_SkipS16) {
+  string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule,
+                                          {{"TYPE", "s16"}});
+  EXPECT_FALSE(
+      WhileLoopSimplifier()
+          .Run(ParseAndReturnVerifiedModule(hlo_string).ValueOrDie().get())
+          .ValueOrDie());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index 1f583ca44b7d20d56f27560f4a97a38c3fcc3026..039ccda7322f5efda6a827efbeda1225c3596cc0 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -270,4 +272,17 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
   return result;
 }
 
+/*static*/ absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>
+WhileUtil::GetGTEsMapForWhileConditional(
+    const HloComputation& while_conditional) {
+  absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>> result;
+  for (HloInstruction* user :
+       while_conditional.parameter_instruction(0)->users()) {
+    if (user->opcode() == HloOpcode::kGetTupleElement) {
+      result[user->tuple_index()].push_back(user);
+    }
+  }
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
index 524dcec5f12689027ef76b8ae180bcbcc7cff601..cba41ccd8b184ba3d867bc170724aee71e777788 100644
--- a/tensorflow/compiler/xla/service/while_util.h
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
@@ -85,6 +87,13 @@ class WhileUtil {
   // Assumes `while_body` is the body computation of the while loop in question.
   static std::vector<HloInstruction*> GetInvariantGTEsForWhileBody(
       const HloComputation& while_body);
+
+  // Returns a map of index to GetTupleElement instructions in
+  // `while_conditional` that access elements in the parameter tuple. Assumes
+  // `while_conditional` is the conditional computation of the while loop in
+  // question.
+  static absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>
+  GetGTEsMapForWhileConditional(const HloComputation& while_conditional);
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
index b9ef18892d7aa859f6b0b505db4c004e4f5c5066..a546a6d39cc55d1f327b8449c7d26cd4c95dbf98 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -45,7 +45,8 @@ class ZeroSizedHloEliminationTest : public HloTestBase {
                 0, ShapeUtil::MakeShape(F32, {3, 0}), "zero sized param"))) {}
 
   StatusOr<bool> RunZeroSizedElimination() {
-    auto module = CreateNewModule("zero_sized_elimination_test_module");
+    auto module =
+        CreateNewUnverifiedModule("zero_sized_elimination_test_module");
     module->AddEntryComputation(builder_.Build());
     return ZeroSizedHloElimination{}.Run(module.get());
   }
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 14c35e7b84f07bebac33a9753ac26a8ee1418f1e..33edbd1b20d01bf132f2a152625d5f49a45f26f9 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -47,8 +47,11 @@ class ServiceInterface {
   virtual Status ResetDevice(const ResetDeviceRequest* arg,
                              ResetDeviceResponse* result) = 0;
 
-  virtual Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                              ExecuteResponse* result) = 0;
+  virtual Status Compile(const CompileRequest* arg,
+                         CompileResponse* result) = 0;
+
+  virtual Status Execute(const ExecuteRequest* arg,
+                         ExecuteResponse* result) = 0;
 
   virtual Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
                                       ExecuteParallelResponse* result) = 0;
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index df610102b4c7fa08c0b7030124939009130f89f4..7bf97729165bef98fabc29040e02203eee68a53c 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -667,12 +667,11 @@ void ShapeTree<T>::CopySubtreeFrom(const ShapeTree<T>& other,
 template <typename T>
 bool ShapeTree<T>::operator==(const ShapeTree<T>& other) const {
   bool equal = true;
-  ForEachElement(
-      [this, &other, &equal](const ShapeIndex& index, const T& data) {
-        if (data != other.element(index)) {
-          equal = false;
-        }
-      });
+  ForEachElement([&other, &equal](const ShapeIndex& index, const T& data) {
+    if (data != other.element(index)) {
+      equal = false;
+    }
+  });
   return equal;
 }
 
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index c8ff55e7845785d9292516b823fb591cc28cbfad..2b6c484bc4f205be0180403eeac2dd391029b110 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -52,10 +52,10 @@ class ShapeTreeTest : public ::testing::Test {
 
 TEST_F(ShapeTreeTest, DefaultConstructor) {
   ShapeTree<int> int_tree;
-  EXPECT_TRUE(ShapeUtil::IsNil(int_tree.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(int_tree.shape()));
 
   ShapeTree<bool> bool_tree;
-  EXPECT_TRUE(ShapeUtil::IsNil(bool_tree.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(bool_tree.shape()));
 }
 
 void ShapeTreeTest::TestShapeConstructor(const Shape& shape,
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 17120e610cb26dda41fffd28fdb2b9e8bdffb973..7d011bfc658a1f0fc27d93027be355f49966bd62 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -74,6 +74,11 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index) {
   return out;
 }
 
+bool ShapeIndexView::StartsWith(ShapeIndexView prefix) const {
+  return size() >= prefix.size() &&
+         indices_.subspan(0, prefix.size()) == prefix.indices_;
+}
+
 namespace {
 
 // Returns whether the given primitive type corresponds to an array shape.
@@ -367,10 +372,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return IsTuple(shape) && TupleElementCount(shape) == 0;
 }
 
-/* static */ bool ShapeUtil::IsNil(const Shape& shape) {
-  return IsEmptyTuple(shape);
-}
-
 /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) {
   CHECK(IsTuple(shape)) << HumanString(shape);
   return shape.tuple_shapes_size();
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 191ab04759f2d0ae87d988cba0d303f1ab696432..7f72e57d008a71c7aa01262610dfb745641976b7 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -100,6 +101,11 @@ class ShapeIndex {
 
   string ToString() const;
 
+  template <typename H>
+  friend H AbslHashValue(H h, const ShapeIndex& index) {
+    return H::combine(std::move(h), index.indices_);
+  }
+
  private:
   container_type indices_;
 };
@@ -147,6 +153,9 @@ class ShapeIndexView {
 
   string ToString() const;
 
+  // Returns true if this shape index starts with 'prefix'.
+  bool StartsWith(ShapeIndexView prefix) const;
+
  private:
   absl::Span<const int64> indices_;
 };
@@ -465,9 +474,6 @@ class ShapeUtil {
   // Returns true if shape is an empty tuple.
   static bool IsEmptyTuple(const Shape& shape);
 
-  // Returns true if shape is the nil shape (an empty tuple).
-  static bool IsNil(const Shape& shape);
-
   // Returns the number of elements in the given tuple shape.
   // Precondition: IsTuple(shape)
   static int64 TupleElementCount(const Shape& shape);
@@ -751,10 +757,18 @@ class ShapeUtil {
       pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads);
     }
 
+    tensorflow::mutex mu;
+    Status status;  // Guarded by mu
+
     while (n < rank) {
       if (pool != absl::nullopt) {
-        pool->Schedule(
-            [indexes, &visitor_function] { visitor_function(indexes); });
+        pool->Schedule([indexes, &visitor_function, &mu, &status] {
+          StatusOr<bool> result = visitor_function(indexes);
+          if (!result.ok()) {
+            tensorflow::mutex_lock lock(mu);
+            status = status.ok() ? result.status() : status;
+          }
+        });
       } else {
         TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
         if (!should_continue) {
@@ -772,7 +786,9 @@ class ShapeUtil {
       }
     }
 
-    return Status::OK();
+    // Waits for the scheduled work to complete.
+    pool.reset();
+    return status;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil);
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 0c647369a37e70f93abe1732963d2ddc7730c214..11b493323cb4a44909bc535d1bbc04fda7506728 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -376,12 +376,12 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
 }
 
 TEST(ShapeUtilTest, NilShape) {
-  EXPECT_TRUE(ShapeUtil::IsNil(ShapeUtil::MakeNil()));
-  EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {1, 2, 3})));
-  EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {0, 1})));
-  EXPECT_FALSE(ShapeUtil::IsNil(
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeNil()));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {1, 2, 3})));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {0, 1})));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {})})));
-  EXPECT_FALSE(ShapeUtil::IsNil(
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {0})})));
 }
 
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index d395c9a4ceecfbd38076ac51f5a18da2ef098abb..20493a354cf486051ec3f47146e48c01a92af83b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -44,7 +44,7 @@ cc_library(
     testonly = True,
     srcs = ["xla_internal_test_main.cc"],
     deps = [
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings",
@@ -117,12 +117,12 @@ cc_library(
     deps = [
         ":literal_test_util",
         ":test_utils",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
@@ -135,50 +135,13 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-cc_library(
-    name = "hlo_verified_test_base",
-    testonly = True,
-    srcs = ["hlo_verified_test_base.cc"],
-    hdrs = ["hlo_verified_test_base.h"],
-    deps = [
-        ":hlo_test_base",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_verifier",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_verified_test_base_test",
-    srcs = ["hlo_verified_test_base_test.cc"],
-    deps = [
-        ":hlo_test_base",
-        ":hlo_verified_test_base",
-        ":test_macros_cpu",
-        ":test_utils",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_verifier",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 tf_cc_binary(
     name = "local_client_aot_test_helper",
     srcs = ["local_client_aot_test_helper.cc"],
@@ -335,6 +298,31 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "conv_depthwise_test",
+    timeout = "long",
+    srcs = ["conv_depthwise_test.cc"],
+    blacklisted_backends = [
+        # disabled because of a break b/119590850.
+        "cpu",
+        "gpu",
+    ],
+    shard_count = 50,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
@@ -868,7 +856,8 @@ xla_test(
     name = "convolution_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
-    shard_count = 25,
+    shard_count = 40,
+    tags = ["optonly"],
     deps = CONVOLUTION_TEST_DEPS + [
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index c131bfd6a6e6d8f3a929145fa06247c3addc5550..0615f9425c1289d666641f4d581946b44b4895ce 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -2478,8 +2478,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ne) {
   Ne(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,2] {
-  { 00 },
-  { 01 }
+  { 0, 0 },
+  { 0, 1 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2492,8 +2492,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ge) {
   Ge(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
-  { 1100 },
-  { 0001 }
+  { 1, 1, 0, 0 },
+  { 0, 0, 0, 1 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2506,8 +2506,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Gt) {
   Gt(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
-  { 0100 },
-  { 0000 }
+  { 0, 1, 0, 0 },
+  { 0, 0, 0, 0 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2520,8 +2520,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Le) {
   Le(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
-  { 1011 },
-  { 1111 }
+  { 1, 0, 1, 1 },
+  { 1, 1, 1, 1 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2534,8 +2534,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Lt) {
   Lt(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
-  { 0011 },
-  { 1110 }
+  { 0, 0, 1, 1 },
+  { 1, 1, 1, 0 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2744,12 +2744,16 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
   Array3D<int> expected_3d(
       {{{0, 1}, {0, 0}, {0, 0}}, {{0, 1}, {1, 0}, {0, 1}}});
   const string expected = R"(pred[2,3,2] {
-{ { 01 },
-  { 00 },
-  { 00 } },
-{ { 01 },
-  { 10 },
-  { 01 } }
+{
+  { 0, 1 },
+  { 0, 0 },
+  { 0, 0 }
+},
+{
+  { 0, 1 },
+  { 1, 0 },
+  { 0, 1 }
+}
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index 9966e4606ef7f104487182e0240e64e4c9e4d834..9930bfc95c297093584d427397cac042c296050f 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -42,7 +42,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) {
       ShapeUtil::MakeShape(F32, {}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -58,7 +58,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -81,7 +81,7 @@ XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
   builder.AddInstruction(HloInstruction::CreateTuple({element1, element2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -102,7 +102,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {0, 1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -121,7 +121,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {1, 0}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -138,7 +138,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
       ShapeUtil::MakeShape(F32, {2, 3, 2}), input, {0, 2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -158,7 +158,7 @@ TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
       ShapeUtil::MakeShape(F32, {2, 2, 3, 3}), input, {1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -183,7 +183,7 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
       ShapeUtil::MakeShape(F32, {3, 3, 3, r1_size}), input, {3}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -214,7 +214,7 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
       ShapeUtil::MakeShape(F32, {32, 64, 7, 7}), input, {1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -230,7 +230,7 @@ TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
       ShapeUtil::MakeShape(F32, {64, 64, 3, 3}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   LOG(INFO) << hlo_module->ToString();
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
@@ -253,7 +253,7 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
       ShapeUtil::MakeShape(F32, {3, 3, 2, 2}), input, {2, 3}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -287,7 +287,7 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
       ShapeUtil::MakeShape(F32, {2, 3, 4, 5}), input, {0, 1, 2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 9811a015e91d866d6f4de6ebb6dac536ed6c7e06..4f5b525a34252db9e967a55af0d1bf39a2dd830e 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -492,6 +492,32 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
   ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
 }
 
+XLA_TEST_F(ConcatTest, ConcatDeeplyNested) {
+  XlaBuilder builder(TestName());
+  auto a_literal = LiteralUtil::CreateR1<float>({256.0});
+  auto a = Parameter(&builder, 0, a_literal.shape(), "x");
+  auto b = ConcatInDim(&builder, {a, a}, 0);
+  auto c = ConcatInDim(&builder, {b, b}, 0);
+  auto d = ConcatInDim(&builder, {c, c}, 0);
+  auto e = ConcatInDim(&builder, {d, d}, 0);
+  auto f = ConcatInDim(&builder, {e, e}, 0);
+  auto g = ConcatInDim(&builder, {f, f}, 0);
+  auto h = ConcatInDim(&builder, {g, g}, 0);
+  auto i = ConcatInDim(&builder, {h, h}, 0);
+  auto j = ConcatInDim(&builder, {i, i}, 0);
+  auto k = ConcatInDim(&builder, {j, j}, 0);
+  auto l = ConcatInDim(&builder, {k, k}, 0);
+  auto m = ConcatInDim(&builder, {l, l}, 0);
+  auto n = ConcatInDim(&builder, {m, m}, 0);
+  auto o = ConcatInDim(&builder, {n, n}, 0);
+  auto p = ConcatInDim(&builder, {o, o}, 0);
+  auto q = ConcatInDim(&builder, {p, p}, 0);
+  ConcatInDim(&builder, {q, q}, 0);
+  std::vector<float> expected(131072, 256.0);
+  auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie();
+  ComputeAndCompareR1<float>(&builder, expected, {a_data.get()});
+}
+
 // Describes a binary rank-2 concatenation test.
 struct R2BinarySpec {
   int64 lhs_dim0;
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60ce576ceb20b89b59e72d821e63b0ccdee51b0b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct DepthwiseConvolution2DSpec {
+  int64 output_feature, window, stride, pad, lhs_dilate;
+  std::vector<int64> activation_dims;
+  std::vector<int64> activation_layout;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> kernel_layout;
+  std::vector<int64> output_dims;
+  std::vector<int64> output_layout;
+};
+
+class DepthwiseConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<DepthwiseConvolution2DSpec, bool>> {};
+
+static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<DepthwiseConvolution2DSpec> config_set;
+  std::vector<std::vector<int64>> config_options = {
+      {128, 6, 3, 64},  {256, 5, 3, 256},  {256, 5, 2, 144}, {144, 5, 3, 64},
+      {144, 5, 2, 256}, {8, 48, 17, 8},    {128, 20, 6, 64}, {128, 1, 2, 144},
+      {256, 1, 2, 64},  {64, 14, 12, 172}, {16, 9, 4, 16}};
+
+  for (auto option : config_options) {
+    int64 feature = option[0];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[3];
+
+    std::vector<int64> kernel_layout = {3, 2, 1, 0};
+    DepthwiseConvolution2DSpec config;
+    config.output_feature = feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size, feature};
+    config.activation_layout = {3, 0, 2, 1};
+
+    config.kernel_dims = {kernel_size, kernel_size, 1, feature};
+    config.kernel_layout = {3, 2, 1, 0};
+
+    if (activation_size == 1 && kernel_size == 2) {
+      // Test for outer dim.
+      config.output_dims = {batch, activation_size + kernel_size - 1,
+                            activation_size + kernel_size, feature};
+    } else if (feature == 256) {
+      // Restrict dilation-based tests only to one feature configuration.
+      config.stride = activation_size - 1;
+      config.pad = 0;
+      config.lhs_dilate = feature / 32;
+      config.output_dims = {batch, feature / 32,
+                            activation_size - kernel_size + 1, feature};
+    } else {
+      config.stride = config.pad = config.lhs_dilate = -1;
+      config.output_dims = {batch, activation_size - kernel_size + 1,
+                            activation_size - kernel_size + 1, feature};
+    }
+
+    // Try this layout for all kernel shapes.
+    config.output_layout = {3, 0, 2, 1};
+    config_set.push_back(config);
+
+    // Try other layouts only for certain kernel shapes.
+    if (kernel_size % 2 == 0) {
+      config.activation_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.output_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.activation_layout = {3, 0, 2, 1};
+      config_set.push_back(config);
+    }
+  }
+
+  return config_set;
+}
+
+string DepthwiseConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
+      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"), data_type);
+  // -1 indicates non-existence.
+  if (spec.stride != -1) {
+    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
+  }
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextDepthwiseConvolution2D(
+    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv, is_scheduled=true
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
+          feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.window, spec.window, spec.window, spec.output_feature);
+
+  } else if (spec.stride == -1) {
+    return absl::StrFormat(
+        R"(
+      HloModule TensorFlowDepthwiseConv, is_scheduled=true
+
+      ENTRY main {
+        activation = %s[%s]{%s} parameter(0)
+        kernel = %s[%s]{%s} parameter(1)
+        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
+            feature_group_count=%d
+      }
+      )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.output_feature);
+  } else {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv, is_scheduled=true
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
+          dim_labels=b01f_01io->b01f, feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.stride, 0, 0, spec.lhs_dilate, spec.output_feature);
+  }
+}
+
+XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) {
+  const DepthwiseConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text =
+      BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      hlo_text, ErrorSpec{0.01, 0.01}, [](HloModule* module) -> Status {
+        BFloat16MixedPrecisionRemoval remover;
+        TF_RETURN_IF_ERROR(remover.Run(module).status());
+        Despecializer despecializer;
+        return despecializer.Run(module).status();
+      }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DepthwiseConvolution2DTestWithRandomIndices, DepthwiseConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    DepthwiseConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 3aebf784664dac14ba2ea45c5a229b7b2e4fc39d..b52d30fd6624c26ad62bd0c5f6a6d74175e4539f 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -597,7 +597,692 @@ TYPED_TEST(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, Types) {
 }
 
 template <typename T>
-class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
+class Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 5};
+    std::vector<int64> filter_dims = {3, 3, 1, 5};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/5);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(6864),  static_cast<T>(7296),  static_cast<T>(7746),
+         static_cast<T>(8214),  static_cast<T>(8700),  static_cast<T>(7809),
+         static_cast<T>(8286),  static_cast<T>(8781),  static_cast<T>(9294),
+         static_cast<T>(9825),  static_cast<T>(10644), static_cast<T>(11256),
+         static_cast<T>(11886), static_cast<T>(12534), static_cast<T>(13200),
+         static_cast<T>(11589), static_cast<T>(12246), static_cast<T>(12921),
+         static_cast<T>(13614), static_cast<T>(14325)});
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 5}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+
+    auto filter_r = filter_r1.Reshape(filter_dims);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 512}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 512}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(
+    Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes,
+    TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {256, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048 * 256, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 =
+        expected_r1.Reshape({256, 2, 2, 512}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {256, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048 * 256, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 =
+        expected_r1.Reshape({256, 2, 2, 512}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 5};
+    std::vector<int64> filter_dims = {3, 3, 1, 5};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/5);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(6864),  static_cast<T>(7296),  static_cast<T>(7746),
+         static_cast<T>(8214),  static_cast<T>(8700),  static_cast<T>(7809),
+         static_cast<T>(8286),  static_cast<T>(8781),  static_cast<T>(9294),
+         static_cast<T>(9825),  static_cast<T>(10644), static_cast<T>(11256),
+         static_cast<T>(11886), static_cast<T>(12534), static_cast<T>(13200),
+         static_cast<T>(11589), static_cast<T>(12246), static_cast<T>(12921),
+         static_cast<T>(13614), static_cast<T>(14325)});
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 5}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(
+    Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes,
+    TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({3, 0, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 1024};
+    std::vector<int64> filter_dims = {3, 3, 1, 1024};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/1024);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(4096, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 1024}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid : public ConvolutionTest {
  public:
   void RunTest() {
     XlaBuilder builder(TestName());
@@ -656,8 +1341,8 @@ class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
   }
 };
 
-TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, TestTypes);
-TYPED_TEST(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, Types) {
+TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, Types) {
   this->RunTest();
 }
 
@@ -951,6 +1636,18 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF32ForwardReversed)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %arg0 = f32[3,56,56,16] parameter(0)
+  %arg1 = f32[3,3,3,32] parameter(1)
+  ROOT %conv = f32[54,54,16,32] convolution(%arg0, %arg1), window={size=3x3 rhs_reversal=1x1}, dim_labels=f01b_i01o->01bf
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
 XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardFilter)) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 1407e68d9a336b6bb1c960711015430f872aa912..3622f2c1e84639baed13059b21b20609d1347da6 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -45,7 +45,7 @@ class CopyOpTest : public HloTestBase {
     builder.AddInstruction(HloInstruction::CreateUnary(
         constant->shape(), HloOpcode::kCopy, constant));
     auto computation = builder.Build();
-    auto module = CreateNewModule();
+    auto module = CreateNewUnverifiedModule();
     module->AddEntryComputation(std::move(computation));
 
     Literal result = ExecuteAndTransfer(std::move(module), {});
@@ -98,7 +98,7 @@ XLA_TEST_F(CopyOpTest, CopyParameterScalar) {
 
   auto computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
 
   Literal result = ExecuteAndTransfer(std::move(module), {&literal});
@@ -119,7 +119,7 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2Twice) {
 
   auto computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   Literal result = ExecuteAndTransfer(std::move(module), {});
   LiteralTestUtil::ExpectR2Near<float>({{1.0, 2.0}, {3.0, 4.0}}, result,
@@ -143,7 +143,7 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2DifferentLayouts) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   Literal result = ExecuteAndTransfer(std::move(module), {});
 
@@ -175,7 +175,7 @@ void CopyOpTest::TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   ForceResultLayout(module.get(), LayoutUtil::MakeLayout({1, 2, 0}));
   Literal result = ExecuteAndTransfer(std::move(module), {});
@@ -209,7 +209,7 @@ void CopyOpTest::TestCopyConstantLayoutR4(size_t n1, size_t n2, size_t n3,
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   ForceResultLayout(module.get(), LayoutUtil::MakeLayout(permutation));
   Literal result = ExecuteAndTransfer(std::move(module), {});
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 001490c6a8c568656437465054ee4db40d0d8dee..738b6442354b01364278e3e3c713aa2cdb5cf47d 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -70,7 +70,7 @@ class CustomCallTest : public HloTestBase {
 };
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto constant = builder.AddInstruction(
@@ -85,7 +85,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
 }
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   Array2D<float> array(2, 2);
@@ -106,7 +106,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
 }
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
   auto input = b.AddInstruction(
@@ -130,7 +130,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
 }
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(InputAndOutputLayoutDiffer)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
   auto input =
@@ -155,7 +155,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(LayoutConstrained)) {
   // The argument and result of the computation are set to different layouts,
   // but the custom call is layout constrained to a fixed operand and result
   // layout, so the correct result should be produced.
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
   auto input =
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 4d4b676a538947c8dd92a7e34db72e45766cae2c..d1fddf9d6b494a822610e41307fa103dc90bdef3 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -81,7 +81,7 @@ class FusionTest : public HloTestBase {
     }
 
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
 
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
 
@@ -183,7 +183,7 @@ XLA_TEST_F(FusionTest, Test) {
   //                     (-{{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}}),
   //              {{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}})) = {{0.5}, {2.72}}
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0}, {2.0}, {3.0}})));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -231,7 +231,7 @@ XLA_TEST_F(FusionTest, Parameter) {
   // Build a computation and fuse part of it so the fusion instruction has an
   // operand parameter.
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0, 2.0, 3.0}})));
   auto copy1 = builder.AddInstruction(HloInstruction::CreateUnary(
@@ -266,7 +266,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
       ShapeUtil::MakeShapeWithLayout(F32, {rand_dim0_size, dim1_size}, {1, 0});
   // Build simple fusion computation: y = x^2 (elementwise).
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto two = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
@@ -290,7 +290,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
 
 XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
   auto const_array = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -314,7 +314,7 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
 
 XLA_TEST_F(FusionTest, ReshapeToScalar) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto single_element_array = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2<int32>({{5}})));
   auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -329,7 +329,7 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) {
 
 XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -344,7 +344,7 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
 
 XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}})));
   auto reshape1 = builder.AddInstruction(
@@ -359,7 +359,7 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
 
 XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR3<int32>({{{7}}})));
   auto reshape1 = builder.AddInstruction(
@@ -374,7 +374,7 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
 
 XLA_TEST_F(FusionTest, Reshape__1by1by1) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -389,7 +389,7 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) {
 
 XLA_TEST_F(FusionTest, Reshape__) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(
@@ -404,7 +404,7 @@ XLA_TEST_F(FusionTest, Reshape__) {
 
 XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(
@@ -419,7 +419,7 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
 
 XLA_TEST_F(FusionTest, Transpose_2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -434,7 +434,7 @@ XLA_TEST_F(FusionTest, Transpose_2by3) {
 
 XLA_TEST_F(FusionTest, Transpose_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -449,7 +449,7 @@ XLA_TEST_F(FusionTest, Transpose_3by3) {
 
 XLA_TEST_F(FusionTest, Reverse) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
@@ -465,7 +465,7 @@ XLA_TEST_F(FusionTest, Reverse) {
 
 XLA_TEST_F(FusionTest, ReverseNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
@@ -483,7 +483,7 @@ XLA_TEST_F(FusionTest, ReverseNegate) {
 
 XLA_TEST_F(FusionTest, BroadcastNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
   auto broadcast1 = builder.AddInstruction(HloInstruction::CreateBroadcast(
@@ -501,7 +501,7 @@ XLA_TEST_F(FusionTest, BroadcastNegate) {
 
 XLA_TEST_F(FusionTest, SliceNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto slice1 = builder.AddInstruction(HloInstruction::CreateSlice(
@@ -519,7 +519,7 @@ XLA_TEST_F(FusionTest, SliceNegate) {
 
 XLA_TEST_F(FusionTest, DynamicSliceNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto const1 = builder.AddInstruction(
@@ -541,7 +541,7 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) {
 
 XLA_TEST_F(FusionTest, ReshapeNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto reshape1 = builder.AddInstruction(
@@ -559,7 +559,7 @@ XLA_TEST_F(FusionTest, ReshapeNegate) {
 
 XLA_TEST_F(FusionTest, TransposeNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}})));
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -587,7 +587,7 @@ std::unique_ptr<HloComputation> MakeReduceTestComputation() {
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -607,7 +607,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -630,7 +630,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}})));
   auto const1 = builder.AddInstruction(
@@ -682,7 +682,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
 // into a fusion, it should remain shared, rather than being duplicated
 // within the fusion.
 XLA_TEST_F(FusionTest, SharedConstant) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 7ab2ecda58666acd7e9b8587d200a902b75822f3..989a7c705a8254f99e5cc0e97dfde5942f146964 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -85,6 +85,25 @@ ProgramShape GetProgramShapeWithLayout(const HloModule& module) {
 
 }  // namespace
 
+Status VerifiedHloModule::Verify() {
+  if (computation_count() == 0) {
+    // The computation was never built. Nothing to verify.
+    return Status::OK();
+  }
+  return verifier_.Run(this).status();
+}
+
+void VerifiedHloModule::VerifyOrAddFailure(const string& message) {
+  Status status = Verify();
+  if (!status.ok()) {
+    ADD_FAILURE() << "HloVerifier failed on module " << name()
+                  << (message.empty() ? "" : absl::StrCat(" (", message, ")"))
+                  << ": " << status;
+    LOG(ERROR) << "Contents of bad module:";
+    XLA_LOG_LINES(tensorflow::ERROR, ToString());
+  }
+}
+
 HloTestBase::HloTestBase(bool verifier_layout_sensitive,
                          bool allow_mixed_precision_in_hlo_verifier,
                          std::function<bool(const HloInstruction*)>
@@ -100,17 +119,40 @@ HloTestBase::HloTestBase(se::Platform* test_platform,
                          bool allow_mixed_precision_in_hlo_verifier,
                          std::function<bool(const HloInstruction*)>
                              instruction_can_change_layout_func)
-    : test_runner_(test_platform), reference_runner_(reference_platform) {
+    : test_runner_(test_platform),
+      reference_runner_(reference_platform),
+      verifier_layout_sensitive_(verifier_layout_sensitive),
+      allow_mixed_precision_in_hlo_verifier_(
+          allow_mixed_precision_in_hlo_verifier) {
   hlo_verifier_ = absl::make_unique<HloVerifier>(
       /*layout_sensitive=*/verifier_layout_sensitive,
       /*allow_mixed_precision=*/allow_mixed_precision_in_hlo_verifier,
       instruction_can_change_layout_func);
 }
 
-std::unique_ptr<HloModule> HloTestBase::CreateNewModule(const string& name) {
+std::unique_ptr<HloModule> HloTestBase::CreateNewUnverifiedModule(
+    const string& name) {
   return absl::make_unique<HloModule>(name, GetModuleConfigForTest());
 }
 
+std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
+    const string& name) {
+  return absl::make_unique<VerifiedHloModule>(
+      name, GetModuleConfigForTest(), verifier_layout_sensitive_,
+      allow_mixed_precision_in_hlo_verifier_);
+}
+
+StatusOr<std::unique_ptr<VerifiedHloModule>>
+HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                                          const HloModuleConfig& config) {
+  auto module = absl::make_unique<VerifiedHloModule>(
+      TestName(), config, verifier_layout_sensitive_,
+      allow_mixed_precision_in_hlo_verifier_);
+  TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
+  TF_RETURN_IF_ERROR(module->Verify());
+  return std::move(module);
+}
+
 /* static */
 StatusOr<bool> HloTestBase::RunHloPass(HloPassInterface* hlo_pass,
                                        HloModule* module) {
@@ -135,7 +177,7 @@ PrecisionConfig HloTestBase::DefaultPrecisionConfig(int operands) {
 }
 
 DebugOptions HloTestBase::GetDebugOptionsForTest() {
-  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  auto debug_options = GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
   debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 217428befa474448cf2dcbae2eb6cb5b0e61d44c..1d1e7f437296a7493ef7da07039fcf6d273f35bc 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/macros.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/backend.h"
@@ -38,6 +39,31 @@ limitations under the License.
 
 namespace xla {
 
+// An HLO module derived class which verifies itself on destruction. This class
+// is intended to be used in unit tests. Any verification errors are raised via
+// ADD_FAILURE.
+class VerifiedHloModule : public HloModule {
+ public:
+  VerifiedHloModule(const string& name, const HloModuleConfig& config,
+                    bool verifier_layout_sensitive,
+                    bool allow_mixed_precision_in_hlo_verifier)
+      : HloModule(name, config),
+        verifier_(verifier_layout_sensitive,
+                  allow_mixed_precision_in_hlo_verifier) {}
+
+  ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
+
+  // Verifies the module using HloVerifier and returns the status.
+  Status Verify();
+
+  // Verifies the module and flags any error with ADD_FAILURE. 'message' is
+  // included in the failure message.
+  void VerifyOrAddFailure(const string& message);
+
+ private:
+  HloVerifier verifier_;
+};
+
 // A base class for tests which build and/or run HLO code. The class includes
 // support for running an HLO module on two platforms and compare the results.
 // This is a lower level of abstraction than using the client interface and
@@ -72,7 +98,22 @@ class HloTestBase : public ::testing::Test {
   // options from command-line flags. If you want a fresh HloModule object and
   // then add HloComputations to it, it's recommended to use this method in your
   // tests.
-  std::unique_ptr<HloModule> CreateNewModule(const string& name = TestName());
+  //
+  // This returns a vanilla HloModule that doesn't run the HLO verifier on
+  // destruction.
+  ABSL_DEPRECATED("Use CreateNewVerifiedModule instead.")
+  std::unique_ptr<HloModule> CreateNewUnverifiedModule(
+      const string& name = TestName());
+
+  // Like CreateNewUnverifiedModule, except the HloModule returned here runs the
+  // HLO verifier on destruction.
+  std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
+      const string& name = TestName());
+
+  // Parses the given string and returns module as a VerifiedHloModule.
+  StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
+      absl::string_view hlo_text,
+      const HloModuleConfig& config = HloModuleConfig());
 
   // Runs the hlo_pass with the provided module and returns the result. This
   // function also verifies that the module remains unchanged when hlo_pass
@@ -247,6 +288,8 @@ class HloTestBase : public ::testing::Test {
   HloRunner test_runner_;
   HloRunner reference_runner_;
 
+  bool verifier_layout_sensitive_;
+  bool allow_mixed_precision_in_hlo_verifier_;
   std::unique_ptr<HloVerifier> hlo_verifier_;
 
   ErrorSpec error_spec_{0.0001};
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
deleted file mode 100644
index 8bd0a729b77f3ec14204952cb0062103c823883e..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
-
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/hlo_verifier.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace xla {
-
-Status VerifiedHloModule::Verify() {
-  if (computation_count() == 0) {
-    // The computation was never built. Nothing to verify.
-    return Status::OK();
-  }
-  return verifier_.Run(this).status();
-}
-
-void VerifiedHloModule::VerifyOrAddFailure(const string& message) {
-  Status status = Verify();
-  if (!status.ok()) {
-    ADD_FAILURE() << "HloVerifier failed on module " << name()
-                  << (message.empty() ? "" : absl::StrCat(" (", message, ")"))
-                  << ": " << status;
-  }
-}
-
-HloVerifiedTestBase::HloVerifiedTestBase(bool layout_sensitive,
-                                         bool allow_mixed_precision)
-    : HloTestBase(
-          /*verifier_layout_sensitive=*/layout_sensitive,
-          /*allow_mixed_precision_in_hlo_verifier=*/allow_mixed_precision),
-      verifier_layout_sensitive_(layout_sensitive),
-      allow_mixed_precision_in_hlo_verifier_(allow_mixed_precision) {}
-
-HloModule& HloVerifiedTestBase::module() {
-  if (!module_) {
-    module_ = CreateNewVerifiedModule(TestName());
-  }
-  return *module_;
-}
-
-HloModule* HloVerifiedTestBase::CreateNewModule(const string& name) {
-  modules_.emplace_back(CreateNewVerifiedModule(name));
-  return modules_.back().get();
-}
-
-void HloVerifiedTestBase::ParseAndVerifyModule(absl::string_view hlo_text,
-                                               const HloModuleConfig& config) {
-  CHECK(!module_) << "Called ParseModule when test already has a module.";
-  module_ = CreateNewVerifiedModule(TestName());
-  TF_CHECK_OK(ParseHloString(hlo_text, module_.get()));
-  module_->VerifyOrAddFailure("after parsing");
-}
-
-StatusOr<std::unique_ptr<VerifiedHloModule>>
-HloVerifiedTestBase::ParseAndReturnVerifiedModule(
-    absl::string_view hlo_text, const HloModuleConfig& config) {
-  auto module = CreateNewVerifiedModule(TestName());
-  TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
-  TF_RETURN_IF_ERROR(module->Verify());
-  return std::move(module);
-}
-
-std::unique_ptr<VerifiedHloModule> HloVerifiedTestBase::CreateNewVerifiedModule(
-    const string& name) {
-  return absl::make_unique<VerifiedHloModule>(
-      name, GetModuleConfigForTest(), verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_);
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
deleted file mode 100644
index 388a99bb36408665edbc20ade6c6a733d64db88d..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_TESTS_HLO_VERIFIED_TEST_BASE_H_
-#define TENSORFLOW_COMPILER_XLA_TESTS_HLO_VERIFIED_TEST_BASE_H_
-
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "absl/base/macros.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-
-namespace xla {
-
-// An HLO module derived class which verifies itself on destruction. This class
-// is intended to be used in unit tests. Any verification errors are raised via
-// ADD_FAILURE.
-class VerifiedHloModule : public HloModule {
- public:
-  VerifiedHloModule(const string& name, const HloModuleConfig& config,
-                    bool verifier_layout_sensitive,
-                    bool allow_mixed_precision_in_hlo_verifier)
-      : HloModule(name, config),
-        verifier_(verifier_layout_sensitive,
-                  allow_mixed_precision_in_hlo_verifier) {}
-
-  ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
-
-  // Verifies the module using HloVerifier and returns the status.
-  Status Verify();
-
-  // Verifies the module and flags any error with ADD_FAILURE. 'message' is
-  // included in the failure message.
-  void VerifyOrAddFailure(const string& message);
-
- private:
-  HloVerifier verifier_;
-};
-
-// A base class for HLO tests that stores a default VerifiedHloModule.
-class HloVerifiedTestBase : public HloTestBase {
- protected:
-  HloVerifiedTestBase(bool layout_sensitive = false,
-                      bool allow_mixed_precision = false);
-
-  // Constructs a default shape verifier.
-  std::unique_ptr<ShapeVerifier> MakeShapeVerifier();
-
-  // Returns the default HloModule, lazily creating it if necessary via
-  // HloTestBase::CreateNewModule().
-  ABSL_DEPRECATED("Use CreateNewVerifiedModule() instead.")
-  HloModule& module();
-
-  ABSL_DEPRECATED("Use ParseAndReturnVerifiedModule() instead.")
-  void ParseAndVerifyModule(absl::string_view hlo_text,
-                            const HloModuleConfig& config = HloModuleConfig());
-
-  // Parses the given string and returns module as a VerifiedHloModule.
-  StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
-      absl::string_view hlo_text,
-      const HloModuleConfig& config = HloModuleConfig());
-
-  // Creates a new module for a test, and stores it in modules_ so it can be
-  // verified. Intentionally hides HloTestBase::CreateNewModule, to prevent
-  // creation of unverified modules.
-  ABSL_DEPRECATED("Use CreateNewVerifiedModule() instead.")
-  HloModule* CreateNewModule(const string& name = TestName());
-
-  // Creates and returns a verified HLO module with the given name.
-  std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
-      const string& name = TestName());
-
- private:
-  // It is confusing to store modules created by module() and CreateNewModule()
-  // in different fields, but it allows us to migrate tests to
-  // HloVerifiedTestBase more easily, so it's a win because we can verify more
-  // modules. See b/80488902.
-  //
-  // Lazily populated. Access via module().
-  std::unique_ptr<VerifiedHloModule> module_;
-
-  // Populated by calls to CreateNewModule.
-  std::vector<std::unique_ptr<VerifiedHloModule>> modules_;
-
-  bool verifier_layout_sensitive_;
-  bool allow_mixed_precision_in_hlo_verifier_;
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_TESTS_HLO_VERIFIED_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base_test.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base_test.cc
deleted file mode 100644
index 5c0263e811f94c90a69a460525ffa0c65127ebb5..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base_test.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
-
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_verifier.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace {
-
-// This class includes unit tests which are expected to fail because invalid HLO
-// modules are intentionally built. Unfortunately, Tensorflow doesn't appear to
-// include the necessary gunit parts to test this test machinery (needs the
-// macro EXPECT_NONFATAL_FAILURE). The disabled tests can be run with the
-// disabled tests enabled and failures can be manually compared against
-// expectations.
-class HloVerifiedTestBaseTest : public HloVerifiedTestBase {};
-
-XLA_TEST_F(HloVerifiedTestBaseTest, NoModule) {
-  // Test shouldn't fail if no module is created at all.
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, GoodLazilyCreatedModule) {
-  // Use module() to lazily create an empty module, build it up, and verify no
-  // failures.
-  HloModule& hlo_module = module();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  hlo_module.AddEntryComputation(builder.Build());
-}
-
-// This test is expected to fail. See test class comment.
-XLA_TEST_F(HloVerifiedTestBaseTest, DISABLED_BadLazilyCreatedModule) {
-  // Use module() to lazily create an empty module and build up an invalid
-  // module.
-  HloModule& hlo_module = module();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  hlo_module.AddEntryComputation(builder.Build());
-
-  *hlo_module.entry_computation()->root_instruction()->mutable_shape() =
-      ShapeUtil::MakeShape(PRED, {1, 2, 3});
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, GoodCreateNewModule) {
-  // Call CreateNewModule and build up a valid module.
-  HloModule* module = CreateNewModule();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  module->AddEntryComputation(builder.Build());
-}
-
-// This test is expected to fail. See test class comment.
-XLA_TEST_F(HloVerifiedTestBaseTest, DISABLED_BadCreateNewModule) {
-  // Call CreateNewModule and build up a invalid module.
-  HloModule* module = CreateNewModule();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  module->AddEntryComputation(builder.Build());
-
-  *module->entry_computation()->root_instruction()->mutable_shape() =
-      ShapeUtil::MakeShape(PRED, {1, 2, 3});
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, ParseAndVerifyModuleGood) {
-  const char* const hlo_string = R"(
-HloModule ParseAndVerifyModuleGood
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[] add(x,y)
-}
-)";
-
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_EQ(module().entry_computation()->instruction_count(), 3);
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, ParseAndReturnVerifiedModuleGood) {
-  const char* const hlo_string = R"(
-HloModule ParseAndReturnVerifiedModuleGood
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[] add(x,y)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  EXPECT_EQ(module->entry_computation()->instruction_count(), 3);
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, ParseAndReturnVerifiedModuleInvalidText) {
-  const char* const hlo_string = R"(
-HloModule ParseAndReturnVerifiedModuleGood
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[] add(x,y)
-}
-
-RANDOM GARBAGE
-)";
-
-  ASSERT_IS_NOT_OK(ParseAndReturnVerifiedModule(hlo_string).status());
-}
-
-// This test is expected to fail. See test class comment.
-XLA_TEST_F(HloVerifiedTestBaseTest, DISABLED_ParseAndReturnVerifiedModuleBad) {
-  const char* const hlo_string = R"(
-HloModule ParseAndReturnVerifiedModuleBad
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[1234] add(x,y)
-}
-)";
-
-  ASSERT_IS_NOT_OK(ParseAndReturnVerifiedModule(hlo_string).status());
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index c622b295094e53e63d0ed692d428bc97724c787c..a78ccacec114858740bf1b9c04e9b688bca5818d 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -68,7 +68,7 @@ class LLVMCompilerTest : public ::testing::Test {
     builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
 
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
     hlo_module->AddEntryComputation(builder.Build());
 
     compiler->SetPreOptimizationHook(pre_opt_hook);
@@ -90,7 +90,7 @@ class LLVMCompilerTest : public ::testing::Test {
     builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
 
-    std::unique_ptr<HloModule> hlo_module = CreateNewModule();
+    std::unique_ptr<HloModule> hlo_module = CreateNewUnverifiedModule();
     hlo_module->AddEntryComputation(builder.Build());
 
     auto module_group = absl::make_unique<HloModuleGroup>("test_module_group");
@@ -124,9 +124,9 @@ class LLVMCompilerTest : public ::testing::Test {
     return ::testing::UnitTest::GetInstance()->current_test_info()->name();
   }
 
-  static std::unique_ptr<HloModule> CreateNewModule() {
+  static std::unique_ptr<HloModule> CreateNewUnverifiedModule() {
     HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config.set_debug_options(GetDebugOptionsFromFlags());
     return absl::make_unique<HloModule>(TestName(), config);
   }
 };
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index ca7637a0cfa5d837dfd86aadafd1e5cc19ffc22e..3f5135438fc59bea98527b1be30ee49339edd455 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -62,7 +62,7 @@ class MultiOutputFusionTest : public HloTestBase {
 
   void RunTest2D(bool manual_fusion, int64 size) {
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
 
     const Shape elem_shape0 = ShapeUtil::MakeShapeWithLayout(F32, {}, {});
     const Shape elem_shape2 =
@@ -122,7 +122,7 @@ class MultiOutputFusionTest : public HloTestBase {
 
   void RunTest1D(bool manual_fusion, int size) {
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
 
     const Shape elem_shape_F32 =
         ShapeUtil::MakeShapeWithDescendingLayout(F32, {size});
diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc
index 58539e6b061b0cec1cc660b52e78894e5deeea56..774eb8d2a85914c52597144e70838ee117ee1134 100644
--- a/tensorflow/compiler/xla/tests/pred_test.cc
+++ b/tensorflow/compiler/xla/tests/pred_test.cc
@@ -87,8 +87,8 @@ TEST_F(PredTest, ConstantR2Pred) {
   XlaBuilder builder(TestName());
   ConstantR2<bool>(&builder, {{false, true, true}, {true, false, false}});
   const string expected = R"(pred[2,3] {
-  { 011 },
-  { 100 }
+  { 0, 1, 1 },
+  { 1, 0, 0 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
index 7e1f4aa0eb4801876d9bdbac6a4d7f1d09f81ba8..32de0fdf78f9c442e17c55e1b951e39122dac5ef 100644
--- a/tensorflow/compiler/xla/tests/scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -129,6 +129,42 @@ ENTRY main {
   RunTest(hlo_text, &operand, &scatter_indices, &updates);
 }
 
+XLA_TEST_F(ScatterTest, TensorFlowScatterV2_InversePermutation) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterV2
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  permutation = s32[3,4] parameter(0)
+  reshape = s32[3,4,1] reshape(permutation)
+  operand = s32[3,4] iota(), iota_dimension=1
+  updates = s32[3,4,1,1] iota(), iota_dimension=1
+  iota = s32[3,4,1] iota(), iota_dimension=0
+  indices = s32[3,4,2] concatenate(iota, reshape), dimensions={2}
+  ROOT scatter = s32[3,4] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={2,3},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=2
+}
+)";
+  Literal permutation =
+      LiteralUtil::CreateR2<int32>({{1, 3, 2, 0}, {3, 0, 2, 1}, {2, 3, 1, 0}});
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text, config));
+  auto actual = ExecuteAndTransfer(std::move(module), {&permutation});
+  Literal expected =
+      LiteralUtil::CreateR2<int32>({{3, 0, 2, 1}, {1, 3, 2, 0}, {3, 2, 0, 1}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, actual));
+}
+
 XLA_TEST_F(ScatterTest, SimpleR4) {
   const char* hlo_text = R"(
 HloModule SimpleR4
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 2cc33ab0963afe8ba2d8e9a6972dcf0622e27c48..3fb69419e735bfd9c5054673e0687f5139a410cb 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -166,6 +166,26 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
 }
 
+TEST_F(SliceTest, SliceOfReshape) {
+  Array2D<int> values(2 * 3 * 24, 7);
+  values.FillIota(1);
+  XlaBuilder builder(TestName());
+  auto original = ConstantR2FromArray2D(&builder, values);
+  auto reshape = Reshape(original, {24, 3, 2, 7});
+  Slice(reshape, {0, 0, 0, 0}, {11, 3, 2, 7}, {1, 1, 1, 1});
+  ComputeAndCompare(&builder, {});
+}
+
+TEST_F(SliceTest, SliceOfCollapsingReshape) {
+  Array4D<int> values(2, 3, 5, 7);
+  values.FillIota(1);
+  XlaBuilder builder(TestName());
+  auto original = ConstantR4FromArray4D(&builder, values);
+  auto reshape = Reshape(original, {2 * 3 * 5, 7});
+  Slice(reshape, {0, 0}, {4, 7}, {1, 1});
+  ComputeAndCompare(&builder, {});
+}
+
 XLA_TEST_F(SliceTest, StridedSliceR4WithOutputLayout) {
   Array4D<float> values(2, 4, 6, 8);
   values.FillRandom(3.14f);
@@ -253,7 +273,6 @@ XLA_TEST_P(SliceR1LargeTest, DoIt_S64) { Run<int64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_PRED) { Run<bool>(GetParam()); }
 
-
 // Tests for R1 slice ops.
 // The format for each testcase is {input size, start, limit, stride}.
 // clang-format off
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index b34fd0f2e873214c509533f29553af914ddc984d..a2b7c26331b3cc89ed0413efe8eb31c2b9e37038 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -28,7 +28,7 @@ namespace {
 class TokenHloTest : public HloTestBase {};
 
 XLA_TEST_F(TokenHloTest, SingleTokenInstruction) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(HloInstruction::CreateToken());
 
@@ -38,8 +38,22 @@ XLA_TEST_F(TokenHloTest, SingleTokenInstruction) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, LiteralUtil::CreateToken()));
 }
 
+XLA_TEST_F(TokenHloTest, TokenInTuple) {
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto token = builder.AddInstruction(HloInstruction::CreateToken());
+  builder.AddInstruction(HloInstruction::CreateTuple({token}));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(module), {}));
+  Literal token_literal = LiteralUtil::CreateToken();
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(result, LiteralUtil::MakeTuple({&token_literal})));
+}
+
 XLA_TEST_F(TokenHloTest, TokenTree) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
   auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
@@ -54,7 +68,7 @@ XLA_TEST_F(TokenHloTest, TokenTree) {
 }
 
 XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
@@ -75,7 +89,7 @@ XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
 }
 
 XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(HloInstruction::CreateParameter(
       0,
@@ -95,7 +109,7 @@ XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
 }
 
 XLA_TEST_F(TokenHloTest, InvalidOperandToTokenInstruction) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 376559500efad6a756f8a0f60f0a522db047c0e5..ca036f1ae0d5e31a3f83d9d31c80e070c2a666df 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -91,8 +91,8 @@ Status ParseOneProfileOutputLine(
   string match_usecs = "([0-9.]+) usec";
   string match_flops = "([^ ]*)";
   string match_trops = "([^ ]*)";
-  string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
-  string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
+  string match_bytes_per_sec = "([0-9.TGMKi]*)(?:B/s)?";
+  string match_bytes_per_cycle = "([0-9.TGMKi]*)(?:B/cycle)?";
 
   // The underlined part is what we're trying to match with match_opcode:
   //
@@ -307,6 +307,7 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) {
   string profile_output;
   ExecuteAndFetchProfile(&profile_output, client, computation, matrix_shape,
                          matrix_shape);
+  SCOPED_TRACE(profile_output);
 
   std::vector<string> profile_output_lines =
       absl::StrSplit(profile_output, '\n');
@@ -318,14 +319,13 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) {
 
   ASSERT_NE(while_body_profile_start, profile_output_lines.cend());
 
-  auto while_body_profile_end = std::find_if(
-      while_body_profile_start, profile_output_lines.end(),
-      [](absl::string_view s) {
-        return absl::StartsWith(s, "********** microseconds report **********");
-      });
+  auto while_body_profile_end =
+      std::find_if(while_body_profile_start, profile_output_lines.end(),
+                   [](absl::string_view s) {
+                     return absl::StartsWith(s, "********** microseconds ");
+                   });
 
-  // We emit a blank line before the "********** microseconds report **********"
-  // line.
+  // We emit a blank line before the "microseconds report" line.
   while_body_profile_end--;
 
   ASSERT_NE(while_body_profile_end, profile_output_lines.end());
@@ -380,7 +380,7 @@ static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) {
 
 GTEST_API_ int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv);
 
   auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
index 15603619b62d8f45cdce97ac7d83924a78f88cf3..dca0aa52a533130372759156a3238f1a3b10ca42 100644
--- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
+++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 GTEST_API_ int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
   if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
     LOG(ERROR) << "\n" << usage;
@@ -49,7 +49,7 @@ GTEST_API_ int main(int argc, char** argv) {
       // different API than Tensorflow's.
       testing::InitGoogleTest(&argc, argv);
 #if defined(PLATFORM_GOOGLE)
-      base::SetFlag(&FLAGS_benchmarks, pattern);
+      absl::SetFlag(&FLAGS_benchmarks, pattern);
       RunSpecifiedBenchmarks();
 #else
       tensorflow::testing::Benchmark::Run(pattern);
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 3a086c66bbb37965b1ad7c83a93f0054ae723e87..8926bbed2b54fceaaf0e6e991f0e881d35731ef4 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -33,6 +33,7 @@ cc_library(
     name = "dumped_computation_to_graphviz_library",
     srcs = ["dumped_computation_to_graphviz.cc"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -40,7 +41,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
@@ -78,6 +78,7 @@ cc_library(
     name = "replay_computation_library",
     srcs = ["replay_computation.cc"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -91,7 +92,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:testing",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service/gpu:infeed_manager",
@@ -207,13 +207,13 @@ tf_cc_binary(
     name = "dumped_computation_to_tf_graphdef",
     srcs = ["dumped_computation_to_tf_graphdef.cc"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_proto",
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
index c866a13de7543fc948311f94708bc6b904717b62..b623556468fb4a5d96be614b6c067d5a1df51a6f 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -54,7 +54,7 @@ void RealMain(absl::Span<char* const> args) {
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
     XlaComputation computation =
         client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+    DebugOptions debug_options = GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     ComputationStats stats =
         client->GetComputationStats(computation, debug_options)
@@ -68,7 +68,7 @@ void RealMain(absl::Span<char* const> args) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
index 07ef5ff656bb48519a700a1d7d6c60b655a40ed6..f8bb9a6b1e217fc4e6e15c8a3302be61ed339c82 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -53,7 +53,7 @@ void RealMain(absl::Span<char* const> args) {
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
     XlaComputation computation =
         client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+    DebugOptions debug_options = GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     debug_options.set_xla_hlo_dump_as_graphdef(true);
     ComputationStats stats =
@@ -68,7 +68,7 @@ void RealMain(absl::Span<char* const> args) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 109411f99b6eb000474b0c61783c51f42d43bb6d..47be9f5adf1063463d7678579a7f394684aaf357 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -47,8 +47,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -191,8 +191,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
 
   // Run the computation num_runs times, and return the result from the last
   // execution.
-  const bool xla_hlo_profile =
-      legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile();
+  const bool xla_hlo_profile = GetDebugOptionsFromFlags().xla_hlo_profile();
   StreamExecutorMemoryAllocator allocator(
       client->platform(),
       {client->platform()->ExecutorForDevice(0).ValueOrDie()});
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 8ce741647414a1fa75e6d706ec1e719ace7b7cc8..b015f4328a15473db862b753c907975856383a79 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -387,6 +387,19 @@ T CeilOfRatio(T dividend, T divisor) {
   return tensorflow::MathUtil::CeilOfRatio<T>(dividend, divisor);
 }
 
+template <typename T>
+std::vector<T> ElementWiseCeilOfRatio(absl::Span<const T> dividends,
+                                      absl::Span<const T> divisors) {
+  std::vector<T> ceil_of_ratios;
+  CHECK_EQ(dividends.size(), divisors.size());
+  ceil_of_ratios.reserve(dividends.size());
+  absl::c_transform(dividends, divisors, std::back_inserter(ceil_of_ratios),
+                    [](const T dividend, const T divisor) {
+                      return CeilOfRatio<T>(dividend, divisor);
+                    });
+  return ceil_of_ratios;
+}
+
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
 // then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
 template <typename T>
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 8ea8dbab2574ca1e24271e7c1c7762d4a6b6a8de..51c73b3d17e4c32d9a8a14d3055ab56f02922af3 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -185,6 +185,17 @@ bool HasWindowReversal(const Window& window) {
   return false;
 }
 
+bool AllOrNoneReversed(const Window& window) {
+  if (window.dimensions().empty()) {
+    return true;
+  }
+  bool reversed = window.dimensions()[0].window_reversal();
+  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
+                     [&](const WindowDimension& dim) {
+                       return dim.window_reversal() == reversed;
+                     });
+}
+
 bool HasDilation(const Window& window) {
   return HasBaseDilation(window) || HasWindowDilation(window);
 }
diff --git a/tensorflow/compiler/xla/window_util.h b/tensorflow/compiler/xla/window_util.h
index 1fb9e855fc16f334eb0e83dfd27b307b2149628f..099d7ecdd5c732ffc8c6ff6370288a2fc4144fa2 100644
--- a/tensorflow/compiler/xla/window_util.h
+++ b/tensorflow/compiler/xla/window_util.h
@@ -56,6 +56,7 @@ bool HasWindowDilation(const Window& window);
 bool HasDilation(const Window& window);
 
 bool HasWindowReversal(const Window& window);
+bool AllOrNoneReversed(const Window& window);
 
 // Returns true if the given logical dimension is inactive in the sense that it
 // has window bound 1, no striding and no padding.
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 65948ef4b0c3d51805b15634e6215f192e740aaa..28df3b03f398841460189910bc3a5096dfb0d367 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -322,6 +322,34 @@ message UnregisterRequest {
 message UnregisterResponse {
 }
 
+message CompileRequest {
+  // The graph to be compiled.
+  HloModuleProto computation = 1;
+
+  // Options that affect how XLA compiles code to service this request.
+  ExecutionOptions execution_options = 2;
+
+  // The layouts of the input arguments. If not set, the default layout will be
+  // used. Although the real arguments are not needed in compilation, the
+  // layouts of the arguments can affect the compilation.
+  repeated Shape input_shape_with_layout = 3;
+}
+
+message CompileResponse {
+  // The handle to the executable.
+  ExecutionHandle handle = 1;
+}
+
+message ExecuteRequest {
+  ExecutionHandle handle = 1;
+
+  // The shape and layout of the arguments must be the same as the those of the
+  // executable's parameters.
+  repeated GlobalDataHandle arguments = 2;
+}
+
+// TODO(b/118493728): Remove this and ExecuteGraphParallelRequest and replace
+// the uses with calls to Compile and Execute.
 message ExecuteGraphRequest {
   HloModuleProto computation = 1;
   repeated GlobalDataHandle arguments = 2;
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index b6bd919e2b26a109cb9dfd2a6aaba86f1732cff1..683ccc40f162ead3a248aee83d9abf3086a1ac93 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -332,11 +332,13 @@ message LiteralProto {
   repeated double f64s = 9;
   repeated float c64s = 12;  // Stored as interleaved real, imag floats.
   repeated LiteralProto tuple_literals = 10;
-  // The F16s and BF16s are encoded in little endian byte order
+  // The F16s, BF16s, U16s and S16s are encoded in little endian byte order
   bytes f16s = 11;
   bytes bf16s = 13;
+  bytes u16s = 16;
+  bytes s16s = 17;
   repeated int64 sparse_indices = 14;
-  // Next = 16
+  // Next = 18
 }
 
 message WindowDimension {
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 9e3d2454d16730c1d1f93cb384db88544380f77e..67f475846e5f16060c1080759b0acb4216c4e72b 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -12,6 +12,7 @@ cc_library(
     hdrs = ["xrt_state_ops.h"],
     deps = [
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -21,7 +22,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:hlo_proto",
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 5678f0905ff5b8956e0811026e7450acba8815e9..6ab77fbaaf0cbe23503ebc71775f52af01e41a74 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -6,6 +6,24 @@ import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
 import "tensorflow/compiler/xla/service/hlo.proto";
 
+message DeviceAssignment {
+  message ComputationDevice {
+    message DeviceMeshCoordinates {
+      // The mesh coordinates for the device. Usually (X, Y, Core), in the order
+      // in which they are returned in the TopologyProto.
+      //  X    = value(0)
+      //  Y    = value(1)
+      //  Core = value(2)
+      repeated int32 value = 1;
+    }
+    // As many replicas as there are in the replicated computation.
+    repeated DeviceMeshCoordinates replica_devices = 1;
+  }
+  // As many ComputationDevice as many there are computations (number
+  // of cores per replica).
+  repeated ComputationDevice computation_devices = 1;
+}
+
 // Options for an XLA compilation.
 message XLAComputationConfig {
   // The number of replicas the computation will be run on. If this is
@@ -23,6 +41,11 @@ message XLAComputationConfig {
   // computation. per_core_args_and_result_shapes is optional for a
   // single-core computation.
   repeated xla.ProgramShape per_core_program_shape = 5;
+  // Describes how replicated computation instances should be assigned to
+  // devices. There are num_cores_per_replica computations, and each one will be
+  // sent and executed to the set of replica device numbers described in the
+  // DeviceAssignment proto.
+  DeviceAssignment device_assignment = 6;
 }
 
 // Options and XLA computation for a compilation.
diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index a513aa1e7c49d64a860c740fffde156fb5bcbcf3..f6c6560c1c354ed8a36b98b1f564835eb9958e55 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -9,8 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
 py_library(
     name = "all_reduce_py",
     srcs = ["__init__.py"],
@@ -29,29 +27,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nccl_ops",
-    ],
-)
-
-tf_py_test(
-    name = "all_reduce_test",
-    srcs = ["python/all_reduce_test.py"],
-    additional_deps = [
-        ":all_reduce",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:state_ops",
+        "//tensorflow/python/distribute:all_reduce",
     ],
 )
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 25f4b4b8d341331db79321338a88cabfe325eea5..238cdaf8a79812df3f043d9d070bbcfd443f6e1e 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -18,842 +18,5 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import math
-
-from tensorflow.python.framework import device as device_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nccl_ops
-
-
-def _flatten_tensors(tensors):
-  """Check tensors for isomorphism and flatten.
-
-  Args:
-    tensors: list of T `tf.Tensor` which must all have the same shape.
-
-  Returns:
-    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
-    shape: the original shape of each element of input tensors
-
-  Raises:
-    ValueError: tensors are empty or non-isomorphic or have unknown shape.
-  """
-  if not tensors:
-    raise ValueError("tensors cannot be empty")
-  shape = tensors[0].shape
-  for tensor in tensors:
-    shape = shape.merge_with(tensor.shape)
-  if not shape.is_fully_defined():
-    raise ValueError("Tensors must have statically known shape.")
-  if len(shape) != 1:
-    reshaped = []
-    for t in tensors:
-      with ops.colocate_with(t):
-        reshaped.append(array_ops.reshape(t, [-1]))
-    tensors = reshaped
-  return tensors, shape
-
-
-def _reshape_tensors(tensors, shape):
-  """Reshape tensors flattened by _flatten_tensors.
-
-  Args:
-    tensors: list of T `tf.Tensor` of identical length 1D tensors.
-    shape: list of integers describing the desired shape.  Product of
-      the elements must equal the length of each tensor.
-
-  Returns:
-    list of T `tf.Tensor` which are the reshaped inputs.
-  """
-  reshaped = []
-  for t in tensors:
-    with ops.colocate_with(t):
-      reshaped.append(array_ops.reshape(t, shape))
-  return reshaped
-
-
-def _padded_split(tensor, pieces):
-  """Like split for 1D tensors but pads-out case where len % pieces != 0.
-
-  Args:
-    tensor: T `tf.Tensor` that must be 1D.
-    pieces: a positive integer specifying the number of pieces into which
-      tensor should be split.
-
-  Returns:
-    list of T `tf.Tensor` of length pieces, which hold the values of
-      thin input tensor, in order.  The final tensor may
-      be zero-padded on the end to make its size equal to those of all
-      of the other tensors.
-
-  Raises:
-    ValueError: The input tensor is not 1D.
-  """
-  shape = tensor.shape
-  if 1 != len(shape):
-    raise ValueError("input tensor must be 1D")
-  tensor_len = shape.dims[0].value
-  with ops.colocate_with(tensor):
-    if tensor_len % pieces != 0:
-      # pad to an even length
-      chunk_size = 1 + tensor_len // pieces
-      if pieces > tensor_len:
-        # This is an edge case that should not come up in practice,
-        # i.e. a different reduction algorithm would be better,
-        # but we'll make it work just for completeness.
-        pad_len = pieces - tensor_len
-        extended_whole = array_ops.concat(
-            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        parts = array_ops.split(extended_whole, pieces)
-        return parts, pad_len
-      elif (pieces - 1) * chunk_size >= tensor_len:
-        # Another edge case of limited real interest.
-        pad_len = (pieces * chunk_size) % tensor_len
-        extended_whole = array_ops.concat(
-            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        parts = array_ops.split(extended_whole, pieces)
-        return parts, pad_len
-      else:
-        last_chunk_size = tensor_len - (pieces - 1) * chunk_size
-        pad_len = chunk_size - last_chunk_size
-        piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
-        parts = array_ops.split(tensor, piece_lens)
-        parts[-1] = array_ops.concat(
-            [parts[-1], array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        return parts, pad_len
-    else:
-      return array_ops.split(tensor, pieces), 0
-
-
-def _strip_padding(tensors, pad_len):
-  """Strip the suffix padding added by _padded_split.
-
-  Args:
-    tensors: list of T `tf.Tensor` of identical length 1D tensors.
-    pad_len: number of elements to be stripped from the end of each tensor.
-
-  Returns:
-    list of T `tf.Tensor` which are the stripped inputs.
-
-  Raises:
-    ValueError: tensors must be a non-empty list of 1D tensors, and
-      each must be longer than pad_len.
-  """
-  if not tensors:
-    raise ValueError("tensors cannot be empty")
-  shape = tensors[0].shape
-  if len(shape) > 1:
-    raise ValueError("tensors must be 1D")
-  prefix_len = int(shape[0] - pad_len)
-  if prefix_len < 0:
-    raise ValueError("pad_len longer than tensor")
-  stripped = []
-  for t in tensors:
-    with ops.colocate_with(t):
-      stripped.append(array_ops.slice(t, [0], [prefix_len]))
-  return stripped
-
-
-def _ragged_split(tensor, pieces):
-  """Like split for 1D tensors but allows case where len % pieces != 0.
-
-  Args:
-    tensor: T `tf.Tensor` that must be 1D.
-    pieces: a positive integer specifying the number of pieces into which
-      tensor should be split.
-
-  Returns:
-    list of T `tf.Tensor` of length pieces, which hold the values of
-      the input tensor, in order.  The final tensor may be shorter
-      than the others, which will all be of equal length.
-
-  Raises:
-    ValueError: input tensor must be 1D.
-  """
-  shape = tensor.shape
-  if 1 != len(shape):
-    raise ValueError("input tensor must be 1D")
-  tensor_len = shape.dims[0].value
-  chunk_size = tensor_len // pieces
-  with ops.colocate_with(tensor):
-    if tensor_len != (pieces * chunk_size):
-      # last piece will be short
-      assert pieces > 1
-      last_chunk_size = tensor_len - ((pieces - 1) * chunk_size)
-      assert last_chunk_size > 0
-      piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
-      return array_ops.split(tensor, piece_lens)
-    else:
-      return array_ops.split(tensor, pieces)
-
-
-def _ring_permutations(num_workers, num_subchunks, gpu_perm):
-  """"Generate an array of device index arrays, one for each subchunk.
-
-  In the basic ring reduction algorithm there are size(T)/num_devices
-  data chunks and each device process one chunk per tick, i.e. sending
-  one chunk and receiving one chunk.  The idea of subchunking is that
-  each device processes num_subchunks smaller data regions per tick,
-  and the ring rank permutation is different for each subchunk index
-  so that a device is potentially sending to and receiving from
-  num_subchunks different other devices at each tick.  Where multiple
-  independent data channels exist between devices, this strategy
-  supplies a method of using them in parallel.
-
-  Args:
-    num_workers: number of worker tasks
-    num_subchunks: number of subchunks into which to divide each per-GPU chunk.
-    gpu_perm: an array of integers in [0, num_gpus-1] giving the default
-      ring order of GPUs at each worker.  Other permutations will be generated
-      by rotating this array and splicing together per-worker instances.
-
-  Raises:
-    ValueError: the number of subchunks may not exceed the number of GPUs.
-
-  Returns:
-    pred_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
-        preceding device in the permutation for that subchunk.  The
-        device index of GPU i at worker j is i + (j * num_gpus).
-    rank_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
-       local rank of device d in the permutation for that subchunk.
-  """
-  num_gpus = len(gpu_perm)
-  devices = num_workers * num_gpus
-  if devices == 0:
-    return [], []
-  if num_subchunks > num_gpus:
-    raise ValueError(
-        "num_subchunks %d must be <= num_gpus %d" % (num_subchunks, num_gpus))
-  rotation_interval = max(1, int(num_gpus / num_subchunks))
-  perms_by_s = []
-  for s in range(0, num_subchunks):
-    full_order = []
-    offset = s * rotation_interval
-    for w in range(0, num_workers):
-      default_order = [(w * num_gpus) + i for i in gpu_perm]
-      dev_order = default_order[offset:] + default_order[:offset]
-      full_order += dev_order
-    perms_by_s.append(full_order)
-  pred_by_s_d = [[-1 for d in range(0, devices)]
-                 for s in range(0, num_subchunks)]
-  rank_by_s_d = [[-1 for d in range(0, devices)]
-                 for s in range(0, num_subchunks)]
-  for s in range(0, num_subchunks):
-    for d in range(0, devices):
-      for t in range(0, devices):
-        if d == perms_by_s[s][t]:
-          rank_by_s_d[s][d] = t
-          pred_by_s_d[s][d] = perms_by_s[s][(t + devices - 1) % devices]
-          break
-  return (pred_by_s_d, rank_by_s_d)
-
-
-def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
-                          gpu_perm, red_op, un_op=None):
-  """Construct a subgraph performing a ring-style all-reduce of input_tensors.
-
-  Args:
-    input_tensors: a list of T `tf.Tensor` objects, which must all
-      have the same shape and type.
-    num_workers: number of worker tasks spanned by input_tensors.
-    num_subchunks: number of subchunks each device should process in one tick.
-    gpu_perm: a list of ints giving a ring-wise rank ordering of GPUs at
-      each worker.  All workers must have the same number of
-      GPUs with the same rank ordering.  If NVLINK is available, this should
-      be a ring order supported by NVLINK edges.
-    red_op: a binary operator for elementwise reduction.
-    un_op: an optional unary operator to apply to fully reduced values.
-
-  Raises:
-    ValueError: empty input_tensors or they don't all have same
-    size.
-
-  Returns:
-    a list of T `tf.Tensor` identical sum-reductions of input_tensors.
-  """
-  if len(input_tensors) < 2:
-    raise ValueError("input_tensors must be length 2 or longer")
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  devices = [t.device for t in input_tensors]
-  (pred_by_s_d, rank_by_s_d) = _ring_permutations(
-      num_workers, num_subchunks, gpu_perm)
-  chunks_by_dev, pad_len = _build_ring_gather(
-      input_tensors, devices,
-      num_subchunks, pred_by_s_d, rank_by_s_d, red_op)
-  if un_op:
-    chunks_by_dev = _apply_unary_to_chunks(un_op, chunks_by_dev)
-  output_tensors = _build_ring_scatter(pred_by_s_d, rank_by_s_d,
-                                       chunks_by_dev)
-  if pad_len > 0:
-    output_tensors = _strip_padding(output_tensors, pad_len)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_ring_gather(input_tensors, devices, num_subchunks,
-                       pred_by_s_d, rank_by_s_d, red_op):
-  """Construct a subgraph for the first (reduction) pass of ring all-reduce.
-
-  Args:
-    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
-      shape and type.
-    devices: array of device name strings
-    num_subchunks: number of subchunks each device should process in one tick.
-    pred_by_s_d: as produced by _ring_permutations
-    rank_by_s_d: as produced by _ring_permutations
-    red_op: a binary operator for elementwise reduction
-
-  Raises:
-    ValueError: tensors must all be one dimensional.
-
-  Returns:
-    list of list of T `tf.Tensor` of (partially) reduced values where
-    exactly num_subchunks chunks at each device are fully reduced.
-  """
-  num_devices = len(input_tensors)
-  if num_devices == 0:
-    return []
-  if num_devices == 1:
-    return input_tensors
-  shape = input_tensors[0].shape
-  if 1 != len(shape):
-    raise ValueError("input tensors must be 1D")
-  num_chunks = num_devices * num_subchunks
-  num_ticks = num_devices - 1
-  # Initialize chunks_by_dev with splits of the input tensors.
-  chunks_by_dev = []
-  split_pad_len = 0
-  for d in range(0, num_devices):
-    with ops.device(devices[d]):
-      splits, split_pad_len = _padded_split(input_tensors[d], num_chunks)
-      chunks_by_dev.append(splits)
-  # Reduction phase
-  for tick in range(0, num_ticks):
-    # One new partial reduction for every chunk
-    new_partial_reductions = [None for _ in range(0, num_chunks)]
-    # Compute reductions with respect to last tick's values
-    for d in range(0, num_devices):
-      with ops.device(devices[d]):
-        for s in range(0, num_subchunks):
-          rank = rank_by_s_d[s][d]
-          seg_index = (rank + num_devices - (2 + tick)) % num_devices
-          pred_dev = pred_by_s_d[s][d]
-          chunk_index = (seg_index * num_subchunks) + s
-          new_partial_reductions[chunk_index] = red_op(
-              chunks_by_dev[pred_dev][chunk_index],
-              chunks_by_dev[d][chunk_index])
-    # Update chunks_by_dev with the new values at the end of the tick.
-    for d in range(0, num_devices):
-      for s in range(0, num_subchunks):
-        rank = rank_by_s_d[s][d]
-        seg_index = (rank + num_devices - (2 + tick)) % num_devices
-        chunk_index = (seg_index * num_subchunks) + s
-        chunks_by_dev[d][chunk_index] = new_partial_reductions[chunk_index]
-  return chunks_by_dev, split_pad_len
-
-
-def _apply_unary_to_chunks(f, chunks_by_dev):
-  """Apply a unary op to each tensor in chunks_by_dev, on same device.
-
-  Args:
-    f: a unary function over T `tf.Tensor`.
-    chunks_by_dev: list of lists of T `tf.Tensor`.
-
-  Returns:
-    new list of lists of T `tf.Tensor` with the same structure as
-    chunks_by_dev containing the derived tensors.
-  """
-  output = []
-  for x in chunks_by_dev:
-    with ops.colocate_with(x[0]):
-      output.append([f(t) for t in x])
-  return output
-
-
-def _build_ring_scatter(pred_by_s_d, rank_by_s_d,
-                        chunks_by_dev):
-  """Construct subgraph for second (scatter) pass of ring all-reduce.
-
-  Args:
-    pred_by_s_d: as produced by _ring_permutations
-    rank_by_s_d: as produced by _ring_permutations
-    chunks_by_dev: list of list of T `tf.Tensor` indexed by ints
-      (device, chunk)
-
-  Raises:
-    ValueError: chunks_by_dev is not well-formed
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors, one
-    at each device corresponding to the outer dimension of chunks_by_dev.
-  """
-  num_devices = len(chunks_by_dev)
-  num_chunks = len(chunks_by_dev[0])
-  if 0 != num_chunks % num_devices:
-    raise ValueError(
-        "Expect number of chunks per device to be divisible by num_devices")
-  num_subchunks = int(num_chunks / num_devices)
-  num_ticks = num_devices - 1
-  for tick in range(0, num_ticks):
-    passed_values = [None for _ in range(0, num_chunks)]
-    for d in range(0, num_devices):
-      with ops.colocate_with(chunks_by_dev[d][0]):
-        for s in range(0, num_subchunks):
-          rank = rank_by_s_d[s][d]
-          seg_index = (rank + num_devices - (1 + tick)) % num_devices
-          pred_dev = pred_by_s_d[s][d]
-          chunk_index = (seg_index * num_subchunks) + s
-          passed_values[chunk_index] = array_ops.identity(
-              chunks_by_dev[pred_dev][chunk_index])
-    for d in range(0, num_devices):
-      for s in range(0, num_subchunks):
-        rank = rank_by_s_d[s][d]
-        seg_index = (rank + num_devices - (1 + tick)) % num_devices
-        chunk_index = (seg_index * num_subchunks) + s
-        chunks_by_dev[d][chunk_index] = passed_values[chunk_index]
-  # Join chunks at each device.
-  output = []
-  for x in chunks_by_dev:
-    with ops.colocate_with(x[0]):
-      output.append(array_ops.concat(x, 0))
-  return output
-
-
-def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
-  """Construct a subgraph for recursive halving-doubling all-reduce.
-
-  The recursive halving-doubling algorithm is described in
-  http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf
-
-  The concept is to arrange the participating n devices in
-  a linear sequence where devices exchange data pairwise
-  with one other device in each round.  During the gather
-  phase there are lg(n) rounds where devices exchange
-  increasingly smaller sub-tensors with another device
-  at increasingly greater distances, until at the top
-  each device has 1/n of the fully reduced values.  During the
-  scatter phase each device exchanges its fully reduced
-  sub-tensor (which doubles in length at each round)
-  with one other device at increasingly smaller distances
-  until each device has all of the fully reduced values.
-
-  Note: this preliminary version requires that len(input_tensors) be a
-    power of 2.  TODO(tucker): relax this restriction.  Also, the
-    number of elements in each tensor must be divisible by 2^h where h
-    is the number of hops in each phase.  This will also be relaxed in
-    the future with edge-case specific logic.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
-    red_op: a binary elementwise reduction Op.
-    un_op: an optional unary elementwise Op to apply to reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors, one
-    at each device of input_tensors.
-
-  Raises:
-    ValueError: num_devices not a power of 2, or tensor len not divisible
-    by 2 the proper number of times.
-  """
-  devices = [t.device for t in input_tensors]
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  reduced_shards = _build_recursive_hd_gather(input_tensors, devices, red_op)
-  if un_op:
-    reduced_shards = [un_op(t) for t in reduced_shards]
-  output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_recursive_hd_gather(input_tensors, devices, red_op):
-  """Construct the gather phase of recursive halving-doubling all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
-    devices: a list of strings naming the devices hosting input_tensors,
-      which will also be used to host the (partial) reduction values.
-    red_op: a binary elementwise reduction Op.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensor shards.
-
-  Raises:
-    ValueError: num_devices not a power of 2, or tensor len not divisible
-    by 2 the proper number of times.
-  """
-  num_devices = len(devices)
-  num_hops = int(math.log(num_devices, 2))
-  if num_devices != (2 ** num_hops):
-    raise ValueError("num_devices must be a power of 2")
-  chunks = input_tensors
-  for h in range(0, num_hops):
-    span = 2 ** h
-    group_size = span * 2
-    new_chunks = [[] for _ in devices]
-    for d in range(0, num_devices):
-      if (d % group_size) >= (group_size / 2):
-        # skip right half of a pair
-        continue
-      left_dev = devices[d]
-      right_dev = devices[d + span]
-      left_split = array_ops.split(chunks[d], 2)
-      right_split = array_ops.split(chunks[d+span], 2)
-      with ops.device(left_dev):
-        new_chunks[d] = red_op(left_split[0], right_split[0])
-      with ops.device(right_dev):
-        new_chunks[d + span] = red_op(left_split[1], right_split[1])
-    chunks = new_chunks
-  return chunks
-
-
-def _build_recursive_hd_scatter(input_tensors, devices):
-  """Construct the scatter phase of recursive halving-doublng all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
-    devices: a list of strings naming the devices on which the reconstituted
-      full tensors should be placed.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors.
-  """
-  num_devices = len(devices)
-  num_hops = int(math.log(num_devices, 2))
-  assert num_devices == (2 ** num_hops), "num_devices must be a power of 2"
-  chunks = input_tensors
-  for h in reversed(range(0, num_hops)):
-    span = 2 ** h
-    group_size = span * 2
-    new_chunks = [[] for _ in devices]
-    for d in range(0, num_devices):
-      if (d % group_size) >= (group_size / 2):
-        # skip right half of a pair
-        continue
-      left_idx = d
-      right_idx = d + span
-      left_dev = devices[left_idx]
-      right_dev = devices[right_idx]
-      with ops.device(left_dev):
-        new_chunks[left_idx] = array_ops.concat([chunks[left_idx],
-                                                 chunks[right_idx]], 0)
-      with ops.device(right_dev):
-        new_chunks[right_idx] = array_ops.concat([chunks[left_idx],
-                                                  chunks[right_idx]], 0)
-    chunks = new_chunks
-  return chunks
-
-
-def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
-  """Construct a subgraph for shuffle all-reduce.
-
-  Shuffle reduce is essentially the algorithm implemented when using
-  parameter servers.  Suppose tensor length is n, there are d devices
-  and g gather shards.  Each device sends a n/g length sub-tensor to
-  each gather shard.  The gather shards perform a reduction across d
-  fragments, then broadcast the result back to each device.  The
-  devices then join the g fully reduced fragments they receive from
-  the shards.  The gather shards could perform d-1 pairwise
-  reductions, or one d-way reduction.  The first is better where
-  reduction Op time is low compared to transmission time, the second
-  better in the other case.
-
-  Args:
-    input_tensors: list of T @(tf.Tensor} values to be reduced.
-    gather_devices: list of names of devices on which reduction shards
-      should be placed.
-    red_op: an n-array elementwise reduction Op
-    un_op: optional elementwise unary Op to be applied to fully-reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  dst_devices = [t.device for t in input_tensors]
-  reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
-                                         red_op, un_op)
-  output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None):
-  """Construct the gather (concentrate and reduce) phase of shuffle all-reduce.
-
-  Args:
-    input_tensors: list of T @(tf.Tensor} values to be reduced.
-    gather_devices: list of names of devices on which reduction shards
-      should be placed.
-    red_op: the binary reduction Op
-    un_op: optional elementwise unary Op to be applied to fully-reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced shards.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  num_source_devices = len(input_tensors)
-  num_gather_devices = len(gather_devices)
-  shape = input_tensors[0].shape
-  if len(shape) != 1:
-    raise ValueError("input_tensors must be 1D")
-  shards_by_source = []
-  for d in range(0, num_source_devices):
-    with ops.colocate_with(input_tensors[d]):
-      shards_by_source.append(
-          _ragged_split(input_tensors[d], num_gather_devices))
-  reduced_shards = []
-  for d in range(0, num_gather_devices):
-    with ops.device(gather_devices[d]):
-      values = [s[d] for s in shards_by_source]
-      red_shard = red_op(values)
-      if un_op:
-        red_shard = un_op(red_shard)
-      reduced_shards.append(red_shard)
-  return reduced_shards
-
-
-def _build_shuffle_scatter(reduced_shards, dst_devices):
-  """Build the scatter phase of shuffle all-reduce.
-
-  Args:
-    reduced_shards:  list of T @(tf.Tensor} fully reduced shards
-    dst_devices: list of names of devices at which the fully-reduced value
-      should be reconstituted.
-
-  Returns:
-    list of T `tf.Tensor` scattered tensors.
-  """
-  num_devices = len(dst_devices)
-  out_tensors = []
-  for d in range(0, num_devices):
-    with ops.device(dst_devices[d]):
-      out_tensors.append(array_ops.concat(reduced_shards, 0))
-  return out_tensors
-
-
-def _split_by_task(devices, values):
-  """Partition devices and values by common task.
-
-  Args:
-    devices: list of device name strings
-    values: list of T `tf.tensor` of same length as devices.
-
-  Returns:
-    (per_task_devices, per_task_values) where both values are
-    lists of lists with isomorphic structure: the outer list is
-    indexed by task, and the inner list has length of the number
-    of values belonging to that task.  per_task_devices contains
-    the specific devices to which the values are local, and
-    per_task_values contains the corresponding values.
-
-  Raises:
-    ValueError: devices must be same length as values.
-  """
-  num_devices = len(devices)
-  if num_devices != len(values):
-    raise ValueError("len(devices) must equal len(values)")
-  per_task_devices = collections.OrderedDict()
-  per_task_values = collections.OrderedDict()
-  for d in range(num_devices):
-    d_spec = device_lib.DeviceSpec.from_string(devices[d])
-    if not hasattr(d_spec, "task") or d_spec.task is None:
-      assert False, "failed to parse device %s" % devices[d]
-    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
-    if index not in per_task_devices:
-      per_task_devices[index] = []
-      per_task_values[index] = []
-    per_task_devices[index].append(devices[d])
-    per_task_values[index].append(values[d])
-
-  return (list(per_task_devices.values()), list(per_task_values.values()))
-
-
-def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
-  """Build a subgraph that does one full all-reduce, using NCCL.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    red_op: binary elementwise reduction operator.  Must be one of
-      {tf.add}
-    un_op: optional unary elementwise Op to apply to fully-reduce values.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: red_op not supported.
-  """
-  if red_op == math_ops.add:
-    output_tensors = nccl_ops.all_sum(input_tensors)
-  else:
-    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
-  if un_op:
-    un_op_wrapped = []
-    for t in output_tensors:
-      with ops.colocate_with(t):
-        un_op_wrapped.append(un_op(t))
-    output_tensors = un_op_wrapped
-  return output_tensors
-
-
-def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
-  """Construct a subgraph for NCCL hybrid all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    red_op: binary elementwise reduction operator.
-    upper_level_f: function for reducing one value per worker, across
-      workers.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  devices = [t.device for t in input_tensors]
-  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
-  num_workers = len(per_worker_devices)
-  up_values = [None for w in range(0, num_workers)]
-  up_devices = up_values[:]
-  down_values = up_values[:]
-  # First stage: reduce within each worker using NCCL
-  for w in range(0, num_workers):
-    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
-    # NOTE: these reductions will not run to completion unless
-    # every output value is used.  Since we only need one, we
-    # need to put control dependencies on the rest.
-    with ops.control_dependencies(worker_values):
-      with ops.device(worker_values[0].device):
-        up_values[w] = array_ops.identity(worker_values[0])
-      up_devices[w] = per_worker_devices[w][0]
-  # Second stage: Apply upper_level_f to reduce across first device at
-  # each worker
-  level_2_output = upper_level_f(up_values)
-  # Third stage: propagate within each worker using NCCL Broadcast
-  for w in range(0, num_workers):
-    dst_tensors = []
-    with ops.device(per_worker_devices[w][0]):
-      broadcast_src = nccl_ops.broadcast(array_ops.identity(level_2_output[w]))
-    for d in per_worker_devices[w]:
-      with ops.device(d):
-        dst_tensors.append(array_ops.identity(broadcast_src))
-    down_values[w] = dst_tensors
-  output_tensors = [v for sublist in down_values for v in sublist]
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _reduce_non_singleton(input_tensors, red_f, un_op):
-  """If input_tensors has more than one element apply red_f, else apply un_op."""
-  if len(input_tensors) > 1:
-    return red_f(input_tensors)
-  else:
-    if not un_op:
-      return input_tensors
-    output_tensors = []
-    for t in input_tensors:
-      with ops.colocate_with(t):
-        output_tensors.append(un_op(t))
-    return output_tensors
-
-
-def build_nccl_then_ring(input_tensors, subdiv, red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Ring across workers."""
-  def upper_builder(y):
-    return build_ring_all_reduce(y, len(y), subdiv, [0], red_op, un_op)
-  def upper_level_f(x):
-    return _reduce_non_singleton(x, upper_builder, un_op)
-  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
-
-
-def build_nccl_then_recursive_hd(input_tensors, red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Recursive-HD across workers."""
-  upper_level_f = lambda x: build_recursive_hd_all_reduce(x, red_op, un_op)
-  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
-
-
-def build_nccl_then_shuffle(input_tensors, gather_devices, nccl_red_op,
-                            shuffle_red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Shuffle across workers."""
-  upper_level_f = lambda x: build_shuffle_all_reduce(x, gather_devices,
-                                                     shuffle_red_op, un_op)
-  return _build_nccl_hybrid(input_tensors, nccl_red_op, upper_level_f)
-
-
-def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
-  """Construct a subgraph for Shuffle hybrid all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    gather_devices: list of device names on which to host gather shards.
-    red_op: binary elementwise reduction operator.
-    upper_level_f: function for reducing one value per worker, across
-      workers.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  # First stage, reduce across each worker using gather_devices.
-  devices = [t.device for t in input_tensors]
-  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
-  num_workers = len(per_worker_devices)
-  up_values = []
-  if len(gather_devices) != num_workers:
-    raise ValueError("For shuffle hybrid, gather_devices must contain one "
-                     "device per worker. ")
-  for w in range(0, num_workers):
-    reduced_shards = _build_shuffle_gather(
-        per_worker_values[w], [gather_devices[w]], red_op)
-    up_values.append(reduced_shards[0])
-  # Second stage, apply upper_level_f.
-  level_2_output = upper_level_f(up_values)
-  # Third stage, apply shuffle scatter at each worker.
-  output_tensors = []
-  for w in range(0, num_workers):
-    output_tensors += _build_shuffle_scatter(
-        [level_2_output[w]], per_worker_devices[w])
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def build_shuffle_then_ring(input_tensors, gather_devices, subdiv,
-                            red_n_op, red_op, un_op=None):
-  """Construct hybrid of Shuffle within workers, Ring across workers."""
-  def upper_builder(tensors):
-    return build_ring_all_reduce(tensors, len(tensors), subdiv, [0],
-                                 red_op, un_op)
-  def upper_level_f(tensors):
-    return _reduce_non_singleton(tensors, upper_builder, un_op)
-  return _build_shuffle_hybrid(
-      input_tensors, gather_devices, red_n_op, upper_level_f)
-
-
-def build_shuffle_then_shuffle(input_tensors, first_gather_devices,
-                               second_gather_devices, red_op, un_op=None):
-  """Construct hybrid of Shuffle within workers, Shuffle across workers."""
-  def upper_builder(tensors):
-    return build_shuffle_all_reduce(tensors, second_gather_devices,
-                                    red_op, un_op)
-  def upper_level_f(tensors):
-    return _reduce_non_singleton(tensors, upper_builder, un_op)
-  return _build_shuffle_hybrid(
-      input_tensors, first_gather_devices, red_op, upper_level_f)
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.python.distribute.all_reduce import *
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/BUILD b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6d2d70c99b4cc804f2c8bf57afdc8c11f1f73516
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
@@ -0,0 +1,36 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
+
+py_library(
+    name = "benchmark_base",
+    srcs = [
+        "benchmark_base.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "cartpole_benchmark",
+    size = "enormous",
+    srcs = ["cartpole_benchmark.py"],
+    tags = [
+        "local",
+        "manual",
+        "no_oss",
+        "notap",
+        "nozapfhahn",
+    ],
+    deps = [
+        ":benchmark_base",
+        # Note: required gym dependency may need to be added here.
+    ],
+)
+
+tf_py_logged_benchmark(
+    name = "cartpole_logged_benchmark",
+    target = "//tensorflow/contrib/autograph/examples/benchmarks:cartpole_benchmark",
+)
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..93c694849c4dc3faca71e7f9d8614649a7784f99
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common benchmarking code.
+
+See https://www.tensorflow.org/community/benchmarks for usage.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+import tensorflow as tf
+
+
+class ReportingBenchmark(tf.test.Benchmark):
+  """Base class for a benchmark that reports general performance metrics.
+
+  Subclasses only need to call one of the _profile methods, and optionally
+  report_results.
+  """
+
+  def time_execution(self, name, target, iters, warm_up_iters=5):
+    for _ in range(warm_up_iters):
+      target()
+
+    all_times = []
+    for _ in range(iters):
+      iter_time = time.time()
+      target()
+      all_times.append(time.time() - iter_time)
+
+    avg_time = np.average(all_times)
+
+    extras = dict()
+    extras['all_times'] = all_times
+
+    if isinstance(name, tuple):
+      extras['name'] = name
+      name = '_'.join(str(piece) for piece in name)
+
+    self.report_benchmark(
+        iters=iters, wall_time=avg_time, name=name, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f553be58e94f11e45f0697558348fbbd26bfb91
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py
@@ -0,0 +1,492 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A basic RL cartpole benchmark.
+
+The RL model uses the OpenAI Gym environment to train a simple network using
+the policy gradients method. The training scales the gradients for each step
+by the episode's cumulative discounted reward and averages these gradients over
+a fixed number of games before applying the optimization step.
+
+For benchmarking purposes, we replace the OpenAI Gym environment to a fake
+that returns random actions and rewards and never ends the episode. This way
+the benchmarks compare the same amount of computation at each step.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib import eager
+from tensorflow.contrib.autograph.examples.benchmarks import benchmark_base
+from tensorflow.python import autograph as ag
+from tensorflow.python.eager import context
+
+#
+# AutoGraph implementation
+#
+
+
+@ag.convert()
+def graph_append_discounted_rewards(destination, rewards, discount_rate):
+  """Discounts episode rewards and appends them to destination."""
+  ag.set_element_type(rewards, tf.float32)
+
+  cdr = 0.0
+  reverse_discounted = []
+  ag.set_element_type(reverse_discounted, tf.float32)
+
+  for i in range(len(rewards) - 1, -1, -1):
+    cdr = cdr * discount_rate + rewards[i]
+    cdr.set_shape(())
+    reverse_discounted.append(cdr)
+
+  retval = destination
+  # Note: AutoGraph doesn't yet support reversed() so we use a loop instead.
+  for i in range(len(reverse_discounted) - 1, -1, -1):
+    retval.append(reverse_discounted[i])
+
+  return retval
+
+
+class GraphPolicyNetwork(tf.keras.Model):
+  """Policy network for the cart-pole reinforcement learning problem.
+
+  The forward path of the network takes an observation from the cart-pole
+  environment (length-4 vector) and outputs an action.
+  """
+
+  def __init__(self, hidden_size):
+    super(GraphPolicyNetwork, self).__init__()
+    self._hidden_layer = tf.keras.layers.Dense(
+        hidden_size, activation=tf.nn.elu)
+    self._output_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    """Calculates logits and action.
+
+    Args:
+      inputs: Observations from a step in the cart-pole environment, of shape
+        `(batch_size, input_size)`
+
+    Returns:
+      logits: the logits output by the output layer. This can be viewed as the
+        likelihood vales of choosing the left (0) action. Shape:
+        `(batch_size, 1)`.
+      actions: randomly selected actions ({0, 1}) based on the logits. Shape:
+        `(batch_size, 1)`.
+    """
+    hidden = self._hidden_layer(inputs)
+    logits = self._output_layer(hidden)
+
+    left_prob = tf.nn.sigmoid(logits)
+    action_probs = tf.concat([left_prob, 1.0 - left_prob], 1)
+
+    actions = tf.multinomial(tf.log(action_probs), 1)
+    return logits, actions
+
+  # TODO(mdan): Move this method out of the class.
+  @ag.convert()
+  def train(self, cart_pole_env, optimizer, discount_rate, num_games,
+            max_steps_per_game):
+    var_list = tf.trainable_variables()
+    grad_list = [
+        tf.TensorArray(tf.float32, 0, dynamic_size=True) for _ in var_list
+    ]
+
+    step_counts = []
+    discounted_rewards = []
+    ag.set_element_type(discounted_rewards, tf.float32)
+    ag.set_element_type(step_counts, tf.int32)
+
+    # Note: we use a shared object, cart_pole_env here. Because calls to the
+    # object's method are made through py_func, TensorFlow cannot detect its
+    # data dependencies. Hence we must manually synchronize access to it
+    # and ensure the control dependencies are set in such a way that
+    # calls to reset(), take_one_step, etc. are made in the correct order.
+    sync_counter = tf.constant(0)
+
+    for _ in tf.range(num_games):
+      with tf.control_dependencies([sync_counter]):
+        obs = cart_pole_env.reset()
+        with tf.control_dependencies([obs]):
+          sync_counter += 1
+
+        game_rewards = []
+        ag.set_element_type(game_rewards, tf.float32)
+
+        for step in tf.range(max_steps_per_game):
+          logits, actions = self(obs)  # pylint:disable=not-callable
+          logits = tf.reshape(logits, ())
+          actions = tf.reshape(actions, ())
+
+          labels = 1.0 - tf.cast(actions, tf.float32)
+          loss = tf.nn.sigmoid_cross_entropy_with_logits(
+              labels=labels, logits=logits)
+          grads = tf.gradients(loss, var_list)
+
+          for i in range(len(grads)):
+            grad_list[i].append(grads[i])
+
+          with tf.control_dependencies([sync_counter]):
+            obs, reward, done = cart_pole_env.step(actions)
+            with tf.control_dependencies([obs]):
+              sync_counter += 1
+            obs = tf.reshape(obs, (1, 4))
+
+          game_rewards.append(reward)
+          if reward < 0.1 or done:
+            step_counts.append(step + 1)
+            break
+
+        discounted_rewards = graph_append_discounted_rewards(
+            discounted_rewards, game_rewards, discount_rate)
+
+    discounted_rewards = ag.stack(discounted_rewards)
+    discounted_rewards.set_shape((None,))
+    mean, variance = tf.nn.moments(discounted_rewards, [0])
+    normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)
+
+    for i in range(len(grad_list)):
+      g = ag.stack(grad_list[i])
+
+      # This block just adjusts the shapes to match for multiplication.
+      r = normalized_rewards
+      if r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+      if r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+
+      grad_list[i] = tf.reduce_mean(g * r, axis=0)
+
+    optimizer.apply_gradients(
+        zip(grad_list, var_list), global_step=tf.train.get_global_step())
+
+    return ag.stack(step_counts)
+
+
+@ag.convert()
+def graph_train_model(policy_network, cart_pole_env, optimizer, iterations):
+  """Trains the policy network for a given number of iterations."""
+  i = tf.constant(0)
+  mean_steps_per_iteration = []
+  ag.set_element_type(mean_steps_per_iteration, tf.int32)
+
+  while i < iterations:
+    steps_per_game = policy_network.train(
+        cart_pole_env,
+        optimizer,
+        discount_rate=0.95,
+        num_games=20,
+        max_steps_per_game=200)
+    mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
+    i += 1
+
+  return ag.stack(mean_steps_per_iteration)
+
+
+class GraphGymCartpoleEnv(object):
+  """An env backed by OpenAI Gym's CartPole environment.
+
+  Used to confirm a functional model only.
+  """
+
+  def __init__(self):
+    cart_pole_env = gym.make('CartPole-v1')
+    cart_pole_env.seed(0)
+    cart_pole_env.reset()
+    self.env = cart_pole_env
+
+  def reset(self):
+    obs = ag.utils.wrap_py_func(self.env.reset, tf.float64, ())
+    obs = tf.reshape(obs, (1, 4))
+    obs = tf.cast(obs, tf.float32)
+    return obs
+
+  def step(self, actions):
+
+    def take_one_step(actions):
+      obs, reward, done, _ = self.env.step(actions)
+      obs = obs.astype(np.float32)
+      reward = np.float32(reward)
+      return obs, reward, done
+
+    return ag.utils.wrap_py_func(take_one_step,
+                                 (tf.float32, tf.float32, tf.bool), (actions,))
+
+
+class GraphRandomCartpoleEnv(object):
+  """An environment that returns random actions and never finishes.
+
+  Used during benchmarking, it will cause training to run a constant number of
+  steps.
+  """
+
+  def reset(self):
+    return tf.random.normal((1, 4))
+
+  def step(self, actions):
+    with tf.control_dependencies([actions]):
+      random_obs = tf.random.normal((1, 4))
+      fixed_reward = tf.constant(0.001)
+      done = tf.constant(False)
+      return random_obs, fixed_reward, done
+
+
+#
+# Eager implementation
+#
+
+
+def eager_append_discounted_rewards(discounted_rewards, rewards, discount_rate):
+  cdr = 0.0
+  reverse_discounted = []
+
+  for i in range(len(rewards) - 1, -1, -1):
+    cdr = cdr * discount_rate + rewards[i]
+    reverse_discounted.append(cdr)
+
+  discounted_rewards.extend(reversed(reverse_discounted))
+  return discounted_rewards
+
+
+class EagerPolicyNetwork(tf.keras.Model):
+  """Policy network for the cart-pole reinforcement learning problem.
+
+  The forward path of the network takes an observation from the cart-pole
+  environment (length-4 vector) and outputs an action.
+  """
+
+  def __init__(self, hidden_size):
+    super(EagerPolicyNetwork, self).__init__()
+    self._hidden_layer = tf.keras.layers.Dense(
+        hidden_size, activation=tf.nn.elu)
+    self._output_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    """Calculates logits and action.
+
+    Args:
+      inputs: Observations from a step in the cart-pole environment, of shape
+        `(batch_size, input_size)`
+
+    Returns:
+      logits: the logits output by the output layer. This can be viewed as the
+        likelihood vales of choosing the left (0) action. Shape:
+        `(batch_size, 1)`.
+      actions: randomly selected actions ({0, 1}) based on the logits. Shape:
+        `(batch_size, 1)`.
+    """
+    hidden = self._hidden_layer(inputs)
+    logits = self._output_layer(hidden)
+
+    left_prob = tf.nn.sigmoid(logits)
+    action_probs = tf.concat([left_prob, 1.0 - left_prob], 1)
+
+    self._grad_fn = eager.implicit_gradients(
+        self._get_cross_entropy_and_save_actions)
+
+    actions = tf.multinomial(tf.log(action_probs), 1)
+    return logits, actions
+
+  def _get_cross_entropy_and_save_actions(self, inputs):
+    logits, actions = self(inputs)  # pylint:disable=not-callable
+    self._current_actions = actions
+    labels = 1.0 - tf.cast(actions, tf.float32)
+    return tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
+
+  def train(self, cart_pole_env, optimizer, discount_rate, num_games,
+            max_steps_per_game):
+    grad_list = None
+
+    step_counts = []
+    discounted_rewards = []
+
+    for _ in range(num_games):
+      obs = cart_pole_env.reset()
+
+      game_rewards = []
+
+      for step in range(max_steps_per_game):
+        grads_and_vars = self._grad_fn(tf.constant([obs], dtype=tf.float32))
+        grads, var_list = zip(*grads_and_vars)
+        actions = self._current_actions.numpy()[0][0]
+
+        if grad_list is None:
+          grad_list = [[g] for g in grads]
+        else:
+          for i in range(len(grads)):
+            grad_list[i].append(grads[i])
+
+        obs, reward, done = cart_pole_env.step(actions)
+
+        game_rewards.append(reward)
+        if reward < 0.1 or done:
+          step_counts.append(step + 1)
+          break
+
+      discounted_rewards = eager_append_discounted_rewards(
+          discounted_rewards, game_rewards, discount_rate)
+
+    discounted_rewards = tf.stack(discounted_rewards)
+    mean, variance = tf.nn.moments(discounted_rewards, [0])
+    normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)
+
+    for i in range(len(grad_list)):
+      g = tf.stack(grad_list[i])
+
+      r = normalized_rewards
+      while r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+
+      grad_list[i] = tf.reduce_mean(g * r, axis=0)
+
+    optimizer.apply_gradients(
+        zip(grad_list, var_list), global_step=tf.train.get_global_step())
+
+    return tf.stack(step_counts)
+
+
+def eager_train_model(policy_network, cart_pole_env, optimizer, iterations):
+  """Trains the policy network for a given number of iterations."""
+  mean_steps_per_iteration = []
+
+  for _ in range(iterations):
+    steps_per_game = policy_network.train(
+        cart_pole_env,
+        optimizer,
+        discount_rate=0.95,
+        num_games=20,
+        max_steps_per_game=200)
+    mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
+
+  return mean_steps_per_iteration
+
+
+class EagerGymCartpoleEnv(object):
+  """An env backed by OpenAI Gym's CartPole environment.
+
+  Used to confirm a functional model only.
+  """
+
+  def __init__(self):
+    cart_pole_env = gym.make('CartPole-v1')
+    cart_pole_env.seed(0)
+    cart_pole_env.reset()
+    self.env = cart_pole_env
+
+  def reset(self):
+    return self.env.reset()
+
+  def step(self, actions):
+    obs, reward, done, _ = self.env.step(actions)
+    return obs, reward, done
+
+
+class EagerRandomCartpoleEnv(object):
+  """An environment that returns random actions and never finishes.
+
+  Used during benchmarking, it will cause training to run a constant number of
+  steps.
+  """
+
+  def reset(self):
+    return np.random.normal(size=(4,))
+
+  def step(self, actions):
+    with tf.control_dependencies([actions]):
+      random_obs = np.random.normal(size=(4,))
+      fixed_reward = 0.001
+      done = False
+      return random_obs, fixed_reward, done
+
+
+def graph_demo_training():
+  """Not used in the benchmark. Used to confirm a functional model."""
+  with tf.Graph().as_default():
+    tf.set_random_seed(0)
+
+    network = GraphPolicyNetwork(hidden_size=5)
+    network.build((1, 4))
+    env = GraphGymCartpoleEnv()
+    opt = tf.train.AdamOptimizer(0.05)
+
+    train_ops = graph_train_model(network, env, opt, iterations=5)
+
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(tf.local_variables_initializer())
+      steps_per_iteration = sess.run(train_ops)
+      for i, steps in enumerate(steps_per_iteration):
+        print('Step {} iterations: {}'.format(i, steps))
+
+
+def eager_demo_training():
+  with context.eager_mode():
+    network = EagerPolicyNetwork(hidden_size=5)
+    network.build((1, 4))
+    env = EagerGymCartpoleEnv()
+    opt = tf.train.AdamOptimizer(0.05)
+
+    steps_per_iteration = eager_train_model(network, env, opt, iterations=5)
+    for i, steps in enumerate(steps_per_iteration):
+      print('Step {} iterations: {}'.format(i, steps))
+
+
+class RLCartPoleBenchmark(benchmark_base.ReportingBenchmark):
+  """Actual benchmark.
+
+  Trains the RL agent a fixed number of times, on random environments that
+  result in constant number of steps.
+  """
+
+  def benchmark_cartpole(self):
+
+    def train_session(sess, ops):
+      return lambda: sess.run(ops)
+
+    def train_eager(network, env, opt):
+      return lambda: eager_train_model(network, env, opt, iterations=10)
+
+    for model_size in (10, 100, 1000):
+      with tf.Graph().as_default():
+        network = GraphPolicyNetwork(hidden_size=model_size)
+        network.build((1, 4))
+        env = GraphRandomCartpoleEnv()
+        opt = tf.train.AdamOptimizer(0.05)
+        train_ops = graph_train_model(network, env, opt, iterations=10)
+
+        with tf.Session() as sess:
+          sess.run(tf.global_variables_initializer())
+          sess.run(tf.local_variables_initializer())
+
+          self.time_execution(('cartpole', 'autograph', model_size),
+                              train_session(sess, train_ops), 20)
+
+      with context.eager_mode():
+        network = EagerPolicyNetwork(hidden_size=model_size)
+        network.build((1, 4))
+        env = EagerRandomCartpoleEnv()
+        opt = tf.train.AdamOptimizer(0.05)
+
+        self.time_execution(('cartpole', 'eager', model_size),
+                            train_eager(network, env, opt), 20)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
index 55faad983f2bcf2f3fa633669bd371608e2e925b..3e4d0dc1cec76b068c1c846eb476eec615e4f613 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -18,8 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import function
+from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import gen_batch_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -101,12 +102,15 @@ def batch_function(num_batch_threads,
   def decorator(fn):  # pylint: disable=missing-docstring
 
     def decorated(*args):  # pylint: disable=missing-docstring
-      types = [arg.dtype for arg in args]
 
-      @function.Defun(*types)
+      @function.defun()
       def computation(*computation_args):
         return fn(*computation_args)
 
+      computation = computation.get_concrete_function(
+          *[tensor_spec.TensorSpec(dtype=x.dtype, shape=x.shape, name=str(i))
+            for i, x in enumerate(args)])
+
       with ops.name_scope("batch") as name:
         for a in args:
           if not isinstance(a, ops.Tensor):
@@ -123,7 +127,7 @@ def batch_function(num_batch_threads,
             f=computation,
             in_tensors=list(args),
             captured_tensors=computation.captured_inputs,
-            Tout=[o.type for o in computation.definition.signature.output_arg])
+            Tout=[o.dtype for o in computation.outputs])
 
     return decorated
 
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
index 01ee8703a93836d607ee9b765c51c79fe3bb974f..9109b9c1c91cefa4c52bad49de23336a6e05e1ef 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops_test.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -219,6 +219,7 @@ class BatchOpsTest(test.TestCase):
 
       @batch_ops.batch_function(1, 10, 100000)
       def computation(in_t):
+        self.assertTrue(in_t.shape is not None)
         return in_t + 1
 
       inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 14b6fc4ac26f74f54628ae37ad6437c7d3e8caba..d3b23d949ee2c7674c3918d39e8b71d76eefcfec 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -132,6 +132,7 @@ py_library(
     srcs = ["estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":custom_loss_head",
         ":estimator_utils",
         ":model",
         "//tensorflow/contrib/boosted_trees:losses",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index a3df272e6924792128fc38fd153b9527b58b486e..b314b4d74df882a421d9a2ecce2629a63d5c5248 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -41,7 +41,8 @@ def make_custom_export_strategy(name,
                                 convert_fn,
                                 feature_columns,
                                 export_input_fn,
-                                use_core_columns=False):
+                                use_core_columns=False,
+                                feature_engineering_fn=None):
   """Makes custom exporter of GTFlow tree format.
 
   Args:
@@ -52,6 +53,7 @@ def make_custom_export_strategy(name,
     export_input_fn: A function that takes no arguments and returns an
       `InputFnOps`.
     use_core_columns: A boolean, whether core feature columns were used.
+    feature_engineering_fn: Feature eng function to be called on the input.
 
   Returns:
     An `ExportStrategy`.
@@ -59,9 +61,12 @@ def make_custom_export_strategy(name,
   base_strategy = saved_model_export_utils.make_export_strategy(
       serving_input_fn=export_input_fn, strip_default_attrs=True)
   input_fn = export_input_fn()
+  features = input_fn.features
+  if feature_engineering_fn is not None:
+    features, _ = feature_engineering_fn(features, labels=None)
   (sorted_feature_names, dense_floats, sparse_float_indices, _, _,
    sparse_int_indices, _, _) = gbdt_batch.extract_features(
-       input_fn.features, feature_columns, use_core_columns)
+       features, feature_columns, use_core_columns)
 
   def export_fn(estimator, export_dir, checkpoint_path=None, eval_result=None):
     """A wrapper to export to SavedModel, and convert it to other formats."""
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index ca73e4af2fbd0a383d02fa7111f59161701661df..358404cd946bbc56d2f7228be8fe4223749c850b 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -36,7 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
-from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.feature_column import feature_column_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 38d19976ef38a295a172e935f70bdae3c67f01e2..a178820841c4c8bcb7f5742babdb6d0f4825de31 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.boosted_trees.estimator_batch import model
 from tensorflow.contrib.boosted_trees.python.utils import losses
 from tensorflow.contrib.learn.python.learn.estimators import estimator
@@ -26,7 +28,8 @@ from tensorflow.python.estimator.canned import head as core_head_lib
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.losses import losses as core_losses
-
+from tensorflow.contrib.boosted_trees.estimator_batch import custom_loss_head
+from tensorflow.python.ops import array_ops
 
 # ================== Old estimator interface===================================
 # The estimators below were designed for old feature columns and old estimator
@@ -414,6 +417,108 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+# When using this estimator, make sure to regularize the hessian (at least l2,
+# min_node_weight)!
+# TODO(nponomareva): extend to take multiple quantiles in one go.
+class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
+  """An estimator that does quantile regression and returns quantile estimates.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               quantiles,
+               label_dimension=1,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               use_core_libs=False,
+               output_leaf_index=False,
+               override_global_step_value=None,
+               num_quantiles=100):
+    """Initializes a GradientBoostedDecisionTreeQuantileRegressor instance.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      quantiles: a list of quantiles for the loss, each between 0 and 1.
+      label_dimension: Dimension of regression label. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`). When label_dimension>1, it is
+        recommended to use multiclass strategy diagonal hessian or full hessian.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+      override_global_step_value: If after the training is done, global step
+        value must be reset to this value. This should be used to reset global
+        step to a number > number of steps used to train the current ensemble.
+        For example, the usual way is to train a number of trees and set a very
+        large number of training steps. When the training is done (number of
+        trees were trained), this parameter can be used to set the global step
+        to a large value, making it look like that number of training steps ran.
+        If None, no override of global step will happen.
+      num_quantiles: Number of quantiles to build for numeric feature values.
+    """
+
+    if len(quantiles) > 1:
+      raise ValueError('For now, just one quantile per estimator is supported')
+
+    def _quantile_regression_head(quantile):
+      # Use quantile regression.
+      head = custom_loss_head.CustomLossHead(
+          loss_fn=functools.partial(
+              losses.per_example_quantile_regression_loss, quantile=quantile),
+          link_fn=array_ops.identity,
+          logit_dimension=label_dimension)
+      return head
+
+    learner_config.num_classes = max(2, label_dimension)
+
+    super(GradientBoostedDecisionTreeQuantileRegressor, self).__init__(
+        model_fn=model.model_builder,
+        params={
+            'head': _quantile_regression_head(quantiles[0]),
+            'feature_columns': feature_columns,
+            'learner_config': learner_config,
+            'num_trees': num_trees,
+            'weight_column_name': weight_column_name,
+            'examples_per_layer': examples_per_layer,
+            'logits_modifier_function': logits_modifier_function,
+            'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
+            'output_leaf_index': False,
+            'override_global_step_value': override_global_step_value,
+            'num_quantiles': num_quantiles,
+        },
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
+
 # ================== New Estimator interface===================================
 # The estimators below use new core Estimator interface and must be used with
 # new feature columns and heads.
@@ -437,12 +542,42 @@ def core_multiclass_head(
 
   # pylint:disable=protected-access
   head_fn = core_head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-      n_classes=n_classes, loss_fn=loss_fn, loss_reduction=loss_reduction)
+      n_classes=n_classes,
+      loss_fn=loss_fn,
+      loss_reduction=loss_reduction,
+      weight_column=weight_column)
   # pylint:enable=protected-access
 
   return head_fn
 
 
+# For quantile regression, use this head with Core..Estimator, or use
+# Core..QuantileRegressor directly,
+def core_quantile_regression_head(
+    quantiles,
+    label_dimension=1,
+    weight_column=None,
+    loss_reduction=core_losses.Reduction.SUM_OVER_NONZERO_WEIGHTS):
+  """Core head for quantile regression problems."""
+
+  def loss_fn(labels, logits):
+    result = losses.per_example_quantile_regression_loss(
+        labels=labels,
+        predictions=logits,
+        weights=weight_column,
+        quantile=quantiles)
+    return result[0]
+
+  # pylint:disable=protected-access
+  head_fn = core_head_lib._regression_head(
+      label_dimension=label_dimension,
+      loss_fn=loss_fn,
+      loss_reduction=loss_reduction,
+      weight_column=weight_column)
+  # pylint:enable=protected-access
+  return head_fn
+
+
 class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator):
   """An estimator using gradient boosted decision trees.
 
@@ -606,3 +741,104 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
 
     super(CoreGradientBoostedDecisionTreeRanker, self).__init__(
         model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+# When using this estimator, make sure to regularize the hessian (at least l2,
+# min_node_weight)!
+# TODO(nponomareva): extend to take multiple quantiles in one go.
+class CoreGradientBoostedDecisionTreeQuantileRegressor(
+    core_estimator.Estimator):
+  """An estimator that does quantile regression and returns quantile estimates.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               quantiles,
+               label_dimension=1,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_keys=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               output_leaf_index=False,
+               num_quantiles=100):
+    """Initializes a core version of GradientBoostedDecisionTreeEstimator.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      quantiles: a list of quantiles for the loss, each between 0 and 1.
+      label_dimension: Dimension of regression label. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`). When label_dimension>1, it is
+        recommended to use multiclass strategy diagonal hessian or full hessian.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+      num_quantiles: Number of quantiles to build for numeric feature values.
+    """
+    if len(quantiles) > 1:
+      raise ValueError('For now, just one quantile per estimator is supported')
+
+    def _model_fn(features, labels, mode, config):
+      return model.model_builder(
+          features=features,
+          labels=labels,
+          mode=mode,
+          config=config,
+          params={
+              'head':
+                  core_quantile_regression_head(
+                      quantiles[0], label_dimension=label_dimension),
+              'feature_columns':
+                  feature_columns,
+              'learner_config':
+                  learner_config,
+              'num_trees':
+                  num_trees,
+              'weight_column_name':
+                  weight_column_name,
+              'examples_per_layer':
+                  examples_per_layer,
+              'center_bias':
+                  center_bias,
+              'logits_modifier_function':
+                  logits_modifier_function,
+              'use_core_libs':
+                  True,
+              'output_leaf_index':
+                  output_leaf_index,
+              'override_global_step_value':
+                  None,
+              'num_quantiles':
+                  num_quantiles,
+          },
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC)
+
+    super(CoreGradientBoostedDecisionTreeQuantileRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index c155128c0e4ccf928349ee6453baff4384222096..ee052ac60387d8f993e4942dd7dff39e191dd3a4 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -47,8 +48,8 @@ def _multiclass_train_input_fn():
   features = {
       "x": constant_op.constant([[2.], [1.], [1.], [5.], [3.5], [4.6], [3.5]])
   }
-  label = constant_op.constant(
-      [[1], [0], [0], [2], [2], [0], [1]], dtype=dtypes.int32)
+  label = constant_op.constant([[1], [0], [0], [2], [2], [0], [1]],
+                               dtype=dtypes.int32)
   return features, label
 
 
@@ -77,6 +78,59 @@ def _infer_ranking_train_input_fn():
   return features, None
 
 
+_QUANTILE_REGRESSION_SIZE = 1000
+
+
+def _quantile_regression_input_fns(two_dimension=False):
+  # The data generation is taken from
+  # http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html
+  np.random.seed(1)
+
+  def f(x):
+    """The function to predict."""
+    return x * np.sin(x)
+
+  def g(x):
+    """The function to predict."""
+    return x * np.cos(x)
+
+  #  Training data.
+  x = np.atleast_2d(np.random.uniform(0, 10.0,
+                                      size=_QUANTILE_REGRESSION_SIZE)).T
+  x = x.astype(np.float32)
+
+  # Labels.
+  if not two_dimension:
+    y = f(x).ravel()
+  else:
+    y = np.column_stack((f(x).ravel(), g(x).ravel()))
+
+  # Add random noise.
+  dy = 1.5 + 1.0 * np.random.random(y.shape)
+  noise = np.random.normal(0, dy)
+  y += noise
+  y_original = y.astype(np.float32)
+  if not two_dimension:
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1)
+
+  train_input_fn = numpy_io.numpy_input_fn(
+      x=x,
+      y=y,
+      batch_size=_QUANTILE_REGRESSION_SIZE,
+      num_epochs=None,
+      shuffle=True)
+
+  # Test on the training data to make sure the predictions are calibrated.
+  test_input_fn = numpy_io.numpy_input_fn(
+      x=x,
+      y=y,
+      batch_size=_QUANTILE_REGRESSION_SIZE,
+      num_epochs=1,
+      shuffle=False)
+
+  return train_input_fn, test_input_fn, y_original
+
+
 class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -341,6 +395,130 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     for prediction_dict in result_iter:
       self.assertTrue("classes" in prediction_dict)
 
+  # One dimensional quantile regression.
+  def testQuantileRegression(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
+
+    # 95% percentile.
+    model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["scores"])
+
+    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper >= 0.92)
+    self.assertTrue(frac_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
+    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["scores"])
+
+    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower >= 0.92)
+    self.assertTrue(frac_above_lower <= 0.98)
+
+  # Multi-dimensional quantile regression.
+  def testQuantileRegressionMultiDimLabel(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
+        two_dimension=True)
+
+    # 95% percentile.
+    model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        label_dimension=2,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["scores"])
+
+    count_below_upper = np.count_nonzero(upper > y, axis=0)
+    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
+    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
+    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
+    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper_0 >= 0.92)
+    self.assertTrue(frac_below_upper_0 <= 0.98)
+    self.assertTrue(frac_below_upper_1 >= 0.92)
+    self.assertTrue(frac_below_upper_1 <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.92)
+    self.assertTrue(frac_both_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
+        two_dimension=True)
+    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        label_dimension=2,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["scores"])
+
+    count_above_lower = np.count_nonzero(lower < y, axis=0)
+    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
+    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
+    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
+    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower_0 >= 0.92)
+    self.assertTrue(frac_above_lower_0 <= 0.98)
+    self.assertTrue(frac_above_lower_1 >= 0.92)
+    self.assertTrue(frac_above_lower_1 <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.92)
+    self.assertTrue(frac_both_above_lower <= 0.98)
+
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
@@ -489,8 +667,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
     feature_columns = [
         core_feature_column.weighted_categorical_column(
-            categorical_column=core_feature_column.
-            categorical_column_with_vocabulary_list(
+            categorical_column=core_feature_column
+            .categorical_column_with_vocabulary_list(
                 key="word", vocabulary_list=["the", "cat", "dog"]),
             weight_feature_key="weight")
     ]
@@ -509,8 +687,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
         # Weights for the words are 5 - cat, 6- dog and 1 -the.
         features_dict["word"] = sparse_tensor.SparseTensor(
             indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
-            values=constant_op.constant(
-                ["the", "cat", "dog", "the"], dtype=dtypes.string),
+            values=constant_op.constant(["the", "cat", "dog", "the"],
+                                        dtype=dtypes.string),
             dense_shape=[4, 3])
         features_dict["weight"] = sparse_tensor.SparseTensor(
             indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
@@ -534,6 +712,132 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     est.evaluate(input_fn=input_fn, steps=1)
     est.predict(input_fn=input_fn)
 
+  # One dimensional quantile regression.
+  def testQuantileRegression(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1)
+
+    # 95% percentile.
+    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["predictions"])
+
+    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper >= 0.92)
+    self.assertTrue(frac_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
+    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["predictions"])
+
+    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower >= 0.92)
+    self.assertTrue(frac_above_lower <= 0.98)
+
+  # Multi-dimensional quantile regression.
+  def testQuantileRegressionMultiDimLabel(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
+        two_dimension=True)
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 2)
+
+    # 95% percentile.
+    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        label_dimension=2,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["predictions"])
+
+    count_below_upper = np.count_nonzero(upper > y, axis=0)
+    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
+    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
+    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
+    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper_0 >= 0.92)
+    self.assertTrue(frac_below_upper_0 <= 0.98)
+    self.assertTrue(frac_below_upper_1 >= 0.92)
+    self.assertTrue(frac_below_upper_1 <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.92)
+    self.assertTrue(frac_both_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
+        two_dimension=True)
+    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        label_dimension=2,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["predictions"])
+
+    count_above_lower = np.count_nonzero(lower < y, axis=0)
+    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
+    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
+    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
+    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower_0 >= 0.92)
+    self.assertTrue(frac_above_lower_0 <= 0.98)
+    self.assertTrue(frac_above_lower_1 >= 0.92)
+    self.assertTrue(frac_above_lower_1 <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.92)
+    self.assertTrue(frac_both_above_lower <= 0.98)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index f45010ec26ed25127ca78b97f4d6fd7ebd6467ae..1fffbb5f660c681e1dde11a2aaf1d0f1cf79d1d0 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -142,7 +142,7 @@ class InequalitySplitHandler(base_split_handler.BaseSplitHandler):
         name="StatsAccumulator/{}".format(self._name))
     # Allocate both stats accumulator and quantile accumulator on the same
     # device so that we can build splits with fewer RPCs.
-    with ops.colocate_with(self._stats_accumulator.resource()):
+    with ops.colocate_with(self._stats_accumulator.resource_handle):
       self._quantile_accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token,
           epsilon=epsilon,
@@ -268,8 +268,8 @@ class DenseSplitHandler(InequalitySplitHandler):
       handler = make_dense_split_tensor
 
     are_splits_ready, partition_ids, gains, split_infos = (
-        handler(self._quantile_accumulator.resource(),
-                self._stats_accumulator.resource(), stamp_token,
+        handler(self._quantile_accumulator.resource_handle,
+                self._stats_accumulator.resource_handle, stamp_token,
                 next_stamp_token, self._multiclass_strategy, class_id,
                 self._feature_column_group_id, self._l1_regularization,
                 self._l2_regularization, self._tree_complexity_regularization,
@@ -447,8 +447,8 @@ class SparseSplitHandler(InequalitySplitHandler):
       handler = make_sparse_split_tensor
 
     are_splits_ready, partition_ids, gains, split_infos = (
-        handler(self._quantile_accumulator.resource(),
-                self._stats_accumulator.resource(), stamp_token,
+        handler(self._quantile_accumulator.resource_handle,
+                self._stats_accumulator.resource_handle, stamp_token,
                 next_stamp_token, self._multiclass_strategy, class_id,
                 self._feature_column_group_id, self._l1_regularization,
                 self._l2_regularization, self._tree_complexity_regularization,
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
index 05ce0884ccfff53484fdc0c26e596e7fb6fcdfd6..356ae337685d580319da16a20bbab27ccaa73255 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
@@ -34,7 +34,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -62,7 +62,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2, 1],
@@ -91,7 +91,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -123,7 +123,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -133,7 +133,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
 
       with ops.control_dependencies([op1]):
         (stamp_token, num_updates, partition_1, feature_1, grads_1,
-         hessians_1) = accumulator.serialize()
+         hessians_1) = accumulator.saveable.serialize()
       # Make sure that the accumulator hasn't changed during serialization.
       with ops.control_dependencies([stamp_token]):
         num_updates_2, partition_2, feature_2, grads_2, hessians_2 = (
@@ -164,7 +164,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         # These will be deleted due to deserialize call.
         op1 = accumulator.add(
             stamp_token=0,
@@ -175,7 +175,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
 
       with ops.control_dependencies([op1]):
         deserialize = (
-            accumulator.deserialize(
+            accumulator.saveable.deserialize(
                 stamp_token=2,
                 num_updates=3,
                 partition_ids=[3, 4],
@@ -223,7 +223,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -261,7 +261,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -299,7 +299,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -336,7 +336,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -349,7 +349,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
 
       with ops.control_dependencies([op1]):
         (stamp_token, num_updates_1, partition_1, feature_1, grads_1,
-         hessians_1) = accumulator.serialize()
+         hessians_1) = accumulator.saveable.serialize()
       # Make sure that the accumulator hasn't changed during serialization.
       with ops.control_dependencies([stamp_token]):
         num_updates_2, partition_2, feature_2, grads_2, hessians_2 = (
@@ -386,7 +386,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         # These will be deleted due to deserialize call.
         op1 = accumulator.add(
             stamp_token=0,
@@ -399,7 +399,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
                                                                     0.08]]])
 
       with ops.control_dependencies([op1]):
-        deserialize = accumulator.deserialize(
+        deserialize = accumulator.saveable.deserialize(
             stamp_token=2,
             num_updates=3,
             partition_ids=[3, 4],
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
index 25b2c9e2fd72bd018717e8a87fce726f26bad968..fca22c71a83459cb290eaebcf107cf1c14c222b7 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 # pylint: disable=unused-import
 from tensorflow.contrib.boosted_trees.python.ops import boosted_trees_ops_loader
 # pylint: enable=unused-import
@@ -31,6 +33,7 @@ from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensem
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 ops.NotDifferentiable("TreeEnsembleVariable")
 ops.NotDifferentiable("TreeEnsembleSerialize")
@@ -82,6 +85,44 @@ class TreeEnsembleVariableSavable(saver.BaseSaverBuilder.SaveableObject):
           tree_ensemble_config=restored_tensors[1])
 
 
+class TreeEnsembleVariable(tracking.TrackableResource):
+  """A Tree ensemble model."""
+
+  def __init__(self, stamp_token, tree_ensemble_config, name, container=None):
+    self._stamp_token = stamp_token
+    self._tree_ensemble_config = tree_ensemble_config
+    self._name = name
+    self._container = container
+    self._init_op = None
+    super(TreeEnsembleVariable, self).__init__()
+
+  def create_resource(self):
+    return gen_model_ops.decision_tree_ensemble_resource_handle_op(
+        self._container, shared_name=self._name, name=self._name)
+
+  def initialize(self):
+    return gen_model_ops.create_tree_ensemble_variable(
+        self.resource_handle, self._stamp_token, self._tree_ensemble_config)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_model_ops.tree_ensemble_is_initialized_op(self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    return {
+        "tree_ensemble_variable":
+            functools.partial(
+                TreeEnsembleVariableSavable,
+                tree_ensemble_handle=self.resource_handle,
+                create_op=self.initializer)
+    }
+
+
 def tree_ensemble_variable(stamp_token,
                            tree_ensemble_config,
                            name,
@@ -99,12 +140,11 @@ def tree_ensemble_variable(stamp_token,
     A `Tensor` of type mutable `string`. The handle to the tree ensemble.
   """
   with ops.name_scope(name, "TreeEnsembleVariable") as name:
-    resource_handle = gen_model_ops.decision_tree_ensemble_resource_handle_op(
-        container, shared_name=name, name=name)
-    create_op = gen_model_ops.create_tree_ensemble_variable(
-        resource_handle, stamp_token, tree_ensemble_config)
-    is_initialized_op = gen_model_ops.tree_ensemble_is_initialized_op(
-        resource_handle)
+    tree_ensemble_var = TreeEnsembleVariable(stamp_token, tree_ensemble_config,
+                                             name, container)
+    resource_handle = tree_ensemble_var.resource_handle
+    create_op = tree_ensemble_var.initializer
+    is_initialized_op = tree_ensemble_var.is_initialized()
     # Adds the variable to the savable list.
     saveable = TreeEnsembleVariableSavable(resource_handle, create_op,
                                            resource_handle.name)
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 19b6b3296db394b07f57a25dbde187eb9195af38..0c319cc9bd1f720eb404a9da05227c5807ec874f 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -33,59 +33,20 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
 
 
-class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
-  """A resource that allows distributed quantile computation."""
-
-  def __init__(self,
-               init_stamp_token,
-               epsilon,
-               num_quantiles,
-               max_elements=None,
-               name=None,
-               container=None,
-               generate_quantiles=False):
-    """Creates a QuantileAccumulator object.
-
-    Args:
-      init_stamp_token: The initial value for the stamp token.
-      epsilon: Error bound on the quantile computation.
-      num_quantiles: Number of quantiles to produce from the final summary.
-      max_elements: Maximum number of elements added to the accumulator.
-      name: the name to save the accumulator under.
-      container: An optional `string`. Defaults to `""`
-      generate_quantiles: Generate quantiles instead of approximate boundaries.
-        If true, exactly `num_quantiles` will be produced in the final summary.
-    """
-    self._epsilon = epsilon
-    self._generate_quantiles = generate_quantiles
+class QuantileAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for QuantileAccumulator."""
 
-    name = _PATTERN.sub("", name)
-    with ops.name_scope(name, "QuantileAccumulator") as name:
-      self._quantile_accumulator_handle = (
-          gen_quantile_ops.quantile_stream_resource_handle_op(
-              container=container, shared_name=name, name=name))
-      self._create_op = gen_quantile_ops.create_quantile_accumulator(
-          self._quantile_accumulator_handle,
-          init_stamp_token,
-          epsilon=epsilon,
-          max_elements=max_elements,
-          num_quantiles=num_quantiles,
-          generate_quantiles=generate_quantiles)
-      is_initialized_op = gen_quantile_ops.quantile_accumulator_is_initialized(
-          self._quantile_accumulator_handle)
-    resources.register_resource(self._quantile_accumulator_handle,
-                                self._create_op, is_initialized_op)
-    self._make_savable(name)
-
-  def _make_savable(self, name):
+  def __init__(self, resource_handle, create_op, name):
+    self._resource_handle = resource_handle
+    self._create_op = create_op
     stamp_token, state, are_buckets_ready, buckets = (
-        gen_quantile_ops.quantile_accumulator_serialize(
-            self._quantile_accumulator_handle))
+        gen_quantile_ops.quantile_accumulator_serialize(resource_handle))
     # slice_spec is useful for saving a slice from a variable.
     # It's not meaningful in quantile accumulator.
     slice_spec = ""
@@ -96,9 +57,8 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     specs += [make_save_spec(state, "_state")]
     specs += [make_save_spec(are_buckets_ready, "_are_buckets_ready")]
     specs += [make_save_spec(buckets, "buckets")]
-    super(QuantileAccumulator,
-          self).__init__(self._quantile_accumulator_handle, specs, name)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+    super(QuantileAccumulatorSaveable, self).__init__(self._resource_handle,
+                                                      specs, name)
 
   def restore(self, restored_tensors, unused_restored_shapes):
     """Restores the associated quantile accumulator from 'restored_tensors'.
@@ -119,24 +79,94 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     buckets = restored_tensors[3]
     with ops.control_dependencies([self._create_op]):
       return gen_quantile_ops.quantile_accumulator_deserialize(
-          self._quantile_accumulator_handle,
+          self._resource_handle,
           stamp_token=stamp_token,
           stream_state=state,
           are_buckets_ready=are_buckets_ready,
           buckets=buckets)
 
+
+class QuantileAccumulator(tracking.TrackableResource):
+  """A resource that allows distributed quantile computation."""
+
+  def __init__(self,
+               init_stamp_token,
+               epsilon,
+               num_quantiles,
+               max_elements=None,
+               name=None,
+               container=None,
+               generate_quantiles=False):
+    """Creates a QuantileAccumulator object.
+
+    Args:
+      init_stamp_token: The initial value for the stamp token.
+      epsilon: Error bound on the quantile computation.
+      num_quantiles: Number of quantiles to produce from the final summary.
+      max_elements: Maximum number of elements added to the accumulator.
+      name: the name to save the accumulator under.
+      container: An optional `string`. Defaults to `""`
+      generate_quantiles: Generate quantiles instead of approximate boundaries.
+        If true, exactly `num_quantiles` will be produced in the final summary.
+    """
+    self._init_stamp_token = init_stamp_token
+    self._epsilon = epsilon
+    self._num_quantiles = num_quantiles
+    self._max_elements = max_elements
+    self._container = container
+    self._generate_quantiles = generate_quantiles
+    super(QuantileAccumulator, self).__init__()
+
+    name = _PATTERN.sub("", name)
+    with ops.name_scope(name, "QuantileAccumulator") as name:
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
+    resources.register_resource(self.resource_handle, self._init_op,
+                                is_initialized_op)
+    self._saveable = QuantileAccumulatorSaveable(self.resource_handle,
+                                                 self._init_op, name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
+
+  def create_resource(self):
+    return gen_quantile_ops.quantile_stream_resource_handle_op(
+        container=self._container, shared_name=self._name, name=self._name)
+
+  def initialize(self):
+    return gen_quantile_ops.create_quantile_accumulator(
+        self.resource_handle,
+        self._init_stamp_token,
+        epsilon=self._epsilon,
+        max_elements=self._max_elements,
+        num_quantiles=self._num_quantiles,
+        generate_quantiles=self._generate_quantiles)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_quantile_ops.quantile_accumulator_is_initialized(
+        self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    return {"quantile_accumulator", self.saveable}
+
   def get_buckets(self, stamp_token):
     """Returns quantile buckets created during previous flush."""
     are_buckets_ready, buckets = (
         gen_quantile_ops.quantile_accumulator_get_buckets(
-            quantile_accumulator_handles=[self._quantile_accumulator_handle],
+            quantile_accumulator_handles=[self.resource_handle],
             stamp_token=stamp_token))
     return are_buckets_ready[0], buckets[0]
 
   def schedule_get_buckets(self):
     """Returns a scheduled read of buckets created during previous flush."""
     return batch_ops_utils.ScheduledStampedResourceOp(
-        resource_handle=self._quantile_accumulator_handle,
+        resource_handle=self.resource_handle,
         op=gen_quantile_ops.quantile_accumulator_get_buckets)
 
   def _make_summary(self, column, example_weights):
@@ -161,14 +191,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     """Adds quantile summary to its stream in resource."""
     summary = self._make_summary(column, example_weights)
     return gen_quantile_ops.quantile_accumulator_add_summaries(
-        quantile_accumulator_handles=[self._quantile_accumulator_handle],
+        quantile_accumulator_handles=[self.resource_handle],
         stamp_token=stamp_token,
         summaries=[summary])
 
   def add_prebuilt_summary(self, stamp_token, summary):
     """Adds quantile summary to its stream in resource."""
     return gen_quantile_ops.quantile_accumulator_add_summaries(
-        quantile_accumulator_handles=[self._quantile_accumulator_handle],
+        quantile_accumulator_handles=[self.resource_handle],
         stamp_token=stamp_token,
         summaries=[summary])
 
@@ -177,7 +207,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     summary = self._make_summary(column, example_weights)
     return batch_ops_utils.ScheduledStampedResourceOp(
         op=gen_quantile_ops.quantile_accumulator_add_summaries,
-        resource_handle=self._quantile_accumulator_handle,
+        resource_handle=self.resource_handle,
         summaries=summary)
 
   def flush(self, stamp_token, next_stamp_token):
@@ -190,17 +220,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
       The flush operation.
     """
     return gen_quantile_ops.quantile_accumulator_flush(
-        quantile_accumulator_handle=self._quantile_accumulator_handle,
+        quantile_accumulator_handle=self.resource_handle,
         stamp_token=stamp_token,
         next_stamp_token=next_stamp_token)
 
   def flush_summary(self, stamp_token, next_stamp_token):
     """Finalizes quantile summary stream and resets it for next iteration."""
     result = gen_quantile_ops.quantile_accumulator_flush_summary(
-        quantile_accumulator_handle=self._quantile_accumulator_handle,
+        quantile_accumulator_handle=self.resource_handle,
         stamp_token=stamp_token,
         next_stamp_token=next_stamp_token)
     return result
-
-  def resource(self):
-    return self._quantile_accumulator_handle
diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
index 2e94e353f325f06eed2d290d3a7a461861820c39..ad1191d41236e71008bff8c8a7fbd42c16e3f9c5 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
@@ -26,12 +26,83 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
 
 
-class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
+class StatsAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for StatsAccumulator."""
+
+  def __init__(self, resource_handle, create_op, is_scalar, name):
+    self._create_op = create_op
+    self._resource_handle = resource_handle
+    self._is_scalar = is_scalar
+    slice_spec = ""
+    saver_name = self._resource_handle.name
+    (stamp_token, num_updates, partition_ids, feature_ids, gradients,
+     hessians) = self.serialize()
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
+                                        saver_name + "_stamp"),
+        saver.BaseSaverBuilder.SaveSpec(num_updates, slice_spec,
+                                        saver_name + "_num_updates"),
+        saver.BaseSaverBuilder.SaveSpec(partition_ids, slice_spec,
+                                        saver_name + "_partition_ids"),
+        saver.BaseSaverBuilder.SaveSpec(feature_ids, slice_spec,
+                                        saver_name + "_feature_ids"),
+        saver.BaseSaverBuilder.SaveSpec(gradients, slice_spec,
+                                        saver_name + "_gradients"),
+        saver.BaseSaverBuilder.SaveSpec(hessians, slice_spec,
+                                        saver_name + "hessians"),
+    ]
+    super(StatsAccumulatorSaveable, self).__init__(self._resource_handle, specs,
+                                                   name)
+
+  def serialize(self):
+    """Serializes the stats accumulator state."""
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_serialize(
+          self._resource_handle)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_serialize(
+          self._resource_handle)
+
+  def deserialize(self, stamp_token, num_updates, partition_ids, feature_ids,
+                  gradients, hessians):
+    """Resets the stats accumulator with the serialized state."""
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_deserialize(
+          self._resource_handle, stamp_token, num_updates, partition_ids,
+          feature_ids, gradients, hessians)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_deserialize(
+          self._resource_handle, stamp_token, num_updates, partition_ids,
+          feature_ids, gradients, hessians)
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree ensemble from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree ensemble variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return self.deserialize(
+          stamp_token=restored_tensors[0],
+          num_updates=restored_tensors[1],
+          partition_ids=restored_tensors[2],
+          feature_ids=restored_tensors[3],
+          gradients=restored_tensors[4],
+          hessians=restored_tensors[5])
+
+
+class StatsAccumulator(tracking.TrackableResource):
   """A resource that allows to accumulate gradients and hessians.
 
   For consistency guarantees, we use read and write stamp tokens.
@@ -58,58 +129,69 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
     Returns:
       A `Tensor` of type mutable `string`. The handle to the stats accumulator.
     """
+    self._stamp_token = stamp_token
+    self._gradient_shape = gradient_shape
+    self._hessian_shape = hessian_shape
+    self._container = container
+
+    if (gradient_shape == tensor_shape.scalar() and
+        hessian_shape == tensor_shape.scalar()):
+      self._is_scalar = True
+    else:
+      self._is_scalar = False
+
     if name is not None:
       name = _PATTERN.sub("", name)
     with ops.name_scope(name, "StatsAccumulator") as name:
-      # Both values are scalars.
-      if (gradient_shape == tensor_shape.scalar() and
-          hessian_shape == tensor_shape.scalar()):
-        self._is_scalar = True
-        self._resource_handle = (gen_stats_accumulator_ops.
-                                 stats_accumulator_scalar_resource_handle_op(
-                                     container, name, name=name))
-
-        create_op = gen_stats_accumulator_ops.create_stats_accumulator_scalar(
-            self._resource_handle, stamp_token)
-        is_initialized_op = (
-            gen_stats_accumulator_ops.stats_accumulator_scalar_is_initialized(
-                self._resource_handle))
-      else:
-        self._is_scalar = False
-        self._resource_handle = (gen_stats_accumulator_ops.
-                                 stats_accumulator_tensor_resource_handle_op(
-                                     container, name, name=name))
-        create_op = gen_stats_accumulator_ops.create_stats_accumulator_tensor(
-            self._resource_handle, stamp_token, gradient_shape.as_list(),
-            hessian_shape.as_list())
-        is_initialized_op = (
-            gen_stats_accumulator_ops.stats_accumulator_tensor_is_initialized(
-                self._resource_handle))
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
+    resources.register_resource(self.resource_handle, self.initializer,
+                                is_initialized_op)
+    self._saveable = StatsAccumulatorSaveable(
+        self.resource_handle, self.initializer, self._is_scalar, name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
-    self._create_op = create_op
-    slice_spec = ""
-    saver_name = self._resource_handle.name
-    (stamp_token, num_updates, partition_ids, feature_ids, gradients,
-     hessians) = self.serialize()
-    specs = [
-        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
-                                        saver_name + "_stamp"),
-        saver.BaseSaverBuilder.SaveSpec(num_updates, slice_spec,
-                                        saver_name + "_num_updates"),
-        saver.BaseSaverBuilder.SaveSpec(partition_ids, slice_spec,
-                                        saver_name + "_partition_ids"),
-        saver.BaseSaverBuilder.SaveSpec(feature_ids, slice_spec,
-                                        saver_name + "_feature_ids"),
-        saver.BaseSaverBuilder.SaveSpec(gradients, slice_spec,
-                                        saver_name + "_gradients"),
-        saver.BaseSaverBuilder.SaveSpec(hessians, slice_spec,
-                                        saver_name + "hessians"),
-    ]
+  def create_resource(self):
+    if self._is_scalar:
+      return (
+          gen_stats_accumulator_ops.stats_accumulator_scalar_resource_handle_op(
+              self._container, self._name, name=self._name))
+    else:
+      return (
+          gen_stats_accumulator_ops.stats_accumulator_tensor_resource_handle_op(
+              self._container, self._name, name=self._name))
 
-    super(StatsAccumulator, self).__init__(self._resource_handle, specs, name)
-    resources.register_resource(self._resource_handle, create_op,
-                                is_initialized_op)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+  def initialize(self):
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.create_stats_accumulator_scalar(
+          self.resource_handle, self._stamp_token)
+    else:
+      return gen_stats_accumulator_ops.create_stats_accumulator_tensor(
+          self.resource_handle, self._stamp_token,
+          self._gradient_shape.as_list(), self._hessian_shape.as_list())
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_is_initialized(
+          self.resource_handle)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_is_initialized(
+          self.resource_handle)
+
+  @property
+  def saveable(self):
+    return self._saveable
+
+  def _gather_saveables_for_checkpoint(self):
+    return {"stats_accumulator", self.saveable}
 
   def add(self, stamp_token, partition_ids, feature_ids, gradients, hessians):
     """Updates the stats accumulator."""
@@ -117,11 +199,11 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
         partition_ids, feature_ids, gradients, hessians))
     if self._is_scalar:
       return gen_stats_accumulator_ops.stats_accumulator_scalar_add(
-          [self._resource_handle], stamp_token, [partition_ids], [feature_ids],
+          [self.resource_handle], stamp_token, [partition_ids], [feature_ids],
           [gradients], [hessians])
     else:
       return gen_stats_accumulator_ops.stats_accumulator_tensor_add(
-          [self._resource_handle], stamp_token, [partition_ids], [feature_ids],
+          [self.resource_handle], stamp_token, [partition_ids], [feature_ids],
           [gradients], [hessians])
 
   def schedule_add(self, partition_ids, feature_ids, gradients, hessians):
@@ -131,7 +213,7 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
     if self._is_scalar:
       return batch_ops_utils.ScheduledStampedResourceOp(
           op=gen_stats_accumulator_ops.stats_accumulator_scalar_add,
-          resource_handle=self._resource_handle,
+          resource_handle=self.resource_handle,
           partition_ids=partition_ids,
           feature_ids=feature_ids,
           gradients=gradients,
@@ -139,7 +221,7 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
     else:
       return batch_ops_utils.ScheduledStampedResourceOp(
           op=gen_stats_accumulator_ops.stats_accumulator_tensor_add,
-          resource_handle=self._resource_handle,
+          resource_handle=self.resource_handle,
           partition_ids=partition_ids,
           feature_ids=feature_ids,
           gradients=gradients,
@@ -153,55 +235,11 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
       return gen_stats_accumulator_ops.stats_accumulator_tensor_make_summary(
           partition_ids, feature_ids, gradients, hessians)
 
-  def deserialize(self, stamp_token, num_updates, partition_ids, feature_ids,
-                  gradients, hessians):
-    """Resets the stats accumulator with the serialized state."""
-    if self._is_scalar:
-      return gen_stats_accumulator_ops.stats_accumulator_scalar_deserialize(
-          self._resource_handle, stamp_token, num_updates, partition_ids,
-          feature_ids, gradients, hessians)
-    else:
-      return gen_stats_accumulator_ops.stats_accumulator_tensor_deserialize(
-          self._resource_handle, stamp_token, num_updates, partition_ids,
-          feature_ids, gradients, hessians)
-
   def flush(self, stamp_token, next_stamp_token):
     """Flushes the stats accumulator."""
     if self._is_scalar:
       return gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
-          self._resource_handle, stamp_token, next_stamp_token)
+          self.resource_handle, stamp_token, next_stamp_token)
     else:
       return gen_stats_accumulator_ops.stats_accumulator_tensor_flush(
-          self._resource_handle, stamp_token, next_stamp_token)
-
-  def serialize(self):
-    """Serializes the stats accumulator state."""
-    if self._is_scalar:
-      return gen_stats_accumulator_ops.stats_accumulator_scalar_serialize(
-          self._resource_handle)
-    else:
-      return gen_stats_accumulator_ops.stats_accumulator_tensor_serialize(
-          self._resource_handle)
-
-  def restore(self, restored_tensors, unused_restored_shapes):
-    """Restores the associated tree ensemble from 'restored_tensors'.
-
-    Args:
-      restored_tensors: the tensors that were loaded from a checkpoint.
-      unused_restored_shapes: the shapes this object should conform to after
-        restore. Not meaningful for trees.
-
-    Returns:
-      The operation that restores the state of the tree ensemble variable.
-    """
-    with ops.control_dependencies([self._create_op]):
-      return self.deserialize(
-          stamp_token=restored_tensors[0],
-          num_updates=restored_tensors[1],
-          partition_ids=restored_tensors[2],
-          feature_ids=restored_tensors[3],
-          gradients=restored_tensors[4],
-          hessians=restored_tensors[5])
-
-  def resource(self):
-    return self._resource_handle
+          self.resource_handle, stamp_token, next_stamp_token)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 1cf61a10ba25f206333bb78b7944e366bcd19b92..85020c5df293598e79de0e964f55af5231aa3622 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -992,7 +992,7 @@ class GradientBoostedDecisionTreeModel(object):
 
         # Get accumulated steps and examples for the current layer.
         _, _, _, _, acc_examples, acc_steps = (
-            steps_accumulator.serialize())
+            steps_accumulator.saveable.serialize())
         acc_examples = math_ops.cast(acc_examples[0], dtypes.int64)
         acc_steps = math_ops.cast(acc_steps[0], dtypes.int64)
         ensemble_update_ops.append(
@@ -1257,13 +1257,12 @@ class GradientBoostedDecisionTreeModel(object):
   def _get_replica_device_setter(self, worker_device):
     """Creates a replica device setter."""
     ps_tasks = self._num_ps_replicas
-    ps_ops = [
-        "Variable",
-        "VariableV2",
+    ps_ops = list(device_setter.STANDARD_PS_OPS)
+    ps_ops.extend([
         "DecisionTreeEnsembleResourceHandleOp",
         "StatsAccumulatorScalarResourceHandleOp",
         "StatsAccumulatorTensorResourceHandleOp",
-    ]
+    ])
     ps_strategy = _OpRoundRobinStrategy(ps_ops, ps_tasks)
     return device_setter.replica_device_setter(
         worker_device=worker_device,
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
index b5ebaf1999519f65110e8164fa20bace5ecc3ef6..7a99dccdd1066354ee50dad0622a6fbda9c860ff 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -48,6 +48,47 @@ def per_example_logistic_loss(labels, weights, predictions):
       labels=labels, logits=predictions)
   return unweighted_loss * weights, control_flow_ops.no_op()
 
+# MUST USE WITH HESSIAN REGULARIZATION,
+# This loss can have zero hessian, so it must be used with l2 or min_node_weight
+# regularization.
+# An example config is
+# learner_config.constraints.min_node_weight = 1 / num_examples_per_layer
+# learner_config.regularization.l2 = 1.0 / num_examples_per_layer
+# TODO(nponomareva): make it multidimensional so we can estimate several
+# quantiles at once.
+def per_example_quantile_regression_loss(labels, weights, predictions,
+                                         quantile):
+  """Smoothed loss for quantile regression.
+
+  The standard quantile regression loss is quantile*(y-y') when y>y' and
+  (quantile-1)*(y-y') otherwise, y' is a prediction, y is a label. The impl
+  below is this loss but squared in the region where the loss value < 1.
+
+  Args:
+    labels: Rank 2 (N, D) tensor of per-example labels.
+    weights: Rank 2 (N, 1) tensor of per-example weights.
+    predictions: Rank 2 (N, D) tensor of per-example predictions.
+    quantile: The quantile to use.
+
+  Returns:
+    loss: A Rank 2 (N, 1) tensor of per-example quantile loss.
+    update_op: An update operation to update the loss's internal state.
+  """
+  labels = math_ops.to_float(labels)
+  error = labels - predictions
+  square_loss_right = array_ops.where(error * quantile < 1.0,
+                                      math_ops.square(quantile * error),
+                                      quantile * error)
+  square_loss_left = array_ops.where(error * (quantile - 1) < 1,
+                                     math_ops.square((quantile - 1) * error),
+                                     (quantile - 1) * error)
+
+  unweighted_loss = array_ops.where(error > 0, square_loss_right,
+                                    square_loss_left)
+  if weights is None:
+    return unweighted_loss, control_flow_ops.no_op()
+  else:
+    return unweighted_loss * weights, control_flow_ops.no_op()
 
 # This is classical form of Maximum entropy loss, that is twice differentiable
 # (sparse_softmax_cross_entropy which is what we go for is not twice
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
index 5ecd4f341831ce8d6f8eb04a763280c177ffe275..7774ac0e122a532e1e0280f185ead3022a0b89d6 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
@@ -25,6 +25,13 @@ import six
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
 @six.add_metaclass(abc.ABCMeta)
 class ClusterResolver(object):
   """Abstract class for all implementations of ClusterResolvers.
@@ -37,6 +44,17 @@ class ClusterResolver(object):
   automatically discover and resolve IP addresses for various TensorFlow
   workers. This will eventually allow us to automatically recover from
   underlying machine failures and scale TensorFlow worker clusters up and down.
+
+  Note to Implementors: In addition to these abstract methods, you must also
+  implement the task_type, task_index, and rpc_layer attributes. You may choose
+  to implement them either as properties with getters or setters or directly
+  set the attributes.
+
+  - task_type is the name of the server's current named job (e.g. 'worker',
+     'ps' in a distributed parameterized training job).
+  - task_index is the ordinal index of the server within the task type.
+  - rpc_layer is the protocol used by TensorFlow to communicate with other
+      TensorFlow servers in a distributed environment.
   """
 
   @abc.abstractmethod
@@ -53,16 +71,16 @@ class ClusterResolver(object):
     management system every time this function is invoked and reconstructing
     a cluster_spec, rather than attempting to cache anything.
     """
-    raise NotImplementedError(
-        'cluster_spec is not implemented for {}.'.format(self))
+    raise NotImplementedError()
 
   @abc.abstractmethod
-  def master(self, task_type=None, task_index=None):
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
     """Retrieves the name or URL of the session master.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
       task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
@@ -71,16 +89,44 @@ class ClusterResolver(object):
     returned is up-to-date at the time to calling this function. This usually
     means retrieving the master every time this function is invoked.
     """
-    raise NotImplementedError('master is not implemented for {}.'.format(self))
+    raise NotImplementedError()
+
+  @abc.abstractmethod
+  def num_accelerators_per_worker(self, session_config=None):
+    """Returns the number of accelerator cores per worker.
+
+    This returns the number of accelerator cores (such as GPUs and TPUs)
+    available per worker. If workers only has CPU cores available, then this
+    should return 0. This method will query the master for this information
+    if it is not otherwise known.
+
+    Args:
+      session_config: (Optional) Configuration for starting a new session to
+        query how many accelerator cores it has.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractproperty
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    raise NotImplementedError()
 
 
 class SimpleClusterResolver(ClusterResolver):
   """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
 
-  def __init__(self, cluster_spec, master=''):
+  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
+               environment='', num_accelerators_per_worker=0,
+               rpc_layer=None):
     """Creates a SimpleClusterResolver from a ClusterSpec."""
     super(SimpleClusterResolver, self).__init__()
 
+    self._task_type = task_type
+    self._task_index = task_index
+    self._environment = environment
+    self._num_accelerators_per_worker = num_accelerators_per_worker
+    self._rpc_layer = rpc_layer
+
     if not isinstance(cluster_spec, ClusterSpec):
       raise TypeError('cluster_spec must be a ClusterSpec.')
     self._cluster_spec = cluster_spec
@@ -93,12 +139,13 @@ class SimpleClusterResolver(ClusterResolver):
     """Returns the ClusterSpec passed into the constructor."""
     return self._cluster_spec
 
-  def master(self, task_type=None, task_index=None):
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
       task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC used by distributed TensorFlow.
 
     Returns:
       The name or URL of the session master.
@@ -106,10 +153,52 @@ class SimpleClusterResolver(ClusterResolver):
     If a task_type and task_index is given, this will override the `master`
     string passed into the initialization function.
     """
-    if task_type and task_index:
-      return self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+    else:
+      master = self._master
+
+    return format_master_url(master, rpc_layer=rpc_layer or self._rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._environment
+
+  def num_accelerators_per_worker(self, session_config=None):
+    """Returns the number of accelerator cores per worker.
+
+    Args:
+      session_config: Unused. The SimpleClusterResolver does not do automatic
+        detection of accelerators, so a TensorFlow session will never be
+        created, and thus a `session_config` is never necessary here, and will
+        be ignored.
+    """
+    del session_config
+    return self._num_accelerators_per_worker
 
-    return self._master
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
 
 
 class UnionClusterResolver(ClusterResolver):
@@ -119,13 +208,22 @@ class UnionClusterResolver(ClusterResolver):
   merges the underlying ClusterResolvers, and returns one unified ClusterSpec
   when cluster_spec is called. The details of the merge function is
   documented in the cluster_spec function.
+
+  For additional Cluster Resolver properties such as task type, task index,
+  rpc layer, environment, etc..., we will return the value from the first
+  ClusterResolver in the union.
   """
 
-  def __init__(self, *args):
+  def __init__(self, *args, **kwargs):
     """Initializes a UnionClusterResolver with other ClusterResolvers.
 
     Args:
       *args: `ClusterResolver` objects to be unionized.
+      **kwargs:
+        rpc_layer - (Optional) Override value for the RPC layer used by
+          TensorFlow.
+        task_type - (Optional) Override value for the current task type.
+        task_index - (Optional) Override value for the current task index.
 
     Raises:
       TypeError: If any argument is not a subclass of `ClusterResolvers`.
@@ -133,6 +231,13 @@ class UnionClusterResolver(ClusterResolver):
     """
     super(UnionClusterResolver, self).__init__()
 
+    self._rpc_layer = kwargs.pop('rpc_layer', None)
+    self._task_type = kwargs.pop('task_type', None)
+    self._task_index = kwargs.pop('task_index', None)
+
+    if kwargs:
+      raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
+
     if not args:
       raise ValueError('At least one ClusterResolver is required.')
 
@@ -216,7 +321,7 @@ class UnionClusterResolver(ClusterResolver):
 
     return ClusterSpec(merged_cluster)
 
-  def master(self, task_type=None, task_index=None):
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     This usually returns the master from the first ClusterResolver passed in,
@@ -225,11 +330,45 @@ class UnionClusterResolver(ClusterResolver):
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
       task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
     """
-    if task_type and task_index:
-      return self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      return format_master_url(master, rpc_layer or self._rpc_layer)
+
+    return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type or self._cluster_resolvers[0].task_type
+
+  @property
+  def task_index(self):
+    return self._task_index or self._cluster_resolvers[0].task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._cluster_resolvers[0].environment
+
+  def num_accelerators_per_worker(self, session_config=None):
+    return self._cluster_resolvers[0].num_accelerators_per_worker(
+        session_config)
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer or self._cluster_resolvers[0].rpc_layer
 
-    return self._cluster_resolvers[0].master()
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
index c004b2e2d3bc6552a3ab10997ed44f24e611735a..b94c9612b5bd4d92e84319f22932ce5599ba4b36 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
@@ -57,6 +57,62 @@ class UnionClusterResolverTest(test.TestCase):
     actual_cluster_spec = union_resolver.cluster_spec()
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testInitSimpleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
+                                            task_index=1, environment="cloud",
+                                            num_accelerators_per_worker=8,
+                                            rpc_layer="grpc")
+
+    self.assertEqual(simple_resolver.task_type, "ps")
+    self.assertEqual(simple_resolver.task_index, 1)
+    self.assertEqual(simple_resolver.environment, "cloud")
+    self.assertEqual(simple_resolver.num_accelerators_per_worker(), 8)
+    self.assertEqual(simple_resolver.rpc_layer, "grpc")
+
+  def testOverrideSimpleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
+                                            task_index=1, environment="cloud",
+                                            num_accelerators_per_worker=8,
+                                            rpc_layer="grpc")
+
+    simple_resolver.task_type = "worker"
+    simple_resolver.task_index = 2
+    simple_resolver.rpc_layer = "http"
+
+    self.assertEqual(simple_resolver.task_type, "worker")
+    self.assertEqual(simple_resolver.task_index, 2)
+    self.assertEqual(simple_resolver.rpc_layer, "http")
+
+  def testSimpleOverrideMasterWithTaskIndexZero(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    actual_master = simple_resolver.master("worker", 0, rpc_layer="grpc")
+    self.assertEqual(actual_master, "grpc://worker0:2222")
+
+  def testSimpleOverrideMasterWithRpcLayer(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    actual_master = simple_resolver.master("worker", 2, rpc_layer="grpc")
+    self.assertEqual(actual_master, "grpc://worker2:2222")
+
   def testSimpleOverrideMaster(self):
     base_cluster_spec = server_lib.ClusterSpec({
         "ps": ["ps0:2222", "ps1:2222"],
@@ -65,7 +121,42 @@ class UnionClusterResolverTest(test.TestCase):
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec)
     actual_master = simple_resolver.master("worker", 2)
-    self.assertEquals(actual_master, "worker2:2222")
+    self.assertEqual(actual_master, "worker2:2222")
+
+  def testUnionClusterResolverGetProperties(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps",
+                                      task_index=1, environment="cloud",
+                                      num_accelerators_per_worker=8,
+                                      rpc_layer="grpc")
+
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "ps": ["ps2:2222", "ps3:2222"],
+        "worker": ["worker3:2222", "worker4:2222", "worker5:2222"]
+    })
+    resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker",
+                                      task_index=2, environment="local",
+                                      num_accelerators_per_worker=16,
+                                      rpc_layer="http")
+
+    union_resolver = UnionClusterResolver(resolver1, resolver2)
+
+    self.assertEqual(union_resolver.task_type, "ps")
+    self.assertEqual(union_resolver.task_index, 1)
+    self.assertEqual(union_resolver.environment, "cloud")
+    self.assertEqual(union_resolver.num_accelerators_per_worker(), 8)
+    self.assertEqual(union_resolver.rpc_layer, "grpc")
+
+    union_resolver.task_type = "worker"
+    union_resolver.task_index = 2
+    union_resolver.rpc_layer = "http"
+
+    self.assertEqual(union_resolver.task_type, "worker")
+    self.assertEqual(union_resolver.task_index, 2)
+    self.assertEqual(union_resolver.rpc_layer, "http")
 
   def testTwoNonOverlappingJobMergedClusterResolver(self):
     cluster_spec_1 = server_lib.ClusterSpec({
@@ -116,10 +207,13 @@ class UnionClusterResolverTest(test.TestCase):
     union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
 
     unspecified_master = union_cluster.master()
-    self.assertEquals(unspecified_master, "")
+    self.assertEqual(unspecified_master, "")
 
     specified_master = union_cluster.master("worker", 1)
-    self.assertEquals(specified_master, "worker1:2222")
+    self.assertEqual(specified_master, "worker1:2222")
+
+    rpc_master = union_cluster.master("worker", 1, rpc_layer="grpc")
+    self.assertEqual(rpc_master, "grpc://worker1:2222")
 
   def testOverlappingJobMergedClusterResolver(self):
     cluster_spec_1 = server_lib.ClusterSpec({
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
index 5083e4d10ba6ee2e1be8f373c099556b422ef5aa..195b68959b6d21ef674438a4a23a4dd07f45faa7 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
@@ -30,6 +30,10 @@ except ImportError:
   _GOOGLE_API_CLIENT_INSTALLED = False
 
 
+def _format_master_url(master, rpc_layer=None):
+  return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+
+
 class GceClusterResolver(ClusterResolver):
   """Cluster Resolver for Google Compute Engine.
 
@@ -45,7 +49,10 @@ class GceClusterResolver(ClusterResolver):
                zone,
                instance_group,
                port,
-               job_name='worker',
+               task_type='worker',
+               task_index=0,
+               rpc_layer='grpc',
+               num_accelerators_per_worker=0,
                credentials='default',
                service=None):
     """Creates a new GceClusterResolver object.
@@ -55,13 +62,22 @@ class GceClusterResolver(ClusterResolver):
     each instance in the instance group.
 
     Args:
-      project: Name of the GCE project
-      zone: Zone of the GCE instance group
-      instance_group: Name of the GCE instance group
+      project: Name of the GCE project.
+      zone: Zone of the GCE instance group.
+      instance_group: Name of the GCE instance group.
       port: Port of the listening TensorFlow server (default: 8470)
-      job_name: Name of the TensorFlow job this set of instances belongs to
+      task_type: Name of the TensorFlow job this GCE instance group of VM
+        instances belong to.
+      task_index: The task index for this particular VM, within the GCE
+        instance group. In particular, every single instance should be assigned
+        a unique ordinal index within an instance group manually so that they
+        can be distinguished from each other.
+      rpc_layer: The RPC layer TensorFlow should use to communicate across
+        instances.
+      num_accelerators_per_worker: Number of accelerators (GPUs) present per
+        instance.
       credentials: GCE Credentials. If nothing is specified, this defaults to
-        GoogleCredentials.get_application_default()
+        GoogleCredentials.get_application_default().
       service: The GCE API object returned by the googleapiclient.discovery
         function. (Default: discovery.build('compute', 'v1')). If you specify a
         custom service object, then the credentials parameter will be ignored.
@@ -72,7 +88,9 @@ class GceClusterResolver(ClusterResolver):
     self._project = project
     self._zone = zone
     self._instance_group = instance_group
-    self._job_name = job_name
+    self._task_type = task_type
+    self._task_index = task_index
+    self._rpc_layer = rpc_layer
     self._port = port
     self._credentials = credentials
 
@@ -133,10 +151,58 @@ class GceClusterResolver(ClusterResolver):
           previous_response=response)
 
     worker_list.sort()
-    return ClusterSpec({self._job_name: worker_list})
+    return ClusterSpec({self._task_type: worker_list})
 
-  def master(self, task_type=None, task_index=None):
-    if task_type and task_index:
-      return self.cluster_spec().task_address(task_type, task_index)
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    task_type = task_type if task_type is not None else self._task_type
+    task_index = task_index if task_index is not None else self._task_index
+
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      if rpc_layer or self._rpc_layer:
+        return '%s://%s' % (rpc_layer or self._rpc_layer, master)
+      else:
+        return master
 
     return ''
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    raise RuntimeError(
+        'You cannot reset the task_type of the GceClusterResolver after it has '
+        'been created.')
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the GCE environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators_per_worker(self, session_config=None):
+    del session_config  # Unused, since this is set manually in __init__.
+    return self._num_accelerators_per_worker
+
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
index 87b8303122498992dd24ae06824f7f769357d8f8..c691552e86025896e23891a3e8f7da5ed2f9da31 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
@@ -135,12 +135,86 @@ class GceClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testMasterRetrieval(self):
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_index=0,
+        port=8470,
+        credentials=None,
+        service=self.standard_mock_service_client())
+    self.assertEqual(gce_cluster_resolver.master(), 'grpc://10.123.45.67:8470')
+
+  def testMasterRetrievalWithCustomTasks(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    self.assertEqual(
+        gce_cluster_resolver.master('worker', 2, 'test'),
+        'test://10.3.4.5:8470')
+
+  def testOverrideParameters(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_type='testworker',
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    gce_cluster_resolver.task_index = 1
+    gce_cluster_resolver.rpc_layer = 'test'
+
+    self.assertEqual(gce_cluster_resolver.task_type, 'testworker')
+    self.assertEqual(gce_cluster_resolver.task_index, 1)
+    self.assertEqual(gce_cluster_resolver.rpc_layer, 'test')
+    self.assertEqual(gce_cluster_resolver.master(), 'test://10.2.3.4:8470')
+
+  def testOverrideParametersWithZeroOrEmpty(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_type='',
+        task_index=1,
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    self.assertEqual(gce_cluster_resolver.master(
+        task_type='', task_index=0), 'grpc://10.1.2.3:8470')
+
   def testCustomJobNameAndPortRetrieval(self):
     gce_cluster_resolver = GceClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='custom',
+        task_type='custom',
         port=2222,
         credentials=None,
         service=self.standard_mock_service_client())
@@ -196,7 +270,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='worker',
+        task_type='worker',
         port=8470,
         credentials=None,
         service=self.gen_standard_mock_service_client(worker1_name_to_ip))
@@ -205,7 +279,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='worker',
+        task_type='worker',
         port=8470,
         credentials=None,
         service=self.gen_standard_mock_service_client(worker2_name_to_ip))
@@ -214,7 +288,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='ps',
+        task_type='ps',
         port=2222,
         credentials=None,
         service=self.gen_standard_mock_service_client(ps_name_to_ip))
diff --git a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
index ddae64839f01b4f67fe4c0c0bc00199bb2e037aa..eab1359a5bdf0e15d630e209964fa46dce9b2d42 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import format_master_url
+from tensorflow.python.client import device_lib
 from tensorflow.python.training import server_lib
 
 _KUBERNETES_API_CLIENT_INSTALLED = True
@@ -41,6 +43,7 @@ class KubernetesClusterResolver(ClusterResolver):
   def __init__(self,
                job_to_label_mapping=None,
                tf_server_port=8470,
+               rpc_layer='grpc',
                override_client=None):
     """Initializes a new KubernetesClusterResolver.
 
@@ -58,6 +61,8 @@ class KubernetesClusterResolver(ClusterResolver):
          'ps': ['job-name=ps-1', 'job-name=ps-2']}
         ```
       tf_server_port: The port the TensorFlow server is listening on.
+      rpc_layer: (Optional) The RPC layer TensorFlow should use to communicate
+        between tasks in Kubernetes. Defaults to 'grpc'.
       override_client: The Kubernetes client (usually automatically retrieved
         using `from kubernetes import client as k8sclient`). If you pass this
         in, you are responsible for setting Kubernetes credentials manually.
@@ -65,6 +70,7 @@ class KubernetesClusterResolver(ClusterResolver):
     Raises:
       ImportError: If the Kubernetes Python client is not installed and no
         `override_client` is passed in.
+      RuntimeError: If autoresolve_task is not a boolean or a callable.
     """
     if _KUBERNETES_API_CLIENT_INSTALLED:
       k8sconfig.load_kube_config()
@@ -82,16 +88,37 @@ class KubernetesClusterResolver(ClusterResolver):
     self._tf_server_port = tf_server_port
     self._override_client = override_client
 
-  def master(self):
-    # TODO(frankchn): Figure out a standard way to pass in the current task type
-    # and task id via Kubernetes.
-    pass
+    self.task_type = None
+    self.task_index = None
+    self.rpc_layer = rpc_layer
 
-  def get_master(self):
-    return self.master()
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
 
-  def get_job_name(self):
-    return self._job_name
+    You must have set the task_type and task_index object properties before
+    calling this function, or pass in the `task_type` and `task_index`
+    parameters when using this function. If you do both, the function parameters
+    will override the object properties.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+    """
+    if task_type is not None and task_index is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(task_type, task_index),
+          rpc_layer or self.rpc_layer)
+
+    if self.task_type is not None and self.task_index is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(self.task_type, self.task_index),
+          rpc_layer or self.rpc_layer)
+
+    return ''
 
   def cluster_spec(self):
     """Returns a ClusterSpec object based on the latest info from Kubernetes.
@@ -130,3 +157,17 @@ class KubernetesClusterResolver(ClusterResolver):
       cluster_map[tf_job] = all_pods
 
     return server_lib.ClusterSpec(cluster_map)
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the Cloud environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  def num_accelerators_per_worker(self, session_config=None):
+    local_devices = device_lib.list_local_devices(session_config)
+    return len([d for d in local_devices if d.device_type == 'GPU'])
diff --git a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py
index fbb26e803d73c96decf57a040a05694a434500f2..c63a98af6c24efa22c49c9ba38abd243c17d478e 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py
@@ -109,6 +109,23 @@ class KubernetesClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
 
+  def testGetMasterWithOverrideParameters(self):
+    ret = _create_pod_list(
+        ('worker-0', 'Running', '10.1.2.3'),
+        ('worker-1', 'Running', '10.1.2.4'),
+        ('worker-2', 'Running', '10.1.2.5'))
+
+    cluster_resolver = KubernetesClusterResolver(
+        override_client=_mock_kubernetes_client(
+            {'job-name=tensorflow': ret}))
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_index = 0
+    self.assertEqual(cluster_resolver.task_type, 'worker')
+    self.assertEqual(cluster_resolver.task_index, 0)
+    self.assertEqual(cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+    self.assertEqual(cluster_resolver.master('worker', 2),
+                     'grpc://10.1.2.5:8470')
+
   def testNonRunningPod(self):
     ret = _create_pod_list(('tensorflow-abc123', 'Failed', '10.1.2.3'),)
 
diff --git a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
index dabe2fe1d39db14c60e5437d636144f18c384cf1..f590ecead96565672af30c2f3702f1a21f4317be 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
@@ -53,7 +53,8 @@ class SlurmClusterResolver(ClusterResolver):
                gpus_per_node=1,
                gpus_per_task=1,
                tasks_per_node=None,
-               auto_set_gpu=True):
+               auto_set_gpu=True,
+               rpc_layer='grpc'):
     """Creates a new SlurmClusterResolver object.
 
     This takes in parameters and creates a SlurmClusterResolver object. It uses
@@ -74,6 +75,8 @@ class SlurmClusterResolver(ClusterResolver):
       auto_set_gpu: Set the visible CUDA devices automatically while resolving
         the cluster by setting CUDA_VISIBLE_DEVICES environment variable.
         Defaults to True.
+      rpc_layer: (Optional) The protocol TensorFlow uses to communicate between
+        nodes. Defaults to 'grpc'.
 
     Returns:
       A ClusterResolver object which can be used with distributed TensorFlow.
@@ -107,8 +110,9 @@ class SlurmClusterResolver(ClusterResolver):
     self._gpus_per_task = gpus_per_task
 
     self._auto_set_gpu = auto_set_gpu
-    self._job_name = None
-    self._task_index = None
+    self.task_type = None
+    self.task_index = None
+    self.rpc_layer = rpc_layer
 
     self._gpu_allocation = []
     self._cluster_allocation = {}
@@ -157,17 +161,15 @@ class SlurmClusterResolver(ClusterResolver):
     cluster_rank_offset_start = 0
     cluster_rank_offset_end = 0
 
-    for job_name, num_tasks in self._jobs.items():
+    for task_type, num_tasks in self._jobs.items():
       cluster_rank_offset_end = cluster_rank_offset_start + num_tasks
 
-      self._cluster_allocation[job_name] = \
-        task_list[cluster_rank_offset_start:cluster_rank_offset_end]
+      self._cluster_allocation[task_type] = (
+          task_list[cluster_rank_offset_start:cluster_rank_offset_end])
 
-      if self._rank >= cluster_rank_offset_start and \
-          self._rank < cluster_rank_offset_end:
-
-        self._job_name = job_name
-        self._task_index = self._rank - cluster_rank_offset_start
+      if cluster_rank_offset_start <= self._rank < cluster_rank_offset_end:
+        self.task_type = task_type
+        self.task_index = self._rank - cluster_rank_offset_start
 
       cluster_rank_offset_start = cluster_rank_offset_end
 
@@ -188,9 +190,37 @@ class SlurmClusterResolver(ClusterResolver):
       A string specifying job name the process belongs to and an integner
         specifying the task index the process belongs to in that job.
     """
-    return self._job_name, self._task_index
+    return self.task_type, self.task_index
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master string for connecting to a TensorFlow master.
+
+    Args:
+      task_type: (Optional) Overrides the default auto-selected task type.
+      task_index: (Optional) Overrides the default auto-slected task index.
+      rpc_layer: (Optional) Overrides the default RPC protocol TensorFlow uses
+        to communicate across nodes.
+
+    Returns:
+      A connection string for connecting to a TensorFlow master.
+    """
+    task_type = task_type if task_type is not None else self.task_type
+    task_index = task_index if task_index is not None else self.task_index
+    rpc_layer = rpc_layer or self.rpc_layer
+    master = self.cluster_spec().task_address(task_type, task_index)
+
+    return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the Slurm environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
 
-  def master(self, task_type=None, task_index=None):
-    if task_type and task_index:
-      return self.cluster_spec().task_address(task_type, task_index)
-    return self._cluster_allocation[str(self._job_name)][self._task_index]
+  def num_accelerators_per_worker(self, session_config=None):
+    del session_config  # Unused, since this is set in __init__ manually.
+    return self._gpus_per_node
diff --git a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py
index 9aa7df745eb8e1c444011485687b213d87c37da5..7c76e133fe4762f3ea072ef4784cba00996b95cc 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py
@@ -67,6 +67,31 @@ class SlurmClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  @mock.patch.dict(os.environ, {'SLURM_PROCID': '0', 'SLURM_NTASKS': '3'})
+  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
+                     mock_resolve_hostnames_output)
+  def testSimpleMasterRetrieval(self):
+    slurm_cluster_resolver = SlurmClusterResolver(
+        jobs={
+            'ps': 1,
+            'worker': 2
+        },
+        port_base=8888,
+        tasks_per_node=1,
+        gpus_per_node=1,
+        gpus_per_task=1,
+        auto_set_gpu=False)
+
+    slurm_cluster_resolver.task_type = 'worker'
+    slurm_cluster_resolver.task_index = 1
+    self.assertEqual(slurm_cluster_resolver.master(), 'grpc://t02n43:8888')
+
+    slurm_cluster_resolver.rpc_layer = 'ab'
+    self.assertEqual(slurm_cluster_resolver.master('ps', 0), 'ab://t02n13:8888')
+    self.assertEqual(
+        slurm_cluster_resolver.master('ps', 0, rpc_layer='test'),
+        'test://t02n13:8888')
+
   @mock.patch.dict(os.environ, {
       'SLURM_PROCID': '0',
       'SLURM_NTASKS': '3',
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
index 7bbd189d03d9c96914d11948941916739f10d18f..95aad0de1378dbee47ba24ff903da31fdb18a1af 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
@@ -27,13 +27,98 @@ from tensorflow.python.training.server_lib import ClusterSpec
 
 _TF_CONFIG_ENV = 'TF_CONFIG'
 _SESSION_MASTER_KEY = 'session_master'
+_RPC_LAYER_KEY = 'rpc_layer'
+_TASK_KEY = 'task'
+
+
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
+def _load_tf_config():
+  return json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+
+
+def _get_value_in_tfconfig(key, default=None):
+  tf_config = _load_tf_config()
+  return tf_config[key] if key in tf_config else default
 
 
 class TFConfigClusterResolver(ClusterResolver):
   """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
 
-  def _load_tf_config(self):
-    return json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+  def __init__(self,
+               task_type=None,
+               task_index=None,
+               rpc_layer=None,
+               environment=None,
+               num_accelerators_per_worker=0):
+    """Creates a new TFConfigClusterResolver.
+
+    Args:
+      task_type: (String, optional) Overrides the task type specified in the
+        TF_CONFIG environment variable.
+      task_index: (Integer, optional) Overrides the task index specified in the
+        TF_CONFIG environment variable.
+      rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
+      environment: (String, optional) Overrides the environment TensorFlow
+        operates in.
+      num_accelerators_per_worker: (Integer, optional) Specifies the number of
+        accelerators (e.g. GPUs, TPUs, others) that each node has.
+    """
+
+    self._task_type = task_type
+    self._task_index = task_index
+    self._rpc_layer = rpc_layer
+    self._environment = environment
+    self._num_accelerators_per_worker = num_accelerators_per_worker
+
+  @property
+  def task_type(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, {})
+      return task_info['type'] if 'type' in task_info else None
+    else:
+      return self._task_type
+
+  @property
+  def task_index(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, {})
+      return task_info['index'] if 'index' in task_info else None
+    else:
+      return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._environment
+
+  @property
+  def rpc_layer(self):
+    if self._rpc_layer is None:
+      return _get_value_in_tfconfig(_RPC_LAYER_KEY)
+    else:
+      return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators_per_worker(self, session_config=None):
+    # TODO(frankchn): Connect to server (w/ session_config) in the future.
+    del session_config  # Unused, we do not connect to another server here.
+    return self._num_accelerators_per_worker
 
   def cluster_spec(self):
     """Returns a ClusterSpec based on the TF_CONFIG environment variable.
@@ -41,12 +126,12 @@ class TFConfigClusterResolver(ClusterResolver):
     Returns:
       A ClusterSpec with information from the TF_CONFIG environment variable.
     """
-    tf_config = self._load_tf_config()
+    tf_config = _load_tf_config()
     if 'cluster' not in tf_config:
       return ClusterSpec({})
     return ClusterSpec(tf_config['cluster'])
 
-  def master(self, task_type=None, task_index=0):
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
     """Returns the master address to use when creating a TensorFlow session.
 
     Args:
@@ -54,6 +139,8 @@ class TFConfigClusterResolver(ClusterResolver):
         master.
       task_index: (Integer, optional) Overrides and sets the task id of the
         master.
+      rpc_layer: (String, optional) Overrides and sets the protocol over which
+        TensorFlow nodes communicate with each other.
 
     Returns:
       The address of the master.
@@ -64,14 +151,9 @@ class TFConfigClusterResolver(ClusterResolver):
     """
 
     # If `session_master` is set, just use that.
-    tf_config = self._load_tf_config()
-    if _SESSION_MASTER_KEY in tf_config:
-      return tf_config[_SESSION_MASTER_KEY]
-
-    if 'rpc_layer' in tf_config:
-      rpclayer = '%s://' % tf_config['rpc_layer']
-    else:
-      rpclayer = ''
+    session_master = _get_value_in_tfconfig(_SESSION_MASTER_KEY)
+    if session_master is not None:
+      return session_master
 
     # Return an empty string if we are the only job in the ClusterSpec.
     cluster_spec = self.cluster_spec()
@@ -82,11 +164,8 @@ class TFConfigClusterResolver(ClusterResolver):
 
     # We try to auto-detect the task type and id, but uses the user-supplied one
     # where available
-    if not task_type:
-      if 'task' not in tf_config:
-        raise RuntimeError('You must either specify a `task_type`, or your '
-                           'TF_CONFIG must contain a `task` section.')
-      task_type = tf_config['task']['type']
-      task_index = tf_config['task']['index']
-
-    return rpclayer + cluster_spec.task_address(task_type, task_index)
+    task_type = task_type if task_type is not None else self.task_type
+    task_index = task_index if task_index is not None else self.task_index
+
+    return format_master_url(cluster_spec.task_address(task_type, task_index),
+                             self.rpc_layer)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py
index 468161d2aa49129f2ec960b1ccddf49c712f00a7..3db6d5447f5abab6936a2ab4b4a149715ec01394 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py
@@ -133,6 +133,58 @@ class TFConfigClusterResolverTest(test.TestCase):
     cluster_resolver = TFConfigClusterResolver()
     self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
 
+  def testTaskTypeIndexRpcRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": 0
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('ps', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual('grpc', cluster_resolver.rpc_layer)
+
+  def testParameterOverrides(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": 1
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_index=0,
+                                               num_accelerators_per_worker=8)
+
+    self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
+    self.assertEqual('ps', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual(8, cluster_resolver.num_accelerators_per_worker())
+
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_index = 1
+    cluster_resolver.rpc_layer = 'test'
+
+    self.assertEqual('test://worker1:2222', cluster_resolver.master())
+    self.assertEqual('worker', cluster_resolver.task_type)
+    self.assertEqual(1, cluster_resolver.task_index)
+    self.assertEqual('test', cluster_resolver.rpc_layer)
+
   def testZeroItemsInClusterSpecMasterRead(self):
     os.environ['TF_CONFIG'] = """
     {}
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index c4ac9d0700194da558820aabc28bf1c0857591e2..d5537a4100ddad19d2a9131b971f3d604d58f8f2 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -24,6 +24,7 @@ from six.moves.urllib.request import Request
 from six.moves.urllib.request import urlopen
 
 from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import format_master_url
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
@@ -50,6 +51,34 @@ class TPUClusterResolver(ClusterResolver):
   Cloud Platform project.
   """
 
+  def _tpuService(self):
+    """Creates a new Cloud TPU API object.
+
+    This works around an issue where the underlying HTTP connection sometimes
+    times out when the script has been running for too long. Other methods in
+    this object calls this method to get a new API object whenever they need
+    to communicate with the Cloud API.
+
+    Returns:
+      A Google Cloud TPU API object.
+    """
+    if self._service:
+      return self._service
+
+    credentials = self._credentials
+    if credentials is None or credentials == 'default':
+      credentials = GoogleCredentials.get_application_default()
+
+    if self._discovery_url:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials,
+          discoveryServiceUrl=self._discovery_url)
+    else:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials)
+
   def _requestComputeMetadata(self, path):
     req = Request('http://metadata/computeMetadata/v1/%s' % path,
                   headers={'Metadata-Flavor': 'Google'})
@@ -57,6 +86,8 @@ class TPUClusterResolver(ClusterResolver):
     return compat.as_bytes(resp.read())
 
   def _shouldResolve(self):
+    if isinstance(self._should_resolve_override, bool):
+      return self._should_resolve_override
     if (self._tpu == compat.as_bytes('') or
         self._tpu == compat.as_bytes('local') or
         self._tpu.startswith(compat.as_bytes('/bns')) or
@@ -81,7 +112,7 @@ class TPUClusterResolver(ClusterResolver):
     return None
 
   @staticmethod
-  def _discoveryUrl():
+  def _environmentDiscoveryUrl():
     return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
 
   def __init__(self,
@@ -153,55 +184,80 @@ class TPUClusterResolver(ClusterResolver):
       raise ValueError('Please provide a TPU Name to connect to.')
 
     self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
-    self._job_name = job_name
-    self._credentials = credentials
 
+    # By default the task_type is 'worker` and the task_index is 0 (which is the
+    # first worker in the task).
+    self.task_type = job_name
+    self.task_index = 0
+
+    if tpu.startswith('grpc://'):
+      # Cloud environment, where we are using GRPC to communicate to TPUs.
+      self._environment = ''
+    elif tpu == 'local' or not tpu:
+      # Google environment, where the TPU is attached to the host.
+      self._environment = 'google'
+    elif tpu.startswith('/bns'):
+      # Google environment, where we reach the TPU through BNS.
+      self._environment = 'google'
+
+    # If TPU is in the Google environment or exists locally, we don't use any
+    # RPC layer.
+    if tpu.startswith('/bns') or tpu == 'local' or not tpu:
+      self.rpc_layer = None
+    else:
+      self.rpc_layer = 'grpc'
+
+    # Setting this overrides the return value of self._shouldResolve()
+    self._should_resolve_override = None
+
+    # We strip out the protocol if it is included, and override the
+    # shouldResolve function to never resolve. We are adding the protocol back
+    # in later in self.master().
+    if self.rpc_layer is not None and tpu.startswith(self.rpc_layer + '://'):
+      tpu = tpu[len(self.rpc_layer + '://'):]
+      self._tpu = tpu
+      self._should_resolve_override = False
+
+    # Whether we should actually attempt to contact Cloud APIs
     should_resolve = self._shouldResolve()
 
+    # We error out if we are in a non-Cloud environment which cannot talk to the
+    # Cloud APIs using the standard class and a special object is not passed in.
+    self._service = service
+    if (self._service is None and should_resolve and
+        not _GOOGLE_API_CLIENT_INSTALLED):
+      raise ImportError('googleapiclient and oauth2client must be installed '
+                        'before using the TPU cluster resolver. Execute: '
+                        '`pip install --upgrade google-api-python-client` '
+                        'and `pip install --upgrade oauth2client` to '
+                        'install with pip.')
+
+    # We save user-passed credentials, unless the user didn't pass in anything.
+    self._credentials = credentials
+    if (credentials == 'default' and should_resolve and
+        _GOOGLE_API_CLIENT_INSTALLED):
+      self._credentials = None
+
+    # Automatically detect project and zone if unspecified.
     if not project and should_resolve:
       project = compat.as_str(
           self._requestComputeMetadata('project/project-id'))
-
     if not zone and should_resolve:
       zone_path = compat.as_str(self._requestComputeMetadata('instance/zone'))
       zone = zone_path.split('/')[-1]
-
     self._project = project
     self._zone = zone
 
-    if credentials == 'default' and should_resolve:
-      if _GOOGLE_API_CLIENT_INSTALLED:
-        self._credentials = GoogleCredentials.get_application_default()
-
-    if service is None and should_resolve:
-      if not _GOOGLE_API_CLIENT_INSTALLED:
-        raise ImportError('googleapiclient and oauth2client must be installed '
-                          'before using the TPU cluster resolver. Execute: '
-                          '`pip install --upgrade google-api-python-client` '
-                          'and `pip install --upgrade oauth2client` to '
-                          'install with pip.')
-
-      final_discovery_url = self._discoveryUrl() or discovery_url
-      if final_discovery_url:
-        self._service = discovery.build(
-            'tpu', 'v1alpha1',
-            credentials=self._credentials,
-            discoveryServiceUrl=final_discovery_url)
-      else:
-        self._service = discovery.build(
-            'tpu', 'v1alpha1',
-            credentials=self._credentials)
-    else:
-      self._service = service
+    self._discovery_url = self._environmentDiscoveryUrl() or discovery_url
 
     self._coordinator_name = coordinator_name
-    if coordinator_name and not coordinator_address and (should_resolve or
-                                                         in_gke):
+    if (coordinator_name and not coordinator_address and
+        (should_resolve or in_gke)):
       self._start_local_server()
     else:
       self._coordinator_address = coordinator_address
 
-  def master(self, task_type=None, task_index=None):
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
     """Get the Master string to be used for the session.
 
     In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
@@ -213,8 +269,12 @@ class TPUClusterResolver(ClusterResolver):
     'grpc://10.240.1.2:8470' will be returned).
 
     Args:
-      task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_type: (Optional, string) The type of the TensorFlow task of the
+        master.
+      task_index: (Optional, integer) The index of the TensorFlow task of the
+        master.
+      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
+        communicate with TPUs.
 
     Returns:
       string, the connection string to use when creating a session.
@@ -222,25 +282,34 @@ class TPUClusterResolver(ClusterResolver):
     Raises:
       ValueError: If none of the TPUs specified exists.
     """
-    if not self._shouldResolve():
-      return self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
-
-    cluster_spec = self.cluster_spec()
-    if task_type and task_index:
-      return cluster_spec.task_address(task_type, task_index)
-
-    job_tasks = cluster_spec.job_tasks(self._job_name)
-    if not job_tasks:
-      raise ValueError('No TPUs exists with the specified names exist.')
-
-    return 'grpc://' + job_tasks[0]
+    if self._shouldResolve():
+      # We are going to communicate with the Cloud TPU APIs to get a Cluster.
+      cluster_spec = self.cluster_spec()
+      if task_type is not None and task_index is not None:
+        # task_type and task_index is from the function parameter
+        master = cluster_spec.task_address(task_type, task_index)
+      elif self.task_type is not None and self.task_index is not None:
+        # task_type and task_index is from the object
+        master = cluster_spec.task_address(self.task_type, self.task_index)
+      else:
+        # by default we take the first item in the cluster with the right name
+        job_tasks = cluster_spec.job_tasks(self.task_type)
+        if not job_tasks:
+          raise ValueError('No TPUs with the specified names exist.')
+        master = job_tasks[0]
+    else:
+      if isinstance(self._tpu, (bytes, bytearray)):
+        master = self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
+      else:
+        master = self._tpu.split(_ENDPOINTS_SEPARATOR)[0]
+    return format_master_url(master, rpc_layer or self.rpc_layer)
 
   def get_master(self):
     return self.master()
 
   def get_job_name(self):
     if self._shouldResolve():
-      return self._job_name
+      return self.task_type
 
   def cluster_spec(self):
     """Returns a ClusterSpec object based on the latest TPU information.
@@ -270,7 +339,8 @@ class TPUClusterResolver(ClusterResolver):
       # Case 1.
       full_name = 'projects/%s/locations/%s/nodes/%s' % (
           self._project, self._zone, compat.as_text(self._tpu))
-      request = self._service.projects().locations().nodes().get(name=full_name)
+      service = self._tpuService()
+      request = service.projects().locations().nodes().get(name=full_name)
       response = request.execute()
 
       if 'state' in response and response['state'] != 'READY':
@@ -291,18 +361,23 @@ class TPUClusterResolver(ClusterResolver):
         instance_url = '%s:%s' % (response['ipAddress'], response['port'])
         worker_list = [instance_url]
 
-      cluster_spec = {self._job_name: worker_list}
+      cluster_spec = {self.task_type: worker_list}
     else:
-      if not self._tpu.startswith(compat.as_bytes('grpc://')):
+      if self.rpc_layer is None:
         # Case 3.
         return None
       # Case 2.
-      cluster_spec = {
-          self._job_name: [
-              x[len(compat.as_bytes('grpc://')):]
-              for x in self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))
-          ]
-      }
+      tpus = []
+      for tpu in self._tpu.split(_ENDPOINTS_SEPARATOR):
+        # We are working around the fact that GKE environment variable that is
+        # supplied to us has the protocol string embedded in it, but we want
+        # to strip it out for the ClusterSpec.
+        if (self.rpc_layer is not None and
+            tpu.startswith(self.rpc_layer + '://')):
+          tpus.append(tpu[len(self.rpc_layer + '://'):])
+        else:
+          tpus.append(tpu)
+      cluster_spec = {self.task_type: tpus}
 
     if self._coordinator_address:
       # {1, 2}.a
@@ -310,6 +385,24 @@ class TPUClusterResolver(ClusterResolver):
 
     return server_lib.ClusterSpec(cluster_spec)
 
+  def num_accelerators_per_worker(self, session_config=None):
+    """Returns the number of TPU cores per worker.
+
+    This defaults to 8 for all current TPU configurations, and we do not need
+    to query any remote systems for this.
+
+    Args:
+      session_config: Unused. Not currently necessary to query anything as this
+        number is 8 for all TPU configurations.
+    """
+    del session_config  # Unused. Not necessary to query anything.
+    return 8
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    return self._environment
+
   def _start_local_server(self):
     address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
     self._server = server_lib.Server(
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index ad4f6432630be44a7de6e778f55f1fb7fd66f307..365bd52ee254b38588b3dfb20d64f7839e720df4 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -132,6 +132,7 @@ class TPUClusterResolverTest(test.TestCase):
     }
     """ % tpu_cluster_resolver._coordinator_port
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
 
   @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
                      mock_request_compute_metadata)
@@ -157,6 +158,7 @@ class TPUClusterResolverTest(test.TestCase):
     job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
 
   @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
                      mock_request_compute_metadata)
@@ -226,6 +228,7 @@ class TPUClusterResolverTest(test.TestCase):
     job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
 
   def testNewNetworkEndpointFormat(self):
     tpu_map = {
@@ -304,6 +307,7 @@ class TPUClusterResolverTest(test.TestCase):
     }
     """ % tpu_cluster_resolver._coordinator_port
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.4:8470')
 
   def testPodResolutionNoCoordinator(self):
     tpu_map = {
@@ -350,6 +354,7 @@ class TPUClusterResolverTest(test.TestCase):
     }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.4:8470')
 
   def testGetMasterNoEntries(self):
     tpu_map = {}
@@ -459,10 +464,67 @@ class TPUClusterResolverTest(test.TestCase):
 
     del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
 
-  def testDiscoveryUrl(self):
+  def testEnvironmentDiscoveryUrl(self):
     os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
     self.assertEqual('https://{api}.internal/{apiVersion}',
-                     TPUClusterResolver._discoveryUrl())
+                     TPUClusterResolver._environmentDiscoveryUrl())
+
+  def testEnvironmentAndRpcDetectionForGoogle(self):
+    tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/ab/cd/ef')
+    self.assertEqual(tpu_cluster_resolver.environment, 'google')
+    self.assertEqual(tpu_cluster_resolver.rpc_layer, None)
+
+  def testEnvironmentAndRpcDetectionForGrpcString(self):
+    tpu_cluster_resolver = TPUClusterResolver(tpu='grpc://10.1.2.3:8470')
+    self.assertEqual(tpu_cluster_resolver.environment, '')
+    self.assertEqual(tpu_cluster_resolver.rpc_layer, 'grpc')
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+
+  def testOverrideTaskTypeAndIndexAndGetMaster(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'health':
+                'HEALTHY',
+            'networkEndpoints': [
+                {
+                    'ipAddress': '10.2.3.4',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.5',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.6',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.7',
+                    'port': 8470,
+                },
+            ]
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu='test-tpu-1',
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.4:8470')
+
+    tpu_cluster_resolver.task_type = 'worker'
+    tpu_cluster_resolver.task_index = 3
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.7:8470')
+
+    self.assertEqual(
+        tpu_cluster_resolver.master(
+            task_type='worker', task_index=2, rpc_layer='test'),
+        'test://10.2.3.6:8470')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index fbdca497fcc3126d2086d289ebdb113370072d22..a63366e1361effe20787c197eddd66b5c0c96410 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -59,8 +59,6 @@ option(tensorflow_ENABLE_MKLDNN_SUPPORT "Enable Intel MKLDNN support, requires M
 
 # GPU, CUDA and cuDNN options
 option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
-set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against")
-set(tensorflow_CUDNN_VERSION "7" CACHE STRING "cuDNN version to build against")
 
 if(HAIKU)
 	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
@@ -72,25 +70,25 @@ endif()
 if (NOT WIN32)
   # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
   # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
-  find_package (Threads)
+  find_package (Threads REQUIRED)
 
   # Options for linking CUDA/CUDNN libraries
-  option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
+  option(tensorflow_PATH_CUDA_LIB "Additional library search path for cudnn, nccl, culibos" /usr/local/cuda/lib64/)
   option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
   if (NOT tensorflow_CUDNN_INCLUDE)
     # option's default value is OFF. Fill it with real default values
     set(tensorflow_CUDNN_INCLUDE /usr/include)
   endif (NOT tensorflow_CUDNN_INCLUDE)
-  option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB})
-  if (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
+  option(tensorflow_PATH_CUDNN_LIB "Override PATH_CUDA_LIB for cudnn" ${tensorflow_PATH_CUDA_LIB})
+  if (NOT tensorflow_PATH_CUDNN_LIB)
     # option's default value is OFF. Fill it with real default values
-    set (tensorflow_PATH_CUDNN_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
-  endif (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
-  option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB})
-  if (NOT tensorflow_PATH_NCCL_STATIC_LIB)
+    set (tensorflow_PATH_CUDNN_LIB ${tensorflow_PATH_CUDA_LIB})
+  endif (NOT tensorflow_PATH_CUDNN_LIB)
+  option(tensorflow_PATH_NCCL_LIB "Override PATH_CUDA_LIB for nccl" ${tensorflow_PATH_CUDA_LIB})
+  if (NOT tensorflow_PATH_NCCL_LIB)
     # option's default value is OFF. Fill it with real default values
-    set (tensorflow_PATH_NCCL_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
-  endif (NOT tensorflow_PATH_NCCL_STATIC_LIB)
+    set (tensorflow_PATH_NCCL_LIB ${tensorflow_PATH_CUDA_LIB})
+  endif (NOT tensorflow_PATH_NCCL_LIB)
   option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64)
   if (NOT tensorflow_CUDA_LIBRARY_PATH)
     # option's default value is OFF. Fill it with real default values
@@ -210,14 +208,17 @@ endif()
 include(CheckCXXCompilerFlag)
 
 # OpenMP Support
-CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
-if (GCC_OPENMP_SUPPORT)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-endif()
-CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
-if (MSVC_OPENMP_SUPPORT)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
-endif()
+if (WIN32)
+  CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
+  if (MSVC_OPENMP_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+  endif()
+else (WIN32)
+  CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
+  if (GCC_OPENMP_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  endif()
+endif (WIN32)
 
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
@@ -377,29 +378,19 @@ if (tensorflow_ENABLE_GPU)
     list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
   endif (NOT WIN32)
 
-  # later command will make use of the value in tensorflow_CUDA_VERSION
-  find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED EXACT)
-
-  # Test compatibility of compiler on CUDA
-  try_compile(CUDA_TEST_COMPILE_C
-    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
-    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.c
-    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
-  try_compile(CUDA_TEST_COMPILE_CXX
-    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
-    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.cc
-    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
-  if(NOT (CUDA_TEST_COMPILE_C AND CUDA_TEST_COMPILE_CXX))
-    message(FATAL_ERROR "Selected compiler (or version) is not supported for CUDA")
+  # minimum 9.1 in cuda version
+  find_package(CUDA 9.1 REQUIRED)
+  if(NOT CUDA_FOUND)
+    message(FATAL_ERROR "CUDA not found.")
   endif()
 
-  # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
-  # CUDA_NVCC_FLAGS and cuda_config.h below
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_37,code=\"sm_37,compute_37\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_52,code=\"sm_52,compute_52\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_60,code=\"sm_60,compute_60\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_61,code=\"sm_61,compute_61\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_70,code=\"sm_70,compute_70\")
+  # use cmake internal CUDA_ARCH_NAME switch
+  # e.g. CUDA_ARCH_NAME="Auto" will autodetect
+  #      CUDA_ARCH_NAME="All"  will use all arches
+  cuda_select_nvcc_arch_flags(NVCC_ARCH_FLAGS ${CUDA_ARCH_NAME})
+  list(APPEND CUDA_NVCC_FLAGS ${NVCC_ARCH_FLAGS})
+  message(STATUS "Using CUDA arch flags: ${NVCC_ARCH_FLAGS_readable}")
+
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
   set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
@@ -423,43 +414,94 @@ if (tensorflow_ENABLE_GPU)
   else (WIN32)
     set(CUDNN_INCLUDE "${tensorflow_CUDNN_INCLUDE}")
 
-    find_library(nccl_STATIC_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
-    if (NOT nccl_STATIC_LIBRARY)
+    if (tensorflow_BUILD_SHARED_LIB)
+      find_library(nccl_LIBRARY NAMES libnccl.so PATHS ${tensorflow_PATH_NCCL_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    else (tensorflow_BUILD_SHARED_LIB)
+      find_library(nccl_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    endif (tensorflow_BUILD_SHARED_LIB)
+    if (NOT nccl_LIBRARY)
       message(FATAL_ERROR "NCCL is required for GPU-build")
-    else (NOT nccl_STATIC_LIBRARY)
-      message("nccl-static: ${nccl_STATIC_LIBRARY}")
+    else (NOT nccl_LIBRARY)
+      message("nccl: ${nccl_LIBRARY}")
       # something like /usr/lib64/libnccl_static.a
-    endif (NOT nccl_STATIC_LIBRARY)
-
-    find_library(cudnn_STATIC_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
-    if (NOT cudnn_STATIC_LIBRARY)
+    endif (NOT nccl_LIBRARY)
+
+    if (tensorflow_BUILD_SHARED_LIB)
+      find_library(cudnn_LIBRARY NAMES libcudnn.so PATHS ${tensorflow_PATH_CUDNN_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    else (tensorflow_BUILD_SHARED_LIB)
+      find_library(cudnn_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    endif (tensorflow_BUILD_SHARED_LIB)
+    if (NOT cudnn_LIBRARY)
       message(FATAL_ERROR "CUDNN is required for GPU-build")
-    else (NOT cudnn_STATIC_LIBRARY)
-      message("cudnn-static: ${cudnn_STATIC_LIBRARY}")
-    endif (NOT cudnn_STATIC_LIBRARY)
-
-    find_library(culibos_STATIC_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
-    if (NOT culibos_STATIC_LIBRARY)
+    else (NOT cudnn_LIBRARY)
+      file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+      # fetch cudnn version
+      string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+             CUDNN_VERSION_MAJOR "${CUDNN_VERSION_FILE_CONTENTS}")
+      string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+             CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
+      string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+             CUDNN_VERSION_MINOR "${CUDNN_VERSION_FILE_CONTENTS}")
+      string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+             CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
+      string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+             CUDNN_VERSION_PATCH "${CUDNN_VERSION_FILE_CONTENTS}")
+      string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+             CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
+      if(NOT CUDNN_VERSION_MAJOR)
+        set(CUDNN_VERSION "???")
+      else()
+        set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
+      endif()
+      message(STATUS "cudnn library: ${cudnn_LIBRARY} (found version: \"${CUDNN_VERSION}\")")
+    endif (NOT cudnn_LIBRARY)
+
+    if (tensorflow_BUILD_SHARED_LIB)
+      # shared first (if exists) else static one
+      find_library(culibos_LIBRARY NAMES libculibos.so libculibos.a PATHS ${tensorflow_PATH_CUDA_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    else (tensorflow_BUILD_SHARED_LIB)
+      # only static version
+      find_library(culibos_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_CUDA_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    endif (tensorflow_BUILD_SHARED_LIB)
+    if (NOT culibos_LIBRARY)
       message(FATAL_ERROR "CULIBOS is required for GPU-build")
-    else (NOT culibos_STATIC_LIBRARY)
-      message("culibos-static: ${culibos_STATIC_LIBRARY}")
-    endif (NOT culibos_STATIC_LIBRARY)
+    else (NOT culibos_LIBRARY)
+      message("culibos: ${culibos_LIBRARY}")
+    endif (NOT culibos_LIBRARY)
 
     set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
-      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
+      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_LIBRARY} ${culibos_LIBRARY} ${nccl_LIBRARY})
   endif (WIN32)
   include_directories(${CUDNN_INCLUDE})
 
   # Remove "." from CUDA version variable.
-  string(REPLACE "." "" short_CUDA_VER ${tensorflow_CUDA_VERSION})
+  string(REPLACE "." "" short_CUDA_VER ${CUDA_VERSION})
+
+  # List of enumerated CUDA caps
+  string(REPLACE " " ";" NVCC_ARCH_LIST "${NVCC_ARCH_FLAGS_readable}")
+  set(list ${NVCC_ARCH_LIST})
+
+  # Construct capability string
+  foreach(NVCC_ARCH ${NVCC_ARCH_LIST})
+    if (NVCC_ARCH MATCHES "sm_")
+      string(REGEX REPLACE "^.sm*" "" NVCC_ARCH ${NVCC_ARCH})
+      math(EXPR NVCC_ARCH_MAJOR "${NVCC_ARCH} / 10")
+      math(EXPR NVCC_ARCH_MINOR "(${NVCC_ARCH} - (${NVCC_ARCH_MAJOR}*10))")
+      if (TF_CUDA_CAP)
+        set(TF_CUDA_CAP "${TF_CUDA_CAP},CudaVersion(\"${NVCC_ARCH_MAJOR}.${NVCC_ARCH_MINOR}\")")
+      else (TF_CUDA_CAP)
+        set(TF_CUDA_CAP "CudaVersion(\"${NVCC_ARCH_MAJOR}.${NVCC_ARCH_MINOR}\")")
+      endif (TF_CUDA_CAP)
+    endif()
+  endforeach()
 
   # create cuda_config.h
   FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
     "#ifndef CUDA_CUDA_CONFIG_H_\n"
     "#define CUDA_CUDA_CONFIG_H_\n"
-    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.7\"),CudaVersion(\"5.2\"),CudaVersion(\"6.0\"),CudaVersion(\"6.1\"),CudaVersion(\"7.0\")\n"
+    "#define TF_CUDA_CAPABILITIES ${TF_CUDA_CAP}\n"
     "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n"
-    "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n"
+    "#define TF_CUDNN_VERSION \"64_${CUDNN_VERSION}\"\n"
     "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
     "#endif  // CUDA_CUDA_CONFIG_H_\n"
   )
@@ -494,14 +536,14 @@ if (tensorflow_ENABLE_GPU)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
       msvcp_dll_name=msvcp140.dll
       cudart_dll_name=cudart64_${short_CUDA_VER}.dll
-      cuda_version_number=${tensorflow_CUDA_VERSION}
+      cuda_version_number=${CUDA_VERSION}
       nvcuda_dll_name=nvcuda.dll
       cudnn_dll_name=cudnn64_${tensorflow_CUDNN_VERSION}.dll
       cudnn_version_number=${tensorflow_CUDNN_VERSION})
   else(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
-	    cuda_version_number=${tensorflow_CUDA_VERSION}
-	    cudnn_version_number=${tensorflow_CUDNN_VERSION})
+      cuda_version_number=${CUDA_VERSION}
+      cudnn_version_number=${tensorflow_CUDNN_VERSION})
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
   set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
index c6c5021f60b38ed05a19f3e439c9810251841f76..4546dbdecc0dbc36f17cc727345e0762718b5165 100644
--- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake
+++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
@@ -20,6 +20,7 @@ if (systemlib_ABSEIL_CPP)
                absl_dynamic_annotations
                absl_malloc_internal
                absl_throw_delegate
+               absl_int128
                absl_strings
                str_format_internal
                absl_bad_optional_access)
@@ -50,6 +51,7 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/base/Release/absl_dynamic_annotations.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_malloc_internal.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_throw_delegate.lib
+          ${abseil_cpp_BUILD}/absl/numeric/Release/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/str_format_internal.lib
           ${abseil_cpp_BUILD}/absl/types/Release/absl_bad_optional_access.lib)
@@ -60,6 +62,7 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/base/absl_dynamic_annotations.lib
           ${abseil_cpp_BUILD}/absl/base/absl_malloc_internal.lib
           ${abseil_cpp_BUILD}/absl/base/absl_throw_delegate.lib
+          ${abseil_cpp_BUILD}/absl/numeric/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/str_format_internal.lib
           ${abseil_cpp_BUILD}/absl/types/absl_bad_optional_access.lib)
@@ -71,6 +74,7 @@ else (systemlib_ABSEIL_CPP)
         ${abseil_cpp_BUILD}/absl/base/libabsl_dynamic_annotations.a
         ${abseil_cpp_BUILD}/absl/base/libabsl_malloc_internal.a
         ${abseil_cpp_BUILD}/absl/base/libabsl_throw_delegate.a
+        ${abseil_cpp_BUILD}/absl/numeric/libabsl_int128.a
         ${abseil_cpp_BUILD}/absl/strings/libabsl_strings.a
         ${abseil_cpp_BUILD}/absl/strings/libstr_format_internal.a
         ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a)
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index d94b703700cfcd9ecae7f1d2718ba33ffd82c176..96160568fa79291a7b391761373e1eaf0f70974e 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -57,6 +57,7 @@ tensorflow/python/ops
 tensorflow/python/ops/distributions
 tensorflow/python/ops/linalg
 tensorflow/python/ops/losses
+tensorflow/python/ops/signal
 tensorflow/python/platform
 tensorflow/python/profiler
 tensorflow/python/profiler/internal
@@ -377,8 +378,6 @@ tensorflow/contrib/seq2seq/python/ops
 tensorflow/contrib/session_bundle
 tensorflow/contrib/session_bundle/example
 tensorflow/contrib/signal
-tensorflow/contrib/signal/python
-tensorflow/contrib/signal/python/ops
 tensorflow/contrib/slim
 tensorflow/contrib/slim/python
 tensorflow/contrib/slim/python/slim
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 88b4a6165c0f9171ec7cc169bc099c7db1549ee7..d66e39ac07c7b7c9423fa7e878a9cefd94b867bd 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -68,14 +68,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/csv_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/unique_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index ef337b3a15c4f58fe183af78d34376b3ed27099a..9cfa8b90749280b6aa815cc210941c75bd5e16c5 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -89,7 +89,6 @@ GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_prediction "${tensorflow_source_dir}/t
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(coder "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(data_dataset "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(framework_variable "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index ef487d3509bf3c9bfaf0b117998e6b121543c1c6..df7b854afcca1a0bed660624152f465d4bf3b25f 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -373,8 +373,6 @@ GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_stats_accumulator_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_stats_accumulator_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_coder_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/coder/python/ops/gen_coder_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_data_dataset_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_dataset_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/factorization/python/ops/gen_clustering_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_factorization_ops"
diff --git a/tensorflow/contrib/compiler/xla_test.py b/tensorflow/contrib/compiler/xla_test.py
index 8d13dc7316a693657f1b6e102830808d35372fe9..3b49755afcf0753d31c0ce506dce42709b1ee8bc 100644
--- a/tensorflow/contrib/compiler/xla_test.py
+++ b/tensorflow/contrib/compiler/xla_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
@@ -49,7 +48,7 @@ class XLACompileContextTest(test.TestCase):
     histogram_summary = summary.histogram('histogram_summary', dummy_tensor)
     image_summary = summary.image('image_summary', dummy_tensor)
     scalar_summary = summary.scalar('scalar_summary', dummy_tensor)
-    tensor_summary = summary_ops.tensor_summary('tensor_summary', dummy_tensor)
+    tensor_summary = summary.tensor_summary('tensor_summary', dummy_tensor)
     summary.merge(
         [
             audio_summary, histogram_summary, image_summary, scalar_summary,
diff --git a/tensorflow/contrib/copy_graph/python/__init__.py b/tensorflow/contrib/copy_graph/python/__init__.py
index b9ff28eb0d7115ff5919c2f758f70ba388f5d4d2..5c1048e02a3104c958f7710ba97980d3353adbad 100644
--- a/tensorflow/contrib/copy_graph/python/__init__.py
+++ b/tensorflow/contrib/copy_graph/python/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/copy_graph/python/util/__init__.py b/tensorflow/contrib/copy_graph/python/util/__init__.py
index b9ff28eb0d7115ff5919c2f758f70ba388f5d4d2..5c1048e02a3104c958f7710ba97980d3353adbad 100644
--- a/tensorflow/contrib/copy_graph/python/util/__init__.py
+++ b/tensorflow/contrib/copy_graph/python/util/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/crf/__init__.py b/tensorflow/contrib/crf/__init__.py
index fe5e34d258fbc1508a0a85655f29c2c9bc8fa8b1..d53549048f33162ec89dfe957ca58a4bbb4e95c6 100644
--- a/tensorflow/contrib/crf/__init__.py
+++ b/tensorflow/contrib/crf/__init__.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Linear-chain CRF layer.
 
-See the [CRF](https://tensorflow.org/api_guides/python/contrib.crf) guide.
-
 @@crf_binary_score
 @@crf_decode
 @@crf_log_likelihood
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 57ffaa87e45559a6ecf4c8059e5a6cdee8b8b664..8d35622e393e15a2f2dfea7c75ad2c9f48aa7150 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -42,10 +42,11 @@ tf_custom_op_py_library(
 
 cuda_py_test(
     name = "cudnn_rnn_ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
     additional_deps = [
         ":cudnn_rnn_py",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python/ops/losses:losses",
@@ -61,10 +62,10 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
-    shard_count = 6,
+    shard_count = 2,
     tags = [
-        "no_oss",  # b/117989214
         "noasan",  # http://b/62067814
+        "requires-gpu-sm35",
     ],
 )
 
diff --git a/tensorflow/contrib/cudnn_rnn/__init__.py b/tensorflow/contrib/cudnn_rnn/__init__.py
index 5d8c6191f8db9f96532aa78e4790a4665d3b4877..5320232268657fa73bcd3e86da49d6525e9b8db5 100644
--- a/tensorflow/contrib/cudnn_rnn/__init__.py
+++ b/tensorflow/contrib/cudnn_rnn/__init__.py
@@ -24,6 +24,10 @@
 @@CudnnGRUSaveable
 @@CudnnRNNReluSaveable
 @@CudnnRNNTanhSaveable
+@@CudnnParamsFormatConverterLSTM
+@@CudnnParamsFormatConverterGRU
+@@CudnnParamsFormatConverterTanh
+@@CudnnParamsFormatConverterRelu
 """
 
 from __future__ import absolute_import
@@ -48,6 +52,10 @@ _allowed_symbols = [
     "CudnnGRUSaveable",
     "CudnnRNNReluSaveable",
     "CudnnRNNTanhSaveable",
+    "CudnnParamsFormatConverterLSTM",
+    "CudnnParamsFormatConverterGRU",
+    "CudnnParamsFormatConverterTanh",
+    "CudnnParamsFormatConverterRelu",
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index c59d3682d404e032d9f4bf81ef54ab456341cefa..1e2c9121d63267692ee80f14299392e19ab95a88 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -18,24 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import itertools
 import os
 import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
@@ -56,710 +62,991 @@ CUDNN_RNN_TANH_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_TANH_PARAMS_PER_LAYER
 CUDNN_RNN_RELU_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_RELU_PARAMS_PER_LAYER
 
 
-def _CreateModel(rnn_mode,
-                 num_layers,
-                 num_units,
-                 input_size,
-                 input_mode="linear_input",
-                 direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-                 dtype=dtypes.float32,
-                 dropout=0.):
-  del input_mode
-  if rnn_mode == cudnn_rnn_ops.CUDNN_LSTM:
-    model_fn = cudnn_rnn_ops.CudnnLSTM
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_GRU:
-    model_fn = cudnn_rnn_ops.CudnnGRU
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_TANH:
-    model_fn = cudnn_rnn_ops.CudnnRNNTanh
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_RELU:
-    model_fn = cudnn_rnn_ops.CudnnRNNRelu
+def RunLSTM(sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers=1,
+            is_training=True,
+            dropout=0.,
+            num_dirs=True,
+            dtype=dtypes.float32):
+  # TODO(jamesqin): add multi-layer tests.
+  # TODO(jamesqin): add multi-dir tests
+  assert num_layers == 1
+  assert num_dirs == 1
+  if is_training and not np.isclose(dropout, 0):
+    raise ValueError("dropout can not be 0. when test training.")
+
+  # set graph level random seed and numpy random seed.
+  random_seed.set_random_seed(0)
+  np.random.seed(0)
+
+  inputs = variable_scope.get_variable(
+      "inputs",
+      initializer=np.random.rand(time, batch_size,
+                                 input_size).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_h_op = variable_scope.get_variable(
+      "initial_h_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_c_op = variable_scope.get_variable(
+      "initial_c_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+
+  initializer = init_ops.random_uniform_initializer(
+      -0.01, 0.01, dtype=dtype, seed=19980904)
+
+  with variable_scope.variable_scope("test", initializer=initializer):
+    w = variable_scope.get_variable(
+        "rnn/lstm_cell/kernel",
+        shape=[input_size + num_units, num_units * 4],
+        dtype=dtype)
+    b = variable_scope.get_variable(
+        "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype)
+
+    # canonical lstm. must set forget_bias to 0. to align with cudnn lstm.
+    cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
+    outputs_op, state_tuple_op = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=rnn_cell_impl.LSTMStateTuple(
+            h=initial_h_op, c=initial_c_op),
+        dtype=dtype,
+        time_major=True,
+        scope=None)
+
+  # Convert to cudnn opaque param.
+  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
+      num_layers, num_units, input_size)
+  opaque_params = format_converter.tf_canonical_to_opaque([w, b])
+
+  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
+  cu_initial_c_op = array_ops.expand_dims(initial_c_op, axis=0)
+  cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
+      inputs,
+      cu_initial_h_op,
+      cu_initial_c_op,
+      opaque_params,
+      dropout=dropout,
+      is_training=is_training,
+      rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
+  # Remove the trivial 1st dimension.
+  cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
+      c=array_ops.squeeze(cu_c_op, axis=0),
+      h=array_ops.squeeze(cu_h_op, axis=0))
+
+  if is_training:
+    (inp_grad_op, hgrad_op,
+     cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients(
+         outputs_op, [inputs, initial_h_op, initial_c_op, w, b])
+
+    (cu_inp_grad_op, cu_hgrad_op,
+     cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
+         cu_outputs_op,
+         [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
+    # Remove the trivial 1st dimension
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+    # Remove the trivial 1st dimension
+    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0)
+
+    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
+        opaque_grad_op)
+    cu_wgrad_op = cu_wgrad_op[0]
+    cu_bgrad_op = cu_bgrad_op[0]
+    # cudnn lstm has 2 biases each gate. When converting to tf canonical format,
+    # the two biases are summed into one. Thus here bias gradient should be
+    # halved when comparing with tf lstm.
+    cu_bgrad_op *= 0.5
+
+  init_op = variables.global_variables_initializer()
+  sess.run(init_op)
+
+  if is_training:
+    outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([
+        outputs_op, state_tuple_op, inp_grad_op,
+        (hgrad_op, cgrad_op), wgrad_op, bgrad_op
+    ])
+    (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
+     cu_bgrad) = sess.run([
+         cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
+         (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
+     ])
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
+    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
+    logging.vlog(1, "inp_grad: %s" % inp_grad)
+    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
+    logging.vlog(1, "state_grad: %s" % str(state_grad))
+    logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad))
+    logging.vlog(1, "wgrad: %s" % str(wgrad))
+    logging.vlog(1, "bgrad: %s" % str(bgrad))
+    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
+    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
+    return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad,
+            cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad,
+            cu_bgrad)
   else:
-    raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
-  return model_fn(
-      num_layers,
-      num_units,
-      input_size,
-      direction=direction,
-      dtype=dtype,
-      dropout=dropout)
-
-
-def _CreateParamsSavable(params,
-                         model,
-                         base_variable_scope=None,
-                         name="params_canonical"):
-  """Create a RNNParamsSaveable for the weight and bias parameters.
+    outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
+    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op])
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
+    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
+  return outputs, cu_outputs, state_tuple, cu_state_tuple
+
+
+# Basic set of RNN configs to test. They can be further extended in relevant
+# test (e.g. adding num_dirs).
+NAMED_RNN_TESTCASES = ({
+    "testcase_name": "xsmall",
+    "num_units": 1,
+    "input_size": 1,
+    "batch_size": 1,
+    "time": 1,
+    "num_layers": 1,
+}, {
+    "testcase_name": "small",
+    "num_units": 4,
+    "input_size": 4,
+    "batch_size": 4,
+    "time": 4,
+    "num_layers": 1,
+}, {
+    "testcase_name": "medium",
+    "num_units": 128,
+    "input_size": 64,
+    "batch_size": 8,
+    "time": 16,
+    "num_layers": 1,
+}, {
+    "testcase_name": "large",
+    "num_units": 128,
+    "input_size": 128,
+    "batch_size": 16,
+    "time": 32,
+    "num_layers": 1,
+})
+
+
+def ExpandNamedTestCases(inputs, *remove_keys, **extra_configs):
+  """Expands testcase with new config dimensions.
+
+  Example:
+    inputs = (
+      {'testcase_name': 'test1', 'gender': 'male'}
+      {'testcase_name': 'test2', 'gender': 'female'}
+    )
+    remove_keys:  empty
+    extra_configs = {
+      'age': [40, 80]
+      'height': [5, 6]
+    }
+
+    Returns:
+      (
+        {'testcase_name': 'test1_age_40_height_5','gender': 'male', 'age':
+        40,'height': 5}
+        {'testcase_name': 'test1_age_40_height_6', 'gender': 'male', 'age': 40,
+        'height': 6}
+        {'testcase_name': 'test1_age_80_height_5', 'gender': 'male', 'age': 80,
+        'height': 5}
+        {'testcase_name': 'test1_age_80_height_6', 'gender': 'male', 'age': 80,
+        'height': 6}
+
+        {'testcase_name': 'test2_age_40_height_5', 'gender': 'female', 'age':
+        40,
+        'height': 5}
+        {'testcase_name': 'test2_age_40_height_6', 'gender': 'female', 'age':
+        40,
+        'height': 6}
+        {'testcase_name': 'test2_age_80_height_5', 'gender': 'female', 'age':
+        80,
+        'height': 5}
+        {'testcase_name': 'test2_age_80_height_6', 'gender': 'female', 'age':
+        80,
+        'height': 6}
+      )
 
   Args:
-    params: a Variable for weight and bias parameters.
-    model: a CudnnRNN model.
-    base_variable_scope: a string, prefix of names of saved variables.
-    name: a string, name of the RNNParamsSaveable object.
+    inputs: A list of dictionary, each being a testcase.
+    *remove_keys: A list of keys into testcase which are not needed in new
+      testcases.
+    **extra_configs: A dict of new test dimension and applicable values in that
+      dimension.
+
   Returns:
-    a RNNParamsSaveable object.
+    A list of dictionary with expanded test cases.
   """
-  if model._rnn_mode == CUDNN_LSTM:
-    fn = cudnn_rnn_ops.CudnnLSTMSaveable
-  elif model._rnn_mode == CUDNN_GRU:
-    fn = cudnn_rnn_ops.CudnnGRUSaveable
-  elif model._rnn_mode == CUDNN_RNN_TANH:
-    fn = cudnn_rnn_ops.CudnnRNNTanhSaveable
-  elif model._rnn_mode == CUDNN_RNN_RELU:
-    fn = cudnn_rnn_ops.CudnnRNNReluSaveable
-  params_saveable = fn(
-      params,
-      model.num_layers,
-      model.num_units,
-      model.input_size,
-      model.input_mode,
-      model.direction,
-      scope=base_variable_scope,
-      name=name)
-  ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
-  return params_saveable
-
-
-def _MinLSTMParamSize(num_layers,
-                      num_units,
-                      input_size,
-                      direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION):
-  if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION:
-    first_layer_weights = 4 * num_units * (num_units + input_size)
-    higher_layer_weights = 8 * (num_layers - 1) * num_units * num_units
-    all_biases = 8 * num_layers * num_units
-    return first_layer_weights + higher_layer_weights + all_biases
-  elif direction == cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION:
-    first_layer_weights = 4 * num_units * (num_units + input_size)
-    higher_layer_weights = (num_layers - 1) * (
-        4 * 2 * num_units * num_units + 4 * num_units**2)
-    all_biases = 8 * num_layers * num_units
-    return 2 * (first_layer_weights + higher_layer_weights + all_biases)
-  else:
-    raise ValueError("%s direction is not supported.")
+  res = []
+  ordered_extra_configs = collections.OrderedDict(extra_configs)
+  keys = ordered_extra_configs.keys()
+  # A list of list of configs.
+  # The outer loop is iterating keys, the innner is values of one key.
+  combined_kv = [[(k, v) for v in ordered_extra_configs[k]] for k in keys]
+  logging.info("combined_kv: %s", combined_kv)
 
+  for inp in inputs:
+    # Each inp is a dict
+    for config in itertools.product(*combined_kv):
+      new_inp = dict(inp)
+      # config is a list in the form of [(k_i, v_j), (k_p, v_q), ...]
+      suffix = ["%s_%s" % (p[0], str(p[1])) for p in config]
+      suffix = "_".join(suffix)
+      new_inp["testcase_name"] += "_" + suffix
+      for k, v in config:
+        new_inp[k] = v
+      # Remove not used keys from the new test case.
+      if remove_keys:
+        if not isinstance(remove_keys, (list, tuple)):
+          remove_keys = [remove_keys]
+        for k in remove_keys:
+          new_inp.pop(k, None)
+      logging.info("new_inp: %s", new_inp)
+      res.append(new_inp)
+  # Dedup, necessary if `remove_keys` is set.
+  return [dict(t) for t in {tuple(d.items()) for d in res}]
 
-class CudnnRNNTestSaveRestore(TensorFlowTestCase):
 
-  def _CompareWeights(self, lhs, rhs):
-    self.assertEqual(len(lhs), len(rhs))
-    for lw, rw in zip(lhs, rhs):
-      self.assertAllEqual(lw, rw)
+class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
 
-  def _CompareBiases(self, lhs, rhs, rnn_mode, num_layers, direction):
-    self.assertEqual(len(lhs), len(rhs))
-    if rnn_mode == CUDNN_LSTM:
-      num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
-    elif rnn_mode == CUDNN_GRU:
-      num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
-    elif rnn_mode == CUDNN_RNN_TANH:
-      num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
-    else:
-      num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
-    num_dirs = 1 if direction == CUDNN_RNN_UNIDIRECTION else 2
-    num_params_per_layer *= num_dirs
-    self.assertEqual(num_params_per_layer * num_layers, len(lhs))
-
-    for i in range(num_layers):
-      layer_lhs = lhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
-      layer_rhs = rhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
-      if direction == CUDNN_RNN_UNIDIRECTION:
-        self._CompareSingleLayerBiases(layer_lhs, layer_rhs)
-      else:
-        size = len(layer_lhs)
-        fw_lhs, bw_lhs = layer_lhs[:size//2], layer_lhs[size//2:]
-        fw_rhs, bw_rhs = layer_rhs[:size//2], layer_rhs[size//2:]
-        self._CompareSingleLayerBiases(fw_lhs, fw_rhs)
-        self._CompareSingleLayerBiases(bw_lhs, bw_rhs)
-
-  def _CompareSingleLayerBiases(self, lhs, rhs):
-    self.assertEqual(len(lhs), len(rhs))
-
-    lf_lhs, rt_lhs = lhs[:len(lhs)//2], lhs[len(lhs)//2:]
-    lf_rhs, rt_rhs = rhs[:len(rhs)//2], rhs[len(rhs)//2:]
-    self.assertEqual(len(lf_lhs), len(rt_lhs))
-    self.assertEqual(len(lf_rhs), len(rt_rhs))
-
-    sum_lhs, sum_rhs = [], []
-    for lf, rt in zip(lf_lhs, rt_lhs):
-      sum_lhs.append(lf + rt)
-    for lf, rt in zip(lf_rhs, rt_rhs):
-      sum_rhs.append(lf + rt)
-    self.assertEqual(len(sum_lhs), len(sum_rhs))
-    for lf, rt in zip(sum_lhs, sum_rhs):
-      self.assertAllEqual(lf, rt)
+  def _test_training_helper(self,
+                            num_units,
+                            input_size,
+                            batch_size,
+                            time,
+                            num_layers,
+                            dtype,
+                            rtol=2e-6,
+                            atol=2e-6):
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad,
+       state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunLSTM(
+           sess, num_units, input_size, batch_size, time, num_layers)
 
-  def _testSaveRestoreVariable(self, rnn_mode, direction, dtype):
-    num_layers = 2
-    num_units = 7
-    input_size = 3
-    with ops.Graph().as_default():
-      model = _CreateModel(
-          rnn_mode,
-          num_layers=num_layers,
-          num_units=num_units,
-          input_size=input_size,
-          direction=direction,
-          dtype=dtype)
-      random_seed.set_random_seed(1234)
-      params_size_t = model.params_size()
-      params = variables.Variable(
-          random_ops.random_uniform([params_size_t], dtype=dtype),
-          dtype=dtype,
-          validate_shape=False)
-      saveable = _CreateParamsSavable(params, model)
-      weights, biases = saveable._OpaqueParamsToCanonical()
-      reset_params = state_ops.assign(
-          params,
-          array_ops.zeros([params_size_t], dtype=dtype),
-          validate_shape=False)
-      save_path = os.path.join(self.get_temp_dir(),
-                               "save-restore-variable-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      for s, cu_s in zip(state_tuple, cu_state_tuple):
+        self.assertAllClose(s, cu_s, rtol=rtol, atol=atol)
+      for sg, cu_sg in zip(state_grad, cu_state_grad):
+        self.assertAllClose(sg, cu_sg, rtol=rtol, atol=atol)
+      self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol)
+      self.assertAllClose(bgrad, cu_bgrad, rtol=rtol, atol=atol)
+      self.assertAllClose(wgrad, cu_wgrad, rtol=rtol, atol=atol)
 
-        weights_v, biases_v = sess.run([weights, biases])
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(num_units, input_size, batch_size, time,
+                               num_layers, dtypes.float32)
 
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        weights_v_restored, biases_v_restored = sess.run([weights, biases])
-
-        self._CompareWeights(weights_v, weights_v_restored)
-        self._CompareBiases(biases_v, biases_v_restored, rnn_mode, num_layers,
-                            direction)
-
-  def _testSaveRestoreTwoVariables(self, rnn_mode, direction, dtype):
-    num_layers = 2
-    num_units = 7
-    input_size = 3
-    with ops.Graph().as_default():
-      model = _CreateModel(
-          rnn_mode,
-          num_layers=num_layers,
-          num_units=num_units,
-          input_size=input_size,
-          direction=direction,
-          dtype=dtype)
-      random_seed.set_random_seed(1234)
-      params_size_t = model.params_size()
-      names = ["rnn_1", "rnn_2"]
-      param_vars = [
-          variables.Variable(
-              random_ops.random_uniform([params_size_t], dtype=dtype),
-              dtype=dtype,
-              validate_shape=False) for name in names
-      ]
-      saveables = []
-      for name, params in zip(names, param_vars):
-        saveables.append(_CreateParamsSavable(params, model, name, name))
-      weights1, biases1 = saveables[0]._OpaqueParamsToCanonical()
-      weights2, biases2 = saveables[1]._OpaqueParamsToCanonical()
-      reset_params = [
-          state_ops.assign(
-              params,
-              array_ops.zeros([params_size_t], dtype=dtype),
-              validate_shape=False) for params in param_vars
-      ]
-      save_path = os.path.join(self.get_temp_dir(),
-                               "save-restore-variable-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(use_gpu=True,
-                             graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
-        weights1_v, biases1_v = sess.run([weights1, biases1])
-        weights2_v, biases2_v = sess.run([weights2, biases2])
-
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        weights1_v_restored, biases1_v_restored = sess.run([weights1, biases1])
-        weights2_v_restored, biases2_v_restored = sess.run([weights2, biases2])
-
-        self._CompareWeights(weights1_v, weights1_v_restored)
-        self._CompareWeights(weights2_v, weights2_v_restored)
-        self._CompareBiases(biases1_v, biases1_v_restored, rnn_mode, num_layers,
-                            direction)
-        self._CompareBiases(biases2_v, biases2_v_restored, rnn_mode, num_layers,
-                            direction)
-
-  def _testSaveRestoreOutput(self, rnn_mode, direction, dtype):
-    with ops.Graph().as_default():
-      num_layers = 2
-      num_units = 7
-      input_size = 7
-      seq_length = 10
-      batch_size = 5
-      dir_count = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
-      model = _CreateModel(
-          rnn_mode,
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training_fp16(self, num_units, input_size, batch_size, time,
+                         num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(
+        num_units,
+        input_size,
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float16,
+        rtol=5e-3,
+        atol=5e-4)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
           num_layers,
+          is_training=False)
+
+      self.assertAllClose(outputs, cu_outputs)
+      # h
+      self.assertAllClose(state_tuple.h, cu_state_tuple.h)
+      # c
+      self.assertAllClose(state_tuple.c, cu_state_tuple.c)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_fp16(self, num_units, input_size, batch_size, time,
+                          num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM(
+          sess,
           num_units,
           input_size,
-          direction=direction,
-          dtype=dtype)
-      params_size_t = model.params_size()
-      params = variables.Variable(
-          array_ops.ones([params_size_t], dtype=dtype),
-          validate_shape=False,
-          dtype=dtype)
-      _CreateParamsSavable(params, model)
-      save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+          batch_size,
+          time,
+          num_layers,
+          is_training=False,
+          dtype=dtypes.float16)
 
-      np.random.seed(1234)
-      has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-      input_data = constant_op.constant(
-          np.random.randn(seq_length, batch_size, input_size), dtype=dtype)
-      input_h = constant_op.constant(
-          np.random.randn(num_layers * dir_count, batch_size, num_units),
-          dtype=dtype)
-      if has_input_c:
-        input_c = constant_op.constant(
-            np.random.randn(num_layers * dir_count, batch_size, num_units),
-            dtype=dtype)
-        outputs = model(
-            input_data=input_data,
-            input_h=input_h,
-            input_c=input_c,
-            params=params,
-            is_training=False)
-      else:
-        outputs = model(
-            input_data=input_data,
-            input_h=input_h,
-            params=params,
-            is_training=False)
-      total_sum = sum(map(math_ops.reduce_sum, outputs))
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        total_sum_v = sess.run(total_sum)
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        reset_params = state_ops.assign(
-            params,
-            array_ops.zeros([params_size_t], dtype=dtype),
-            validate_shape=False)
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        total_sum_v_restored = sess.run(total_sum)
-        self.assertAllClose(total_sum_v, total_sum_v_restored, atol=1e-5)
+      rtol, atol = 5e-3, 5e-4
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      # h
+      self.assertAllClose(
+          state_tuple.h, cu_state_tuple.h, rtol=rtol, atol=atol)
+      # c
+      self.assertAllClose(
+          state_tuple.c, cu_state_tuple.c, rtol=rtol, atol=atol)
 
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSaveRestore(self):
-    rnn_modes = [
-        cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU,
-        cudnn_rnn_ops.CUDNN_RNN_TANH, cudnn_rnn_ops.CUDNN_RNN_RELU
-    ]
-    directions = [
-        cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-        cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION
-    ]
-    dtype_list = [dtypes.float32, dtypes.float64]
-    for rnn_mode, direction, dtype in itertools.product(rnn_modes, directions,
-                                                        dtype_list):
-      self._testSaveRestoreVariable(rnn_mode, direction, dtype)
-      self._testSaveRestoreTwoVariables(rnn_mode, direction, dtype)
-      self._testSaveRestoreOutput(rnn_mode, direction, dtype)
-
-
-class CudnnRNNTestParamsSize(TensorFlowTestCase):
-
-  def _testOneLSTMParamsSize(self, num_layers, num_units, input_size,
-                             direction):
-    logging.info("Testing one lstm param size with config: %s", locals())
-    min_params_size = _MinLSTMParamSize(num_layers, num_units, input_size,
-                                        direction)
-    model = _CreateModel(
-        cudnn_rnn_ops.CUDNN_LSTM,
-        num_layers,
+  def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
+                                  num_layers):
+    """Validates that dropout does not affect Cudnn Rnn inference."""
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    # Hand-picked dropouts are used below (0. and 1.)
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        # 1st time w/o dropout.
+        (_, cu_outputs, _, cu_state_tuple) = RunLSTM(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=0.)
+
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        (_, cu_outputs2, _, cu_state_tuple2) = RunLSTM(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=1.)
+
+    self.assertAllClose(cu_outputs, cu_outputs2)
+    # h
+    self.assertAllClose(cu_state_tuple.h, cu_state_tuple2.h)
+    # c
+    self.assertAllClose(cu_state_tuple.c, cu_state_tuple2.c)
+
+
+def RunGRU(sess,
+           num_units,
+           input_size,
+           batch_size,
+           time,
+           num_layers=1,
+           is_training=True,
+           dropout=0.,
+           num_dirs=True,
+           dtype=dtypes.float32):
+  # TODO(jamesqin): add multi-layer tests.
+  # TODO(jamesqin): add multi-dir tests
+  assert num_layers == 1
+  assert num_dirs == 1
+  if is_training and not np.isclose(dropout, 0):
+    raise ValueError("dropout can not be 0. when test training.")
+
+  # set graph level random seed and numpy random seed.
+  random_seed.set_random_seed(0)
+  np.random.seed(0)
+
+  inputs = variable_scope.get_variable(
+      "inputs",
+      initializer=np.random.rand(time, batch_size,
+                                 input_size).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_h_op = variable_scope.get_variable(
+      "initial_h_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+
+  initializer = init_ops.random_uniform_initializer(
+      -0.01, 0.01, dtype=dtype, seed=19980904)
+  with variable_scope.variable_scope("test", initializer=initializer):
+    gate_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/gates/kernel",
+        shape=[input_size + num_units, num_units * 2],
+        dtype=dtype)
+    gate_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/gates/bias",
+        shape=[num_units * 2],
+        dtype=dtype)
+    candidate_inp_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel",
+        shape=[input_size, num_units],
+        dtype=dtype)
+    candidate_inp_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias",
+        shape=[num_units],
+        dtype=dtype)
+    candidate_hid_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel",
+        shape=[num_units, num_units],
+        dtype=dtype)
+    candidate_hid_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias",
+        shape=[num_units],
+        dtype=dtype)
+
+    cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True)
+    outputs_op, h_op = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=initial_h_op,
+        dtype=dtype,
+        time_major=True,
+        scope=None)
+
+  ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel]
+  bs = [gate_bias, candidate_inp_bias, candidate_hid_bias]
+  # Convert to cudnn opaque param.
+  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
+      num_layers, num_units, input_size)
+  opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+
+  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
+  cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
+      inputs,
+      cu_initial_h_op,
+      array_ops.zeros_like(cu_initial_h_op),  # not used
+      opaque_params,
+      dropout=dropout,
+      is_training=is_training,
+      rnn_mode=cudnn_rnn_ops.CUDNN_GRU)
+
+  if is_training:
+    (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op,
+     cib_grad_op, chb_grad_op) = gradients_impl.gradients(
+         outputs_op, [inputs, initial_h_op] + ws + bs)
+
+    (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients(
+        cu_outputs_op, [inputs, cu_initial_h_op, opaque_params])
+    # Remove the trivial 1st dimension
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+
+    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
+        opaque_grad_op)
+    (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op
+    (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op
+    # cudnn gru has 2 biases for reset and update gates. When converting to tf
+    # canonical format, the two biases are summed into one.  Thus here relevant
+    # bias gradient should be halved before comparing with tf gru.
+    cu_gb_grad_op *= 0.5
+
+  init_op = variables.global_variables_initializer()
+  sess.run(init_op)
+
+  if is_training:
+    outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([
+        outputs_op, h_op, inp_grad_op, hgrad_op,
+        (gk_grad_op, cik_grad_op, chk_grad_op),
+        (gb_grad_op, cib_grad_op, chb_grad_op)
+    ])
+    (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run([
+        cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
+        (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
+        (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
+    ])
+    # Remove the trivial 1st dimension
+    cu_h = np.squeeze(cu_h, axis=0)
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "h: %s" % h)
+    logging.vlog(1, "cu_h: %s" % h)
+    logging.vlog(1, "inp_grad: %s" % inp_grad)
+    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
+    logging.vlog(1, "hgrad: %s" % hgrad)
+    logging.vlog(1, "cu_hgrad: %s" % cu_hgrad)
+    logging.vlog(1, "wgrad: %s" % str(wgrad))
+    logging.vlog(1, "bgrad: %s" % str(bgrad))
+    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
+    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
+    return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
+            cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad)
+  else:
+    outputs, h = sess.run([outputs_op, h_op])
+    cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op])
+    # Remove the trivial 1st dimension.
+    cu_h = np.squeeze(cu_h, axis=0)
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "h: %s" % h)
+    logging.vlog(1, "cu_h: %s" % h)
+  return outputs, cu_outputs, h, cu_h
+
+
+class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
+
+  def _test_training_helper(self,
+                            num_units,
+                            input_size,
+                            batch_size,
+                            time,
+                            num_layers,
+                            dtype,
+                            rtol=2e-6,
+                            atol=2e-6):
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
+       cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunGRU(
+           sess, num_units, input_size, batch_size, time, num_layers)
+
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
+      self.assertAllClose(hgrad, cu_hgrad, rtol=rtol, atol=atol)
+      self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol)
+      for bg, cu_bg in zip(bgrad, cu_bgrad):
+        self.assertAllClose(bg, cu_bg, rtol=rtol, atol=atol)
+      for wg, cu_wg in zip(wgrad, cu_wgrad):
+        self.assertAllClose(wg, cu_wg, rtol=rtol, atol=atol)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(num_units, input_size, batch_size, time,
+                               num_layers, dtypes.float32)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training_fp16(self, num_units, input_size, batch_size, time,
+                         num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(
         num_units,
         input_size,
-        direction=direction)
-    params_size = model.params_size()
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      params_size_v = sess.run(params_size)
-      self.assertLessEqual(min_params_size, params_size_v)
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float16,
+        rtol=5e-3,
+        atol=5e-4)
 
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testLSTMParamsSize(self):
-    test_configs = [
-        [4, 200, 200],
-        [4, 200, 300],
-        [4, 200, 100],
-        [1, 100, 200],
-        [2, 200, 100],
-        [3, 200, 400],
-    ]
-    directions = [
-        cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-        cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION
-    ]
-    for (config, direction) in itertools.product(test_configs, directions):
-      num_layers, num_units, input_size = config
-      with ops.Graph().as_default():
-        self._testOneLSTMParamsSize(num_layers, num_units, input_size,
-                                    direction)
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h) = RunGRU(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False)
+      self.assertAllClose(outputs, cu_outputs)
+      self.assertAllClose(h, cu_h)
 
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testLSTMParamsSizeShape(self):
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
-          cudnn_rnn_ops.CUDNN_LSTM,
-          constant_op.constant([4]), 200, 200,
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      params_size = model.params_size()
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
-          cudnn_rnn_ops.CUDNN_LSTM,
-          4, constant_op.constant([200]), 200,
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      params_size = model.params_size()
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
+  def test_inference_fp16(self, num_units, input_size, batch_size, time,
+                          num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h) = RunGRU(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False,
+          dtype=dtypes.float16)
+
+      rtol, atol = 5e-3, 5e-4
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
+                                  num_layers):
+    """Validates that dropout does not affect Cudnn Rnn inference."""
+    # Hand-picked dropouts are used below (0. and 1.)
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        # 1st time w/o dropout.
+        (_, cu_outputs, _, cu_h) = RunGRU(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=0.)
+
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        (_, cu_outputs2, _, cu_h2) = RunGRU(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=1.)
+
+    self.assertAllClose(cu_outputs, cu_outputs2)
+    self.assertAllClose(cu_h[0], cu_h2[0])
+
+
+class CudnnParamsFormatConverterTest(TensorFlowTestCase,
+                                     parameterized.TestCase):
+  """Class for testing various format converters."""
+
+  def _test_lstm_helper(self, num_units, input_size, num_layers, direction):
+    with self.session(use_gpu=True) as sess:
+      random_seed.set_random_seed(0)
+      np.random.seed(0)
+
+      num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
+      format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
+          num_layers, num_units, input_size, direction=direction)
+
+      ws, bs = [], []
+      for _ in range(num_layers * num_dirs):
+        w = constant_op.constant(
+            np.random.rand(input_size + num_units, 4 * num_units),
+            dtype=dtypes.float32)
+        b = constant_op.constant(
+            np.random.rand(4 * num_units), dtype=dtypes.float32)
+        ws.append(w)
+        bs.append(b)
+
+      opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+      opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
           cudnn_rnn_ops.CUDNN_LSTM,
-          4, 200, constant_op.constant([200]),
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      params_size = model.params_size()
+          num_layers,
+          num_units,
+          input_size,
+          direction=direction)
 
+      ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params)
 
-class CudnnRNNTestInference(TensorFlowTestCase):
+      # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical()
+      # returns the original input.
+      ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r])
+      for w, w_r in zip(ws, ws_r):
+        self.assertAllClose(w, w_r)
+      for b, b_r in zip(bs, bs_r):
+        self.assertAllClose(b, b_r)
 
-  def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
-                              batch_size, seq_length, dir_count, dropout,
-                              expected, tolerance):
-    random_seed.set_random_seed(5678)
-    model = _CreateModel(
-        rnn_mode,
-        num_layers,
-        num_units,
-        input_size,
-        input_mode="auto_select",
-        direction=(cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1
-                   else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION),
-        dropout=dropout)
-    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-    params_size_t = model.params_size()
-    input_data = array_ops.ones([seq_length, batch_size, input_size])
-    input_h = array_ops.ones([num_layers * dir_count, batch_size, num_units])
-    params = variables.Variable(
-        array_ops.ones([params_size_t]), validate_shape=False)
-    if has_input_c:
-      input_c = array_ops.ones([num_layers * dir_count, batch_size, num_units])
-      output, output_h, output_c = model(
-          input_data=input_data,
-          input_h=input_h,
-          input_c=input_c,
-          params=params,
-          is_training=False)
-    else:
-      output, output_h = model(
-          input_data=input_data,
-          input_h=input_h,
-          params=params,
-          is_training=False)
-    output_sum = math_ops.reduce_sum(output)
-    output_h_sum = math_ops.reduce_sum(output_h)
-    total_sum = output_sum + output_h_sum
-    if has_input_c:
-      output_c_sum = math_ops.reduce_sum(output_c)
-      total_sum += output_c_sum
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      sess.run(variables.global_variables_initializer())
-      total_sum_v = sess.run([total_sum])
+      # Test opaque_params size lower bound
+      opaque_params_size_v = sess.run(opaque_params_size)
+      min_params_size = (
+          np.sum([x.size for x in ws]) + np.sum([x.size for x in bs]))
+      logging.info("min_parm_size: %d vs actual_opaque_param_size: %d",
+                   min_params_size, opaque_params_size_v)
+      self.assertLessEqual(min_params_size, opaque_params_size_v)
 
-      self.assertAllClose(
-          total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_lstm(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_lstm_helper(num_units, input_size, num_layers,
+                           cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
 
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleInference(self):
-    test_configs = [
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "expected": 231833.22,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "expected": 56000,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "expected": 56000,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "expected": 130688,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 8,
-                "input_size": 4,
-                "batch_size": 4,
-                "seq_length": 2,
-                "dir_count": 1,
-            },
-        },
-    ]
-    # Cudnn scales result for dropout during training, therefore dropout has no
-    # impact for inference results.
-    # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
-    # demonstrative of the dropout-invariant nature of CudnnRnn.)
-    dropouts = [0., 0.5, 1.]
-    for (config, dropout) in itertools.product(test_configs, dropouts):
-      rnn_mode = config["rnn_mode"]
-      expected = config["expected"]
-      tolerance = config["tolerance"]
-      shape = config["shape"]
-      with ops.Graph().as_default():
-        self._testOneSimpleInference(
-            rnn_mode, shape["num_layers"], shape["num_units"],
-            shape["input_size"], shape["batch_size"], shape["seq_length"],
-            shape["dir_count"], dropout, expected, tolerance)
-
-
-class CudnnRNNTestTraining(TensorFlowTestCase):
-
-  def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
-                             batch_size, seq_length, dir_count, dropout, dtype,
-                             delta, tolerance):
-    # Gradient checking runs two forward ops with almost the same input. Need to
-    # make sure the drop patterns across the two runs are the same.
-    logging.info("Training test with config: %s", locals())
-    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
-    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
-    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-    random_seed.set_random_seed(5678)
-    direction = (cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1
-                 else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
-    model = _CreateModel(
-        rnn_mode,
-        num_layers,
-        num_units,
-        input_size,
-        direction=direction,
-        dtype=dtype,
-        dropout=dropout)
-    params_size_t = model.params_size()
-    input_data = variables.Variable(
-        random_ops.random_uniform(
-            [seq_length, batch_size, input_size], dtype=dtype),
-        dtype=dtype)
-    input_h = variables.Variable(
-        random_ops.random_uniform(
-            [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-        dtype=dtype)
-    params = variables.Variable(
-        random_ops.random_uniform([params_size_t], dtype=dtype),
-        validate_shape=False,
-        dtype=dtype)
-    if has_input_c:
-      input_c = variables.Variable(
-          random_ops.random_uniform(
-              [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-          dtype=dtype)
-
-      output, output_h, output_c = model(
-          input_data=input_data,
-          input_h=input_h,
-          input_c=input_c,
-          params=params)
-    else:
-      output, output_h = model(
-          input_data=input_data, input_h=input_h, params=params)
-    output_sum = math_ops.reduce_sum(output)
-    output_h_sum = math_ops.reduce_sum(output_h)
-    total_sum = output_sum + output_h_sum
-    if has_input_c:
-      output_c_sum = math_ops.reduce_sum(output_c)
-      total_sum += output_c_sum
-
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      params_size_v = sess.run(params_size_t)
-      inputs_and_shapes = [
-          (input_data, [seq_length, batch_size, input_size]),
-          (input_h, [num_layers * dir_count, batch_size, num_units]),
-          (params, [params_size_v]),
-      ]
-      if has_input_c:
-        inputs_and_shapes.append(
-            (input_c, [num_layers * dir_count, batch_size, num_units]),)
-      sess.run(variables.global_variables_initializer())
-      all_inputs = [entry[0] for entry in inputs_and_shapes]
-      all_shapes = [entry[1] for entry in inputs_and_shapes]
-
-      err = gradient_checker.compute_gradient_error(
-          all_inputs, all_shapes, total_sum, [1], delta=delta)
-
-      self.assertLess(err, tolerance)
-      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
+  def test_lstm_bidi(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_lstm_helper(num_units, input_size, num_layers,
+                           cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
+
+  def _test_gru_helper(self, num_units, input_size, num_layers, direction):
+    with self.session(use_gpu=True) as sess:
+      random_seed.set_random_seed(0)
+      np.random.seed(0)
+
+      num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
+      format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
+          num_layers, num_units, input_size, direction=direction)
 
+      ws, bs = [], []
+      for _ in range(num_layers * num_dirs):
+        gate_kernel = constant_op.constant(
+            np.random.rand(input_size + num_units, num_units * 2),
+            dtype=dtypes.float32)
+        gate_bias = constant_op.constant(
+            np.random.rand(num_units * 2), dtype=dtypes.float32)
+        candidate_inp_kernel = constant_op.constant(
+            np.random.rand(input_size, num_units), dtype=dtypes.float32)
+        candidate_inp_bias = constant_op.constant(
+            np.random.rand(num_units), dtype=dtypes.float32)
+        candidate_hid_kernel = constant_op.constant(
+            np.random.rand(num_units, num_units), dtype=dtypes.float32)
+        candidate_hid_bias = constant_op.constant(
+            np.random.rand(num_units), dtype=dtypes.float32)
+        ws.extend([gate_kernel, candidate_inp_kernel, candidate_hid_kernel])
+        bs.extend([gate_bias, candidate_inp_bias, candidate_hid_bias])
+
+      opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+      opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
+          cudnn_rnn_ops.CUDNN_GRU,
+          num_layers,
+          num_units,
+          input_size,
+          direction=direction)
+
+      ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params)
+
+      # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical()
+      # returns the original input.
+      ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r])
+      for w, w_r in zip(ws, ws_r):
+        self.assertAllClose(w, w_r)
+      for b, b_r in zip(bs, bs_r):
+        self.assertAllClose(b, b_r)
+
+      # Test opaque_params size lower bound
+      opaque_params_size_v = sess.run(opaque_params_size)
+      min_params_size = (
+          np.sum([x.size for x in ws]) + np.sum([x.size for x in bs]))
+      logging.info("min_parm_size: %d vs actual_opaque_param_size: %d",
+                   min_params_size, opaque_params_size_v)
+      self.assertLessEqual(min_params_size, opaque_params_size_v)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_gru(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_gru_helper(num_units, input_size, num_layers,
+                          cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTraining(self):
-    test_configs = [
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "dtype": dtypes.float32,
-            "tolerance": 1.5e-2,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "dtype": dtypes.float32,
-            "tolerance": 4e-3,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "dtype": dtypes.float32,
-            "tolerance": 5e-3,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "dtype": dtypes.float32,
-            "tolerance": 5e-1,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-    ]
-    dropouts = [0., 0.5, 1.]
-    dir_counts = [1]
-    for config, dropout, dir_count in itertools.product(test_configs, dropouts,
-                                                        dir_counts):
-      rnn_mode = config["rnn_mode"]
-      dtype = config.get("dtype", dtypes.float32)
-      delta = config.get("delta", 1e-3)
-      tolerance = config["tolerance"]
-      shape = config["shape"]
-      with ops.Graph().as_default():
-        self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    dir_count, dropout, dtype, delta, tolerance)
+  def test_gru_bidi(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_gru_helper(num_units, input_size, num_layers,
+                          cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
+
+
+class CudnnRnnSaveRestoreTest(TensorFlowTestCase, parameterized.TestCase):
+  """Class for testing various Cudnn Rnn SaveableObjects."""
+
+  def _create_opaque_param(self,
+                           rnn_mode,
+                           num_units,
+                           input_size,
+                           num_layers,
+                           direction,
+                           name=None):
+    param_size_t = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
+        rnn_mode, num_layers, num_units, input_size, direction=direction)
+    init_val = random_ops.random_uniform([param_size_t])
+    return variable_scope.get_variable(
+        name or "opaque_param", initializer=init_val, validate_shape=False)
+
+  def _create_saveable(self, opaque_param, rnn_mode, num_units, input_size,
+                       num_layers, direction):
+    if rnn_mode == CUDNN_LSTM:
+      fn = cudnn_rnn_ops.CudnnLSTMSaveable
+    elif rnn_mode == CUDNN_GRU:
+      fn = cudnn_rnn_ops.CudnnGRUSaveable
+    elif rnn_mode == CUDNN_RNN_TANH:
+      fn = cudnn_rnn_ops.CudnnRNNTanhSaveable
+    elif rnn_mode == CUDNN_RNN_RELU:
+      fn = cudnn_rnn_ops.CudnnRNNReluSaveable
+    saveable = fn(
+        opaque_param, num_layers, num_units, input_size, direction=direction)
+    return saveable
+
+  def _compare_weights(self, lhs, rhs):
+    self.assertLen(rhs, len(lhs))
+    for lw, rw in zip(lhs, rhs):
+      self.assertAllEqual(lw, rw)
+
+  def _compare_biases(self, lhs, rhs):
+    self.assertLen(rhs, len(lhs))
+    for lf, rt in zip(lhs, rhs):
+      self.assertAllEqual(lf, rt)
+
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, "time", "batch_size", **{
+              "rnn_mode": [
+                  CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH
+              ],
+              "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
+          }))
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_save_restore_variable(self, rnn_mode, num_units, input_size,
+                                 num_layers, direction):
+    # Verify the restored opaque param, once converted to tf_canonical format,
+    # is the same as the tf canonicals of the pre-restored param.
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      opaque_param = self._create_opaque_param(rnn_mode, num_units, input_size,
+                                               num_layers, direction)
+      saveable = self._create_saveable(opaque_param, rnn_mode, num_units,
+                                       input_size, num_layers, direction)
+      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      weights_op, biases_op = saveable.format_converter.opaque_to_tf_canonical(
+          saveable._variables)
+
+      save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test")
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      init_op = variables.global_variables_initializer()
+      reset_op = state_ops.assign(opaque_param,
+                                  array_ops.zeros_like(opaque_param))
+      sess.run(init_op)
+      self.assertEqual(save_path, saver.save(sess, save_path))
+
+      # Get the tf canonical vals before reset-restore
+      weights, biases = sess.run([weights_op, biases_op])
+
+      # Reset the opaque param value
+      sess.run(reset_op)
+      # Assert reset happened.
+      weights_z, biases_z = sess.run([weights_op, biases_op])
+      for w in weights_z:
+        self.assertAllClose(w, np.zeros_like(w))
+      for b in biases_z:
+        self.assertAllClose(b, np.zeros_like(b))
+
+      # Restore opaque param value from checkpoint.
+      saver.restore(sess, save_path)
+      weights_r, biases_r = sess.run([weights_op, biases_op])
+      self._compare_weights(weights, weights_r)
+      self._compare_biases(biases, biases_r)
+
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, "time", "batch_size", **{
+              "rnn_mode": [
+                  CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH
+              ],
+              "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
+          }))
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_save_restore_multi_variables(self, rnn_mode, num_units, input_size,
+                                        num_layers, direction):
+    # Verify the restored opaque param, once converted to tf_canonical format,
+    # is the same as the tf canonicals of the pre-restored param.
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      opaque_params = []
+      saveables = []
+      num_opaque_params = 2
+      for i in range(num_opaque_params):
+        opaque_params.append(
+            self._create_opaque_param(
+                rnn_mode,
+                num_units,
+                input_size,
+                num_layers,
+                direction,
+                name="opaque_param_%d" % i))
+        saveable = self._create_saveable(opaque_params[i], rnn_mode, num_units,
+                                         input_size, num_layers, direction)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+        saveables.append(saveable)
+
+      weights_ops, biases_ops = [], []
+      for i in range(num_opaque_params):
+        weights_op, biases_op = (
+            saveables[i].format_converter.opaque_to_tf_canonical(
+                saveables[i]._variables))
+        weights_ops.append(weights_op)
+        biases_ops.append(biases_op)
+
+      save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test")
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      init_op = variables.global_variables_initializer()
+      reset_ops = []
+      for i in range(num_opaque_params):
+        reset_ops.append(
+            state_ops.assign(opaque_params[i],
+                             array_ops.zeros_like(opaque_params[i])))
+      sess.run(init_op)
+      self.assertEqual(save_path, saver.save(sess, save_path))
+
+      # Get the tf canonical vals before reset-restore
+      for i in range(num_opaque_params):
+        weights, biases = sess.run([weights_ops[i], biases_ops[i]])
+
+        # Reset the opaque param value
+        sess.run(reset_ops[i])
+
+        # Assert reset happened.
+        weights_z, biases_z = sess.run([weights_ops[i], biases_ops[i]])
+        for w in weights_z:
+          self.assertAllClose(w, np.zeros_like(w))
+        for b in biases_z:
+          self.assertAllClose(b, np.zeros_like(b))
+
+        # Restore opaque param value from checkpoint.
+        saver.restore(sess, save_path)
+        weights_r, biases_r = sess.run([weights_ops[i], biases_ops[i]])
+        self._compare_weights(weights, weights_r)
+        self._compare_biases(biases, biases_r)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 1954f6717bbebd803b0ec45992b43cf68f5d72a0..6cc93dccb004687a2d583a5d1925ea6b98c98979 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -536,7 +536,9 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
       save_path = os.path.join(self.get_temp_dir(),
                                "save-restore-variable-test")
       saver = saver_lib.Saver()
-      weights, biases = model.rnn.saveable._OpaqueParamsToCanonical()
+      weights, biases = (
+          model.rnn.saveable.format_converter._opaque_to_cu_canonical(
+              model.rnn.saveable._variables))
       opaque_params = rnn.trainable_variables[0]
       # CudnnTestModel() creates CudnnOpaqueParamsSaveable that helps saver save
       # Cudnn vars in canonical format.
@@ -583,8 +585,12 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
             dtype=dtype)
       opaque_params = (model1.rnn.trainable_variables[0],
                        model2.rnn.trainable_variables[0])
-      weights1, biases1 = model1.rnn.saveable._OpaqueParamsToCanonical()
-      weights2, biases2 = model2.rnn.saveable._OpaqueParamsToCanonical()
+      saveable1 = model1.rnn.saveable
+      weights1, biases1 = saveable1.format_converter._opaque_to_cu_canonical(
+          saveable1._variables)
+      saveable2 = model1.rnn.saveable
+      weights2, biases2 = saveable2.format_converter._opaque_to_cu_canonical(
+          saveable2._variables)
       reset_params = [
           state_ops.assign(params,
                            array_ops.zeros_like(params, dtype=dtype))
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py b/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
index f09466b631f69d6234573dd5eafada650421c117..60229af374be869005139921483793156e5e7a05 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
@@ -27,5 +27,10 @@ from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibl
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibleLSTMCell
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnGRUSaveable
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnLSTMSaveable
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterGRU
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterLSTM
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterRelu
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterTanh
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNReluSaveable
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNTanhSaveable
+
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index a324c6e7d76223aaa6514e695e4ff8444db455d0..8e25637ed91a1559b321ea96efbfaa2910f67158 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -21,6 +21,7 @@ from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -322,7 +323,7 @@ class _CudnnRNN(base_layer.Layer):
       raise ValueError("The last dimension of the inputs to `CudnnRNN` "
                        "should be defined. Found `None`.")
     self._input_size = input_shape[-1].value
-    self.input_spec = base_layer.InputSpec(ndim=3, axes={-1: self._input_size})
+    self.input_spec = input_spec.InputSpec(ndim=3, axes={-1: self._input_size})
 
     self._set_scope(None)
 
@@ -388,11 +389,11 @@ class _CudnnRNN(base_layer.Layer):
       output_states: a tuple of tensor(s) of the same shape and structure as
         `initial_state`.
     Raises:
-      ValueError: initial_state is not a tuple.
+      TypeError: initial_state is not a tuple.
     """
     if initial_state is not None and not isinstance(initial_state, tuple):
-      raise ValueError("Invalid initial_state type: %s, expecting tuple.",
-                       type(initial_state))
+      raise TypeError("Invalid initial_state type: %s, expecting tuple." %
+                      initial_state)
     dtype = self.dtype
     inputs = ops.convert_to_tensor(inputs, dtype=dtype)
 
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 2c92f31788378c2a9f01183bc04b035668b59b59..1ce29b42d52ff67477161278ed11016c2e73041d 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -74,7 +74,7 @@ class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell):
 
 
 class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
-  """Cudnn Compatible GRUCell.
+  r"""Cudnn Compatible GRUCell.
 
   A GRU impl akin to `tf.nn.rnn_cell.GRUCell` to use along with
   `tf.contrib.cudnn_rnn.CudnnGRU`. The latter's params can be used by
@@ -177,172 +177,60 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
     return new_h, new_h
 
 
-# TODO(yaozhang): make sure we only save the canonical version of params and
-# don't save the platform-specific version to avoid potential race
-# conditions where params is updated by both versions when being restored.
-# Currently, checkpointing will function properly, despite that we save both
-# versions, because Saver restores customized savables after Variables.
-# However, it is good to not rely on this restoring order of Saver and to
-# avoid unnecessary storage. Add a test to check only the canonical version is
-# saved.
-class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
-  """Abstract SaveableObject implementation handling Cudnn opaque params."""
+class CudnnParamsFormatConverter(object):
+  """Abstract class that converts between params of Cudnn Rnn and TF Rnn."""
 
   def __init__(self,
-               opaque_params,
                num_layers,
                num_units,
                input_size,
                input_mode=CUDNN_INPUT_LINEAR_MODE,
-               direction=CUDNN_RNN_UNIDIRECTION,
-               scope=None,
-               name="cudnn_rnn_saveable"):
-    """Creates a CudnnOpaqueParamsSaveable object.
-
-       CudnnOpaqueParamsSaveable is saveable/restorable in a checkpoint file
-       and is used to save/restore the weights and biases parameters in a
-       canonical format which is directly consumable by platform-independent tf
-       RNN cells. Parameters are saved as tensors layer by layer with weight
-       tensors followed by bias tensors, and forward direction followed by
-       backward direction (if applicable). When restoring, a user could name
-       param_variables as desired, and restore weight and bias tensors to these
-       variables.
-
-       For CudnnRNNRelu or CudnnRNNTanh, there are 2 tensors per weight and per
-       bias for each layer: tensor 0 is applied to the input from the previous
-       layer and tensor 1 to the recurrent input.
-
-       For CudnnLSTM, there are 8 tensors per weight and per bias for each
-       layer: tensor 0-3 are applied to the input from the previous layer and
-       tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate;
-       tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate;
-       tensor 3 and 7 the output gate.
-
-       For CudnnGRU, there are 6 tensors per weight and per bias for each layer:
-       tensor 0-2 are applied to the input from the previous layer and
-       tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate;
-       tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
+               direction=CUDNN_RNN_UNIDIRECTION):
+    """Constructor.
 
     Args:
-      opaque_params: a variable, Cudnn RNN opaque params.
       num_layers: the number of layers for the RNN model.
       num_units: the number of units within the RNN model.
       input_size: the size of the input, it could be different from the
-          num_units.
+        num_units.
       input_mode: indicate whether there is a linear projection between the
-          input and the actual computation before the first layer. It could be
-          'linear_input', 'skip_input' or 'auto_select'.
-          'linear_input' (default) always applies a linear projection of input
-          onto RNN hidden state. (standard RNN behavior).
-          'skip_input' is only allowed when input_size == num_units;
-          'auto_select' implies 'skip_input' when input_size == num_units;
-          otherwise, it implies 'linear_input'.
+        input and the actual computation before the first layer. It could be one
+        of 'linear_input', 'skip_input' or 'auto_select'. * 'linear_input'
+        (default) always applies a linear projection of input onto RNN hidden
+        state. (standard RNN behavior). * 'skip_input' is only allowed when
+        input_size == num_units; * 'auto_select' implies 'skip_input' when
+        input_size == num_units; otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Could be either
-          'unidirectional' or 'bidirectional'
-      scope: string of VariableScope, the scope of equivalent subgraph
-          consisting only platform-independent tf RNN cells.
-      name: the name of the CudnnOpaqueParamsSaveable object.
+        'unidirectional' or 'bidirectional'
     """
-    # Define in subclasses.
     self._num_layers = num_layers
     self._input_size = input_size
     self._num_units = num_units
     self._input_mode = input_mode
     self._direction = direction
-    if scope is not None:
-      scope_name = scope.name if isinstance(scope, vs.VariableScope) else scope
-      self._scope = scope_name or None
-    else:
-      self._scope = None
-
-    self._variables = opaque_params
     self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
     self._num_params = (
         self._num_params_per_layer * self._num_layers * self._num_dirs)
 
-    weights, biases = self._OpaqueParamsToCanonical()
-    (weights, weight_names), (biases, bias_names) = self._TransformCanonical(
-        weights, biases)
-    # We currently don't use slice_spec. It might be useful in a distributed
-    # setting where each parameter server node stores a slice of variable,
-    # instead of having the master pull all slices and then save them.
-    slice_spec = ""
-    params = weights + biases
-    self._weight_names = weight_names
-    self._bias_names = bias_names
-    self._param_names = weight_names + bias_names
-    prefixed_param_names = weight_names + bias_names
-    if self._scope:
-      prefixed_param_names = [
-          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names]
-    specs = [
-        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
-        for param, param_name in zip(params, prefixed_param_names)
-    ]
-    super(CudnnOpaqueParamsSaveable, self).__init__(
-        array_ops.identity(self._variables), specs, name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    weights, biases = self._ReverseTransformCanonical(restored_tensors)
-    weights = [array_ops.reshape(w, [-1]) for w in weights]
-    opaque_params = self._CanonicalToOpaqueParams(weights, biases)
-
-    return state_ops.assign(
-        self._variables, opaque_params, validate_shape=False)
+  def tf_canonical_to_opaque(self, tf_canonicals):
+    r"""Converts tf canonical weights to cudnn opaque param."""
+    cu_weights, cu_biases = self._tf_canonical_to_cu_canonical(tf_canonicals)
+    cu_weights = [array_ops.reshape(w, [-1]) for w in cu_weights]
+    opaque_params = self._cu_canonical_to_opaque(cu_weights, cu_biases)
+    return opaque_params
 
-  def _checkpointable_save(self, save_buffer):
-    weights, biases = self._OpaqueParamsToCanonical()
-    with ops.device("gpu:0"):
-      (weights, _), (biases, _) = self._TransformCanonical(
-          weights, biases)
-    for name, tensor in zip(self._param_names, weights + biases):
-      save_buffer[name] = array_ops.identity(tensor)
+  def opaque_to_tf_canonical(self, opaque_param):
+    r"""Converts cudnn opaque param to tf canonical weights."""
+    cu_weights, cu_biases = self._opaque_to_cu_canonical(opaque_param)
+    weights, biases = self._cu_canonical_to_tf_canonical(cu_weights, cu_biases)
+    return weights, biases
 
-  def _checkpointable_restore(self, restore_buffer):
-    tensors = [array_ops.identity(restore_buffer[name])
-               for name in self._param_names]
-    return self.restore(
-        restored_tensors=tensors,
-        restored_shapes=None  # Unused
-    )
-
-  def _add_checkpointable_dependencies(self, checkpointable, dtype):
-    """Add canonical weight dependencies to `checkpointable`.
-
-    When saving or restoring, converts to or from the opaque buffer
-    format. Weights are saved and loaded in the configuration expected by
-    cuDNN-compatible cells.
-
-    Args:
-      checkpointable: An object inheriting from `CheckpointableBase` to add
-        dependencies too (typically the cuDNN `Layer`).
-      dtype: The dtype for the canonical parameter Tensors.
-    """
-    split_dependencies = split_dependency.split_dependency(
-        component_names=self._param_names,
-        component_dtypes=(dtype,) * len(self._param_names),
-        fill_save_buffer_fn=self._checkpointable_save,
-        consume_restore_buffer_fn=self._checkpointable_restore)
-    self._checkpointable_track_params(checkpointable, split_dependencies)
-
-  def _checkpointable_track_params(self, checkpointable, params):
-    """Tracks parameters in a canonical configuration."""
-    return  # NotImplementedError raised by the Layer.
-
-  def _TFCanonicalNamePrefix(self, layer, is_fwd=True):
-    if self._direction == CUDNN_RNN_UNIDIRECTION:
-      return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
-    else:
-      if is_fwd:
-        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/fw/%s" %
-                (layer, self._rnn_cell_name))
-      else:
-        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/bw/%s" %
-                (layer, self._rnn_cell_name))
-
-  def _OpaqueParamsToCanonical(self):
+  def _opaque_to_cu_canonical(self, opaque_param):
     """Converts opaque params to Cudnn canonical format.
 
+    Args:
+      opaque_param: An opaque tensor storing cudnn rnn params (weights and
+        biases).
     Returns:
       2 list for weights and biases respectively.
     """
@@ -351,14 +239,14 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           num_layers=self._num_layers,
           num_units=self._num_units,
           input_size=self._input_size,
-          params=self._variables,
+          params=opaque_param,
           num_params=self._num_params,
           rnn_mode=self._rnn_mode,
           input_mode=self._input_mode,
           direction=self._direction)
       return (weights, biases)
 
-  def _CanonicalToOpaqueParams(self, cu_weights, cu_biases):
+  def _cu_canonical_to_opaque(self, cu_weights, cu_biases):
     """Converts from Cudnn canonical format to opaque params.
 
     Args:
@@ -378,7 +266,7 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           input_mode=self._input_mode,
           direction=self._direction)
 
-  def _TransformCanonical(self, cu_weights, cu_biases):
+  def _cu_canonical_to_tf_canonical(self, cu_weights, cu_biases):
     r"""Transform from Cudnn canonical to tf canonical.
 
     The elements of argument lists are laid out in the following format:
@@ -398,46 +286,43 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
       cu_weights: a list of tensors of Cudnn canonical weights.
       cu_biases: a list of tensors of Cudnn canonical biases.
     Returns:
-      2 tuples, one for weights and the other for bias.
-      Each tuple has two lists: the 1st for transformed tf canonical tensors
-      and the 2nd for the names of the tensors under which they are saved.
+      1 tuple, tf canonical weights and biases.
     """
     tf_weights, tf_biases = [], []
-    tf_weights_names, tf_bias_names = [], []
 
     layer_weights_num = self._num_params_per_layer * self._num_dirs
     layer_biases_num = layer_weights_num
 
     for i in range(self._num_layers):
-      layer_weights = cu_weights[i * layer_weights_num:
-                                 (i + 1) * layer_weights_num]
+      layer_weights = cu_weights[i * layer_weights_num:(i + 1) *
+                                 layer_weights_num]
       layer_biases = cu_biases[i * layer_biases_num:(i + 1) * layer_biases_num]
       if self._direction == CUDNN_RNN_UNIDIRECTION:
-        prefix = self._TFCanonicalNamePrefix(i)
-        self._TransformSingleLayerCanonical(layer_weights, layer_biases, prefix,
-                                            tf_weights, tf_weights_names,
-                                            tf_biases, tf_bias_names)
+        self._cu_canonical_to_tf_canonical_single_layer(
+            layer_weights, layer_biases, tf_weights, tf_biases)
       else:
-        fw_prefix = self._TFCanonicalNamePrefix(i, is_fwd=True)
-        bw_prefix = self._TFCanonicalNamePrefix(i, is_fwd=False)
-
         fw_weights = layer_weights[:len(layer_weights) // 2]
         bw_weights = layer_weights[len(layer_weights) // 2:]
         fw_biases = layer_biases[:len(layer_biases) // 2]
         bw_biases = layer_biases[len(layer_biases) // 2:]
 
-        self._TransformSingleLayerCanonical(fw_weights, fw_biases, fw_prefix,
-                                            tf_weights, tf_weights_names,
-                                            tf_biases, tf_bias_names)
-
-        self._TransformSingleLayerCanonical(bw_weights, bw_biases, bw_prefix,
-                                            tf_weights, tf_weights_names,
-                                            tf_biases, tf_bias_names)
-    return (tf_weights, tf_weights_names), (tf_biases, tf_bias_names)
-
-  def _TransformSingleLayerCanonical(self, cu_weights, cu_biases, prefix,
-                                     tf_weights, tf_weights_names, tf_biases,
-                                     tf_bias_names):
+        self._cu_canonical_to_tf_canonical_single_layer(
+            fw_weights,
+            fw_biases,
+            tf_weights,
+            tf_biases,
+        )
+
+        self._cu_canonical_to_tf_canonical_single_layer(
+            bw_weights,
+            bw_biases,
+            tf_weights,
+            tf_biases,
+        )
+    return (tf_weights, tf_biases)
+
+  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
+                                                 tf_weights, tf_biases):
     r"""Transform single layer Cudnn canonicals to tf canonicals.
 
     The elements of cu_weights, cu_biases are laid out in the following format:
@@ -447,15 +332,12 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     Args:
       cu_weights: a list of tensors, single layer weights.
       cu_biases: a list of tensors, single layer biases.
-      prefix: the shared prefix of all tensor names.
       tf_weights: a list where transformed weights are stored.
-      tf_weights_names: a list where names of transformed weights are stored.
       tf_biases: a list where transformed biases are stored.
-      tf_bias_names: a list where names of transformed biases are stored.
     """
     raise NotImplementedError("Abstract method")
 
-  def _ReverseTransformCanonical(self, tf_canonicals):
+  def _tf_canonical_to_cu_canonical(self, tf_canonicals):
     r"""Transform from tf canonical to Cudnn canonical.
 
     This is the reverse routine of _TransformCanonical().
@@ -502,30 +384,27 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return cu_weights, cu_biases
 
   def _cudnn_to_tf_weights(self, *cu_weights):
-    r"""Stitching cudnn canonical weights to generate tf canonical weights."""
+    r"""Stitches cudnn canonical weights to generate tf canonical weights."""
     raise NotImplementedError("Abstract method")
 
   def _tf_to_cudnn_weights(self, layer, *tf_weights):
-    r"""Reverse the operations in StitchWeights()."""
+    r"""Reverses the operations in StitchWeights()."""
     raise NotImplementedError("Abstract method")
 
   def _cudnn_to_tf_biases(self, *biases):
-    r"""Stitching cudnn canonical biases to generate tf canonical biases."""
+    r"""Stitches cudnn canonical biases to generate tf canonical biases."""
     raise NotImplementedError("Abstract method")
 
   def _tf_to_cudnn_biases(self, *tf_biases):
-    r"""Reverse the operations in StitchBiases()."""
+    r"""Reverses the operations in StitchBiases()."""
     raise NotImplementedError("Abstract method")
 
 
-class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
-  """SaveableObject implementation handling Cudnn LSTM opaque params."""
-
+class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter):
+  """Helper class that converts between params of Cudnn and TF LSTM."""
   _rnn_mode = CUDNN_LSTM
   _num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
 
-  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
-
   def _cudnn_to_tf_gate_params(self, *cu_gate_order):
     i_g, f_g, c_g, o_g = cu_gate_order
     return [i_g, c_g, f_g, o_g]
@@ -603,44 +482,16 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     # Return ifco order for Cudnn LSTM.
     return b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro
 
-  def _TransformSingleLayerCanonical(self, weights, biases, prefix, tf_weights,
-                                     tf_weights_names, tf_biases,
-                                     tf_bias_names):
-    (w,) = self._cudnn_to_tf_weights(*weights)
-    (b,) = self._cudnn_to_tf_biases(*biases)
-
+  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
+                                                 tf_weights, tf_biases):
+    (w,) = self._cudnn_to_tf_weights(*cu_weights)
+    (b,) = self._cudnn_to_tf_biases(*cu_biases)
     tf_weights.append(w)
-    tf_weights_names.append(prefix + "/kernel")
-
     tf_biases.append(b)
-    tf_bias_names.append(prefix + "/bias")
-
-  def _checkpointable_track_params(self, checkpointable, params):
-    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
-    biases = []
-    weights = []
-    for name in self._weight_names:
-      weights.append(params[name])
-    for name in self._bias_names:
-      biases.append(params[name])
-    assert len(params) == len(weights) + len(biases)
-    if len(weights) == 1 and len(biases) == 1:
-      # For single-layer cells, allow substituting a cell with no MultiRNNCell
-      # wrapping.
-      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
-      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
-      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
-      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
-    assert len(biases) == len(weights)
-    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
-      cell = checkpointable_lib.Checkpointable()
-      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
-      cell.bias = bias
-      cell.kernel = kernel
 
 
-class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
-  """SaveableObject implementation handling Cudnn GRU opaque params."""
+class CudnnParamsFormatConverterGRU(CudnnParamsFormatConverter):
+  """Helper class that converts between params of Cudnn and TF GRU."""
 
   _rnn_mode = CUDNN_GRU
   _num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
@@ -702,29 +553,18 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
     b_ri, b_rr = array_ops.split(br, 2, axis=0)
     return b_wi, b_wr, b_wh, b_ri, b_rr, b_rh
 
-  def _TransformSingleLayerCanonical(self, weights, biases, prefix, tf_weights,
-                                     tf_weights_names, tf_biases,
-                                     tf_bias_names):
+  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
+                                                 tf_weights, tf_biases):
     # pylint: disable=invalid-name
-    W_ir, w_h, r_h = self._cudnn_to_tf_weights(*weights)
-    b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*biases)
+    W_ir, w_h, r_h = self._cudnn_to_tf_weights(*cu_weights)
+    b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*cu_biases)
     # pylint: enable=invalid-name
-
     tf_weights.extend([W_ir, w_h, r_h])
-    tf_weights_names.append(prefix + "/gates/kernel")
-    tf_weights_names.append(prefix + "/candidate/input_projection/kernel")
-    tf_weights_names.append(prefix + "/candidate/hidden_projection/kernel")
-
     tf_biases.extend([b_ir, b_wh, b_rh])
-    tf_bias_names.append(prefix + "/gates/bias")
-    tf_bias_names.append(prefix + "/candidate/input_projection/bias")
-    tf_bias_names.append(prefix + "/candidate/hidden_projection/bias")
-
 
-class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
-  """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
 
-  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
+class CudnnParamsFormatConverterBasic(CudnnParamsFormatConverterLSTM):
+  """Helper class that converts between params of Cudnn and TF Relu/Tanh RNN."""
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
@@ -766,18 +606,270 @@ class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
     return b_i, b_h
 
 
-class CudnnRNNTanhSaveable(CudnnRNNSimpleSaveable):
-  """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
+class CudnnParamsFormatConverterTanh(CudnnParamsFormatConverterBasic):
+  """Helper class that converts between params of Cudnn and TF Tanh RNN."""
   _rnn_mode = CUDNN_RNN_TANH
   _num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
 
 
-class CudnnRNNReluSaveable(CudnnRNNSimpleSaveable):
-  """SaveableObject implementation handling Cudnn RNN Relu opaque params."""
+class CudnnParamsFormatConverterRelu(CudnnParamsFormatConverterBasic):
+  """Helper class that converts between params of Cudnn and TF Relu RNN."""
   _rnn_mode = CUDNN_RNN_RELU
   _num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
 
 
+# TODO(yaozhang): make sure we only save the canonical version of params and
+# don't save the platform-specific version to avoid potential race
+# conditions where params is updated by both versions when being restored.
+# Currently, checkpointing will function properly, despite that we save both
+# versions, because Saver restores customized savables after Variables.
+# However, it is good to not rely on this restoring order of Saver and to
+# avoid unnecessary storage. Add a test to check only the canonical version is
+# saved.
+class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Abstract SaveableObject implementation handling Cudnn opaque params."""
+
+  def __init__(self,
+               opaque_params,
+               num_layers,
+               num_units,
+               input_size,
+               input_mode=CUDNN_INPUT_LINEAR_MODE,
+               direction=CUDNN_RNN_UNIDIRECTION,
+               scope=None,
+               name="cudnn_rnn_saveable"):
+    """Creates a CudnnOpaqueParamsSaveable object.
+
+       CudnnOpaqueParamsSaveable is saveable/restorable in a checkpoint file
+       and is used to save/restore the weights and biases parameters in a
+       canonical format which is directly consumable by platform-independent tf
+       RNN cells. Parameters are saved as tensors layer by layer with weight
+       tensors followed by bias tensors, and forward direction followed by
+       backward direction (if applicable). When restoring, a user could name
+       param_variables as desired, and restore weight and bias tensors to these
+       variables.
+
+       For CudnnRNNRelu or CudnnRNNTanh, there are 2 tensors per weight and per
+       bias for each layer: tensor 0 is applied to the input from the previous
+       layer and tensor 1 to the recurrent input.
+
+       For CudnnLSTM, there are 8 tensors per weight and per bias for each
+       layer: tensor 0-3 are applied to the input from the previous layer and
+       tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate;
+       tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate;
+       tensor 3 and 7 the output gate.
+
+       For CudnnGRU, there are 6 tensors per weight and per bias for each layer:
+       tensor 0-2 are applied to the input from the previous layer and
+       tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate;
+       tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
+
+    Args:
+      opaque_params: a variable, Cudnn RNN opaque params.
+      num_layers: the number of layers for the RNN model.
+      num_units: the number of units within the RNN model.
+      input_size: the size of the input, it could be different from the
+        num_units.
+      input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'. 'linear_input' (default)
+        always applies a linear projection of input onto RNN hidden state.
+        (standard RNN behavior). 'skip_input' is only allowed when input_size ==
+        num_units; 'auto_select' implies 'skip_input' when input_size ==
+        num_units; otherwise, it implies 'linear_input'.
+      direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+      scope: string of VariableScope, the scope of equivalent subgraph
+        consisting only platform-independent tf RNN cells.
+      name: the name of the CudnnOpaqueParamsSaveable object.
+    """
+    # Define in subclasses.
+    self._num_layers = num_layers
+    self._input_size = input_size
+    self._num_units = num_units
+    self._input_mode = input_mode
+    self._direction = direction
+    if scope is not None:
+      scope_name = scope.name if isinstance(scope, vs.VariableScope) else scope
+      self._scope = scope_name or None
+    else:
+      self._scope = None
+
+    self._variables = opaque_params
+    self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
+    # Defined in subclasses.
+    self._format_converter = None
+
+    tf_weights, tf_biases = (
+        self.format_converter.opaque_to_tf_canonical(self._variables))
+    tf_weight_names, tf_bias_names = self._tf_canonical_names()
+    # We currently don't use slice_spec. It might be useful in a distributed
+    # setting where each parameter server node stores a slice of variable,
+    # instead of having the master pull all slices and then save them.
+    slice_spec = ""
+    params = tf_weights + tf_biases
+    self._weight_names = tf_weight_names
+    self._bias_names = tf_bias_names
+    self._param_names = tf_weight_names + tf_bias_names
+    prefixed_param_names = tf_weight_names + tf_bias_names
+    if self._scope:
+      prefixed_param_names = [
+          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names
+      ]
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
+        for param, param_name in zip(params, prefixed_param_names)
+    ]
+    super(CudnnOpaqueParamsSaveable, self).__init__(
+        array_ops.identity(self._variables), specs, name)
+
+  @property
+  def format_converter(self):
+    if self._format_converter is None:
+      self._format_converter = self._format_converter_cls(
+          self._num_layers, self._num_units, self._input_size, self._input_mode,
+          self._direction)
+    return self._format_converter
+
+  def restore(self, restored_tensors, restored_shapes):
+    opaque_params = self.format_converter.tf_canonical_to_opaque(
+        restored_tensors)
+    return state_ops.assign(
+        self._variables, opaque_params, validate_shape=False)
+
+  def _checkpointable_save(self, save_buffer):
+    weights, biases = self.format_converter.opaque_to_tf_canonical(
+        self._variables)
+    for name, tensor in zip(self._param_names, weights + biases):
+      save_buffer[name] = array_ops.identity(tensor)
+
+  def _checkpointable_restore(self, restore_buffer):
+    tensors = [
+        array_ops.identity(restore_buffer[name]) for name in self._param_names
+    ]
+    return self.restore(
+        restored_tensors=tensors,
+        restored_shapes=None  # Unused
+    )
+
+  def _add_checkpointable_dependencies(self, checkpointable, dtype):
+    """Add canonical weight dependencies to `checkpointable`.
+
+    When saving or restoring, converts to or from the opaque buffer
+    format. Weights are saved and loaded in the configuration expected by
+    cuDNN-compatible cells.
+
+    Args:
+      checkpointable: An object inheriting from `CheckpointableBase` to add
+        dependencies too (typically the cuDNN `Layer`).
+      dtype: The dtype for the canonical parameter Tensors.
+    """
+    split_dependencies = split_dependency.split_dependency(
+        component_names=self._param_names,
+        component_dtypes=(dtype,) * len(self._param_names),
+        fill_save_buffer_fn=self._checkpointable_save,
+        consume_restore_buffer_fn=self._checkpointable_restore)
+    self._checkpointable_track_params(checkpointable, split_dependencies)
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Tracks parameters in a canonical configuration."""
+    return  # NotImplementedError raised by the Layer.
+
+  def _tf_canonical_names(self):
+    tf_weights_names, tf_biases_names = [], []
+    for i in range(self._num_layers):
+      if self._direction == CUDNN_RNN_UNIDIRECTION:
+        prefix = self._tf_canonical_name_prefix(i)
+        self._tf_canonical_names_single_layer(prefix, tf_weights_names,
+                                              tf_biases_names)
+      else:
+        fwd_prefix = self._tf_canonical_name_prefix(i, is_fwd=True)
+        bak_prefix = self._tf_canonical_name_prefix(i, is_fwd=False)
+
+        self._tf_canonical_names_single_layer(fwd_prefix, tf_weights_names,
+                                              tf_biases_names)
+        self._tf_canonical_names_single_layer(bak_prefix, tf_weights_names,
+                                              tf_biases_names)
+    return tf_weights_names, tf_biases_names
+
+  def _tf_canonical_name_prefix(self, layer, is_fwd=True):
+    if self._direction == CUDNN_RNN_UNIDIRECTION:
+      return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
+    else:
+      if is_fwd:
+        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/fw/%s" %
+                (layer, self._rnn_cell_name))
+      else:
+        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/bw/%s" %
+                (layer, self._rnn_cell_name))
+
+  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
+                                       tf_biases_names):
+    raise NotImplementedError("Abstract method")
+
+
+class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
+  """SaveableObject implementation handling Cudnn LSTM opaque params."""
+
+  _format_converter_cls = CudnnParamsFormatConverterLSTM
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
+
+  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
+                                       tf_bias_names):
+    tf_weights_names.append(prefix + "/kernel")
+    tf_bias_names.append(prefix + "/bias")
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
+    biases = []
+    weights = []
+    for name in self._weight_names:
+      weights.append(params[name])
+    for name in self._bias_names:
+      biases.append(params[name])
+    assert len(params) == len(weights) + len(biases)
+    if len(weights) == 1 and len(biases) == 1:
+      # For single-layer cells, allow substituting a cell with no MultiRNNCell
+      # wrapping.
+      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
+      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
+      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
+      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+    assert len(biases) == len(weights)
+    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
+      cell = checkpointable_lib.Checkpointable()
+      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell.bias = bias
+      cell.kernel = kernel
+
+
+class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
+  """SaveableObject implementation handling Cudnn GRU opaque params."""
+
+  _format_converter_cls = CudnnParamsFormatConverterGRU
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
+
+  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
+                                       tf_bias_names):
+    tf_weights_names.append(prefix + "/gates/kernel")
+    tf_weights_names.append(prefix + "/candidate/input_projection/kernel")
+    tf_weights_names.append(prefix + "/candidate/hidden_projection/kernel")
+
+    tf_bias_names.append(prefix + "/gates/bias")
+    tf_bias_names.append(prefix + "/candidate/input_projection/bias")
+    tf_bias_names.append(prefix + "/candidate/hidden_projection/bias")
+
+
+class CudnnRNNTanhSaveable(CudnnLSTMSaveable):
+  _format_converter_cls = CudnnParamsFormatConverterTanh
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
+
+
+class CudnnRNNReluSaveable(CudnnLSTMSaveable):
+  _format_converter_cls = CudnnParamsFormatConverterRelu
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
+
+
 _cudnn_rnn_common_doc_string = """
   Cudnn RNN has an opaque parameter buffer that can be used for inference and
   training. But it is possible that the layout of the parameter buffers
@@ -850,7 +942,7 @@ def _get_num_params(rnn_mode, num_layers, direction):
   elif rnn_mode == CUDNN_RNN_TANH:
     num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
   else:
-    raise ValueError("Invalid \'rnn_mode\': %s", rnn_mode)
+    raise ValueError("Invalid \'rnn_mode\': %s" % rnn_mode)
   num_params = num_layers * num_params_per_layer
   if direction != CUDNN_RNN_UNIDIRECTION:
     num_params *= 2
@@ -918,7 +1010,7 @@ def _cudnn_rnn(inputs,
       "seed2": seed2,
       "name": name
   }
-  if use_cudnn_v2 is not "1":
+  if use_cudnn_v2 != "1":
     outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
   else:
     outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
@@ -1582,7 +1674,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
     """
 
     if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION):
-      raise ValueError("Invalid direction: %s", direction)
+      raise ValueError("Invalid direction: %s" % direction)
 
     super(_CudnnRNNNoInputC, self).__init__(
         self._rnn_mode,
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
index a87a5624c88d1d0af10055261dad55937ed6aeb0..3ecd755d86f6be47910aebbdb46d335d165427d8 100644
--- a/tensorflow/contrib/distribute/BUILD
+++ b/tensorflow/contrib/distribute/BUILD
@@ -26,7 +26,6 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/contrib/distribute/python:collective_all_reduce_strategy",
-        "//tensorflow/contrib/distribute/python:cross_tower_ops",
         "//tensorflow/contrib/distribute/python:mirrored_strategy",
         "//tensorflow/contrib/distribute/python:monitor",
         "//tensorflow/contrib/distribute/python:one_device_strategy",
@@ -35,6 +34,7 @@ py_library(
         "//tensorflow/contrib/distribute/python:tpu_strategy",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:distribute_config",
         "//tensorflow/python/distribute:distribute_coordinator",
     ],
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index f82453f3b5ea01b8bb64a70bd49f5e3e831bb4e2..8a8dc159ade6f2a4a9b5ec29055ea4848492b29f 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -46,6 +46,9 @@ Let's see how to scale to multiple GPUs on one machine using `MirroredStrategy`
 Take a very simple model consisting of a single layer:
 
 ```python
+import tensorflow as tf
+from tensorflow import keras
+
 inputs = tf.keras.layers.Input(shape=(1,))
 predictions = tf.keras.layers.Dense(1)(inputs)
 model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
@@ -90,8 +93,8 @@ Similarly, we can also call `evaluate` and `predict` as before using appropriate
 datasets.
 
 ```python
-model.evaluate(eval_dataset)
-model.predict(predict_dataset)
+model.evaluate(eval_dataset, steps=1)
+model.predict(predict_dataset, steps=1)
 ```
 
 That's all you need to train your model with Keras on multiple GPUs with
@@ -131,7 +134,7 @@ def model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, loss=loss)
 
   if mode == tf.estimator.ModeKeys.TRAIN:
-    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss_fn())
+    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
 ```
 
@@ -245,19 +248,17 @@ Let's use the same example for multi-worker. We'll start a cluster with 3
 workers doing synchronous all-reduce training. In the following code snippet, we
 start multi-worker training using `tf.estimator.train_and_evaluate`:
 
-
 ```python
 def model_main():
-  estimator = ...
   distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
       num_gpus_per_worker=2)
   config = tf.estimator.RunConfig(train_distribute=distribution)
+  estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
   train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
   eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
   tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
 ```
 
-
 **Note**: You don't have to set "TF\_CONFIG" manually if you use our provided
 Kubernetes template.
 
@@ -324,13 +325,13 @@ start training.
 On your laptop, you can run
 
 ```python
-estimator = ...
 distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
     num_gpus_per_worker=2)
 config = tf.estimator.RunConfig(
     experimental_distribute=tf.contrib.distribute.DistributeConfig(
         train_distribute=distribution,
         remote_cluster={"worker": ["host1:port", "host2:port", "host3:port"]}))
+estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
 train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
 eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
 tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
index ab2f221dc6486666e914deb19dd56c7687606e2f..8ec73654e30e4967f318c558ba94301e84a206e4 100644
--- a/tensorflow/contrib/distribute/__init__.py
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -25,13 +25,13 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.distribute.python.collective_all_reduce_strategy import CollectiveAllReduceStrategy
-from tensorflow.contrib.distribute.python.cross_tower_ops import *
 from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
 from tensorflow.contrib.distribute.python.monitor import Monitor
 from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
 from tensorflow.contrib.distribute.python.step_fn import *
 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
+from tensorflow.python.distribute.cross_device_ops import *
 from tensorflow.python.distribute.distribute_config import DistributeConfig
 from tensorflow.python.distribute.distribute_coordinator import run_standard_tensorflow_server
 from tensorflow.python.training.distribute import *
@@ -46,6 +46,7 @@ _allowed_symbols = [
     'CrossDeviceOps',
     'DistributeConfig',
     'DistributionStrategy',
+    'DistributionStrategyExtended',
     'MirroredStrategy',
     'Monitor',
     'MultiWorkerAllReduce',
@@ -62,6 +63,7 @@ _allowed_symbols = [
     'get_loss_reduction',
     'get_replica_context',
     'has_distribution_strategy',
+    'in_cross_replica_context',
     'require_replica_context',
     'run_standard_tensorflow_server',
     'UpdateContext',
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 22736c799d276033c0ddc112d17e898be944c933..91282a8c1dab051da7894956d202c88c90e2fe39 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -16,45 +16,26 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 # TODO(priyag): Figure out testonly issues that are preventing us from
 # including our tests in pip for now.
 
-py_library(
-    name = "values",
-    srcs = ["values.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":input_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device_util",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:multi_device_iterator_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
-        "@six_archive//:six",
-    ],
-)
-
 cuda_py_test(
     name = "values_test",
     srcs = ["values_test.py"],
     additional_deps = [
+        ":combinations",
         ":mirrored_strategy",
         ":multi_worker_test_base",
-        ":values",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python:errors",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python:device_util",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -68,25 +49,9 @@ py_library(
     srcs = ["mirrored_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
-        ":shared_variable_creator",
-        ":values",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:device_util",
         "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -95,16 +60,17 @@ py_library(
     srcs = ["parameter_server_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
         ":mirrored_strategy",
-        ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -116,7 +82,7 @@ cuda_py_test(
         ":combinations",
         ":multi_worker_test_base",
         ":parameter_server_strategy",
-        ":values",
+        ":strategy_test_lib",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -127,10 +93,12 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:layers",
         "//tensorflow/python:session",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -145,12 +113,13 @@ py_library(
     srcs = ["one_device_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":values",
-        "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:distribute",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -161,16 +130,16 @@ py_library(
     srcs = ["collective_all_reduce_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
-        ":cross_tower_utils",
         ":mirrored_strategy",
-        ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -233,28 +202,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "mirrored_strategy_test",
-    srcs = ["mirrored_strategy_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
-        ":mirrored_strategy",
-        ":multi_worker_test_base",
-        ":strategy_test_lib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 py_test(
     name = "one_device_strategy_test",
     srcs = ["one_device_strategy_test.py"],
@@ -270,35 +217,32 @@ py_test(
     ],
 )
 
+# TODO(priyag): Rename this test to mirrored_strategy_test
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
     srcs = ["mirrored_strategy_multigpu_test.py"],
     additional_deps = [
+        ":combinations",
         ":mirrored_strategy",
         ":multi_worker_test_base",
-        ":values",
         ":strategy_test_lib",
-        "//tensorflow/python:distribute",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:layers",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
+    shard_count = 5,
     tags = [
         "guitar",
-        "no_pip",
         "multi_and_single_gpu",
-        # Do not perform the extra analysis on this test, because it is already
-        # performed for the `:mirrored_strategy_test` target.
-        "no_oss",
-        "noasan",
-        "notap",
-        "notsan",
+        "no_pip",
     ],
 )
 
@@ -337,12 +281,15 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":one_device_strategy",
-        ":values",
         "//tensorflow/contrib/tpu:tpu_lib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -352,7 +299,6 @@ cuda_py_test(
     additional_deps = [
         ":collective_all_reduce_strategy",
         ":combinations",
-        ":cross_tower_utils",
         ":multi_worker_test_base",
         ":strategy_test_lib",
         "@absl_py//absl/testing:parameterized",
@@ -368,15 +314,13 @@ cuda_py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
     ],
     tags = [
         "multi_and_single_gpu",
         "no_pip",
-        # TODO(b/118820960): Re-enable this test in guitar.
-        "manual",
-        "noguitar",
     ],
 )
 
@@ -470,6 +414,7 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # http://b/119349471
         "no_pip",
     ],
 )
@@ -478,20 +423,11 @@ cuda_py_test(
     name = "keras_optimizer_v2_test",
     srcs = ["keras_optimizer_v2_test.py"],
     additional_deps = [
-        ":combinations",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/optimizer_v2:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
+        ":keras_test_lib",
     ],
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # http://b/119349471
         "no_pip",
     ],
 )
@@ -509,7 +445,9 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/contrib/optimizer_v2:training",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute",
+        "//tensorflow/python/distribute:distribute_config",
+        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/feature_column",
@@ -517,7 +455,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
     ],
-    shard_count = 5,
+    shard_count = 48,
     tags = [
         "multi_and_single_gpu",
         "no_pip",
@@ -525,6 +463,7 @@ cuda_py_test(
         "noasan",
         "nomsan",
         "notsan",
+        "no_oss",  # http://b/119349471
     ],
 )
 
@@ -600,52 +539,16 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "shared_variable_creator",
-    srcs = ["shared_variable_creator.py"],
-    visibility = ["//tensorflow:internal"],
-)
-
-py_test(
-    name = "shared_variable_creator_test",
-    srcs = ["shared_variable_creator_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":shared_variable_creator",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
-py_library(
-    name = "cross_tower_utils",
-    srcs = ["cross_tower_utils.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":values",
-        "//tensorflow/contrib/all_reduce:all_reduce_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nccl_ops",
-    ],
-)
-
 cuda_py_test(
-    name = "cross_tower_utils_test",
-    srcs = ["cross_tower_utils_test.py"],
+    name = "cross_device_utils_test",
+    srcs = ["cross_device_utils_test.py"],
     additional_deps = [
         ":combinations",
-        ":cross_tower_utils",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
@@ -654,40 +557,20 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "cross_tower_ops",
-    srcs = ["cross_tower_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":cross_tower_utils",
-        ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:device_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-        "@six_archive//:six",
-    ],
-)
-
 cuda_py_test(
-    name = "cross_tower_ops_test",
-    srcs = ["cross_tower_ops_test.py"],
+    name = "cross_device_ops_test",
+    srcs = ["cross_device_ops_test.py"],
     additional_deps = [
         ":combinations",
-        ":cross_tower_ops",
         ":multi_worker_test_base",
         ":mirrored_strategy",
-        ":values",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
@@ -697,37 +580,6 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "input_ops",
-    srcs = ["input_ops.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-cuda_py_test(
-    name = "input_ops_test",
-    srcs = ["input_ops_test.py"],
-    additional_deps = [
-        ":input_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python:util",
-    ],
-    tags = [
-        "no_pip",
-    ],
-)
-
 py_library(
     name = "keras_test_lib",
     testonly = 1,
@@ -757,8 +609,6 @@ cuda_py_test(
         "no_oss",  # TODO(b/117919883): Fix python error.
         "no_pip",
         "no_windows_gpu",
-        # TODO(b/118815591): Re-enable this test in guitar.)
-        "noguitar",
         "notsan",
     ],
 )
@@ -769,7 +619,6 @@ py_library(
     srcs = ["metrics_v1_test.py"],
     deps = [
         ":combinations",
-        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index b311644cb22898df082b0c803d1a8960fe159c98..31bd0e996a247a2fc01405fb3b8172a40853d698 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -43,7 +43,9 @@ class CheckpointUtilsWithDistributionStrategyTest(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       in_replica_mode=[True, False],
       mode=["graph"]))
   def testInitFromCheckpoint(self, distribution, in_replica_mode):
@@ -69,7 +71,7 @@ class CheckpointUtilsWithDistributionStrategyTest(
 
     with ops.Graph().as_default() as g, distribution.scope():
       if in_replica_mode:
-        distribution.call_for_each_replica(init_and_verify, g)
+        distribution.call_for_each_replica(init_and_verify, args=[g])
       else:
         init_and_verify(g)
 
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index d9339f8f75acda3695d33c55409e921a9627bac7..906377b7395a520780e485461b83298320ebdcb3 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -18,21 +18,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import distribute as distribute_lib
 
 
 # TODO(yuefengz): support in-graph replication.
-class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
+class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
   """Distribution strategy that uses collective ops for all-reduce.
 
   It is similar to the MirroredStrategy but it uses collective ops for
@@ -53,10 +54,20 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
       num_gpus_per_worker: number of local GPUs or GPUs per worker, the default
         is 0 meaning CPU only.
     """
+    super(CollectiveAllReduceStrategy, self).__init__(
+        CollectiveAllReduceExtended(self, num_gpus_per_worker))
+
+
+class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
+  """Implementation of CollectiveAllReduceStrategy."""
+
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    distribute_lib.DistributionStrategyExtended.__init__(
+        self, container_strategy)
     self._num_gpus_per_worker = num_gpus_per_worker
-    self._initialize_local_worker(num_gpus_per_worker)
+    self._initialize_local_worker(container_strategy, num_gpus_per_worker)
 
-  def _initialize_local_worker(self, num_gpus_per_worker):
+  def _initialize_local_worker(self, container_strategy, num_gpus_per_worker):
     """Initializes the object for local training."""
     self._is_chief = True
     self._num_workers = 1
@@ -68,10 +79,11 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     else:
       local_devices = ["/device:CPU:0"]
 
-    self._collective_keys = cross_tower_utils.CollectiveKeys()
-    super(CollectiveAllReduceStrategy, self).__init__(
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    super(CollectiveAllReduceExtended, self).__init__(
+        container_strategy,
         devices=local_devices,
-        cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
+        cross_device_ops=cross_device_ops_lib.CollectiveAllReduce(
             num_workers=1,
             num_gpus_per_worker=num_gpus_per_worker,
             collective_keys=self._collective_keys))
@@ -83,8 +95,8 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                  local_devices)
 
-  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
-                               task_type, task_id):
+  def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker,
+                               cluster_spec, task_type, task_id):
     """Initializes the object for multi-worker training."""
     if task_type is None or task_id is None:
       raise ValueError("When `cluster_spec` is given, you must also specify "
@@ -94,8 +106,7 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
           "Unrecognized task_type: %r, valid task types are: \"chief\", "
           "\"worker\"." % task_type)
     cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._num_workers = len(cluster_spec.as_dict().get("worker", [])) + len(
-        cluster_spec.as_dict().get("chief", []))
+    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
     if not self._num_workers:
       raise ValueError("No `worker` or `chief` tasks can be found in "
                        "`cluster_spec`.")
@@ -112,10 +123,11 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     else:
       local_devices = [worker_device]
 
-    self._collective_keys = cross_tower_utils.CollectiveKeys()
-    super(CollectiveAllReduceStrategy, self).__init__(
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    super(CollectiveAllReduceExtended, self).__init__(
+        container_strategy,
         devices=local_devices,
-        cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
+        cross_device_ops=cross_device_ops_lib.CollectiveAllReduce(
             num_workers=self._num_workers,
             num_gpus_per_worker=num_gpus_per_worker,
             collective_keys=self._collective_keys))
@@ -202,17 +214,35 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     return mirrored_strategy._create_mirrored_variable(
         devices, _real_mirrored_creator, *args, **kwargs)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
     # TODO(yuefengz): shard the dataset.
-    return values.PerDeviceDataset(
+    return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._devices, True)
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec is None:
+      input_pipeline_id = 0
+    else:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=self._num_workers,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+
+    return values.InputFunctionIterator(
+        input_fn, [(self._default_device, self._devices)], [input_context])
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     """Configures the object.
 
     Args:
@@ -229,8 +259,9 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
       # If a `cluster_spec` is already passed in, do nothing here.
       # TODO(yuefengz): check `cluster_spec` is the same if this object has
       # already been initialized with a `cluster_spec`.
-      self._initialize_multi_worker(self._num_gpus_per_worker, cluster_spec,
-                                    task_type, task_id)
+      self._initialize_multi_worker(
+          self._container_strategy(), self._num_gpus_per_worker, cluster_spec,
+          task_type, task_id)
 
     if not session_config:
       return
@@ -271,11 +302,11 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
         "/job:%s/task:%d" % (self._task_type, self._task_id))
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
     return True
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return True
 
   @property
@@ -287,6 +318,10 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     return self._is_chief
 
   @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     return len(self._devices) * self._num_workers
 
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 19b59513d81c5b0cebf5e44aa66b110db86a91c8..eb2b859aa559dd0c72351a009149ffdcb3c96b7c 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -23,13 +23,18 @@ import numpy as np
 
 from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
@@ -54,8 +59,6 @@ class CollectiveAllReduceStrategyTestBase(
     self._run_options = config_pb2.RunOptions()
     self._run_options.experimental.collective_graph_key = 6
 
-    self._sess_config = config_pb2.ConfigProto()
-
     # We use a different key_base for each test so that collective keys won't be
     # reused.
     # TODO(yuefengz, tucker): enable it to reuse collective keys in different
@@ -66,33 +69,37 @@ class CollectiveAllReduceStrategyTestBase(
   def _get_test_object(self, task_type, task_id, num_gpus=0):
     distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
         num_gpus_per_worker=num_gpus)
+    session_config = config_pb2.ConfigProto()
     if task_type and task_id is not None:
       distribution.configure(
-          session_config=self._sess_config,
+          session_config=session_config,
           cluster_spec=self._cluster_spec,
           task_type=task_type,
           task_id=task_id)
-    collective_keys = cross_tower_utils.CollectiveKeys(
+    collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_start=num_gpus * 100 +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         CollectiveAllReduceStrategyTestBase.collective_key_base)
-    distribution._collective_keys = collective_keys
-    distribution._cross_tower_ops._collective_keys = collective_keys
+    distribution.extended._collective_keys = collective_keys
+    distribution.extended._cross_device_ops._collective_keys = collective_keys
     if task_type and task_id is not None:
-      return distribution, 'grpc://' + self._cluster_spec[task_type][task_id]
+      return distribution, 'grpc://' + self._cluster_spec[task_type][
+          task_id], session_config
     else:
-      return distribution, ''
+      return distribution, '', session_config
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
-    d, master_target = self._get_test_object(task_type, task_id, num_gpus)
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
     with ops.Graph().as_default(), \
-         self.cached_session(config=self._sess_config,
+         self.cached_session(config=config,
                              target=master_target) as sess, \
          d.scope():
-      l = core.Dense(1, use_bias=False, name='gpu_%d' % d._num_gpus_per_worker)
+      l = core.Dense(1, use_bias=False,
+                     name='gpu_%d' % d.extended._num_gpus_per_worker)
 
       def loss_fn(x):
         y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
@@ -117,7 +124,7 @@ class CollectiveAllReduceStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, one)
+        g_v = d.call_for_each_replica(grad_fn, args=[one])
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
@@ -127,7 +134,7 @@ class CollectiveAllReduceStrategyTestBase(
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
             g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
                 d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -135,7 +142,7 @@ class CollectiveAllReduceStrategyTestBase(
 
       before_out, after_out = step()
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
       sess.run(
@@ -154,7 +161,8 @@ class CollectiveAllReduceStrategyTestBase(
       return error_after < error_before
 
   def _test_complex_model(self, task_type, task_id, num_gpus):
-    d, master_target = self._get_test_object(task_type, task_id, num_gpus)
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
 
     def model_fn():
       """Mnist model with synthetic input."""
@@ -193,7 +201,7 @@ class CollectiveAllReduceStrategyTestBase(
       return train_op
 
     with ops.Graph().as_default(), \
-         self.cached_session(config=self._sess_config,
+         self.cached_session(config=config,
                              target=master_target) as sess:
       with d.scope():
         train_op = d.call_for_each_replica(model_fn)
@@ -204,10 +212,10 @@ class CollectiveAllReduceStrategyTestBase(
       return True
 
   def _test_variable_initialization(self, task_type, task_id, num_gpus):
-    distribution, master_target = self._get_test_object(task_type, task_id,
-                                                        num_gpus)
+    distribution, master_target, config = self._get_test_object(
+        task_type, task_id, num_gpus)
     with ops.Graph().as_default(), \
-         self.cached_session(config=self._sess_config,
+         self.cached_session(config=config,
                              target=master_target) as sess, \
          distribution.scope():
 
@@ -222,7 +230,7 @@ class CollectiveAllReduceStrategyTestBase(
       x = distribution.call_for_each_replica(model_fn)
       reduced_x = distribution.unwrap(
           distribution.reduce(
-              variable_scope.VariableAggregation.MEAN, x,
+              reduce_util.ReduceOp.MEAN, x,
               destinations='/cpu:0'))[0]
       x = distribution.unwrap(x)[0]
 
@@ -237,9 +245,42 @@ class CollectiveAllReduceStrategyTestBase(
                                                        reduced_x_value)))
     return np.allclose(x_value, reduced_x_value, atol=1e-5)
 
+  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
+                              expected_values):
+    distribution, master_target, config = self._get_test_object(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_device(d, next_element) for d in devices])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
 
 class DistributedCollectiveAllReduceStrategyTest(
-    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
@@ -267,7 +308,7 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testVariableInitialization(self, num_gpus):
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_variable_initialization,
         self._cluster_spec,
@@ -277,10 +318,30 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
 
+  # TODO(yuefengz): Update how we use num_gpus and required_gpus
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testMakeInputFnIterator(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    # We use CPU as the device when num_gpus = 0
+    devices_per_worker = max(1, num_gpus)
+    expected_values = [[i+j for j in range(devices_per_worker)]
+                       for i in range(0, 100, devices_per_worker)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=3*devices_per_worker,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator('worker', 1, num_gpus,
+                                 input_fn, expected_values)
+
 
 class DistributedCollectiveAllReduceStrategyTestWithChief(
     CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
@@ -321,20 +382,36 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
 
 
 class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase,
+                                       strategy_test_lib.DistributionTestBase,
                                        parameterized.TestCase):
 
   def testMinimizeLossGraph(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._test_minimize_loss_graph(None, None, num_gpus)
 
   def testComplexModel(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._test_complex_model(None, None, num_gpus)
 
+  def testMakeInputFnIterator(self, num_gpus=2):
+    # Collective ops doesn't support strategy with one device.
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    self._test_input_fn_iterator(None, None, num_gpus,
+                                 input_fn, expected_values)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 63a163e76cdd99c73399c657cbe9bc3d010369d2..f3ce547f4d0ffc8d507c77adb22293edf7c54373 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -168,6 +168,8 @@ def _augment_with_special_arguments(test_method):
       if GPU_TEST:
         self.skipTest("Test that doesn't require GPUs.")
     elif context.num_gpus() < required_gpus:
+      # TODO(priyag): Consider allowing tests in graph mode using soft
+      # placement.
       self.skipTest(
           "{} GPUs are not available for this test. {} GPUs are available".
           format(required_gpus, context.num_gpus()))
@@ -335,17 +337,35 @@ tpu_strategy_one_step = NamedDistribution(
     "TPUOneStep", lambda: tpu_lib.TPUStrategy(
         TPUClusterResolver(""), steps_per_run=1),
     required_tpu=True)
-# Note that we disable prefetching for testing since prefetching makes
-# the input non-deterministic.
+mirrored_strategy_with_one_cpu = NamedDistribution(
+    "Mirrored1CPU",
+    lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
+mirrored_strategy_with_one_gpu = NamedDistribution(
+    "Mirrored1GPU",
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]),
+    required_gpus=1)
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
-    lambda: mirrored_lib.MirroredStrategy(
-        ["/gpu:0", "/cpu:0"], prefetch_on_device=False),
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
     required_gpus=1)
 mirrored_strategy_with_two_gpus = NamedDistribution(
     "Mirrored2GPUs",
-    lambda: mirrored_lib.MirroredStrategy(
-        ["/gpu:0", "/gpu:1"], prefetch_on_device=False),
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
+    required_gpus=2)
+core_mirrored_strategy_with_one_cpu = NamedDistribution(
+    "CoreMirrored1CPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/cpu:0"]))
+core_mirrored_strategy_with_one_gpu = NamedDistribution(
+    "CoreMirrored1GPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0"]),
+    required_gpus=1)
+core_mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
+    "CoreMirroredCPUAndGPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/cpu:0"]),
+    required_gpus=1)
+core_mirrored_strategy_with_two_gpus = NamedDistribution(
+    "CoreMirrored2GPUs",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
 
 
@@ -377,8 +397,11 @@ def distributions_and_v1_optimizers():
   """A common set of combination with DistributionStrategies and Optimizers."""
   return combine(
       distribution=[
-          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
-          mirrored_strategy_with_two_gpus
+          one_device_strategy,
+          mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus,
+          core_mirrored_strategy_with_gpu_and_cpu,
+          core_mirrored_strategy_with_two_gpus,
       ],
       optimizer_fn=optimizers_v1)
 
@@ -387,7 +410,10 @@ def distributions_and_v2_optimizers():
   """DistributionStrategies and V2 Optimizers."""
   return combine(
       distribution=[
-          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
-          mirrored_strategy_with_two_gpus
+          one_device_strategy,
+          mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus,
+          core_mirrored_strategy_with_gpu_and_cpu,
+          core_mirrored_strategy_with_two_gpus,
       ],
       optimizer_fn=optimizers_v2)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
similarity index 74%
rename from tensorflow/contrib/distribute/python/cross_tower_ops_test.py
rename to tensorflow/contrib/distribute/python/cross_device_ops_test.py
index 6a9e8e00c02411d6486f30146f7f7d86ecd2fa9c..40410b90be7d9d9ed20fb4e696565cf79c044553 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
@@ -24,28 +24,28 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import device_util
 
 
-def _make_per_device(values, devices, regroup=False):
-  devices = cross_tower_ops_lib.get_devices_from(devices)
+def _make_per_replica(values, devices, regroup=False):
+  devices = cross_device_ops_lib.get_devices_from(devices)
   assert len(values) == len(devices)
 
-  # We simulate the result of regroup called on PerDevice which strips the
-  # PerDevice wrapper if it has only one value.
+  # We simulate the result of regroup called on PerReplica which strips the
+  # PerReplica wrapper if it has only one value.
   if len(values) == 1 and regroup:
     with ops.device(devices[0]):
       placed_v = array_ops.identity(values[0])
@@ -56,7 +56,7 @@ def _make_per_device(values, devices, regroup=False):
     with ops.device(d):
       placed_v = array_ops.identity(v)
     index[d] = placed_v
-  return value_lib.PerDevice(index)
+  return value_lib.PerReplica(index)
 
 
 # pylint: disable=g-doc-args,g-doc-return-or-yield
@@ -66,7 +66,7 @@ def _fake_mirrored(value, devices):
   All components of the returned Mirrored have the same objects, which is not
   true in reality.
   """
-  devices = cross_tower_ops_lib.get_devices_from(devices)
+  devices = cross_device_ops_lib.get_devices_from(devices)
   return value_lib.Mirrored(
       {d: v for d, v in zip(devices, [value] * len(devices))})
 
@@ -118,15 +118,15 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
           self.assertEqual(
               sess.run(list(left._index.values())), list(right._index.values()))
 
-  def _testReductionAndBroadcast(self, cross_tower_ops, distribution):
-    devices = distribution.worker_devices
+  def _testReductionAndBroadcast(self, cross_device_ops, distribution):
+    devices = distribution.extended.worker_devices
 
     values = [constant_op.constant(float(d)) for d in range(len(devices))]
-    per_device = _make_per_device(values, devices)
+    per_replica = _make_per_replica(values, devices)
     mean = (len(devices) - 1.) / 2.
 
     values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
-    per_device_2 = _make_per_device(values_2, devices)
+    per_replica_2 = _make_per_replica(values_2, devices)
     mean_2 = mean + 1.
 
     destination_mirrored = _fake_mirrored(1., devices)
@@ -142,41 +142,43 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     # test reduce()
     for destinations in all_destinations:
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.MEAN,
-              per_device,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.MEAN,
+              per_replica,
               destinations=destinations),
           _fake_mirrored(mean, destinations))
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.MEAN,
-              per_device_2,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.MEAN,
+              per_replica_2,
               destinations=destinations),
           _fake_mirrored(mean_2, destinations))
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.SUM, per_device,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.SUM, per_replica,
               destinations=destinations),
           _fake_mirrored(mean * len(devices), destinations))
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.SUM,
-              per_device_2,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.SUM,
+              per_replica_2,
               destinations=destinations),
           _fake_mirrored(mean_2 * len(devices), destinations))
 
     # test batch_reduce()
     for d1, d2 in itertools.product(all_destinations, all_destinations):
       self._assert_values_equal(
-          cross_tower_ops.batch_reduce(vs.VariableAggregation.MEAN,
-                                       [(per_device, d1), (per_device_2, d2)]),
+          cross_device_ops.batch_reduce(
+              reduce_util.ReduceOp.MEAN,
+              [(per_replica, d1), (per_replica_2, d2)]),
           [
               _fake_mirrored(mean, d1),
               _fake_mirrored(mean_2, d2)
           ])
       self._assert_values_equal(
-          cross_tower_ops.batch_reduce(vs.VariableAggregation.SUM,
-                                       [(per_device, d1), (per_device_2, d2)]),
+          cross_device_ops.batch_reduce(
+              reduce_util.ReduceOp.SUM,
+              [(per_replica, d1), (per_replica_2, d2)]),
           [
               _fake_mirrored(mean * len(devices), d1),
               _fake_mirrored(mean_2 * len(devices), d2)
@@ -185,7 +187,7 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     # test broadcast()
     for destinations in all_destinations:
       self._assert_values_equal(
-          cross_tower_ops.broadcast(constant_op.constant(1.), destinations),
+          cross_device_ops.broadcast(constant_op.constant(1.), destinations),
           _fake_mirrored(1., destinations))
 
 
@@ -194,62 +196,65 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
   # combinations module so that we can pass in devices instead of a distribution
   # strategy.
   reduction_to_one_combinations = combinations.combine(
-      cross_tower_ops=[
+      cross_device_ops=[
           combinations.NamedObject(
               "DefaultReductionToOneDeviceCrossDeviceOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
           combinations.NamedObject(
               "ReductionToCPUDeviceCrossDeviceOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
                   reduce_to_device=_cpu_device)),
           combinations.NamedObject(
               "AccumulateNCrossDeviceOp",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
                   accumulation_fn=math_ops.accumulate_n)),
       ],
       distribution=[
           combinations.one_device_strategy,
           combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.mirrored_strategy_with_two_gpus
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_two_gpus
       ],
       mode=["graph", "eager"])
   allreduce_combinations = combinations.combine(
-      cross_tower_ops=[
+      cross_device_ops=[
           combinations.NamedObject(
               "AllReduce",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
           combinations.NamedObject(
               "HierarchicalCopy",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps(
+              cross_device_ops_lib.AllReduceCrossDeviceOps(
                   "hierarchical_copy", 8, 0, 0)),
           combinations.NamedObject(
               "AllReduceNoGradientRepacking",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
           combinations.NamedObject(
               "HierarchicalCopyAggregateSmallTensors",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps(
+              cross_device_ops_lib.AllReduceCrossDeviceOps(
                   "hierarchical_copy", 0, 100, 10))
       ],
-      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      distribution=[combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       mode=["graph", "eager"])
 
   @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+  def testReductionAndBroadcast(self, cross_device_ops, distribution):
     with distribution.scope():
-      self._testReductionAndBroadcast(cross_tower_ops, distribution)
+      self._testReductionAndBroadcast(cross_device_ops, distribution)
 
   def testChooseAlgorithm(self):
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                     [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
     self.assertEqual(result._num_packs, 8)
 
     # if there are only 4 devices
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "nccl")
     self.assertEqual(result._num_packs, 1)
 
@@ -257,16 +262,16 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
                     [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
                     [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
     self.assertEqual(result._num_packs, 8)
 
     # if not dgx1-like links
     device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
                     [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "nccl")
     self.assertEqual(result._num_packs, 1)
 
@@ -277,9 +282,9 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     devices = ["/cpu:0", "/gpu:0"]
     t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
     t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
-    per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})
-    result = cross_tower_ops_lib._simple_reduce(
-        per_device, devices[0], math_ops.add_n, vs.VariableAggregation.SUM)
+    per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
+    result = cross_device_ops_lib._simple_reduce(
+        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)
 
     # Test that the result is semantically equal to both the concatenated
     # IndexedSlices with and without duplicate indices.
@@ -292,41 +297,42 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
 
   @combinations.generate(
       combinations.combine(
-          cross_tower_ops_instance=[
+          cross_device_ops_instance=[
               combinations.NamedObject(
                   "ReductionToOneDeviceCrossDeviceOps",
-                  cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+                  cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
               combinations.NamedObject(
                   "AllReduceCrossDeviceOps",
-                  cross_tower_ops_lib.AllReduceCrossDeviceOps())
+                  cross_device_ops_lib.AllReduceCrossDeviceOps())
           ],
-          aggregation=[vs.VariableAggregation.SUM, vs.VariableAggregation.MEAN],
+          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
           batch_reduce=[True, False],
           mode=["graph", "eager"],
           required_gpus=1))
-  def testIndexedSlicesAllReduce(self, cross_tower_ops_instance, aggregation,
+  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                  batch_reduce):
     devices = ["/cpu:0", "/gpu:0"]
     dense_shape = [5, 2]
     t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
     t1 = _make_indexed_slices(
         [[3., 4.], [5., 6.]], [1, 3], dense_shape, devices[1])
-    per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})
+    per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
 
     if batch_reduce:
-      result = cross_tower_ops_instance.batch_reduce(aggregation,
-                                                     [(per_device, devices)])
+      result = cross_device_ops_instance.batch_reduce(
+          reduce_op, [(per_replica, devices)])
     else:
-      result = cross_tower_ops_instance.reduce(aggregation, per_device, devices)
+      result = cross_device_ops_instance.reduce(
+          reduce_op, per_replica, devices)
 
     total_indices_with_dups = [1, 1, 3]
     total_indices_without_dups = [1, 3]
 
-    if aggregation == vs.VariableAggregation.SUM:
+    if reduce_op == reduce_util.ReduceOp.SUM:
       total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
       total_values_without_dups = [[4., 6.], [5., 6.]]
     else:
-      assert aggregation == vs.VariableAggregation.MEAN
+      assert reduce_op == reduce_util.ReduceOp.MEAN
       total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
       total_values_without_dups = [[2., 3.], [2.5, 3.]]
 
@@ -353,49 +359,65 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
       "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
   ]
   multi_worker_allreduce_combinations = combinations.combine(
-      cross_tower_ops=[
+      cross_device_ops=[
           combinations.NamedObject(
               "MultiWorkerAllReduce",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
           combinations.NamedObject(
               "MultiWorkerAllReducePack",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
           combinations.NamedObject(
               "MultiWorkerAllReduceAggregation",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
           combinations.NamedObject(
               "MultiWorkerAllReduceMultipleSpecs",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, [("pscpu/pscpu", 2, 100),
                                       ("xring", 2, -1)], 0, 0, 0)),
       ],
       distribution=[
           combinations.NamedDistribution(
               "MirroredCPU",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=0),
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=0),
               required_gpus=0),
           combinations.NamedDistribution(
               "Mirrored1GPU",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=1),
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=1),
               required_gpus=1),
           combinations.NamedDistribution(
               "Mirrored2GPUs",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=2),
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=2),
+              required_gpus=2),
+          # pylint: disable=g-long-lambda
+          combinations.NamedDistribution(
+              "CoreMirroredCPU",
+              lambda: mirrored_strategy.CoreMirroredStrategy(
+                  num_gpus_per_worker=0),
+              required_gpus=0),
+          combinations.NamedDistribution(
+              "CoreMirrored1GPU",
+              lambda: mirrored_strategy.CoreMirroredStrategy(
+                  num_gpus_per_worker=1),
+              required_gpus=1),
+          combinations.NamedDistribution(
+              "CoreMirrored2GPUs",
+              lambda: mirrored_strategy.CoreMirroredStrategy(
+                  num_gpus_per_worker=2),
               required_gpus=2),
       ],
       mode=["graph"])
 
   @combinations.generate(multi_worker_allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+  def testReductionAndBroadcast(self, cross_device_ops, distribution):
     distribution.configure(cluster_spec={
         "worker":
             ["/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"]
     })
     with distribution.scope():
-      self._testReductionAndBroadcast(cross_tower_ops, distribution)
+      self._testReductionAndBroadcast(cross_device_ops, distribution)
 
 
 class MultiWorkerCollectiveAllReduceTest(
@@ -416,7 +438,7 @@ class MultiWorkerCollectiveAllReduceTest(
     MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000
 
   def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False):
-    collective_keys = cross_tower_utils.CollectiveKeys(
+    collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
         instance_key_start=num_gpus * 100 +
@@ -424,7 +446,7 @@ class MultiWorkerCollectiveAllReduceTest(
         instance_key_with_id_start=num_gpus * 10000 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base)
     if local_mode:
-      collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
+      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
           1, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
@@ -432,7 +454,7 @@ class MultiWorkerCollectiveAllReduceTest(
         devices = ["/device:CPU:0"]
       return collective_all_reduce_ops, devices, ""
     else:
-      collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
+      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
           3, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = [
@@ -478,11 +500,11 @@ class MultiWorkerCollectiveAllReduceTest(
       # Collective ops doesn't support scalar tensors, so we have to construct
       # 1-d tensors.
       values = [constant_op.constant([float(d)]) for d in range(len(devices))]
-      per_device = _make_per_device(values, devices, regroup=True)
+      per_replica = _make_per_replica(values, devices, regroup=True)
       mean = np.array([(len(devices) - 1.) / 2.])
 
       values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
-      per_device_2 = _make_per_device(values_2, devices)
+      per_replica_2 = _make_per_replica(values_2, devices)
       mean_2 = np.array([mean[0] + 1.])
 
       destination_mirrored = _fake_mirrored(1., devices)
@@ -499,27 +521,27 @@ class MultiWorkerCollectiveAllReduceTest(
       for destinations in all_destinations:
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.MEAN,
-                per_device,
+                reduce_util.ReduceOp.MEAN,
+                per_replica,
                 destinations=destinations),
             _fake_mirrored(mean, destinations), sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.MEAN,
-                per_device_2,
+                reduce_util.ReduceOp.MEAN,
+                per_replica_2,
                 destinations=destinations),
             _fake_mirrored(mean_2, destinations), sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.SUM,
-                per_device,
+                reduce_util.ReduceOp.SUM,
+                per_replica,
                 destinations=destinations),
             _fake_mirrored(mean * len(devices) * num_workers, destinations),
             sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.SUM,
-                per_device_2,
+                reduce_util.ReduceOp.SUM,
+                per_replica_2,
                 destinations=destinations),
             _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
             sess)
@@ -527,17 +549,17 @@ class MultiWorkerCollectiveAllReduceTest(
       # test batch_reduce()
       for d1, d2 in itertools.product(all_destinations, all_destinations):
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(vs.VariableAggregation.MEAN,
-                                               [(per_device, d1),
-                                                (per_device_2, d2)]),
+            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.MEAN,
+                                               [(per_replica, d1),
+                                                (per_replica_2, d2)]),
             [
                 _fake_mirrored(mean, d1),
                 _fake_mirrored(mean_2, d2)
             ], sess)
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(vs.VariableAggregation.SUM,
-                                               [(per_device, d1),
-                                                (per_device_2, d2)]),
+            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.SUM,
+                                               [(per_replica, d1),
+                                                (per_replica_2, d2)]),
             [
                 _fake_mirrored(mean * len(devices) * num_workers, d1),
                 _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
similarity index 75%
rename from tensorflow/contrib/distribute/python/cross_tower_utils_test.py
rename to tensorflow/contrib/distribute/python/cross_device_utils_test.py
index d25964fa41adc7b1c9164a4ffe49c4c5532f76ac..6086eba0984782f5e85235142817569bee135df0 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for cross_tower_utils."""
+"""Tests for cross_device_utils."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +21,8 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_utils
-from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -43,7 +43,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t0 = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
     t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]])
     total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1])
     self._assert_values_equal(total, result)
 
   @test_util.run_in_graph_and_eager_modes
@@ -53,7 +53,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
     total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1])
     self.assertIsInstance(result, ops.IndexedSlices)
     self._assert_values_equal(total, result)
 
@@ -62,7 +62,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
     n = 2
     expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
-    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n)
     self._assert_values_equal(expected, result)
 
   @test_util.run_in_graph_and_eager_modes
@@ -71,7 +71,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     n = 2
     expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
-    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n)
     self.assertIsInstance(result, ops.IndexedSlices)
     self._assert_values_equal(expected, result)
 
@@ -79,7 +79,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
   def testIsIndexedSlices(self):
     t = math_ops._as_indexed_slices(
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(t))
+    self.assertTrue(cross_device_utils.contains_indexed_slices(t))
 
   @test_util.run_in_graph_and_eager_modes
   def testContainsIndexedSlices_List(self):
@@ -87,7 +87,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices([t0, t1]))
+    self.assertTrue(cross_device_utils.contains_indexed_slices([t0, t1]))
 
   @test_util.run_in_graph_and_eager_modes
   def testContainsIndexedSlices_Tuple(self):
@@ -95,27 +95,16 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices((t0, t1)))
+    self.assertTrue(cross_device_utils.contains_indexed_slices((t0, t1)))
 
   @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerDevice(self):
+  def testContainsIndexedSlices_PerReplica(self):
     t0 = math_ops._as_indexed_slices(
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_device = value_lib.PerDevice({"/gpu:0": t0, "/cpu:0": t1})
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerDeviceMapOutput(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_device = value_lib.PerDevice({
-        "/gpu:0": value_lib.MapOutput([t0]),
-        "/cpu:0": value_lib.MapOutput([t1])})
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
+    per_replica = value_lib.PerReplica({"/gpu:0": t0, "/cpu:0": t1})
+    self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica))
 
   @combinations.generate(combinations.combine(
       mode=["graph", "eager"],
@@ -124,7 +113,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     with ops.device("/cpu:0"):
       t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
     destination = "/gpu:0"
-    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+    result = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
         t, destination)
 
     self._assert_values_equal(t, result)
@@ -139,7 +128,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
       t = math_ops._as_indexed_slices(
           constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     destination = "/gpu:0"
-    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+    result = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
         t, destination)
 
     self.assertIsInstance(result, ops.IndexedSlices)
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index a1355c0b09e51c18cc4f8967dfc2c472d63593b9..e17085628ba6d1dfc79839fd824801723f07a518 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary.writer import writer_cache
@@ -63,7 +63,9 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
           distribution=[
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus
           ],
           use_train_and_evaluate=[True, False]))
   def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
@@ -75,12 +77,12 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
     train_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices),
+        batch_size=batch_size // distribution.num_replicas_in_sync,
         shuffle=True)
     eval_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices),
+        batch_size=batch_size // distribution.num_replicas_in_sync,
         shuffle=False)
     predict_input_fn = numpy_io.numpy_input_fn(
         x={'x': data}, batch_size=batch_size, shuffle=False)
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
index 018512ae5a22eaa7fb78a8c4e5918fec22eb8178..0f35657a8099523b6ba5b8f0a1a2f289c06b531a 100644
--- a/tensorflow/contrib/distribute/python/estimator_training_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_training_test.py
@@ -45,11 +45,13 @@ from tensorflow.python.estimator import training as estimator_training
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export as export_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import session_manager
+
 
 BATCH_SIZE = 10
 LABEL_DIMENSION = 2
@@ -202,10 +204,10 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     train_input_fn = self.dataset_input_fn(
         x={"x": DATA},
         y=DATA,
-        batch_size=BATCH_SIZE // len(train_distribute.worker_devices),
+        batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync,
         shuffle=True)
     if eval_distribute:
-      eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices)
+      eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync
     else:
       eval_batch_size = BATCH_SIZE
     eval_input_fn = self.dataset_input_fn(
@@ -291,19 +293,20 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           train_distribute_cls=[
               collective_all_reduce_strategy.CollectiveAllReduceStrategy,
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy
           ],
           eval_distribute_cls=[
-              None, mirrored_strategy.MirroredStrategy,
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy,
           ],
           required_gpus=[0, 1]))
   def test_complete_flow_standalone_client(self, train_distribute_cls,
                                            eval_distribute_cls):
-    try:
-      train_distribute = train_distribute_cls(num_gpus=context.num_gpus())
-    except TypeError:
-      train_distribute = train_distribute_cls(num_gpus_per_worker=2)
+    train_distribute = train_distribute_cls(
+        num_gpus_per_worker=context.num_gpus())
 
     if eval_distribute_cls:
       eval_distribute = eval_distribute_cls(
@@ -324,10 +327,12 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           mode=["graph"],
           train_distribute_cls=[
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
           ],
           eval_distribute_cls=[
               None,
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
           ],
           required_gpus=[0, 1]))
   def test_estimator_standalone_client(self, train_distribute_cls,
@@ -407,6 +412,7 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           ],
           eval_distribute_cls=[
               None, mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy,
           ],
           required_gpus=[0, 1]))
@@ -451,8 +457,15 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
-          train_distribute_cls=[mirrored_strategy.MirroredStrategy],
-          eval_distribute_cls=[None, mirrored_strategy.MirroredStrategy],
+          train_distribute_cls=[
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy
+          ],
+          eval_distribute_cls=[
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy
+          ],
           required_gpus=[0, 1]))
   def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls,
                                                     eval_distribute_cls):
@@ -508,7 +521,8 @@ class RunConfigTest(test.TestCase):
         "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}):
       run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  num_gpus_per_worker=2)))
 
   def test_should_run_distribute_coordinator(self):
     """Tests that should_run_distribute_coordinator return a correct value."""
@@ -531,10 +545,12 @@ class RunConfigTest(test.TestCase):
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
       config_with_train_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  num_gpus_per_worker=2)))
       config_with_eval_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              eval_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  num_gpus_per_worker=2)))
     self.assertTrue(
         dc_training.should_run_distribute_coordinator(
             config_with_train_distribute))
@@ -547,26 +563,27 @@ class RunConfigTest(test.TestCase):
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
       config = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  num_gpus_per_worker=2)))
     self.assertFalse(dc_training.should_run_distribute_coordinator(config))
 
   def test_init_run_config_duplicate_distribute(self):
     with self.assertRaises(ValueError):
       run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy(),
+          train_distribute=mirrored_strategy.CoreMirroredStrategy(),
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy()))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy()))
 
     with self.assertRaises(ValueError):
       run_config_lib.RunConfig(
-          eval_distribute=mirrored_strategy.MirroredStrategy(),
+          eval_distribute=mirrored_strategy.CoreMirroredStrategy(),
           experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.MirroredStrategy()))
+              eval_distribute=mirrored_strategy.CoreMirroredStrategy()))
 
   def test_init_run_config_none_distribute_coordinator_mode(self):
     # We don't use distribute coordinator for local training.
     config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.MirroredStrategy())
+        train_distribute=mirrored_strategy.CoreMirroredStrategy())
     dc_training.init_run_config(config, {})
     self.assertIsNone(config._distribute_coordinator_mode)
 
@@ -574,7 +591,7 @@ class RunConfigTest(test.TestCase):
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
       config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy())
+          train_distribute=mirrored_strategy.CoreMirroredStrategy())
       self.assertIsNone(config._distribute_coordinator_mode)
 
     # When `train_distribute` is not specified, don't use distribute
@@ -590,7 +607,7 @@ class RunConfigTest(test.TestCase):
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
       config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy())
+          train_distribute=mirrored_strategy.CoreMirroredStrategy())
     self.assertEqual(config._distribute_coordinator_mode,
                      dc.CoordinatorMode.INDEPENDENT_WORKER)
 
@@ -599,7 +616,7 @@ class RunConfigTest(test.TestCase):
     # `experimental.remote_cluster` is set use distribute coordinator with
     # STANDALONE_CLIENT mode.
     config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.MirroredStrategy(),
+        train_distribute=mirrored_strategy.CoreMirroredStrategy(),
         experimental_distribute=DistributeConfig(
             remote_cluster={"chief": ["fake_worker"]}))
     self.assertEqual(config._distribute_coordinator_mode,
@@ -607,5 +624,15 @@ class RunConfigTest(test.TestCase):
 
 
 if __name__ == "__main__":
+  # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly.
+  orig_init = session_manager.SessionManager.__init__
+
+  def new_init(*args, **kwargs):
+    kwargs.pop("recovery_wait_secs", None)
+    kwargs["recovery_wait_secs"] = 0.5
+    orig_init(*args, **kwargs)
+
+  session_manager.SessionManager.__init__ = new_init
+
   with test.mock.patch.object(sys, "exit", os._exit):
     test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index c7036daa3e3321a29e4fc9ae30449fbf15b69b1b..0fd3acd045170c04ebdaa9c84d0cb7267a4bc68a 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -61,7 +61,6 @@ def get_input_datasets(use_bfloat16=False):
   # train dataset
   train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
   train_ds = train_ds.repeat()
-  train_ds = train_ds.shuffle(100)
   train_ds = train_ds.map(lambda x, y: (tf.cast(x, cast_dtype), y))
   train_ds = train_ds.batch(64, drop_remainder=True)
 
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
index f4c222f26c3f6501cd78a69dd6a6d9a442a6bd24..fba06283ce560390b9a408ac7ceb30bbe17a754b 100644
--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -25,23 +25,28 @@ import numpy as np
 import six
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator import training
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import distribution_strategy_context as ds_context
 
 
 class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
@@ -64,7 +69,9 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
           distribution=[
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus
           ],
           use_train_and_evaluate=[True, False]))
   def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
@@ -76,11 +83,11 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
     train_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices))
+        batch_size=batch_size // distribution.num_replicas_in_sync)
     eval_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices))
+        batch_size=batch_size // distribution.num_replicas_in_sync)
     predict_input_fn = numpy_io.numpy_input_fn(
         x={'x': data}, batch_size=batch_size, shuffle=False)
 
@@ -136,44 +143,51 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
       shutil.rmtree(self._model_dir)
 
 
-class MirroredStrategyOptimizerV2Test(test.TestCase):
+def get_model():
+  x = keras.layers.Input(shape=(3,), name='input')
+  y = keras.layers.Dense(4, name='dense')(x)
+  model = keras.Model(x, y)
+  return model
 
-  def testKerasOptimizerWithUnequalInput(self):
-    if context.num_gpus() < 1:
-      self.skipTest('Not enough GPUs.')
 
-    def create_fn(device_id):
+class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def testKerasOptimizerWithUnequalInput(self, distribution):
+    def create_fn():
       var = variables.Variable(
           2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM)
       # grad for cpu is 1, grad for gpu is 2, avg grad is 1.5.
-      loss = (device_id + 1) * var
+      loss = math_ops.cast(_replica_id() + 1, dtype=dtypes.float32) * var
       optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
       train_op = optimizer.minimize(loss, var_list=[var])
       m = optimizer.get_slot(var, 'm')
       v = optimizer.get_slot(var, 'v')
-      return (var, m, v, train_op, optimizer.iteration)
+      return (var, m, v, train_op, optimizer.iterations)
 
     devices = ['/device:GPU:0', '/device:CPU:0']
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      (var, m, v, op, counter) = dist.call_for_each_replica(
-          create_fn, dist.worker_device_index, run_concurrently=False)
+    with distribution.scope():
+      (var, m, v, op, counter) = distribution.call_for_each_replica(create_fn)
       self.evaluate(variables.global_variables_initializer())
       var_val = [2.0, 2.0, 2.0]
       self.assertAllClose(
           var_val,
           self.evaluate(
-              [dist.read_var(var),
+              [distribution.read_var(var),
                var.get(devices[0]),
                var.get(devices[1])]))
       self.assertAllClose([0, 0, 0],
                           self.evaluate([
-                              dist.read_var(counter),
+                              distribution.read_var(counter),
                               counter.get(devices[0]),
                               counter.get(devices[1])
                           ]))
 
-      train_op = dist.unwrap(op)
+      train_op = distribution.unwrap(op)
       self.evaluate(train_op)
       # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
       m_val = [1.2, 1.2, 1.2]
@@ -181,7 +195,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           m_val,
           self.evaluate(
-              [dist.read_var(m),
+              [distribution.read_var(m),
                m.get(devices[0]),
                m.get(devices[1])]))
       # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
@@ -189,7 +203,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           v_val,
           self.evaluate(
-              [dist.read_var(v),
+              [distribution.read_var(v),
                v.get(devices[0]),
                v.get(devices[1])]))
       # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1)
@@ -198,12 +212,12 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           var_val,
           self.evaluate(
-              [dist.read_var(var),
+              [distribution.read_var(var),
                var.get(devices[0]),
                var.get(devices[1])]))
       self.assertAllClose([1, 1, 1],
                           self.evaluate([
-                              dist.read_var(counter),
+                              distribution.read_var(counter),
                               counter.get(devices[0]),
                               counter.get(devices[1])
                           ]))
@@ -214,7 +228,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           m_val,
           self.evaluate(
-              [dist.read_var(m),
+              [distribution.read_var(m),
                m.get(devices[0]),
                m.get(devices[1])]))
       # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
@@ -222,16 +236,50 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           v_val,
           self.evaluate(
-              [dist.read_var(v),
+              [distribution.read_var(v),
                v.get(devices[0]),
                v.get(devices[1])]))
       self.assertAllClose([2, 2, 2],
                           self.evaluate([
-                              dist.read_var(counter),
+                              distribution.read_var(counter),
                               counter.get(devices[0]),
                               counter.get(devices[1])
                           ]))
 
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
+
+    with self.cached_session():
+      model = get_model()
+      optimizer = gradient_descent.SGD(0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 4), dtype=np.float32)
+
+      model.fit(
+          inputs,
+          targets,
+          epochs=1,
+          batch_size=2,
+          verbose=0,
+          validation_data=(inputs, targets))
+      model.evaluate(inputs, targets)
+      model.predict(inputs)
+
+
+def _replica_id():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if not isinstance(replica_id, ops.Tensor):
+    replica_id = constant_op.constant(replica_id)
+  return replica_id
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index eccff1d9f57a6753a9c4ed745931b3108329b2a6..29d85fe971ff291df9e9ddf74c0082393bf55ba6 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -24,9 +24,9 @@ import numpy as np
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import tpu_strategy
-from tensorflow.contrib.distribute.python import values
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import values
 from tensorflow.python.estimator import keras as keras_lib
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.framework import constant_op
@@ -47,7 +47,6 @@ _RANDOM_SEED = 1337
 _TRAIN_SIZE = 200
 _INPUT_SIZE = (10,)
 _NUM_CLASS = 2
-_TOLERANCE = 1e-5
 
 
 # TODO(anjalisridhar): Add a decorator that will allow us to run these tests as
@@ -213,10 +212,77 @@ def multi_input_output_model():
   return model
 
 
+def get_correctness_test_inputs(use_numpy, with_distribution,
+                                x_train, y_train, x_predict):
+  """Generates the inputs for correctness check when enable Keras with DS."""
+  global_batch_size = 64
+  batch_size = global_batch_size
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  use_per_core_batch_size = (
+      with_distribution and
+      not distributed_training_utils.global_batch_size_supported(
+          with_distribution))
+  if use_per_core_batch_size:
+    batch_size //= with_distribution.num_replicas_in_sync
+
+  if use_numpy:
+    training_inputs = {
+        'batch_size': batch_size,
+        'x': x_train,
+        'y': y_train,
+        'epochs': 1,
+        'shuffle': False,
+    }
+    eval_inputs = {
+        'batch_size': batch_size,
+        'x': x_train,
+        'y': y_train,
+    }
+    predict_inputs = {
+        'x': np.array(x_predict, dtype=np.float32),
+    }
+  else:
+    # For dataset inputs, we do not pass batch_size to
+    # keras.fit/evaluate/predict. The batch size is part of the dataset.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (x_train, y_train))
+    x = batch_wrapper(train_dataset, batch_size, with_distribution)
+
+    training_inputs = {
+        'batch_size': None,
+        'x': x,
+        'y': None,
+        'epochs': 1,
+        'shuffle': False,
+        'steps_per_epoch': len(x_train) // global_batch_size,
+    }
+    eval_inputs = {
+        'batch_size': None,
+        'x': x,
+        'y': None,
+        'steps': 20,
+    }
+    predict_batch_size = len(x_predict)
+    if use_per_core_batch_size:
+      predict_batch_size //= with_distribution.num_replicas_in_sync
+    predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
+    predict_dataset = batch_wrapper(predict_dataset,
+                                    predict_batch_size, with_distribution)
+    predict_inputs = {
+        'steps': 1,
+        'x': predict_dataset,
+    }
+
+  return training_inputs, eval_inputs, predict_inputs
+
+
 strategies = [combinations.default_strategy,
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
               combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus,
+              combinations.tpu_strategy,  # steps_per_run=2
               combinations.tpu_strategy_one_step]
 
 
@@ -225,7 +291,9 @@ def strategy_minus_tpu_combinations():
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       mode=['graph'])
 
 
@@ -245,7 +313,15 @@ def strategy_and_optimizer_combinations():
       mode=['graph'])
 
 
-class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
+def strategy_and_inputs():
+  return combinations.combine(
+      distribution=strategies,
+      use_numpy=[True, False],
+      mode=['graph'])
+
+
+class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
+                                        parameterized.TestCase):
 
   def setUp(self):
     self._base_dir = os.path.join(self.get_temp_dir(),
@@ -253,17 +329,18 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     gfile.MakeDirs(self._base_dir)
     self._config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
-    self._dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
 
   def tearDown(self):
     writer_cache.FileWriterCache.clear()
     if os.path.isdir(self._base_dir):
       gfile.DeleteRecursively(self._base_dir)
 
-  def test_train_functional_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_train_functional_with_distribution_strategy(self, distribution):
     keras_model = simple_functional_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -271,8 +348,8 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist,
-                                      eval_distribute=dist)
+                                      train_distribute=distribution,
+                                      eval_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
@@ -286,9 +363,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     writer_cache.FileWriterCache.clear()
     gfile.DeleteRecursively(self._config.model_dir)
 
-  def test_train_sequential_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_train_sequential_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -296,7 +376,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist)
+                                      train_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
@@ -310,7 +390,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     writer_cache.FileWriterCache.clear()
     gfile.DeleteRecursively(self._config.model_dir)
 
-  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self, distribution):
     train_data, test_data = get_multi_inputs_multi_outputs_data()
 
     def train_input_fn():
@@ -340,14 +425,14 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
                                                      output_dict)).batch(16)
 
     self.do_test_multi_inputs_multi_outputs_with_input_fn(
-        train_input_fn, eval_input_fn)
+        distribution, train_input_fn, eval_input_fn)
 
-  def do_test_multi_inputs_multi_outputs_with_input_fn(self, train_input_fn,
-                                                       eval_input_fn):
+  def do_test_multi_inputs_multi_outputs_with_input_fn(
+      self, distribution, train_input_fn, eval_input_fn):
     config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED,
         model_dir=self._base_dir,
-        train_distribute=self._dist)
+        train_distribute=distribution)
     with self.cached_session():
       model = multi_inputs_multi_outputs_model()
       est_keras = keras_lib.model_to_estimator(keras_model=model, config=config)
@@ -357,9 +442,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
       eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
       self.assertLess(eval_results['loss'], baseline_eval_results['loss'])
 
-  def test_keras_optimizer_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_keras_optimizer_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -367,7 +455,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
 
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist)
+                                      train_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(keras_model=keras_model,
                                                config=config)
@@ -392,82 +480,133 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # Verify that the numpy value is copied to the variable.
       self.assertAllEqual(x, val)
 
-  def test_calculating_batch_params(self):
-    # This verifies that we calculate the number of steps when the batch size
-    # is specified.
+  @combinations.generate(strategy_combinations())
+  def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
     with self.cached_session():
-      # 64 is the number of input samples.
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      # The number of replicas is equal to 3.
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0',
-                                                     '/device:GPU:1'])
-
-      with self.assertRaisesRegexp(ValueError, 'Please specify a batch_size '
-                                               'that is smaller than'):
-        # The batch size(128) is larger than the number of input
-        # samples(64).
-        distributed_training_utils.get_input_batch_params(inputs,
-                                                          128,
-                                                          strategy)
-
-      with self.assertRaisesRegexp(ValueError, 'is smaller than the number '
-                                               'of replicas'):
-        # The batch size(32) * num_replicas(3) is 96 which is greater than the
-        # number of input samples(64).
-        distributed_training_utils.get_input_batch_params(inputs,
-                                                          32,
-                                                          strategy)
-
-      # The number of replicas now is equal to 2.
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      # 32 is the batch size per replica.
-      steps = distributed_training_utils.get_input_batch_params(inputs,
-                                                                32,
-                                                                strategy)
-      # The number of batches is the ratio of input samples(64) to
-      # batch size(32) which is 2. The number of steps(1) is the ratio of
-      # number of batches(2) to the number of replicas(2).
+      # Input samples of different sizes
+      input_20_samples = np.zeros((20, 3), dtype=np.float32)
+      input_63_samples = np.zeros((63, 3), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Default global batch size 32 for input with 64 samples run in 2 steps
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=None)
+      self.assertEqual(batch_size, 32 // replica_scale_factor)
+      self.assertEqual(steps, 2)
+
+      # Computed global batch size 20 is lower than 32 if we pass less samples.
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_20_samples, steps=None, batch_size=None)
+      self.assertEqual(batch_size, 20 // replica_scale_factor)
       self.assertEqual(steps, 1)
 
-      # 16 is the batch size per replica.
-      steps = distributed_training_utils.get_input_batch_params(inputs,
-                                                                16,
-                                                                strategy)
-      # The number of batches is the ratio of input samples(64) to
-      # batch size(16) which is 4. The number of steps(2) is the ratio of
-      # number of batches(4) to the number of replicas(2).
+      #  Default global batch size 32 cannot be used with 63 samples.
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=None, batch_size=None)
+
+  @combinations.generate(strategy_combinations())
+  def test_calculating_input_params_with_steps_no_batch_size(self,
+                                                             distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
+    with self.cached_session():
+      # Input samples of different sizes
+      input_63_samples = np.zeros((63, 3), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Computed global batch size is correct for number of specified 1 step
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=1, batch_size=None)
+      self.assertEqual(batch_size, 64 // replica_scale_factor)
+      self.assertEqual(steps, 1)
+
+      # Computed global batch size is correct for number of specified 2 steps
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=2, batch_size=None)
+      self.assertEqual(batch_size, 32 // replica_scale_factor)
       self.assertEqual(steps, 2)
 
-  def test_calculating_batch_size(self):
+      # All samples can not be consumed in specified number of steps
+      with self.assertRaisesRegexp(ValueError, 'not divisible by steps'):
+        distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=2, batch_size=None)
+
+      # This cases is different for different strategies due to the
+      # difference in supported batch size being global or per-replica.
+      if replica_scale_factor == 1:
+        # Computed global batch size is correct even if not sharadable
+        steps, batch_size = distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=3, batch_size=None)
+        self.assertEqual(batch_size, 21)
+        self.assertEqual(steps, 3)
+      else:
+        # Computed global batch size can not be sharded across replicas
+        with self.assertRaisesRegexp(ValueError, 'could not be sharded evenly '
+                                     'across the sync replicas'):
+          distributed_training_utils.get_input_params(
+              distribution, input_63_samples, steps=1, batch_size=None)
+
+  @combinations.generate(strategy_combinations())
+  def test_calculating_input_params_no_steps_with_batch_size(self,
+                                                             distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
     with self.cached_session():
-      # 64 is the number of input samples.
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      targets = np.zeros((64, 4), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Computed steps is correct for specified batch size
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=16)
+      self.assertEqual(batch_size, 16)
+      self.assertEqual(steps, 4 // replica_scale_factor)
+
+      # Computed steps is correct for specified batch size
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=32)
+      self.assertEqual(batch_size, 32)
+      self.assertEqual(steps, 2 // replica_scale_factor)
+
+      # Number of samples is not divisible by the global batch size
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=None, batch_size=20)
+
+      # Number of samples is not divisible by the global batch size
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=None, batch_size=3)
 
-      model = get_model()
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      strategy._require_static_shapes = True
-
-      model.compile(optimizer, loss, distribute=strategy)
-      iterator = model._distribution_standardize_user_data(inputs,
-                                                           targets,
-                                                           batch_size=None,
-                                                           check_steps=True,
-                                                           steps_name='steps',
-                                                           steps=3)
-
-      # The global batch size(21) across all replicas is the ratio of the input
-      # samples(64) to the steps(3).
-      # The batch size(10) per device is the ratio of the global batch size(21)
-      # to the number of replicas(2).
-      # The global batch size and batch size are rounded integer values.
-      self.assertEqual(10, distributed_training_utils.get_batch_dimension(
-          iterator._iterator))
+  @combinations.generate(strategy_combinations())
+  def test_calculating_input_params_with_steps_with_batch_size(self,
+                                                               distribution):
+    with self.cached_session():
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # No change to steps and batch size if both specified and feasible
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=5, batch_size=3)
+      self.assertEqual(batch_size, 3)
+      self.assertEqual(steps, 5)
+
+      # Number of samples is less than global batch size * steps
+      with self.assertRaisesRegexp(ValueError, 'less than samples required'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=10, batch_size=13)
 
   @combinations.generate(strategy_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
@@ -541,9 +680,9 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
     loss = 'mse'
     model.compile(optimizer, loss, distribute=distribution)
 
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    sample_weights = np.ones((10), np.float32)
+    inputs = np.zeros((20, 3), np.float32)
+    targets = np.zeros((20, 4), np.float32)
+    sample_weights = np.ones((20), np.float32)
 
     model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
               steps_per_epoch=2, verbose=1)
@@ -566,7 +705,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # `predict` a list that is equal in length to the number of model outputs.
       # In this test our model has two outputs and each element of `outs`
       # corresponds to all the samples of one of the model outputs.
-      self.assertEqual(2, len(outs))
+      self.assertLen(outs, 2)
       # Each of the output samples have a dimension of 7. We should process all
       # the available input samples(6).
       self.assertAllEqual([6, 7], outs[0].shape)
@@ -598,36 +737,33 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   @combinations.generate(strategy_combinations())
   def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
     with self.cached_session():
-      loss = 'mse'
-
       user_controlled_model = get_model()
-      user_controlled_optimizer = gradient_descent.GradientDescentOptimizer(
-          0.001)
-      user_controlled_metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      user_controlled_model.compile(user_controlled_optimizer, loss,
-                                    metrics=user_controlled_metrics,
-                                    distribute=distribution)
+      user_controlled_model.compile(
+          gradient_descent.GradientDescentOptimizer(0.001),
+          loss='mse',
+          metrics=['mae', keras.metrics.CategoricalAccuracy()],
+          distribute=distribution)
 
       interleaved_model = get_model()
-      interleaved_optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      interleaved_metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      interleaved_model.compile(interleaved_optimizer, loss,
-                                metrics=interleaved_metrics,
-                                distribute=distribution)
+      interleaved_model.set_weights(user_controlled_model.get_weights())
+      interleaved_model.compile(
+          gradient_descent.GradientDescentOptimizer(0.001),
+          loss='mse',
+          metrics=['mae', keras.metrics.CategoricalAccuracy()],
+          distribute=distribution)
 
       dataset = get_dataset(distribution)
 
       # Call fit with validation interleaved
-      interleaved_output = interleaved_model.fit(dataset, epochs=2,
-                                                 steps_per_epoch=2, verbose=0,
-                                                 validation_data=dataset,
-                                                 validation_steps=2)
+      interleaved_output = interleaved_model.fit(
+          dataset, epochs=2, steps_per_epoch=2, verbose=1,
+          validation_data=dataset, validation_steps=2, shuffle=False)
 
       # Manually control the validation running after each epoch.
       user_controlled_output = []
       for _ in range(2):
         user_controlled_model.fit(
-            dataset, epochs=1, steps_per_epoch=2, verbose=0)
+            dataset, epochs=1, steps_per_epoch=2, verbose=1, shuffle=False)
         user_controlled_output.append(
             user_controlled_model.evaluate(dataset, steps=2))
 
@@ -641,16 +777,20 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work
   # as clone_model's input_tensors argument only seems to accept list and not
   # tuples or dict.
-  def test_fit_with_tuple_and_dict_dataset_inputs(self):
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
       input_a_np = np.random.random((10, 3))
       input_b_np = np.random.random((10, 5))
@@ -723,35 +863,48 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
     model.evaluate(dataset, steps=2, verbose=1)
     model.predict(dataset, steps=2)
 
-  def test_dataset_input_shape_validation(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_dataset_wrong_input_shape(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
-      model.compile(optimizer, loss, distribute=strategy)
+      model.compile(optimizer, loss, distribute=distribution)
 
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 3), dtype=np.float32)
+      # Wrong input shape
+      inputs = np.zeros((10, 5), dtype=np.float32)
       targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
 
-      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
+      with self.assertRaisesRegexp(ValueError,
+                                   'expected input to have shape'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
-      # Wrong input shape
-      inputs = np.zeros((10, 5), dtype=np.float32)
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_dataset_no_batch_input_validation(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3), dtype=np.float32)
       targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'expected input to have shape'):
+      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
   @combinations.generate(combinations.combine(
@@ -773,7 +926,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       with self.assertRaisesRegexp(ValueError, 'requires fully defined shapes'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
-  def test_learning_phase_value(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_learning_phase_value(self, distribution):
     # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
     # meaningful values. Currently we don't pass the learning phase if the
     # Lambda layer uses the learning phase.
@@ -787,42 +945,50 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       optimizer = gradient_descent.GradientDescentOptimizer(0.005)
       loss = 'mse'
       metrics = ['acc']
-      strategy = mirrored_strategy.MirroredStrategy(
-          ['/device:GPU:0', '/device:GPU:1'])
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      batch_size = 8
+      if isinstance(distribution, mirrored_strategy.CoreMirroredStrategy):
+        # CoreMirroredStrategy uses global batch size.
+        batch_size = 8 * distribution.num_replicas_in_sync
 
       inputs = np.ones((10, 1), dtype=np.float32)
       targets = np.ones((10, 1), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat().batch(8)
+      dataset = dataset.repeat().batch(batch_size)
       hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1)
       self.assertAlmostEqual(hist.history['acc'][0], 0, 0)
 
       model.set_weights(initial_weights)
-      evaluate_output = model.evaluate(dataset, steps=20)
-      self.assertAlmostEqual(evaluate_output[1], 1, 0)
+      # TODO(psv/anjalisridhar): Enable these lines after we fix b/117431185.
+      # evaluate_output = model.evaluate(dataset, steps=20)
+      # self.assertAlmostEqual(evaluate_output[1], 1, 0)
 
       inputs = np.ones((10, 1), dtype=np.float32)
       predict_dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
-      predict_dataset = predict_dataset.repeat().batch(5)
+
+      predict_dataset = predict_dataset.repeat().batch(batch_size)
       output = model.predict(predict_dataset, steps=10)
-      # `predict` runs for 10 steps and in each step you process 100 samples.
-      ref_output = np.ones((100, 1), dtype=np.float32)
+      # `predict` runs for 10 steps
+      ref_output = np.ones((160, 1), dtype=np.float32)
       self.assertArrayNear(output, ref_output, 1e-1)
 
 
 class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
-  def test_validating_dataset_input_tensors_with_shape_mismatch(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def test_validating_dataset_input_tensors_with_shape_mismatch(self,
+                                                                distribution):
     with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2))
       b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
       x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
       y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
+      with distribution.scope():
         # Removed device and input tensor shape details from the error message
         # since the order of the device and the corresponding input tensor shape
         # is not deterministic over different runs.
@@ -831,17 +997,21 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                      'distributed tensor inputs '
                                      'DistributedValues:.+'):
           distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
+              distribution, x, y)
 
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def test_validating_dataset_input_tensors_with_dtype_mismatch(self,
+                                                                distribution):
     with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
       b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
       x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
       y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
+      with distribution.scope():
         # Removed device and input tensor dtype details from the error message
         # since the order of the device and the corresponding input tensor dtype
         # is not deterministic over different runs.
@@ -850,21 +1020,23 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                      'distributed tensor inputs '
                                      'DistributedValues:.+'):
           distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
+              distribution, x, y)
 
-  def test_unsupported_features(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_unsupported_features(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      dataset = get_dataset(strategy)
+      dataset = get_dataset(distribution)
 
       # Test with validation split
       with self.assertRaisesRegexp(
@@ -899,18 +1071,21 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                    'you should specify the `steps` argument'):
         model.predict(dataset, verbose=0)
 
-  def test_calling_with_unsupported_predefined_callbacks(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      dataset = get_dataset(strategy)
+      dataset = get_dataset(distribution)
 
       def schedule(_):
         return 0.001
@@ -933,11 +1108,17 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                   callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)])
 
 
-class TestDistributionStrategyWithLossMasking(test.TestCase):
+class TestDistributionStrategyWithLossMasking(test.TestCase,
+                                              parameterized.TestCase):
 
   # TODO(priyag): Enable all strategies for this test. Currently it does not
   # work for TPU due to some invalid datatype.
-  def test_masking(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_masking(self, distribution):
     with self.cached_session():
       np.random.seed(1337)
       x = np.array([[[1], [1]], [[0], [0]]])
@@ -946,12 +1127,9 @@ class TestDistributionStrategyWithLossMasking(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
       model.compile(loss='mse',
                     optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    distribute=strategy)
+                    distribute=distribution)
       y = np.array([[[1], [1]], [[1], [1]]])
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
@@ -1018,26 +1196,39 @@ class TestDistributionStrategyCorrectness(test.TestCase,
           distribute=distribution)
 
       batch_size = 64
-      batch_size //= distribution.num_replicas
+      if not distributed_training_utils.global_batch_size_supported(
+          distribution):
+        batch_size //= distribution.num_replicas_in_sync
       train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
       train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
 
       history = model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
       self.assertEqual(history.history['binary_accuracy'], [1.0])
 
-  @combinations.generate(strategy_combinations())
-  def test_correctness(self, distribution):
+  @combinations.generate(strategy_and_inputs())
+  def test_correctness(self, distribution, use_numpy):
     with self.cached_session():
+      tolerance = 1e-5
+
+      if isinstance(distribution, (mirrored_strategy.MirroredStrategy,
+                                   mirrored_strategy.CoreMirroredStrategy)):
+        # TODO(b/119257215): use the default one once the flakyness is fixed.
+        tolerance = 1e-4
+
       keras.backend.set_image_data_format('channels_last')
-      num_samples = 10000
       np.random.seed(_RANDOM_SEED)
       random_seed.set_random_seed(_RANDOM_SEED)
 
-      # Train and predict datasets are created with the same input numpy arrays.
+      # Train, eval, and predict datasets are created with the same input numpy
+      # arrays.
+      # TODO(xiejw): Change this back to 10000, once we support final partial
+      # batch.
+      num_samples = 9984
       x_train = np.random.rand(num_samples, 1)
       y_train = 3 * x_train
       x_train = x_train.astype('float32')
       y_train = y_train.astype('float32')
+      x_predict = [[1.], [2.], [3.], [4.]]
 
       # The model is built once and the initial weights are saved.
       # This is used to initialize the model for both the distribution and
@@ -1051,52 +1242,38 @@ class TestDistributionStrategyCorrectness(test.TestCase,
       initial_weights = model.get_weights()
 
       def fit_and_predict(with_distribution=None):
+        # We have initialized the model to the same weight for the distribution
+        # and non-distribution run.
         model.set_weights(initial_weights)
         model.compile(
             loss=keras.losses.mean_squared_error,
             optimizer=gradient_descent.GradientDescentOptimizer(0.5),
             distribute=with_distribution)
 
-        batch_size = 64
-        if with_distribution:
-          batch_size //= with_distribution.num_replicas
-        train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train,
-                                                                y_train))
-        train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
-        # We have initialized the model to the same weight for the distribution
-        # and non-distribution run. If you want to initialize the model to
-        # random weights for each run, you need to run the model through the
-        # entire dataset at least once to ensure that the weights converge to
-        # the same value.
-        model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
+        training_inputs, eval_inputs, predict_inputs = (
+            get_correctness_test_inputs(use_numpy, with_distribution,
+                                        x_train, y_train, x_predict))
 
+        model.fit(**training_inputs)
+        eval_result = model.evaluate(**eval_inputs)
         weights = model.get_weights()
-        x_predict = [[1.], [2.], [3.], [4.]]
-        predict_batch_size = 4
-        if with_distribution:
-          predict_batch_size //= with_distribution.num_replicas
-        predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
-        predict_dataset = batch_wrapper(predict_dataset,
-                                        predict_batch_size, distribution)
-        predict_result = model.predict(predict_dataset, steps=1)
-
-        return weights, predict_result
-
-      wts_with_ds, predict_with_ds = fit_and_predict(
+        predict_result = model.predict(**predict_inputs)
+
+        return weights, eval_result, predict_result
+
+      wts_with_ds, eval_with_ds, predict_with_ds = fit_and_predict(
           with_distribution=distribution)
-      wts_without_ds, predict_without_ds = fit_and_predict(
+      wts_without_ds, eval_without_ds, predict_without_ds = fit_and_predict(
           with_distribution=None)
 
-      # Verify that the weights are the same within some limits of tolerance.
+      # Verify that the weights, eval results, predict outputs  are the same
+      # within some limits of tolerance.
       self.assertAllClose(
-          wts_with_ds, wts_without_ds, atol=_TOLERANCE, rtol=_TOLERANCE)
-      # Verify that the predicted outputs are the same within some limits of
-      # tolerance.
+          wts_with_ds, wts_without_ds, atol=tolerance, rtol=tolerance)
       self.assertAllClose(
-          predict_with_ds, predict_without_ds, atol=_TOLERANCE, rtol=_TOLERANCE)
-
-
-# TODO(priyag): Add a test for TPUStrategy with steps_per_run > 1.
+          eval_with_ds, eval_without_ds, atol=tolerance, rtol=tolerance)
+      self.assertAllClose(
+          predict_with_ds, predict_without_ds, atol=tolerance, rtol=tolerance)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index 9e1a7ad3932e3e8b79c70f1c07a241dcf52564f1..8ac659abe96370b751ed1556cc699fe20788a0fd 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -72,14 +72,14 @@ def _regression_dataset_fn():
       "predictions": [1., .75, .25, 0.]}).repeat()
 
 
-# TODO(priyag): Add TPU Strategy to this once metrics aggregate correctly using
-# ReplicaLocalVariables on TPUs. Submit http://cl/208914352.
 def all_combinations():
   return combinations.combine(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       mode=["graph"])
 
 
@@ -100,25 +100,26 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
       if isinstance(distribution, tpu_strategy.TPUStrategy):
         def step_fn(ctx, inputs):
           value, update = distribution.call_for_each_replica(
-              metric_fn, inputs)
+              metric_fn, args=inputs)
           ctx.set_non_tensor_output(name="value", output=value)
           return distribution.group(update)
 
         ctx = distribution.run_steps_on_dataset(
-            step_fn, iterator, iterations=distribution.steps_per_run)
+            step_fn, iterator, iterations=distribution.extended.steps_per_run)
         update = ctx.run_op
         value = ctx.non_tensor_outputs["value"]
         # In each run, we run multiple steps, and each steps consumes as many
         # batches as number of replicas.
         batches_per_update = (
-            distribution.num_replicas * distribution.steps_per_run)
+            distribution.num_replicas_in_sync *
+            distribution.extended.steps_per_run)
       else:
         value, update = distribution.call_for_each_replica(
             metric_fn, iterator.get_next())
         update = distribution.group(update)
         # TODO(josh11b): Once we switch to using a global batch size for input,
-        # replace "distribution.num_replicas" with "1".
-        batches_per_update = distribution.num_replicas
+        # replace "distribution.num_replicas_in_sync" with "1".
+        batches_per_update = distribution.num_replicas_in_sync
 
       self.evaluate(iterator.initializer)
       self.evaluate(distribution.initialize())
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 165732d578fd25b1ef631efc5827fd636427c7c8..e77d3d455b0a79b2fac6a458c3aa009ff5c2f780 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -22,10 +22,10 @@ from absl.testing import parameterized
 import numpy
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
 from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -64,11 +64,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(
-                model_fn, *inputs, run_concurrently=layer.built))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
@@ -111,7 +110,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def run_step():
         return distribution.group(
             distribution.call_for_each_replica(
-                model_fn, iterator.get_next(), run_concurrently=layer.built))
+                model_fn, args=(iterator.get_next(),)))
 
       if not context.executing_eagerly():
         with self.cached_session() as sess:
@@ -159,11 +158,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(
-                model_fn, *inputs, run_concurrently=layer.built))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
@@ -221,7 +219,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
                                     renorm, update_ops_in_cross_replica_mode):
     """Verifies that moving mean updates are reduced across replicas."""
     with distribution.scope():
-      num_replicas = len(distribution.worker_devices)
+      num_replicas = distribution.num_replicas_in_sync
       model_fn, dataset_fn, batchnorm = batchnorm_example(
           optimizer_fn,
           batch_per_epoch=num_replicas,
@@ -229,17 +227,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           renorm=renorm,
           update_ops_in_replica_mode=not update_ops_in_cross_replica_mode)
 
-      # Make sure prefetching is disabled since that makes the
-      # specific input on each device to be non deterministic, and
-      # this test relies on specific input being on each device.
-      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
-        self.assertFalse(distribution._prefetch_on_device)
-
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         fetches = distribution.unwrap(
-            distribution.call_for_each_replica(
-                model_fn, *inputs, run_concurrently=batchnorm.built))
+            distribution.call_for_each_replica(model_fn, args=inputs))
         if update_ops_in_cross_replica_mode:
           fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
         return control_flow_ops.group(fetches)
@@ -295,7 +286,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
                   distribution=[
                       combinations.one_device_strategy,
                       combinations.mirrored_strategy_with_gpu_and_cpu,
-                      combinations.mirrored_strategy_with_two_gpus
+                      combinations.mirrored_strategy_with_two_gpus,
+                      combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                      combinations.core_mirrored_strategy_with_two_gpus
                   ]),
               combinations.combine(
                   mode=["graph"], use_callable_loss=[True, False]) +
@@ -331,11 +324,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
         return dataset_ops.Dataset.zip((features, labels)).repeat()
 
-      def step_fn(ctx, x, y):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(
-                model_fn, x, y, run_concurrently=False))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
@@ -369,10 +361,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
       # with sum loss reduction, or 10.6 with mean.
       if loss_reduction == losses_impl.Reduction.SUM:
-        # Note that the "distribution.num_replicas" factor will go away once
-        # we split the input across replicas, instead of pulling a complete
+        # Note that the "distribution.num_replicas_in_sync" factor will go away
+        # once we split the input across replicas, instead of pulling a complete
         # batch of input per replica.
-        self.assertNear(weight, 2 + 21.2 * distribution.num_replicas, 0.0001)
+        self.assertNear(weight, 2 + 21.2 * distribution.num_replicas_in_sync,
+                        0.0001)
       else:
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 10.6, 0.0001)
@@ -412,21 +405,21 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         train_op = optimizer.minimize(loss_fn)
         loss = loss_fn()
         output_context.set_last_step_output(
-            name="replica_loss_agg",
+            name="replica_loss_reduced",
             output=loss,
-            aggregation=variables_lib.VariableAggregation.MEAN)
+            reduce_op=reduce_util.ReduceOp.MEAN)
         output_context.set_non_tensor_output(key1, value1)
         return (train_op, loss)
 
-      def step_fn(output_context, *inputs):
+      def step_fn(output_context, inputs):
         (train_op, loss) = distribution.call_for_each_replica(
-            model_fn, output_context, *inputs, run_concurrently=False)
+            model_fn, args=(output_context,) + inputs)
         output_context.set_last_step_output(
-            name="cross_replica_loss_agg",
+            name="cross_replica_loss_reduced",
             output=loss,
-            aggregation=variables_lib.VariableAggregation.MEAN)
+            reduce_op=reduce_util.ReduceOp.MEAN)
         output_context.set_last_step_output(
-            name="cross_replica_loss_noagg",
+            name="cross_replica_loss_not_reduced",
             output=loss)
         return distribution.group(train_op)
 
@@ -434,16 +427,16 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       def run_step():
         initial_loss = lambda: constant_op.constant(1e7)
-        # Initial values corresponding to aggregated losses are just single
-        # tensors. But for non aggregated losses, we need to have initial
+        # Initial values corresponding to reduced losses are just single
+        # tensors. But for non reduced losses, we need to have initial
         # values that are of the same structure as non reduced losses. In
         # MirroredStrategy, this will be a list of losses, in TPUStrategy
         # it will be single tensor. Using `broadcast` followed by `unwrap`
         # gives us the desired initial value structure.
         initial_loop_values = {
-            "replica_loss_agg": initial_loss(),
-            "cross_replica_loss_agg": initial_loss(),
-            "cross_replica_loss_noagg":
+            "replica_loss_reduced": initial_loss(),
+            "cross_replica_loss_reduced": initial_loss(),
+            "cross_replica_loss_not_reduced":
             distribution.unwrap(distribution.broadcast(initial_loss()))
         }
         ctx = distribution.run_steps_on_dataset(
@@ -453,17 +446,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         self.assertEqual({key1: [value1]}, ctx.non_tensor_outputs)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["replica_loss_agg"],
-            aggregated=True, distribution=distribution)
+            loss_output=ctx.last_step_outputs["replica_loss_reduced"],
+            reduced=True, distribution=distribution)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_replica_loss_agg"],
-            aggregated=True, distribution=distribution)
+            loss_output=ctx.last_step_outputs["cross_replica_loss_reduced"],
+            reduced=True, distribution=distribution)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_replica_loss_noagg"],
-            aggregated=False, distribution=distribution)
-        return (ctx.run_op, ctx.last_step_outputs["replica_loss_agg"])
+            loss_output=ctx.last_step_outputs["cross_replica_loss_not_reduced"],
+            reduced=False, distribution=distribution)
+        return (ctx.run_op, ctx.last_step_outputs["replica_loss_reduced"])
 
       self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
@@ -488,17 +481,16 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       error_is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(error_is_not_increasing)
 
-  def _verify_loss_output(self, initial_loss, loss_output, aggregated,
+  def _verify_loss_output(self, initial_loss, loss_output, reduced,
                           distribution):
-    if not aggregated:
-      self.assertEqual(distribution.num_replicas,
-                       len(distribution.unwrap(loss_output)))
+    if not reduced:
+      self.assertLen(distribution.unwrap(loss_output),
+                     distribution.num_replicas_in_sync)
       loss_output = distribution.reduce(
-          aggregation=variables_lib.VariableAggregation.MEAN,
-          value=loss_output, destinations="/device:CPU:0")
+          reduce_util.ReduceOp.MEAN, loss_output, destinations="/device:CPU:0")
 
     unwrapped_output = distribution.unwrap(loss_output)
-    self.assertEqual(1, len(unwrapped_output))
+    self.assertLen(unwrapped_output, 1)
     loss_tensor = unwrapped_output[0]
     self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
     self.assertEqual(initial_loss.shape, loss_tensor.shape)
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index c23de0694984076b1c9a8da45219436fc38cd286..a3bcc8db88f9466811aa15d37e14a22eb5ce485e 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -12,300 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Class MirroredStrategy implementing DistributionStrategy."""
+"""Contrib version of MirroredStrategy."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-from functools import partial
-import threading
+import functools
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import shared_variable_creator
-from tensorflow.contrib.distribute.python import values
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import device_util
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import values
 from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.util import nest
 
 
-# TODO(josh11b): Replace asserts in this file with if ...: raise ...
-
-
-@contextlib.contextmanager
-def _enter_graph(g):
-  if context.executing_eagerly():
-    with g.as_default(), context.eager_mode():
-      yield
-  else:
-    with g.as_default():
-      yield
-
-
-def _cpu_device(device):
-  cpu_device = tf_device.DeviceSpec.from_string(device)
-  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
-  return cpu_device.to_string()
-
-
-class _RequestedStop(Exception):
-  pass
-
-
-# _call_for_each_replica and _reduce_non_distributed_value are not members of
-# MirroredStrategy so that they are generally not allowed to use anything
-# specific to MirroredStrategy and thus can be shared with other distribution
-# strategies.
-
-
-# TODO(yuefengz): maybe create a common class for those who need to call this
-# _call_for_each_replica.
-def _call_for_each_replica(distribution, fn, *args, **kwargs):
-  """Run `fn` in separate threads, once per replica/worker device.
-
-  Args:
-    distribution: the DistributionStrategy object.
-    fn: function to run (will be run once per device, each in its own thread).
-    *args: positional arguments for `fn`
-    **kwargs: keyword arguments for `fn`.
-        `"run_concurrently"`: Boolean indicating whether executions of `fn`
-           can be run concurrently (under eager execution only), defaults to
-           `True`.
-
-  Returns:
-    Merged return value of `fn` across all replicas.
-
-  Raises:
-    RuntimeError: If fn() calls get_replica_context().merge_call() a different
-        number of times from the available devices.
-  """
-  run_concurrently = kwargs.pop("run_concurrently", True)
-  if not context.executing_eagerly():
-    # Lots of TF library code isn't thread-safe in graph mode, and
-    # there is little to be gained by turning on multithreading when
-    # constructing a graph.
-    run_concurrently = False
-    # Needed for per-thread device, etc. contexts in graph mode.
-    ops.get_default_graph().switch_to_thread_local()
-  elif run_concurrently is None:
-    run_concurrently = True
-
-  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
-
-  shared_variable_store = {}
-
-  # TODO(isaprykin): Create these threads once instead of during every run()
-  # call.
-  threads = []
-  for index, d in enumerate(distribution.worker_devices):
-    variable_creator_fn = shared_variable_creator.make_fn(
-        shared_variable_store, index)
-    t = MirroredStrategy._MirroredReplicaThread(  # pylint: disable=protected-access
-        distribution, coord, d, variable_creator_fn, fn,
-        *values.select_device(d, args), **values.select_device(d, kwargs))
-    threads.append(t)
-
-  for t in threads:
-    t.start()
-
-  # When `fn` starts `should_run` event is set on _MirroredReplicaThread
-  # (`MRT`) threads. The execution waits until
-  # `MRT.has_paused` is set, which indicates that either `fn` is
-  # complete or a `get_replica_context().merge_call()` is called.  If `fn` is
-  # complete, then `MRT.done` is set to True.  Otherwise, arguments
-  # of `get_replica_context().merge_call` from all paused threads are grouped
-  # and the `merge_fn` is performed.  Results of the
-  # `get_replica_context().merge_call` are then set to `MRT.merge_result`.
-  # Each such `get_replica_context().merge_call` call returns the
-  # `MRT.merge_result` for that thread when `MRT.should_run` event
-  # is reset again. Execution of `fn` resumes.
-
-  try:
-    with coord.stop_on_exception():
-      all_done = False
-      while not all_done and not coord.should_stop():
-        done = []
-        if run_concurrently:
-          for t in threads:
-            t.should_run.set()
-          for t in threads:
-            t.has_paused.wait()
-            t.has_paused.clear()
-            if coord.should_stop():
-              return None
-            done.append(t.done)
-        else:
-          for t in threads:
-            t.should_run.set()
-            t.has_paused.wait()
-            t.has_paused.clear()
-            if coord.should_stop():
-              return None
-            done.append(t.done)
-        if coord.should_stop():
-          return None
-        all_done = all(done)
-        if not all_done:
-          if any(done):
-            raise RuntimeError("Some replicas made a different number of "
-                               "replica_context().merge_call() calls.")
-          # get_replica_context().merge_call() case
-          merge_args = values.regroup({t.device: t.merge_args for t in threads})
-          merge_kwargs = values.regroup(
-              {t.device: t.merge_kwargs for t in threads})
-          # We capture the name_scope of the MRT when we call merge_fn
-          # to ensure that if we have opened a name scope in the MRT,
-          # it will be respected when executing the merge function. We only
-          # capture the name_scope from the first MRT and assume it is
-          # the same for all other MRTs.
-          mtt_captured_name_scope = threads[0].captured_name_scope
-          with ops.name_scope(mtt_captured_name_scope):
-            merge_result = threads[0].merge_fn(distribution, *merge_args,
-                                               **merge_kwargs)
-          for t in threads:
-            t.merge_result = values.select_device(t.device, merge_result)
-  finally:
-    for t in threads:
-      t.should_run.set()
-    coord.join(threads)
-
-  return values.regroup({t.device: t.main_result for t in threads})
-
-
-def _reduce_non_distributed_value(distribution, aggregation, value,
-                                  destinations):
-  """Reduce a non-DistributedValue `value` to `destinations`."""
-  if isinstance(value, values.DistributedValues):
-    raise ValueError("You are passing a `DistributedValue` to "
-                     "`_reduce_non_distributed_value`, which is not allowed.")
-
-  # If the same value is present on all replicas then the PerDevice value will
-  # be a single value. We also handle the case when `value` is a single value
-  # and equal to 0.
-  if value == 0:
-    return 0
-  # If the aggregation type is MEAN or ONLY_FIRST_REPLICA, then this
-  # essentially means that the same value should be on all destinations.
-  if aggregation in (
-      variable_scope.VariableAggregation.MEAN,
-      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA):
-    return value
-
-  cross_tower_ops_lib.validate_destinations(destinations)
-  # We do not support an aggregation type of SUM if the value is the same across
-  # all replicas. We call this as part of assign functions for MirroredVariables
-  # and summing up identical values across replicas is not clearly defined.
-  if (len(distribution.worker_devices) != 1 or
-      not cross_tower_ops_lib.check_destinations(destinations)):
-    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
-                     "the given aggregation %s." % (value, aggregation))
-  # TODO(anjalisridhar): Moves these methods to a device utility file?
-  devices = cross_tower_ops_lib.get_devices_from(destinations)
-  if len(devices) == 1:
-    with ops.device(devices[0]):
-      return array_ops.identity(value)
-  else:
-    value_updates = {}
-    for d in devices:
-      with ops.device(d):
-        value_updates[d] = array_ops.identity(value)
-    return values.Mirrored(value_updates)
-
-
-def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
-  # Figure out what collections this variable should be added to.
-  # We'll add the MirroredVariable to those collections instead.
-  collections = kwargs.pop("collections", None)
-  if collections is None:
-    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-  kwargs["collections"] = []
-
-  # Get synchronization value
-  synchronization = kwargs.get("synchronization",
-                               variable_scope.VariableSynchronization.ON_WRITE)
-  if synchronization == variable_scope.VariableSynchronization.NONE:
-    raise ValueError("`NONE` variable synchronization mode is not "
-                     "supported with `Mirrored` distribution strategy. Please"
-                     " change the `synchronization` for variable: " +
-                     kwargs["name"])
-  elif synchronization == variable_scope.VariableSynchronization.ON_READ:
-    # Variables that are to be synced on read are replica local.
-    is_replica_local = True
-    kwargs["trainable"] = False
-  elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or
-        synchronization == variable_scope.VariableSynchronization.AUTO):
-    # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`.
-    is_replica_local = False
-  else:
-    raise ValueError("Invalid variable synchronization mode: " +
-                     synchronization + " for variable: " + kwargs["name"])
-
-  # Get aggregation value
-  aggregation = kwargs.pop("aggregation",
-                           variable_scope.VariableAggregation.NONE)
-  if aggregation not in (
-      variable_scope.VariableAggregation.NONE,
-      variable_scope.VariableAggregation.SUM,
-      variable_scope.VariableAggregation.MEAN,
-      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-  ):
-    raise ValueError("Invalid variable aggregation mode: " + aggregation +
-                     " for variable: " + kwargs["name"])
-
-  # Ignore user-specified caching device, not needed for mirrored variables.
-  kwargs.pop("caching_device", None)
-
-  # TODO(josh11b,apassos): It would be better if variable initialization
-  # was never recorded on the tape instead of having to do this manually
-  # here.
-  with tape.stop_recording():
-    index = real_mirrored_creator(devices, *args, **kwargs)
-
-    if is_replica_local:
-      result = values.ReplicaLocalVariable(
-          index, index[devices[0]], aggregation)
-    else:
-      result = values.MirroredVariable(index, index[devices[0]], aggregation)
-
-  # Add the wrapped variable to the requested collections.
-  # The handling of eager mode and the global step matches
-  # ResourceVariable._init_from_args().
-  if not context.executing_eagerly():
-    g = ops.get_default_graph()
-    # If "trainable" is True, next_creator() will add the member variables
-    # to the TRAINABLE_VARIABLES collection, so we manually remove
-    # them and replace with the MirroredVariable. We can't set
-    # "trainable" to False for next_creator() since that causes functions
-    # like implicit_gradients to skip those variables.
-    if kwargs.get("trainable", True):
-      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for v in index.values():
-        if v in l:
-          l.remove(v)
-    g.add_to_collections(collections, result)
-  elif ops.GraphKeys.GLOBAL_STEP in collections:
-    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
-
-  return result
+# pylint: disable=protected-access,invalid-name
+_call_for_each_replica = mirrored_strategy._call_for_each_replica
+_reduce_non_distributed_value = mirrored_strategy._reduce_non_distributed_value
+_create_mirrored_variable = mirrored_strategy._create_mirrored_variable
+CoreMirroredStrategy = mirrored_strategy.MirroredStrategy
+CoreMirroredExtended = mirrored_strategy.MirroredExtended
+# pylint: enable=protected-access,invalid-name
 
 
 class MirroredStrategy(distribute_lib.DistributionStrategy):
   """Mirrors vars to distribute across multiple devices and machines.
 
+  *** contrib version ***
+
   This strategy uses one replica per device and sync replication for its
   multi-GPU version.
 
@@ -348,8 +81,6 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       specified.
     cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
       set, the `configure` method will try to find the best one.
-    prefetch_on_device: optional boolean to specify whether to prefetch input
-      data to devices.
     auto_shard_dataset: whether to auto-shard the dataset when there are
       multiple workers.
     cross_tower_ops: Deprecated alias for `cross_device_ops`.
@@ -360,482 +91,62 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                num_gpus=None,
                num_gpus_per_worker=None,
                cross_device_ops=None,
-               prefetch_on_device=None,
                auto_shard_dataset=False,
                cross_tower_ops=None):
-    super(MirroredStrategy, self).__init__()
-
     assert not (cross_device_ops and cross_tower_ops)
-    self._cross_tower_ops = cross_device_ops or cross_tower_ops
-    self._prefetch_on_device = prefetch_on_device
-    self._auto_shard_dataset = auto_shard_dataset
-    # Remember num GPUs which might be needed by `configure` method.
     if num_gpus is not None and num_gpus_per_worker is not None:
       raise ValueError(
           "You cannot specify both `num_gpus` and `num_gpus_per_worker`.")
-    if num_gpus is not None:
-      self._num_gpus = num_gpus
-    else:
-      self._num_gpus = num_gpus_per_worker
-
-    self._initialize_local(self._num_gpus, devices)
-
-  def _initialize_local(self, num_gpus, devices):
-    """Initializes the object for local training."""
-    self._cluster_spec = None
-    # Convert `num_gpus` into `devices`, shouldn't specify both.
-    if devices is None:
-      if num_gpus is None:
-        num_gpus = context.num_gpus()
-      if num_gpus == 0:
-        devices = ["/device:CPU:0"]
-      else:
-        devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
-    elif num_gpus is not None:
-      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
-    self._num_gpus = num_gpus
-    # TODO(yuefengz): consider setting the default device.
-
-    assert devices, "Must specify at least one device."
-    assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
-    # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerDevice({d: i for i, d in enumerate(devices)})
-
-  def _initialize_multi_worker(self, num_gpus, cluster_spec):
-    """Initializes the object for multi-worker training."""
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._cluster_spec = cluster_spec
-
-    self._workers = []
-    for job in ["chief", "worker"]:
-      for task in range(len(cluster_spec.as_dict().get(job, []))):
-        self._workers.append("/job:%s/task:%d" % (job, task))
-
     if num_gpus is None:
-      raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
-    if num_gpus > 0:
-      self._worker_device_map = {
-          worker: [
-              device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
-              for gpu in range(num_gpus)
-          ] for worker in self._workers
-      }
-    else:
-      self._worker_device_map = {
-          worker: [device_util.canonicalize(worker, "/device:CPU:0")]
-          for worker in self._workers
-      }
+      num_gpus = num_gpus_per_worker
+    extended = MirroredExtended(self, devices, num_gpus,
+                                cross_device_ops or cross_tower_ops,
+                                auto_shard_dataset)
+    super(MirroredStrategy, self).__init__(extended)
 
-    devices = nest.flatten(self._worker_device_map)
 
-    # Setting `_default_device` will add a device scope in the
-    # distribution.scope. We set the default device to the first worker. When
-    # users specify device under distribution.scope by
-    #   with tf.device("/cpu:0"):
-    #     ...
-    # their ops will end up on the cpu device of its first worker, e.g.
-    # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
-    self._default_device = self._workers[0]
+class MirroredExtended(CoreMirroredExtended):
+  """Implementation of (contrib) MirroredStrategy."""
 
-    assert devices, "Must specify at least one device."
-    assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
-    # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerDevice(
-        {d: i for i, d in enumerate(devices)})
+  def __init__(self,
+               container_strategy,
+               devices=None,
+               num_gpus_per_worker=None,
+               cross_device_ops=None,
+               auto_shard_dataset=False):
+    super(MirroredExtended, self).__init__(
+        container_strategy, devices, num_gpus_per_worker, cross_device_ops)
+    self._auto_shard_dataset = auto_shard_dataset
 
-  def _create_variable(self, next_creator, *args, **kwargs):
-    """Create a mirrored variable. See `DistributionStrategy.scope`."""
-    colocate_with = kwargs.pop("colocate_with", None)
-    devices = self._get_devices_from(colocate_with)
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch.
 
-    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
-      index = {}
-      for i, d in enumerate(devices):
-        with ops.device(d):
-          if i > 0:
-            # Give replicas meaningful distinct names:
-            var0name = index[devices[0]].name.split(":")[0]
-            # We append a / to variable names created on replicas with id > 0 to
-            # ensure that we ignore the name scope and instead use the given
-            # name as the absolute name of the variable.
-            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
-            # Initialize replicas with the same value:
-            def initial_value_fn(device=d):
-              if context.executing_eagerly():
-                init_value = index[devices[0]].value()
-                return array_ops.identity(init_value)
-              else:
-                with ops.device(device):
-                  init_value = index[devices[0]].initial_value
-                  return array_ops.identity(init_value)
-            kwargs["initial_value"] = initial_value_fn
-          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-            # Don't record operations (e.g. other variable reads) during
-            # variable creation.
-            with tape.stop_recording():
-              v = next_creator(*args, **kwargs)
-          assert not isinstance(v, values.DistributedVariable)
-          index[d] = v
-      return index
+    This implementation is different than the one in
+    `tf.distribute.MirroredStrategy` for purposes of backward compatibility.
+    We treat the incoming dataset's batch size as per replica batch size.
 
-    return _create_mirrored_variable(devices, _real_mirrored_creator, *args,
-                                     **kwargs)
+    Args:
+      dataset: `tf.data.Dataset` for input.
+    Returns:
+      An `InputIterator` which returns inputs for each step of the computation.
+    """
+    if self._cluster_spec:
+      worker_device_pairs = self._worker_devices
+    else:
+      worker_device_pairs = [("/job:localhost", self._devices)]
+    return values.DatasetIterator(dataset, worker_device_pairs)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     if self._cluster_spec:
       return values.MultiWorkerDataset(
-          partial(self._call_dataset_fn, dataset_fn), self._worker_device_map,
-          self._prefetch_on_device, self._auto_shard_dataset)
+          functools.partial(self._call_dataset_fn, dataset_fn),
+          self._worker_devices,
+          auto_shard=self._auto_shard_dataset)
     else:
-      return values.PerDeviceDataset(
-          self._call_dataset_fn(dataset_fn), self._devices,
-          self._prefetch_on_device)
-
-  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values=None):
-    if initial_loop_values is None:
-      initial_loop_values = {}
-    initial_loop_values = nest.flatten(initial_loop_values)
-
-    ctx = values.MultiStepContext()
-    def body(i, *args):
-      """A wrapper around `fn` to create the while loop body."""
-      del args
-      fn_inputs = iterator.get_next()
-      if not isinstance(fn_inputs, tuple):
-        fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
-      for (name, output) in ctx.last_step_outputs.items():
-        # Convert all outputs to tensors, potentially from `DistributedValues`.
-        ctx.last_step_outputs[name] = self.unwrap(output)
-      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
-      with ops.control_dependencies([fn_result]):
-        return [i + 1] + flat_last_step_outputs
-
-    # We capture the control_flow_context at this point, before we run `fn`
-    # inside a while_loop. This is useful in cases where we might need to exit
-    # these contexts and get back to the outer context to do some things, for
-    # e.g. create an op which should be evaluated only once at the end of the
-    # loop on the host. One such usage is in creating metrics' value op.
-    self._outer_control_flow_context = (
-        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
-
-    cond = lambda i, *args: i < iterations
-    i = constant_op.constant(0)
-    loop_result = control_flow_ops.while_loop(
-        cond, body, [i] + initial_loop_values, name="",
-        parallel_iterations=1, back_prop=False, swap_memory=False,
-        return_same_structure=True)
-    del self._outer_control_flow_context
-
-    ctx.run_op = control_flow_ops.group(loop_result)
-
-    # Convert the last_step_outputs from a list to the original dict structure
-    # of last_step_outputs.
-    last_step_tensor_outputs = loop_result[1:]
-    last_step_tensor_outputs_dict = nest.pack_sequence_as(
-        ctx.last_step_outputs, last_step_tensor_outputs)
-
-    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
-      output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been aggregated, wrap them in a Mirrored
-      # container, else in a PerDevice container.
-      if aggregation is variables_lib.VariableAggregation.NONE:
-        last_step_tensor_outputs_dict[name] = values.regroup(
-            {d: t for d, t in zip(self._devices, output)}, values.PerDevice)
-      else:
-        assert len(output) == 1
-        last_step_tensor_outputs_dict[name] = output[0]
-
-    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
-    return ctx
-
-  def _broadcast(self, tensor, destinations):
-    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
-    return self._get_cross_tower_ops().broadcast(tensor, destinations or
-                                                 self._devices)
-
-  def _call_for_each_replica(self, fn, *args, **kwargs):
-    return _call_for_each_replica(self, fn, *args, **kwargs)
-
-  def map(self, map_over, fn, *args, **kwargs):
-    # TODO(josh11b): In eager mode, use one thread per device.
-    index = {}
-    for i, m in enumerate(map_over):
-      d = self._devices[i % len(self._devices)]
-      with ops.device(d):
-        l = index.get(d, [])
-        l.append(fn(m,
-                    *values.select_device_mirrored(d, args),
-                    **values.select_device_mirrored(d, kwargs)))
-        index[d] = l
-    # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput
-    # in addition to PerDevice data.
-    return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
-
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
-    del task_type, task_id
-
-    if session_config:
-      session_config.isolate_session_state = True
-
-    if cluster_spec:
-      self._initialize_multi_worker(self._num_gpus, cluster_spec)
-
-    if self._cross_tower_ops is None:
-      if self._cluster_spec:
-        # It currently cannot detect the toplogy of remote workers. So we
-        # hard-code the multi-worker all-reduce algorithm for now.
-        if len(self._workers) == 1:
-          # The default is "nccl".
-          self._cross_tower_ops = cross_tower_ops_lib.AllReduceCrossDeviceOps()
-        else:
-          # The default is hierarchical reduce and broadcast.
-          self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce(
-              self._workers, self._num_gpus)
-      else:
-        self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
-            self._devices, session_config=session_config)
-
-  def _get_cross_tower_ops(self):
-    if self._cross_tower_ops is None:
-      self._cross_tower_ops = (
-          cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps())
-    return self._cross_tower_ops
-
-  def _reduce(self, aggregation, value, destinations):
-    assert not isinstance(value, values.Mirrored)
-    if not isinstance(value, values.DistributedValues):
-      # This function handles reducing values that are not PerDevice or Mirrored
-      # values. For example, the same value could be present on all replicas in
-      # which case `value` would be a single value or value could be 0.
-      return _reduce_non_distributed_value(self, aggregation, value,
-                                           destinations)
-    if aggregation == variable_scope.VariableAggregation.ONLY_FIRST_REPLICA:
-      value = value.get(self._devices[0])
-      if isinstance(value, (int, float)):
-        return value
-      return self.broadcast(value, destinations)
-    return self._get_cross_tower_ops().reduce(
-        aggregation, value, destinations=destinations)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if aggregation == variable_scope.VariableAggregation.ONLY_FIRST_REPLICA:
-      return [self.broadcast(v.get(self._devices[0]), d)
-              for v, d in value_destination_pairs]
-    return self._get_cross_tower_ops().batch_reduce(aggregation,
-                                                    value_destination_pairs)
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    # TODO(josh11b): In eager mode, use one thread per device.
-    assert isinstance(var, values.DistributedVariable)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    updates = {}
-    for d, v in var._index.items():  # pylint: disable=protected-access
-      name = "update_%d" % self._device_index.get(d)
-      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        # If args and kwargs are not mirrored, the value is returned as is.
-        updates[d] = fn(v,
-                        *values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    assert isinstance(colocate_with, list)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    # TODO(josh11b): In eager mode, use one thread per device.
-    updates = {}
-    for d in colocate_with:
-      name = "update_%d" % self._device_index.get(d)
-      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        updates[d] = fn(*values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  def read_var(self, replica_local_var):
-    """Read the aggregate value of a replica-local variable."""
-    if isinstance(replica_local_var, values.ReplicaLocalVariable):
-      return replica_local_var._get_cross_replica()  # pylint: disable=protected-access
-    assert isinstance(replica_local_var, values.Mirrored)
-    return array_ops.identity(replica_local_var.get())
-
-  def _unwrap(self, val):
-    if isinstance(val, values.DistributedValues):
-      # Return in a deterministic order.
-      if set(val.devices) == self._canonical_device_set:
-        return [val.get(device=d) for d in self._devices]
-      return [val.get(device=d) for d in sorted(val.devices)]
-    return [val]
-
-  def value_container(self, val):
-    return values.value_container(val)
-
-  @property
-  def num_replicas(self):
-    return len(self._devices)
+      return values.PerReplicaDataset(
+          self._call_dataset_fn(dataset_fn), self._devices)
 
+  # TODO(priyag): Delete this once all strategies use global batch size.
   @property
-  def num_replicas_in_sync(self):
-    return len(self._devices)
-
-  def _worker_device_index(self):
-    return self._device_index
-
-  @property
-  def worker_devices(self):
-    # Make a copy to prevent users from accidentally mutating our copy.
-    return list(self._devices)
-
-  @property
-  def parameter_devices(self):
-    return list(self._devices)
-
-  @property
-  def between_graph(self):
+  def _global_batch_size(self):
     return False
-
-  @property
-  def should_init(self):
-    return True
-
-  @property
-  def should_checkpoint(self):
-    return True
-
-  @property
-  def should_save_summary(self):
-    return True
-
-  def non_slot_devices(self, var_list):
-    del var_list
-    return list(self._devices)
-
-  def _get_devices_from(self, colocate_with=None):
-    if colocate_with is None:
-      return self._devices
-    else:
-      return cross_tower_ops_lib.get_devices_from(colocate_with)
-
-  class _MirroredReplicaThread(threading.Thread):
-    """A thread that runs() a function on a device."""
-
-    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
-                 **kwargs):
-      super(MirroredStrategy._MirroredReplicaThread, self).__init__()  # pylint: disable=protected-access
-      self.coord = coord
-      self.distribution = dist
-      self.device = device
-      self.replica_id = dist.worker_devices.index(device)
-      self.variable_creator_fn = variable_creator_fn
-      # State needed to run and return the results of `fn`.
-      self.main_fn = fn
-      self.main_args = args
-      self.main_kwargs = kwargs
-      self.main_result = None
-      self.done = False
-      # State needed to run the next merge_call() (if any) requested via
-      # ReplicaContext.
-      self.merge_fn = None
-      self.merge_args = None
-      self.merge_kwargs = None
-      self.merge_result = None
-      self.captured_name_scope = None
-      # We use a thread.Event for the main thread to signal when this
-      # thread should start running (`should_run`), and another for
-      # this thread to transfer control back to the main thread
-      # (`has_paused`, either when it gets to a
-      # `get_replica_context().merge_call` or when `fn` returns). In
-      # either case the event starts cleared, is signaled by calling
-      # set(). The receiving thread waits for the signal by calling
-      # wait() and then immediately clearing the event using clear().
-      self.should_run = threading.Event()
-      self.has_paused = threading.Event()
-      # These fields have to do with inheriting various contexts from the
-      # parent thread:
-      # pylint: disable=protected-access
-      self.context_mode = context.context()._eager_context.mode
-      if not context.context()._context_handle:
-        context.context()._initialize_handle_and_devices()
-      self.context_device_policy = (
-          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-              context.context()._context_handle))
-      self.graph = ops.get_default_graph()
-      self._variable_creator_stack = self.graph._variable_creator_stack[:]
-      self._captured_var_scope = variable_scope.get_variable_scope()
-      # Adding a "/" at end lets us re-enter this scope later.
-      self._name_scope = self.graph.get_name_scope()
-      if self._name_scope:
-        self._name_scope += "/"
-      if self.replica_id > 0:
-        if not self._name_scope:
-          self._name_scope = ""
-        self._name_scope += "replica_%d/" % self.replica_id
-
-    def run(self):
-      # pylint: disable=protected-access
-      self.graph._variable_creator_stack = self._variable_creator_stack
-      self.should_run.wait()
-      self.should_run.clear()
-      try:
-        if self.coord.should_stop():
-          return
-        with self.coord.stop_on_exception(), \
-            context.context()._mode(self.context_mode), \
-            context.context().device_policy(self.context_device_policy), \
-            _enter_graph(self.graph), \
-            MirroredReplicaContext(self.distribution, self.replica_id), \
-            ops.device(self.device), \
-            ops.name_scope(self._name_scope), \
-            variable_scope.variable_scope(
-                self._captured_var_scope, reuse=self.replica_id > 0), \
-            variable_scope.variable_creator_scope(self.variable_creator_fn):
-          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
-          self.done = True
-      finally:
-        self.has_paused.set()
-
-
-class MirroredReplicaContext(distribute_lib.ReplicaContext):
-  """ReplicaContext used in MirroredStrategy.call_for_each_replica().
-
-  Opened in `_MirroredReplicaThread`, to allow the user to invoke
-  `MirroredStrategy`'s specific implementation of `merge_call()`,
-  which works by delegating the function and its arguments to
-  the main thread (the one that invoked
-  `MirroredStrategy.call_for_each_replica()`).
-  """
-
-  def _merge_call(self, fn, *args, **kwargs):
-    """Delegate to the main thread to actually perform merge_call()."""
-    t = threading.current_thread()  # a _MirroredReplicaThread
-    t.merge_fn = fn
-    t.merge_args = args
-    t.merge_kwargs = kwargs
-    t.captured_name_scope = t.graph.get_name_scope()
-    # Adding a "/" at end lets us re-enter this scope later.
-    if t.captured_name_scope:
-      t.captured_name_scope += "/"
-    t.has_paused.set()
-    t.should_run.wait()
-    t.should_run.clear()
-    if t.coord.should_stop():
-      raise _RequestedStop()
-    return t.merge_result
-
-  @property
-  def device(self):
-    distribute_lib.require_replica_context(self)
-    return self._distribution_strategy.worker_devices[self._replica_id]
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index b8e7edaaf82804e6741cc4c94c44ed77189d7ad9..1027da857d3042e5f3699bf9e373c4be4d3a754a 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -20,14 +20,16 @@ from __future__ import print_function
 
 import sys
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.contrib.distribute.python import values
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
@@ -35,7 +37,6 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.layers import core
@@ -47,7 +48,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
+from tensorflow.python.training import distribution_strategy_context as ds_context
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import server_lib
@@ -56,251 +57,240 @@ from tensorflow.python.training import server_lib
 GPU_TEST = "test_gpu" in sys.argv[0]
 
 
-class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.mirrored_strategy_with_two_gpus,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_two_gpus],
+    mode=["graph", "eager"]))
+class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
+                                        parameterized.TestCase):
 
-  def _get_distribution_strategy(self):
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    if GPU_TEST:
-      self.assertGreater(context.num_gpus(), 0)
-      if context.num_gpus() > 1:
-        devices = ["/device:GPU:0", "/device:GPU:1"]
-    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
-    return mirrored_strategy.MirroredStrategy(devices)
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
 
-  def testMinimizeLossEager(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
 
-  def testMinimizeLossGraph(self):
-    soft_placement = not GPU_TEST
-    print("testMinimizeLossGraph soft_placement:", soft_placement)
-    self._test_minimize_loss_graph(
-        self._get_distribution_strategy(), soft_placement=soft_placement)
-
-  def testMapReduce(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_map_reduce(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_device_index(self._get_distribution_strategy())
-
-  def testReplicaId(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_replica_id(self._get_distribution_strategy())
-
-  def testNumReplicas(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self.assertEqual(2, self._get_distribution_strategy().num_replicas)
-
-  def testNumReplicasInSync(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self.assertEqual(2, self._get_distribution_strategy().
-                     num_replicas_in_sync)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testRunRegroupError(self):
-
-    def run_fn(device_id):
-      # Generates a list with different lengths on different devices.
-      # Will fail in _regroup() (if more than one device).
-      return list(range(device_id))
+  def testNumReplicasInSync(self, distribution):
+    self.assertEqual(2, distribution.num_replicas_in_sync)
 
-    dist = self._get_distribution_strategy()
-    with dist.scope(), self.assertRaises(AssertionError):
-      dist.call_for_each_replica(run_fn, dist.worker_device_index)
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testReduceToCpu(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
+  def testRunRegroupError(self, distribution):
+    def run_fn():
+      replica_id = int(self.evaluate(_replica_id()))
+      # Generates a list with different lengths on different devices.
+      # Will fail in _regroup() (if more than one device).
+      return list(range(replica_id))
 
-    def run_fn(device_id):
-      return device_id
+    with distribution.scope(), self.assertRaises(AssertionError):
+      distribution.extended.call_for_each_replica(run_fn)
 
-    dist = self._get_distribution_strategy()
-    with dist.scope():
-      result = dist.call_for_each_replica(run_fn, dist.worker_device_index)
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.SUM,
+  def testReduceToCpu(self, distribution):
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(_replica_id)
+      reduced = distribution.reduce(
+          reduce_util.ReduceOp.SUM,
           result,
           destinations="/device:CPU:0")
-      unwrapped = dist.unwrap(reduced)
+      unwrapped = distribution.unwrap(reduced)
       self.assertEqual(1, len(unwrapped))
-      expected = sum(range(len(dist.worker_devices)))
+      expected = sum(range(distribution.num_replicas_in_sync))
       self.assertEqual(expected, self.evaluate(unwrapped[0]))
 
-  @test_util.run_in_graph_and_eager_modes
-  def testReduceOnlyFirstReplicaUpdates(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    def run_fn(device_id):
-      return constant_op.constant(3 + 5 * device_id)
-
-    dist = self._get_distribution_strategy()
-    with dist.scope():
-      result = dist.call_for_each_replica(run_fn, dist.worker_device_index)
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.ONLY_FIRST_REPLICA,
-          result,
-          destinations="/device:CPU:0")
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(1, len(unwrapped))
-      self.assertEqual(3, self.evaluate(unwrapped[0]))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testReduceToMultipleDestinations(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    devices = ["/device:GPU:0"]
-    if GPU_TEST:
-      self.assertGreater(context.num_gpus(), 0)
-    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
-
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.SUM,
+  def testMakeInputFnIterator(self, distribution):
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=2,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = distribution.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
+                                 expected_values)
+
+  def testGlobalStepUpdate(self, distribution):
+    self._test_global_step_update(distribution)
+
+
+def one_device_combinations():
+  return combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_one_cpu,
+          combinations.mirrored_strategy_with_one_gpu,
+          combinations.core_mirrored_strategy_with_one_cpu,
+          combinations.core_mirrored_strategy_with_one_gpu],
+      mode=["graph", "eager"])
+
+
+class MirroredOneDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.NamedDistribution(
+              "Mirrored1CPU",
+              lambda: mirrored_strategy.MirroredStrategy(["/device:CPU:0"]),
+              required_gpus=1),
+          combinations.mirrored_strategy_with_one_gpu,
+          combinations.NamedDistribution(
+              "CoreMirrored1CPU",
+              lambda: mirrored_strategy.CoreMirroredStrategy(["/device:CPU:0"]),
+              required_gpus=1),
+          combinations.core_mirrored_strategy_with_one_gpu],
+      mode=["graph", "eager"]))
+  def testReduceToMultipleDestinations(self, distribution):
+    with distribution.scope():
+      reduced = distribution.extended.reduce_to(
+          reduce_util.ReduceOp.SUM,
           1.0,
           destinations=["/device:CPU:0", "/device:GPU:0"])
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(2, len(unwrapped))
+      unwrapped = distribution.unwrap(reduced)
+      self.assertLen(unwrapped, 2)
       self.assertEqual(1.0, self.evaluate(unwrapped[0]))
 
+  @combinations.generate(one_device_combinations())
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
 
-class MirroredStrategyVariableCreationTest(test.TestCase):
+  @combinations.generate(one_device_combinations())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
 
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
+  @combinations.generate(one_device_combinations())
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Enough GPUs not available for this test in eager mode.")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSingleVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+class MirroredStrategyVariableCreatorStackTest(
+    test.TestCase, parameterized.TestCase):
 
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=["graph"]))
+  def testCreatorStacksAreThreadLocal(self, distribution):
+    def model_fn():
+      replica_id_str = str(self.evaluate(_replica_id()))
+
+      def thread_creator_fn(next_creator, *args, **kwargs):
+        return next_creator(*args, **kwargs) + ":thread_" + replica_id_str
+
+      with variable_scope.variable_creator_scope(thread_creator_fn):
+        # Create a variable in this scope.
+        v = variable_scope.variable(1.0)
+
+        # This will pause the current thread, and execute the other thread.
+        ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
+
+    def main_thread_creator(next_creator, *args, **kwargs):
+      # We are not using the underlying next_creator for test purposes.
+      del next_creator, args, kwargs
+      return "main_thread"
+
+    with context.graph_mode(), \
+        distribution.scope(), \
+        variable_scope.variable_creator_scope(main_thread_creator):
+      result = distribution.extended.call_for_each_replica(model_fn)
+      result = distribution.unwrap(result)
+      expected = ["main_thread:thread_0", "main_thread:thread_1"]
+      self.assertEqual(expected, result)
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class MirroredStrategyVariableCreationTest(test.TestCase):
+
+  def testSingleVariable(self, distribution):
     def model_fn():
       # This variable should be created only once across the threads because of
-      # special variable_creator functions used by `dist.call_for_each_replica`.
+      # special variable_creator functions used by
+      # `distribution.extended.call_for_each_replica`.
       v = variable_scope.variable(1.0, name="foo")
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       self.assertIsInstance(result, values.MirroredVariable)
-      self.assertEquals("foo:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testUnnamedVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+      self.assertEqual("foo:0", result.name)
 
+  def testUnnamedVariable(self, distribution):
     def model_fn():
       v = variable_scope.variable(1.0)
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       self.assertIsInstance(result, values.MirroredVariable)
       # Default name of "Variable" will be used.
-      self.assertEquals("Variable:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleVariables(self):
-    self._skip_eager_if_gpus_less_than(1)
+      self.assertEqual("Variable:0", result.name)
 
+  def testMultipleVariables(self, distribution):
     def model_fn():
       vs = []
       for i in range(5):
         vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return vs
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       for i, v in enumerate(result):
         self.assertIsInstance(v, values.MirroredVariable)
-        self.assertEquals("foo" + str(i) + ":0", v.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleVariablesWithSameCanonicalName(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self.assertEqual("foo" + str(i) + ":0", v.name)
 
+  def testMultipleVariablesWithSameCanonicalName(self, distribution):
     def model_fn():
       vs = []
       vs.append(variable_scope.variable(1.0, name="foo/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
       vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return vs
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       for v in result:
         self.assertIsInstance(v, values.MirroredVariable)
-      self.assertEquals(4, len(result))
-      self.assertEquals("foo/bar:0", result[0].name)
-      self.assertEquals("foo_1/bar:0", result[1].name)
-      self.assertEquals("foo_1/bar_1:0", result[2].name)
-      self.assertEquals("foo/bar_1:0", result[3].name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testVariableWithSameCanonicalNameAcrossThreads(self):
-    self._skip_eager_if_gpus_less_than(1)
-
-    def model_fn(device_id):
-      v = variable_scope.variable(1.0, name="foo_" + str(device_id))
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
-      return v
+      self.assertEqual(4, len(result))
+      self.assertEqual("foo/bar:0", result[0].name)
+      self.assertEqual("foo_1/bar:0", result[1].name)
+      self.assertEqual("foo_1/bar_1:0", result[2].name)
+      self.assertEqual("foo/bar_1:0", result[3].name)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
+  def testVariableWithSameCanonicalNameAcrossThreads(self, distribution):
+    def model_fn():
+      replica_id = self.evaluate(_replica_id())
+      v = variable_scope.variable(1.0, name="foo_" + str(replica_id))
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
 
-    with dist.scope():
-      result = dist.call_for_each_replica(
-          model_fn, dist.worker_device_index, run_concurrently=False)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       self.assertIsInstance(result, values.MirroredVariable)
       # The resulting mirrored variable will use the name from the first device.
-      self.assertEquals("foo_0:0", result.name)
+      self.assertEqual("foo_0:0", result.name)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithLayers(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testWithLayers(self, distribution):
     def model_fn(features):
       with variable_scope.variable_scope("common"):
         layer1 = core.Dense(1)
@@ -308,17 +298,14 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         layer2 = core.Dense(1)
         layer2(features)
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         layer3 = core.Dense(1)
         layer3(features)
         return [(layer1.kernel, layer1.bias),
                 (layer2.kernel, layer2.bias),
                 (layer3.kernel, layer3.bias)]
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-    ds = dist.distribute_dataset(
+    ds = distribution.distribute_dataset(
         lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
     if context.executing_eagerly():
       iterator = ds.make_one_shot_iterator()
@@ -328,27 +315,23 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     features = iterator.get_next()
 
-    with dist.scope():
-      result = dist.call_for_each_replica(
-          model_fn, features, run_concurrently=False)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=(features,))
       suffixes = ["", "_1", "_2"]
       for (kernel, bias), suffix in zip(result, suffixes):
         self.assertIsInstance(kernel, values.MirroredVariable)
-        self.assertEquals("common/dense" + suffix + "/kernel:0", kernel.name)
+        self.assertEqual("common/dense" + suffix + "/kernel:0", kernel.name)
         self.assertIsInstance(bias, values.MirroredVariable)
-        self.assertEquals("common/dense" + suffix + "/bias:0", bias.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithVariableAndVariableScope(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self.assertEqual("common/dense" + suffix + "/bias:0", bias.name)
 
+  def testWithVariableAndVariableScope(self, distribution):
     def model_fn():
       v0 = variable_scope.variable(1.0, name="var0", aggregation=None)
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.variable(1.0, name="var1")
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         v2 = variable_scope.variable(
             1.0,
             name="var2",
@@ -362,37 +345,31 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       return v0, v1, v2, v3
 
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+    with distribution.scope():
       v = variable_scope.variable(1.0, name="var-main0")
-      self.assertEquals("var-main0:0", v.name)
+      self.assertEqual("var-main0:0", v.name)
 
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
-      self.assertEquals(4, len(result))
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertEqual(4, len(result))
       v0, v1, v2, v3 = result
       self.assertIsInstance(v0, values.MirroredVariable)
-      self.assertEquals("var0:0", v0.name)
+      self.assertEqual("var0:0", v0.name)
       self.assertIsInstance(v1, values.MirroredVariable)
-      self.assertEquals("common/var1:0", v1.name)
+      self.assertEqual("common/var1:0", v1.name)
       self.assertIsInstance(v2, values.ReplicaLocalVariable)
-      self.assertEquals("common/var2:0", v2.name)
-      self.assertEquals(variable_scope.VariableAggregation.SUM, v2.aggregation)
+      self.assertEqual("common/var2:0", v2.name)
+      self.assertEqual(variable_scope.VariableAggregation.SUM, v2.aggregation)
       self.assertIsInstance(v3, values.MirroredVariable)
-      self.assertEquals("common/var3:0", v3.name)
-      self.assertEquals(variable_scope.VariableAggregation.MEAN, v3.aggregation)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithGetVariableAndVariableScope(self):
-    self._skip_eager_if_gpus_less_than(1)
+      self.assertEqual("common/var3:0", v3.name)
+      self.assertEqual(variable_scope.VariableAggregation.MEAN, v3.aggregation)
 
+  def testWithGetVariableAndVariableScope(self, distribution):
     def model_fn():
       v0 = variable_scope.get_variable("var0", [1])
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.get_variable("var1", [1])
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         v2 = variable_scope.get_variable(
             "var2", [1],
             synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -404,33 +381,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       return v0, v1, v2, v3
 
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+    with distribution.scope():
       with variable_scope.variable_scope("main"):
         v = variable_scope.get_variable("var-main0", [1])
-        self.assertEquals("main/var-main0:0", v.name)
+        self.assertEqual("main/var-main0:0", v.name)
 
-        result = dist.call_for_each_replica(model_fn, run_concurrently=False)
-        self.assertEquals(4, len(result))
+        result = distribution.extended.call_for_each_replica(model_fn)
+        self.assertEqual(4, len(result))
         v0, v1, v2, v3 = result
         self.assertIsInstance(v0, values.MirroredVariable)
-        self.assertEquals("main/var0:0", v0.name)
+        self.assertEqual("main/var0:0", v0.name)
         self.assertIsInstance(v1, values.MirroredVariable)
-        self.assertEquals("main/common/var1:0", v1.name)
+        self.assertEqual("main/common/var1:0", v1.name)
         self.assertIsInstance(v2, values.ReplicaLocalVariable)
-        self.assertEquals("main/common/var2:0", v2.name)
-        self.assertEquals(variable_scope.VariableAggregation.SUM,
-                          v2.aggregation)
+        self.assertEqual("main/common/var2:0", v2.name)
+        self.assertEqual(variable_scope.VariableAggregation.SUM,
+                         v2.aggregation)
         self.assertIsInstance(v3, values.MirroredVariable)
-        self.assertEquals("main/common/var3:0", v3.name)
-        self.assertEquals(variable_scope.VariableAggregation.MEAN,
-                          v3.aggregation)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testOnlyFirstReplicaUpdatesVariables(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self.assertEqual("main/common/var3:0", v3.name)
+        self.assertEqual(variable_scope.VariableAggregation.MEAN,
+                         v3.aggregation)
 
+  def testOnlyFirstReplicaUpdatesVariables(self, distribution):
     def create_fn():
       aggregation = variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
       v0 = variable_scope.variable(
@@ -446,71 +418,73 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       return v0, v1
 
     devices = ["/device:GPU:0", "/device:CPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      v0, v1 = dist.call_for_each_replica(create_fn, run_concurrently=False)
+    with distribution.scope():
+      v0, v1 = distribution.extended.call_for_each_replica(create_fn)
       self.evaluate(v0.initializer)
       self.assertEqual(2.0, self.evaluate(v0.get(devices[0])))
       self.assertEqual(2.0, self.evaluate(v0.get(devices[1])))
-      self.assertEqual(2.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0, self.evaluate(distribution.extended.read_var(v0)))
       self.evaluate(v1.initializer)
       self.assertEqual(3.0, self.evaluate(v1.get(devices[0])))
       self.assertEqual(3.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0, self.evaluate(dist.read_var(v1)))
+      self.assertEqual(3.0, self.evaluate(distribution.extended.read_var(v1)))
+
+      def replica_id_plus_one():
+        return math_ops.cast(_replica_id() + 1, dtype=dtypes.float32)
 
       # Update using the assign_add member function.
-      def update_member_fn(device_id):
-        update0 = v0.assign_add(5.0 * (device_id + 1))
-        update1 = v1.assign_add(7.0 * (device_id + 1))
+      def update_member_fn():
+        update0 = v0.assign_add(5.0 * replica_id_plus_one())
+        update1 = v1.assign_add(7.0 * replica_id_plus_one())
         return update0, update1
 
-      update0a, update1a = dist.call_for_each_replica(
-          update_member_fn, dist.worker_device_index, run_concurrently=False)
+      update0a, update1a = distribution.extended.call_for_each_replica(
+          update_member_fn)
 
       # Update "sync on read" variable.
-      self.evaluate(dist.group(update0a))
+      self.evaluate(distribution.group(update0a))
       self.assertEqual(2.0 + 5.0, self.evaluate(v0.get(devices[0])))
       # Writes are not synchronized for "sync on read" variables,
       # so device[1] can end up with a different value.
       self.assertEqual(2.0 + 2*5.0, self.evaluate(v0.get(devices[1])))
       # Always reads from device 0.
-      self.assertEqual(2.0 + 5.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0 + 5.0, self.evaluate(
+          distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
-      self.evaluate(dist.group(update1a))
+      self.evaluate(distribution.group(update1a))
       self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[0])))
       # Writes are synchronized for v1, only the argument to assign_add on
       # device[0] is used.
       self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0 + 7.0, self.evaluate(dist.read_var(v1)))
+      self.assertEqual(3.0 + 7.0, self.evaluate(
+          distribution.extended.read_var(v1)))
 
       # Update using state_ops.assign_add global function.
-      def update_state_ops_fn(device_id):
-        update0 = state_ops.assign_add(v0, 11.0 * (device_id + 1))
-        update1 = state_ops.assign_add(v1, 13.0 * (device_id + 1))
+      def update_state_ops_fn():
+        update0 = state_ops.assign_add(v0, 11.0 * replica_id_plus_one())
+        update1 = state_ops.assign_add(v1, 13.0 * replica_id_plus_one())
         return update0, update1
 
-      update0b, update1b = dist.call_for_each_replica(
-          update_state_ops_fn, dist.worker_device_index, run_concurrently=False)
-      self.evaluate(dist.group(update0b))
+      update0b, update1b = distribution.extended.call_for_each_replica(
+          update_state_ops_fn)
+      self.evaluate(distribution.group(update0b))
 
       # Update "sync on read" variable.
       self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(v0.get(devices[0])))
       self.assertEqual(2.0 + 2*5.0 + 2*11.0, self.evaluate(v0.get(devices[1])))
-      self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(
+          distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
-      self.evaluate(dist.group(update1b))
+      self.evaluate(distribution.group(update1b))
       self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[0])))
       self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(dist.read_var(v1)))
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNoneSynchronizationWithGetVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(
+          distribution.extended.read_var(v1)))
+
+  def testNoneSynchronizationWithGetVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
@@ -519,12 +493,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             "v", [1],
             synchronization=variable_scope.VariableSynchronization.NONE)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNoneSynchronizationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testNoneSynchronizationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
@@ -534,23 +504,15 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             name="v",
             synchronization=variable_scope.VariableSynchronization.NONE)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidSynchronizationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidSynchronizationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable synchronization mode: Invalid for "
           "variable: v"):
         variable_scope.variable(1.0, name="v", synchronization="Invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidAggregationWithGetVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidAggregationWithGetVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
@@ -559,12 +521,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
             aggregation="invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidAggregationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidAggregationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
@@ -574,55 +532,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
             aggregation="invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testThreeDevices(self):
-    self._skip_eager_if_gpus_less_than(2)
-
-    def model_fn():
-      v = variable_scope.variable(1.0, name="foo")
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
-      return v
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
-      self.assertIsInstance(result, values.MirroredVariable)
-      self.assertEquals("foo:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNonMatchingVariableCreation(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testNonMatchingVariableCreation(self, distribution):
     def model_fn(name):
       v = variable_scope.variable(1.0, name=name)
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
+    with distribution.scope():
       names = values.DistributedValues({
           "/device:CPU:0": "foo",
           "/device:GPU:0": "bar"
       })
       with self.assertRaises(RuntimeError):
-        _ = dist.call_for_each_replica(model_fn, names, run_concurrently=False)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testReplicaLocalVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+        _ = distribution.extended.call_for_each_replica(model_fn, args=(names,))
 
+  def testReplicaLocalVariable(self, distribution):
     all_v_sum = {}
     all_v_mean = {}
     components_sum = {}
     components_mean = {}
 
-    def model_fn(device_id):
+    def model_fn():
+      replica_id = self.evaluate(_replica_id())
       v_sum = variable_scope.variable(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -633,26 +564,22 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.MEAN)
       self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
       self.assertTrue(isinstance(v_mean, values.ReplicaLocalVariable))
-      updates = [v_sum.assign_add(2.0 + device_id),
-                 v_mean.assign(6.0 * device_id)]
-      all_v_sum[device_id] = v_sum
-      all_v_mean[device_id] = v_mean
+      updates = [v_sum.assign_add(2.0 + replica_id),
+                 v_mean.assign(6.0 * replica_id)]
+      all_v_sum[replica_id] = v_sum
+      all_v_mean[replica_id] = v_mean
       c_sum = v_sum.get()
       c_mean = v_mean.get()
-      components_sum[device_id] = c_sum
-      components_mean[device_id] = c_mean
+      components_sum[replica_id] = c_sum
+      components_mean[replica_id] = c_mean
       self.assertIsNot(v_sum, c_sum)
       self.assertIsNot(v_mean, c_mean)
       return updates, v_sum, v_mean, c_sum, c_mean
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
+    with distribution.scope():
       # Create "sum" and "mean" versions of ReplicaLocalVariables.
       ret_ops, ret_v_sum, ret_v_mean, regrouped_sum, regrouped_mean = (
-          dist.call_for_each_replica(
-              model_fn, dist.worker_device_index, run_concurrently=False))
+          distribution.extended.call_for_each_replica(model_fn))
       # Should see the same wrapping instance in all replicas.
       self.assertIs(all_v_sum[0], ret_v_sum)
       self.assertIs(all_v_mean[0], ret_v_mean)
@@ -667,10 +594,10 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       # Apply updates
       self.evaluate(variables.global_variables_initializer())
-      self.evaluate([y for x in ret_ops for y in dist.unwrap(x)])
+      self.evaluate([y for x in ret_ops for y in distribution.unwrap(x)])
       expected_sum = 0.0
       expected_mean = 0.0
-      for i, d in enumerate(dist.worker_devices):
+      for i, d in enumerate(distribution.extended.worker_devices):
         # Should see different values on different devices.
         v_sum_value = self.evaluate(ret_v_sum.get(d).read_value())
         v_mean_value = self.evaluate(ret_v_mean.get(d).read_value())
@@ -680,69 +607,125 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         expected = i * 6.0
         self.assertEqual(expected, v_mean_value)
         expected_mean += expected
-      expected_mean /= len(dist.worker_devices)
+      expected_mean /= len(distribution.extended.worker_devices)
 
       # Without get(device), should return the value you get by
       # applying the reduction across all replicas (whether you use
       # read_var(), get(), or nothing).
-      self.assertEqual(expected_sum, self.evaluate(dist.read_var(ret_v_sum)))
-      self.assertEqual(expected_mean, self.evaluate(dist.read_var(ret_v_mean)))
+      self.assertEqual(expected_sum, self.evaluate(
+          distribution.extended.read_var(ret_v_sum)))
+      self.assertEqual(expected_mean, self.evaluate(
+          distribution.extended.read_var(ret_v_mean)))
       self.assertEqual(expected_sum, self.evaluate(ret_v_sum.get()))
       self.assertEqual(expected_mean, self.evaluate(ret_v_mean.get()))
       self.assertEqual(expected_sum, self.evaluate(ret_v_sum))
       self.assertEqual(expected_mean, self.evaluate(ret_v_mean))
 
+  # TODO(priyag): Update this test to work in eager mode as well.
+  def testDynamicRnnVariables(self, distribution):
+    def model_fn():
+      inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
+      cell_fw = rnn_cell_impl.LSTMCell(300)
+      cell_bw = rnn_cell_impl.LSTMCell(300)
+      (outputs, _) = rnn.bidirectional_dynamic_rnn(
+          cell_fw,
+          cell_bw,
+          inputs,
+          dtype=dtypes.float32)
+      return outputs
+
+    with context.graph_mode(), distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      # Two variables are created by the RNN layer.
+      self.assertEqual(2, len(result))
+      for v in result:
+        self.assertIsInstance(v, values.DistributedValues)
+        _, v1 = distribution.unwrap(v)
+        self.assertStartsWith(v1._op.name, "replica_1/")
+
+  def testReplicaLocalVariableUpdate(self, distribution):
+    def model_fn():
+      v_sum = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.SUM)
+      self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
+      return v_sum
+
+    def update(var, value):
+      return var.assign(value)
+
+    with distribution.scope():
+      ret_v_sum = distribution.extended.call_for_each_replica(model_fn)
+
+      # Initialize variables.
+      self.evaluate(variables.global_variables_initializer())
+      # Assert that the aggregated value of the replica local vars is the sum
+      # of the individual values before running the update ops.
+      self.assertEqual(1.0, self.evaluate(ret_v_sum.get(
+          distribution.extended.worker_devices[0]).read_value()))
+      self.assertEqual(2.0, self.evaluate(ret_v_sum))
+
+      # Apply updates.
+      update_ops = distribution.extended.update(
+          ret_v_sum, update, args=(5.0,), group=False)
+      self.evaluate(update_ops)
+      # Assert that the aggregated value of the replica local vars is the sum
+      # of the individual values after running the update ops.
+      self.assertEqual(5.0, self.evaluate(ret_v_sum.get(
+          distribution.extended.worker_devices[0]).read_value()))
+      self.assertEqual(10.0, self.evaluate(ret_v_sum))
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph"]))
+class MirroredStrategyNameScopeTest(test.TestCase):
   # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
   # testing this in eager mode.
 
-  def testNameScope(self):
+  def testNameScope(self, distribution):
     def model_fn():
       with ops.name_scope("foo"):
         a = constant_op.constant(1.0, name="a")
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         b = constant_op.constant(1.0, name="b")
       return a, b
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
+    with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
-        result = dist.call_for_each_replica(model_fn, run_concurrently=False)
-        self.assertEquals(2, len(result))
+        result = distribution.extended.call_for_each_replica(model_fn)
+        self.assertEqual(2, len(result))
         for v, name in zip(result, ["a", "b"]):
           self.assertIsInstance(v, values.DistributedValues)
-          v0, v1 = dist.unwrap(v)
-          self.assertEquals("main/foo/" + name + ":0", v0.name)
-          self.assertEquals("main/replica_1/foo/" + name + ":0", v1.name)
+          v0, v1 = distribution.unwrap(v)
+          self.assertEqual("main/foo/" + name + ":0", v0.name)
+          self.assertEqual("main/replica_1/foo/" + name + ":0", v1.name)
 
-  def testWithDefaultName(self):
+  def testWithDefaultName(self, distribution):
     def model_fn():
       with ops.name_scope(None, "foo"):
         a = constant_op.constant(1.0, name="a")
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         b = constant_op.constant(2.0, name="b")
       return a, b
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
-      self.assertEquals(2, len(result))
+    with context.graph_mode(), distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertEqual(2, len(result))
       for v, name in zip(result, ["a", "b"]):
         self.assertIsInstance(v, values.DistributedValues)
-        v0, v1 = dist.unwrap(v)
-        self.assertEquals("foo/" + name + ":0", v0.name)
-        self.assertEquals("replica_1/foo/" + name + ":0", v1.name)
+        v0, v1 = distribution.unwrap(v)
+        self.assertEqual("foo/" + name + ":0", v0.name)
+        self.assertEqual("replica_1/foo/" + name + ":0", v1.name)
 
   # variable_scope.variable() respects name scopes when creating
   # variables. On the other hand variable_scope.get_variable() ignores name
   # scopes when creating variables. We test both methods of creating variables
   # to make sure that we have the same variable names in both cases.
-  def testNameScopeWithVariable(self):
+  def testNameScopeWithVariable(self, distribution):
     def in_cross_replica(_):
       c = variable_scope.variable(1.0, name="c")
       return c
@@ -750,32 +733,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     def model_fn():
       b = variable_scope.variable(1.0, name="b")
       with ops.name_scope("foo"):
-        c = distribution_strategy_context.get_replica_context().merge_call(
-            in_cross_replica)
+        c = ds_context.get_replica_context().merge_call(in_cross_replica)
       return b, c
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
+    with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
         a = variable_scope.variable(1.0, name="a")
-        result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+        result = distribution.extended.call_for_each_replica(model_fn)
       result_b = result[0]
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
       self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = dist.unwrap(a)
-      b0, b1 = dist.unwrap(result_b)
-      c0, c1 = dist.unwrap(result_c)
-      self.assertEquals("main/a:0", a0.name)
-      self.assertEquals("main/a/replica_1:0", a1.name)
-      self.assertEquals("main/b:0", b0.name)
-      self.assertEquals("main/b/replica_1:0", b1.name)
-      self.assertEquals("main/foo/c:0", c0.name)
-      self.assertEquals("main/foo/c/replica_1:0", c1.name)
-
-  def testNameScopeWithGetVariable(self):
+      a0, a1 = distribution.unwrap(a)
+      b0, b1 = distribution.unwrap(result_b)
+      c0, c1 = distribution.unwrap(result_c)
+      self.assertEqual("main/a:0", a0.name)
+      self.assertEqual("main/a/replica_1:0", a1.name)
+      self.assertEqual("main/b:0", b0.name)
+      self.assertEqual("main/b/replica_1:0", b1.name)
+      self.assertEqual("main/foo/c:0", c0.name)
+      self.assertEqual("main/foo/c/replica_1:0", c1.name)
+
+  def testNameScopeWithGetVariable(self, distribution):
     def in_cross_replica(_):
       c = variable_scope.get_variable("c", [1])
       return c
@@ -783,118 +762,78 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     def model_fn():
       b = variable_scope.get_variable("b", [1])
       with ops.name_scope("foo"):
-        c = distribution_strategy_context.get_replica_context().merge_call(
-            in_cross_replica)
+        c = ds_context.get_replica_context().merge_call(in_cross_replica)
       return b, c
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
+    with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
         a = variable_scope.get_variable("a", [1])
-        result = dist.call_for_each_replica(model_fn, run_concurrently=False)
+        result = distribution.extended.call_for_each_replica(model_fn)
       result_b = result[0]
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
       self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = dist.unwrap(a)
-      b0, b1 = dist.unwrap(result_b)
-      c0, c1 = dist.unwrap(result_c)
-      self.assertEquals("a:0", a0.name)
-      self.assertEquals("a/replica_1:0", a1.name)
-      self.assertEquals("b:0", b0.name)
-      self.assertEquals("b/replica_1:0", b1.name)
-      self.assertEquals("c:0", c0.name)
-      self.assertEquals("c/replica_1:0", c1.name)
-
-  def testDynamicRnnVariables(self):
+      a0, a1 = distribution.unwrap(a)
+      b0, b1 = distribution.unwrap(result_b)
+      c0, c1 = distribution.unwrap(result_c)
+      self.assertEqual("a:0", a0.name)
+      self.assertEqual("a/replica_1:0", a1.name)
+      self.assertEqual("b:0", b0.name)
+      self.assertEqual("b/replica_1:0", b1.name)
+      self.assertEqual("c:0", c0.name)
+      self.assertEqual("c/replica_1:0", c1.name)
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.NamedDistribution(
+            "Mirrored3Devices",
+            # pylint: disable=g-long-lambda
+            lambda: mirrored_strategy.MirroredStrategy(
+                ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+            required_gpus=2),
+        combinations.NamedDistribution(
+            "CoreMirrored3Devices",
+            # pylint: disable=g-long-lambda
+            lambda: mirrored_strategy.CoreMirroredStrategy(
+                ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+            required_gpus=2)],
+    mode=["graph", "eager"]))
+class MirroredThreeDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  def testThreeDevices(self, distribution):
     def model_fn():
-      inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
-      cell_fw = rnn_cell_impl.LSTMCell(300)
-      cell_bw = rnn_cell_impl.LSTMCell(300)
-      (outputs, _) = rnn.bidirectional_dynamic_rnn(
-          cell_fw,
-          cell_bw,
-          inputs,
-          dtype=dtypes.float32)
-      return outputs
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_replica(model_fn, run_concurrently=False)
-      # Two variables are created by the RNN layer.
-      self.assertEquals(2, len(result))
-      for v in result:
-        self.assertIsInstance(v, values.DistributedValues)
-        _, v1 = dist.unwrap(v)
-        self.assertStartsWith(v1.name, "replica_1/")
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testReplicaLocalVariableUpdate(self):
-    with context.graph_mode():
-
-      def model_fn():
-        v_sum = variable_scope.variable(
-            1.0,
-            synchronization=variable_scope.VariableSynchronization.ON_READ,
-            aggregation=variable_scope.VariableAggregation.SUM)
-        self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
-        return v_sum
-
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:GPU:1"])
-
-      def update(var, value):
-        return var.assign(value)
-
-      with dist.scope():
-        ret_v_sum = dist.call_for_each_replica(model_fn, run_concurrently=False)
-        update_ops = dist.update(ret_v_sum, update, 5.0, grouped=False)
-
-        # Initialize variables.
-        self.evaluate(variables.global_variables_initializer())
-        # Assert that the aggregated value of the replica local vars is the sum
-        # of the individual values before running the update ops.
-        self.assertEquals(1.0, self.evaluate(
-            ret_v_sum.get(dist._devices[0]).read_value()))
-        self.assertEquals(2.0, self.evaluate(ret_v_sum))
+      v = variable_scope.variable(1.0, name="foo")
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
 
-        # Apply updates.
-        self.evaluate(update_ops)
-        # Assert that the aggregated value of the replica local vars is the sum
-        # of the individual values after running the update ops.
-        self.assertEquals(5.0, self.evaluate(
-            ret_v_sum.get(dist._devices[0]).read_value()))
-        self.assertEquals(10.0, self.evaluate(ret_v_sum))
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEqual("foo:0", result.name)
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredVariableUpdateTest(test.TestCase):
   # The following tests check assign, assign_add and assign_sub on Mirrored
   # variables in replica and cross replica context.
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Enough GPUs not available for this test in eager mode.")
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContextWithoutAggregationType(self):
+  def testAssignMirroredVarReplicaContextWithoutAggregationType(self,
+                                                                distribution):
     # Test that we always have an aggregation type set on the mirrored variable
     # if we assign to it in replica mode.
-    self._skip_eager_if_gpus_less_than(1)
     def var_fn():
       v = variable_scope.variable(1.0, name="foo")
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -904,23 +843,19 @@ class MirroredVariableUpdateTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError, "You must specify an aggregation method to update a "
                       "MirroredVariable in Replica Context."):
-        self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
+        self.evaluate(distribution.unwrap(
+            distribution.extended.call_for_each_replica(model_fn)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContextWithSum(self):
-    # Test that we don't reduce a non-per-device value with the "sum"
+  def testAssignMirroredVarReplicaContextWithSum(self, distribution):
+    # Test that we don't reduce a non-per-replica value with the "sum"
     # aggregation type.
-    self._skip_eager_if_gpus_less_than(1)
     def var_fn():
       v = variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.SUM)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -929,225 +864,184 @@ class MirroredVariableUpdateTest(test.TestCase):
 
       with self.assertRaisesRegexp(
           ValueError, "A non-DistributedValues value 5.0 cannot be reduced "
-          "with the given aggregation VariableAggregation.SUM."):
-        self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
+          "with the given reduce op ReduceOp.SUM."):
+        self.evaluate(distribution.unwrap(
+            distribution.extended.call_for_each_replica(model_fn)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarCrossDeviceContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(1.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign(6.0))
-      self.assertEquals(6.0, mirrored_var_result)
+      self.assertEqual(6.0, mirrored_var_result)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_replica_context().replica_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(0.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(0.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarCrossDeviceContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(1.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       # read_value == True
       mirrored_var_result = self.evaluate(
           mirrored_var.assign_add(6.0, read_value=True))
-      self.assertEquals(7.0, mirrored_var_result)
-      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
-      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(7.0, mirrored_var_result)
+      self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
 
       # read_value == False
       self.evaluate(mirrored_var.assign_add(2.0, read_value=False))
-      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
-      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarReplicaContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_replica_context().replica_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_add(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(1.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(1.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarReplicaContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign_add(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(6.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(6.0, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarCrossDeviceContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(5.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign_sub(2.0))
-      self.assertEquals(3.0, mirrored_var_result)
-      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
-      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(3.0, mirrored_var_result)
+      self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarReplicaContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_replica_context().replica_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_sub(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(4.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(4.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarReplicaContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign_sub(1.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(4.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(4.0, self.evaluate(mirrored_var))
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
 
-  def testAssignMirroredVarInitializer(self):
+  def testAssignMirroredVarInitializer(self, distribution):
     # This test is not eager compatible since in eager variables are initialized
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
@@ -1155,17 +1049,14 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         v = variable_scope.variable(1.0, name="foo")
         return v
 
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:CPU:0"])
-
-      with dist.scope():
-        mirrored_var = dist.call_for_each_replica(var_fn)
+      with distribution.scope():
+        mirrored_var = distribution.extended.call_for_each_replica(var_fn)
         self.assertIsInstance(mirrored_var, values.MirroredVariable)
         self.assertFalse(self.evaluate(mirrored_var.is_initialized()))
         self.evaluate(mirrored_var.initializer)
         self.assertTrue(self.evaluate(mirrored_var.is_initialized()))
 
-  def testAssignReplicaLocalVarInitializer(self):
+  def testAssignReplicaLocalVarInitializer(self, distribution):
     # This test is not eager compatible since in eager variables are initialized
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
@@ -1177,11 +1068,9 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
         return v_sum
 
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:CPU:0"])
-
-      with dist.scope():
-        replica_local_var = dist.call_for_each_replica(model_fn)
+      with distribution.scope():
+        replica_local_var = distribution.extended.call_for_each_replica(
+            model_fn)
         self.assertTrue(isinstance(replica_local_var,
                                    values.ReplicaLocalVariable))
         self.assertFalse(self.evaluate(replica_local_var.is_initialized()))
@@ -1189,17 +1078,14 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         self.assertTrue(self.evaluate(replica_local_var.is_initialized()))
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class ReplicaLocalVariableAssignTest(test.TestCase):
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
-
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Not enough GPUs available for this test in eager mode.")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignReplicaLocalVarSumAggregation(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignReplicaLocalVarSumAggregation(self, distribution):
     def model_fn():
       v_sum = variable_scope.variable(
           1.0,
@@ -1207,19 +1093,16 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.SUM)
       return v_sum
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      replica_local_var = dist.call_for_each_replica(model_fn,
-                                                     run_concurrently=False)
+    with distribution.scope():
+      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
       self.assertTrue(isinstance(replica_local_var,
                                  values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the SUM of each of
       # values on each of the replicas.
-      self.assertEqual(2.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(2.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
       # Assigning 6.0 in cross replica context will assign a value of
       # 6.0/num_replicas to each replica.
       tlv_ops = replica_local_var.assign(6.0)
@@ -1227,11 +1110,10 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
       # On reading the replica local var we should get the assigned value back.
       # The value on all the replicas are added before being returned by
       # `read_var`.
-      self.assertEqual(6.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(6.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignReplicaLocalVarMeanAggregation(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignReplicaLocalVarMeanAggregation(self, distribution):
     def model_fn():
       v_sum = variable_scope.variable(
           1.0,
@@ -1239,24 +1121,22 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.MEAN)
       return v_sum
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      replica_local_var = dist.call_for_each_replica(model_fn,
-                                                     run_concurrently=False)
+    with distribution.scope():
+      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
       self.assertTrue(isinstance(replica_local_var,
                                  values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the MEAN of values
       # on all replicas which is the value assigned in replica context.
-      self.assertEqual(1.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(1.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
       tlv_ops = replica_local_var.assign(6.0)
       self.evaluate(tlv_ops)
       # On reading the replica local var we should get the MEAN of all values
       # which is equal to the value assigned.
-      self.assertEqual(6.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(6.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
 
 
 class MockModel(object):
@@ -1290,25 +1170,25 @@ class MiniModel(keras_training.Model):
     return self.fc(inputs)
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredStrategyDefunTest(test.TestCase):
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Not enough GPUs available for this test in eager mode.")
-
-  def _call_and_check(self, model_fn, inputs, expected_result, defuns,
-                      two_variables=False):
+  def _call_and_check(self, distribution, model_fn, inputs, expected_result,
+                      defuns, two_variables=False):
     cpu_dev = device_util.canonicalize("CPU:0")
     gpu_dev = device_util.canonicalize("GPU:0")
     devices = [cpu_dev, gpu_dev]
-    dist = mirrored_strategy.MirroredStrategy(devices)
 
-    with dist.scope():
+    with distribution.scope():
       mock_model = MockModel(two_variables)
       self.evaluate(variables.global_variables_initializer())
 
-      result = dist.call_for_each_replica(model_fn, mock_model, *inputs,
-                                          run_concurrently=False)
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=[mock_model] + inputs)
       for device in devices:
         device_result = values.select_device(device, result)
         device_expected_result = values.select_device(device, expected_result)
@@ -1320,18 +1200,15 @@ class MirroredStrategyDefunTest(test.TestCase):
         # call_for_each has one trace per device. To check that the expected set
         # of variables was accessed on each trace, we first retrieve each
         # device-specific graph function.
-        per_device_graph_functions = dist.call_for_each_replica(
-            defun.get_concrete_function,
-            mock_model, *inputs, run_concurrently=False)
+        per_replica_graph_functions = (
+            distribution.extended.call_for_each_replica(
+                defun.get_concrete_function, args=[mock_model] + inputs))
         for device in devices:
-          graph_function = per_device_graph_functions.get(device=device)
+          graph_function = per_replica_graph_functions.get(device=device)
           self.assertEqual(set(mock_model.variables),
                            set(graph_function.graph.variables))
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testVariableInDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testVariableInDefun(self, distribution):
     @function.defun
     def times_two(mock_model):
       return mock_model()
@@ -1339,12 +1216,9 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return times_two(mock_model)
 
-    self._call_and_check(model_fn, [], 2.5, [times_two])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testVariableInNestedDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 2.5, [times_two])
 
+  def testVariableInNestedDefun(self, distribution):
     @function.defun
     def times_two(mock_model):
       return mock_model()
@@ -1356,12 +1230,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return two_x_plus_one(mock_model)
 
-    self._call_and_check(model_fn, [], 3.5, [times_two, two_x_plus_one])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testTwoVariablesInNestedDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 3.5,
+                         [times_two, two_x_plus_one])
 
+  def testTwoVariablesInNestedDefun(self, distribution):
     @function.defun
     def fn1(mock_model):
       return mock_model()
@@ -1373,12 +1245,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return fn2(mock_model)
 
-    self._call_and_check(model_fn, [], 5.5, [fn1, fn2], two_variables=True)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testGradientTapeOverNestedDefuns(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 5.5, [fn1, fn2],
+                         two_variables=True)
 
+  def testGradientTapeOverNestedDefuns(self, distribution):
     @function.defun
     def fn1(mock_model):
       return mock_model()
@@ -1394,32 +1264,21 @@ class MirroredStrategyDefunTest(test.TestCase):
                              [v.get() for v in mock_model.variables])
       return grads
 
-    self._call_and_check(model_fn, [], [2.0, 1.0], [fn1, fn2],
+    self._call_and_check(distribution, model_fn, [], [2.0, 1.0], [fn1, fn2],
                          two_variables=True)
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testPassPerDevice(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testPassPerReplica(self, distribution):
     @function.defun
     def fn1(mock_model, factor):
       return mock_model(factor)
 
-    factors = values.PerDevice({"CPU:0": 5.0, "GPU:0": 3.0})
-    expected_result = values.PerDevice({"CPU:0": 5.0 * 1.25,
-                                        "GPU:0": 3.0 * 1.25})
-    self._call_and_check(fn1, [factors], expected_result, [fn1])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testTrain(self):
-    self._skip_eager_if_gpus_less_than(1)
-
-    cpu_dev = device_util.canonicalize("CPU:0")
-    gpu_dev = device_util.canonicalize("GPU:0")
-    devices = [cpu_dev, gpu_dev]
-    dist = mirrored_strategy.MirroredStrategy(devices)
+    factors = values.PerReplica({"CPU:0": 5.0, "GPU:0": 3.0})
+    expected_result = values.PerReplica({"CPU:0": 5.0 * 1.25,
+                                         "GPU:0": 3.0 * 1.25})
+    self._call_and_check(distribution, fn1, [factors], expected_result, [fn1])
 
-    with dist.scope():
+  def testTrain(self, distribution):
+    with distribution.scope():
       mock_model = MiniModel()
       mock_model.call = function.defun(mock_model.call)
 
@@ -1429,11 +1288,11 @@ class MirroredStrategyDefunTest(test.TestCase):
 
       gradients_fn = backprop.implicit_grad(loss_fn)
       gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
-      grads_and_vars = dist.call_for_each_replica(
-          gradients_fn, None, run_concurrently=False)
+      grads_and_vars = distribution.extended.call_for_each_replica(
+          gradients_fn, args=(None,))
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.25)
-      update_ops = optimizer._distributed_apply(dist, grads_and_vars)  # pylint: disable=protected-access
+      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
 
       if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
@@ -1445,30 +1304,73 @@ class MirroredStrategyDefunTest(test.TestCase):
       self.assertAllEqual([0.5], updated_var_values[1])
 
 
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            combinations.NamedDistribution(
+                "Mirrored",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.CoreMirroredStrategy(
+                    num_gpus_per_worker=context.num_gpus()),
+                required_gpus=1),
+            combinations.NamedDistribution(
+                "CoreMirrored",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.CoreMirroredStrategy(
+                    num_gpus_per_worker=context.num_gpus()),
+                required_gpus=1)
+        ],
+        mode=["graph"]))
 class MultiWorkerMirroredStrategyTest(
     multi_worker_test_base.MultiWorkerTestBase,
     strategy_test_lib.DistributionTestBase):
 
-  def _get_distribution_strategy(self):
+  def _configure_distribution_strategy(self, distribution):
     cluster_spec = server_lib.ClusterSpec({
         "worker": ["/job:worker/task:0", "/job:worker/task:1"]
     })
-    strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
-    strategy.configure(cluster_spec=cluster_spec)
-    return strategy
+    distribution.configure(cluster_spec=cluster_spec)
 
-  def test_num_replicas_in_sync(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    strategy = self._get_distribution_strategy()
+  def test_num_replicas_in_sync(self, distribution):
+    self._configure_distribution_strategy(distribution)
     # We calculate the total number of gpus across the workers(2) specified in
     # the cluster spec.
-    self.assertEqual(context.num_gpus() * 2, strategy.num_replicas_in_sync)
-
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy(),
-                                   learning_rate=0.05)
+    self.assertEqual(context.num_gpus() * 2, distribution.num_replicas_in_sync)
+
+  def testMinimizeLossGraph(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    self._test_minimize_loss_graph(distribution, learning_rate=0.05)
+
+  def testDeviceScope(self, distribution):
+    """Test the device scope of multi-worker MirroredStrategy."""
+    self._configure_distribution_strategy(distribution)
+    with distribution.scope():
+      a = constant_op.constant(1.)
+      with ops.device("/cpu:0"):
+        b = constant_op.constant(1.)
+      self.assertEqual(a.device, "/job:worker/task:0")
+      self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
+
+  def testMakeInputFnIterator(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    num_gpus = context.num_gpus()
+    num_workers = 2
+
+    expected_values = [[i+j for j in range(num_gpus)] * num_workers
+                       for i in range(0, 100, num_gpus)]
+
+    with context.graph_mode(), self.cached_session() as sess:
+      # `expected_input_pipeline_id` is None because the input_fn will be called
+      # multiple times, each with a different input_pipeline_id.
+      input_fn = self._input_fn_to_test_input_context(
+          dataset_fn,
+          expected_num_replicas_in_sync=num_workers*num_gpus,
+          expected_num_input_pipelines=num_workers,
+          expected_input_pipeline_id=None)
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      self._test_input_fn_iterator(
+          iterator, distribution.extended.worker_devices, expected_values, sess)
 
 
 class MultiWorkerMirroredStrategyTestWithChief(
@@ -1488,6 +1390,19 @@ class MultiWorkerMirroredStrategyTestWithChief(
     strategy.configure(cluster_spec=self._cluster_spec)
     self._test_minimize_loss_graph(strategy, learning_rate=0.05)
 
+  def testMinimizeLossGraphCoreMirroredStrategy(self):
+    strategy = mirrored_strategy.CoreMirroredStrategy(
+        num_gpus_per_worker=context.num_gpus())
+    strategy.configure(cluster_spec=self._cluster_spec)
+    self._test_minimize_loss_graph(strategy, learning_rate=0.05)
+
+
+def _replica_id():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if not isinstance(replica_id, ops.Tensor):
+    replica_id = constant_op.constant(replica_id)
+  return replica_id
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
deleted file mode 100644
index 2bfe0f3e7a66311c9b0673761b73382e477cb24b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for class MirroredStrategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribution_strategy_context
-
-
-class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
-
-  def _get_distribution_strategy(self):
-    return mirrored_strategy.MirroredStrategy(["/device:CPU:0"])
-
-  def testMinimizeLossEager(self):
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
-
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy())
-
-  def testMapReduce(self):
-    self._test_map_reduce(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    self._test_device_index(self._get_distribution_strategy())
-
-  def testReplicaId(self):
-    self._test_replica_id(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-
-class VariableCreatorStackTest(test.TestCase):
-
-  def testCreatorStacksAreThreadLocal(self):
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-
-    def model_fn(device_id):
-      assert isinstance(device_id, int)
-
-      def thread_creator_fn(next_creator, *args, **kwargs):
-        return next_creator(*args, **kwargs) + ":thread_" + str(device_id)
-
-      with variable_scope.variable_creator_scope(thread_creator_fn):
-        # Create a variable in this scope.
-        v = variable_scope.variable(1.0)
-
-        # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
-      return v
-
-    def main_thread_creator(next_creator, *args, **kwargs):
-      # We are not using the underlying next_creator for test purposes.
-      del next_creator, args, kwargs
-      return "main_thread"
-
-    with context.graph_mode(), \
-        dist.scope(), \
-        variable_scope.variable_creator_scope(main_thread_creator):
-      result = dist.call_for_each_replica(model_fn, dist.worker_device_index)
-      result = dist.unwrap(result)
-      expected = ["main_thread:thread_0", "main_thread:thread_1"]
-      self.assertEquals(expected, result)
-
-
-class MultiWorkerMirroredStrategyTest(test.TestCase):
-
-  def testDeviceScope(self):
-    """Test the device scope of multi-worker MirroredStrategy."""
-    with context.graph_mode():
-      strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
-      strategy.configure(
-          cluster_spec={"worker": ["/job:worker/task:0", "/job:worker/task:1"]})
-      with strategy.scope():
-        a = constant_op.constant(1.)
-        with ops.device("/cpu:0"):
-          b = constant_op.constant(1.)
-        self.assertEqual(a.device, "/job:worker/task:0")
-        self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
index 815644421e36cc397d6faebf9abd9c54bab557de..c492d8bafc9024ed059f05b92e5466f3702726b9 100644
--- a/tensorflow/contrib/distribute/python/moving_averages_test.py
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -32,7 +32,8 @@ from tensorflow.python.training import moving_averages
 all_combinations = combinations.combine(
     distribution=[combinations.default_strategy,
                   combinations.one_device_strategy,
-                  combinations.mirrored_strategy_with_gpu_and_cpu],
+                  combinations.mirrored_strategy_with_gpu_and_cpu,
+                  combinations.core_mirrored_strategy_with_gpu_and_cpu],
     mode=["graph"])
 
 
@@ -93,7 +94,8 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       var = variables.Variable([10.0, 11.0])
       val = constant_op.constant([1.0, 2.0])
       decay = 0.25
-      # NOTE(josh11b): We currently generate an error if val is a PerDevice value.
+      # NOTE(josh11b): We currently generate an error if val is a PerReplica
+      # value.
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
 
@@ -121,7 +123,8 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       var = variables.Variable([0.0, 0.0])
       val = array_ops.placeholder(dtypes.float32)
       decay = 0.25
-      # NOTE(josh11b): We currently generate an error if val is a PerDevice value.
+      # NOTE(josh11b): We currently generate an error if val is a PerReplica
+      # value.
       assign = moving_averages.assign_moving_average(var, val, decay)
 
       variables.global_variables_initializer().run()
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 8bdf0012087a60cd7d4acfd4eaf0ee0742275655..421507232ac26915741d422d8a23008ddb7bf143 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -20,13 +20,12 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.contrib.distribute.python import values
+from tensorflow.python.distribute import values
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
@@ -40,10 +39,16 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
   # doing something that won't work with other DistributionStrategy
   # implementations?
 
-  def __init__(self, device, prefetch_on_device=None):
-    super(OneDeviceStrategy, self).__init__()
+  def __init__(self, device):
+    super(OneDeviceStrategy, self).__init__(OneDeviceExtended(self, device))
+
+
+class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of OneDeviceStrategy."""
+
+  def __init__(self, container_strategy, device):
+    super(OneDeviceExtended, self).__init__(container_strategy)
     self._device = device
-    self._prefetch_on_device = prefetch_on_device
     self._default_device = device
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -61,18 +66,29 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     with ops.colocate_with(colocate_with):
       return next_creator(*args, **kwargs)
 
-  def distribute_dataset(self, dataset_fn):
-    return values.PerDeviceDataset(
-        self._call_dataset_fn(dataset_fn), [self._device],
-        self._prefetch_on_device)
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch."""
+    return values.DatasetIterator(dataset, [("/job:localhost", [self._device])])
+
+  def _distribute_dataset(self, dataset_fn):
+    return values.PerReplicaDataset(
+        self._call_dataset_fn(dataset_fn), [self._device])
 
-  def _broadcast(self, tensor, destinations):
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    return values.InputFunctionIterator(
+        input_fn, [("/job:localhost", [self._device])],
+        [distribute_lib.InputContext()])
+
+  def _broadcast_to(self, tensor, destinations):
     del destinations
     return tensor
 
   # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values=None):
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
     if initial_loop_values is None:
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
@@ -84,7 +100,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
       fn_inputs = iterator.get_next()
       if not isinstance(fn_inputs, tuple):
         fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
+      fn_result = fn(ctx, fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       with ops.control_dependencies([fn_result]):
         return [i + 1] + flat_last_step_outputs
@@ -117,42 +133,25 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
     return ctx
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
-    # We don't run `fn` in multiple threads in OneDeviceStrategy.
-    kwargs.pop("run_concurrently", None)
-    with ops.device(self._device), _OneDeviceReplicaContext(self):
+  def _call_for_each_replica(self, fn, args, kwargs):
+    strategy = self._container_strategy()
+    with ops.device(self._device), _OneDeviceReplicaContext(strategy):
       return fn(*args, **kwargs)
 
-  def map(self, map_over, fn, *args, **kwargs):
-    with ops.device(self._device):
-      return values.MapOutput([fn(m, *args, **kwargs) for m in map_over])
-
-  def _reduce(self, aggregation, value, destinations):
-    del destinations
-    if not isinstance(value, values.MapOutput):
-      return value
-    l = value.get()
-    assert l
-    with ops.device(self._device):
-      if aggregation == vs.VariableAggregation.SUM:
-        return math_ops.add_n(l)
-      elif aggregation == vs.VariableAggregation.MEAN:
-        return math_ops.add_n(l) / len(l)
-      else:
-        assert False
+  def _reduce_to(self, reduce_op, value, destinations):
+    del reduce_op, destinations
+    return value
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     # The implementations of _update() and _update_non_slot() are identical
     # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
 
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     del colocate_with
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.device(self._device), distribute_lib.UpdateContext(self._device):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -168,7 +167,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     return value
 
   @property
-  def num_replicas(self):
+  def _num_replicas_in_sync(self):
     return 1
 
   @property
@@ -183,16 +182,33 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     del var_list
     return [self._device]
 
-  def _worker_device_index(self):
-    return 0
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
 
 
 class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext for OneDeviceStrategy."""
 
   def __init__(self, distribution_strategy):
     distribute_lib.ReplicaContext.__init__(
-        self, distribution_strategy, replica_id=0)
+        self,
+        distribution_strategy,
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
 
   @property
-  def device(self):
-    return self._distribution_strategy.worker_devices[0]
+  def devices(self):
+    return [self._distribution_strategy.extended.worker_devices[0]]
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index 3fb92273924a665bf2a1ee5fc94b75273b8c5f78..d46cd6f529e363f76bfa2b22339add63530cfde8 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distribute.python import one_device_strategy
 from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 
@@ -35,12 +36,6 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
   def testMinimizeLossGraph(self):
     self._test_minimize_loss_graph(self._get_distribution_strategy())
 
-  def testMapReduce(self):
-    self._test_map_reduce(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    self._test_device_index(self._get_distribution_strategy())
-
   def testReplicaId(self):
     self._test_replica_id(self._get_distribution_strategy())
 
@@ -48,6 +43,20 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
   def testCallAndMergeExceptions(self):
     self._test_call_and_merge_exceptions(self._get_distribution_strategy())
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMakeInputFnIterator(self):
+    d = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i] for i in range(10)]
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=1,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = d.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(
+        iterator, d.extended.worker_devices, expected_values)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index 0554f4a83bda28142d709020a5a648127d66eab0..fa4705af7cb592119f56686d1f693a156f7b4b13 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -51,7 +51,7 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
       def run_step():
         return control_flow_ops.group(distribution.unwrap(
             distribution.call_for_each_replica(
-                model_fn, iterator.get_next(), run_concurrently=layer.built)))
+                model_fn, args=(iterator.get_next(),))))
 
       if not context.executing_eagerly():
         with self.cached_session() as sess:
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index bbfd94ed5c0dd5391db0f4e0043b66553b45270d..fc2d2b20c95f0260d8243b662a020ddee8a00b14 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
 from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import values
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
@@ -64,7 +64,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   Operations that occur only on the first replica (such as incrementing the
   global step), will occur on the first replica *of every worker*.
 
-  It is expected to call `call_for_each_replica(fn, *args, **kwargs)` for any
+  It is expected to call `call_for_each_replica(fn, ...)` for any
   operations which potentially can be replicated across replicas (i.e. multiple
   GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
   caution needs to be taken:
@@ -94,13 +94,21 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
         not.
     """
-    super(ParameterServerStrategy, self).__init__()
+    super(ParameterServerStrategy, self).__init__(
+        ParameterServerExtended(self, num_gpus_per_worker))
+
+
+class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of ParameterServerStrategy."""
+
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    super(ParameterServerExtended, self).__init__(container_strategy)
     self._num_gpus_per_worker = num_gpus_per_worker
     self._initialize_local(num_gpus_per_worker)
 
     # We typically don't need to do all-reduce in this strategy.
-    self._cross_tower_ops = (
-        cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+    self._cross_device_ops = (
+        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
             reduce_to_device=_LOCAL_CPU))
 
   def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
@@ -189,6 +197,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
   def _initialize_local(self, num_gpus_per_worker):
     """Initialize internal devices for local training."""
+    self._worker_device = "/job:localhost"
     # Define compute devices which is a list of device strings and one for each
     # replica. When there are GPUs, replicate operations on these GPUs.
     # Otherwise, place operations on CPU.
@@ -221,20 +230,51 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
         "ParameterServerStrategy with compute_devices = %r, "
         "variable_device = %r", self._compute_devices, self._variable_device)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
-    return values.PerDeviceDataset(
+    return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._compute_devices, True)
 
-  def _broadcast(self, tensor, destinations):
-    if not cross_tower_ops_lib.check_destinations(destinations):
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+      num_input_pipelines = multi_worker_util.worker_count(
+          self._cluster_spec, self._task_type)
+    else:
+      input_pipeline_id = 0
+      num_input_pipelines = 1
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=num_input_pipelines,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+    worker_device_pairs = [(self._worker_device, self._compute_devices)]
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs, [input_context])
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    if not cross_device_ops_lib.check_destinations(destinations):
       destinations = self._compute_devices
-    return self._cross_tower_ops.broadcast(tensor, destinations)
+    return self._cross_device_ops.broadcast(tensor, destinations)
+
+  def _allow_variable_partition(self):
+    return not context.executing_eagerly()
 
   # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
   # this creator, such as "MutableHashTable".
   def _create_variable(self, next_creator, *args, **kwargs):
-    if self.num_replicas > 1:
+    if self._num_replicas_in_sync > 1:
       aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
       if aggregation not in (
           vs.VariableAggregation.NONE,
@@ -288,39 +328,37 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       with ops.device(self._variable_device):
         return var_creator(*args, **kwargs)
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
+  def _call_for_each_replica(self, fn, args, kwargs):
     # pylint: disable=protected-access
-    return mirrored_strategy._call_for_each_replica(self, fn, *args, **kwargs)
+    return mirrored_strategy._call_for_each_replica(
+        self._container_strategy(), fn, args, kwargs)
 
   def _verify_destinations_not_different_worker(self, destinations):
+    if not self._cluster_spec:
+      return
     if destinations is None:
       return
-    for d in cross_tower_ops_lib.get_devices_from(destinations):
+    for d in cross_device_ops_lib.get_devices_from(destinations):
       d_spec = tf_device.DeviceSpec.from_string(d)
       if d_spec.job == self._task_type and d_spec.task != self._task_id:
         raise ValueError(
             "Cannot reduce to another worker: %r, current worker is %r" %
             (d, self._worker_device))
 
-  def _reduce(self, aggregation, value, destinations):
+  def _reduce_to(self, reduce_op, value, destinations):
     self._verify_destinations_not_different_worker(destinations)
     if not isinstance(value, values.DistributedValues):
       # pylint: disable=protected-access
       return mirrored_strategy._reduce_non_distributed_value(
-          self, aggregation, value, destinations)
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self.broadcast(value.get(self._compute_devices[0]), destinations)
-    return self._cross_tower_ops.reduce(
-        aggregation, value, destinations=destinations)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return [self.broadcast(v.get(self._compute_devices[0]), d)
-              for v, d in value_destination_pairs]
+          self, reduce_op, value, destinations)
+    return self._cross_device_ops.reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
     for _, destinations in value_destination_pairs:
       self._verify_destinations_not_different_worker(destinations)
-    return self._cross_tower_ops.batch_reduce(aggregation,
-                                              value_destination_pairs)
+    return self._cross_device_ops.batch_reduce(reduce_op,
+                                               value_destination_pairs)
 
   def _select_single_value(self, structured):
     """Select any single values in `structured`."""
@@ -334,9 +372,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
               "You cannot update variable with a Mirrored object with multiple "
               "components %r when using ParameterServerStrategy. You must "
               "specify a single value or a Mirrored with a single value." % x)
-      elif isinstance(x, values.PerDevice):
+      elif isinstance(x, values.PerReplica):
         raise ValueError(
-            "You cannot update variable with a PerDevice object %r when using "
+            "You cannot update variable with a PerReplica object %r when using "
             "ParameterServerStrategy. You must specify a single value or a "
             "Mirrored with a single value" % x)
       else:
@@ -344,30 +382,26 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
     return nest.map_structure(_select_fn, structured)
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     if isinstance(var, values.AggregatingVariable):
       var = var.get()
     if not isinstance(var, resource_variable_ops.ResourceVariable):
       raise ValueError(
           "You can not update `var` %r. It must be a Variable." % var)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
       result = fn(var, *self._select_single_value(args),
                   **self._select_single_value(kwargs))
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
 
   # TODO(yuefengz): does it need to call _select_single_value?
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     with ops.device(
         colocate_with.device), distribute_lib.UpdateContext(colocate_with):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -393,11 +427,11 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     # variables.
     return array_ops.identity(var)
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     """Configures the strategy class.
 
     The strategy object will be re-initialized if `cluster_spec` is given but
@@ -445,11 +479,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
         ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
 
   @property
-  def num_replicas(self):
-    return len(self._compute_devices)
-
-  @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     return len(self._compute_devices)
 
   @property
@@ -465,11 +495,12 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     return min(var_list, key=lambda x: x.name)
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
+    # TODO(yuefengz): Should this return False in the local case?
     return True
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return self._is_chief
 
   @property
@@ -479,3 +510,8 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   @property
   def should_save_summary(self):
     return self._is_chief
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index b8d5d0ecafce700d0e061b132607965a33ca9cb6..1ada6a6ba493563cd56342854f8d84a8ed5a7d40 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -25,23 +25,29 @@ from absl.testing import parameterized
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import parameter_server_strategy
-from tensorflow.contrib.distribute.python import values
+from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
+from tensorflow.python.training import distribution_strategy_context as ds_context
 from tensorflow.python.training import training_util
 
 CHIEF = run_config.TaskType.CHIEF
@@ -49,6 +55,13 @@ WORKER = run_config.TaskType.WORKER
 PS = run_config.TaskType.PS
 
 
+def _get_replica_id_integer():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id
+
+
 class ParameterServerStrategyTestBase(
     multi_worker_test_base.MultiWorkerTestBase):
 
@@ -85,8 +98,7 @@ class ParameterServerStrategyTestBase(
                              config=sess_config) as sess, \
          d.scope():
 
-      # Define a variable outside the call_for_each_replica scope. This is not
-      # recommended.
+      # Define a variable outside the call_for_each_replica scope.
       n = variable_scope.get_variable('n', initializer=10.0)
       self.assertEqual(n.device, '/job:ps/task:0')
 
@@ -94,9 +106,8 @@ class ParameterServerStrategyTestBase(
         if num_gpus == 0:
           last_part_device = 'device:CPU:0'
         else:
-          last_part_device = (
-              'device:GPU:%d' %
-              distribution_strategy_context.get_replica_context().replica_id)
+          replica_id = _get_replica_id_integer()
+          last_part_device = ('device:GPU:%d' % replica_id)
 
         a = constant_op.constant(1.0)
         b = constant_op.constant(2.0)
@@ -178,6 +189,75 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(z_val, 43.0)
         self.assertEqual(f_val, 46.0)
 
+  def _test_device_assignment_distributed_enable_partitioner(
+      self, task_type, task_id, num_gpus):
+    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
+    num_shards = len(d.parameter_devices)
+    partitioner = partitioned_variables.fixed_size_partitioner(num_shards)
+    with ops.Graph().as_default(), \
+         self.cached_session(target=self._default_target,
+                             config=sess_config) as sess, \
+         d.scope():
+
+      n = variable_scope.get_variable(
+          'n',
+          initializer=constant_op.constant([10.0, 20.0]),
+          aggregation=variable_scope.VariableAggregation.SUM,
+          partitioner=partitioner)
+
+      for part_id, var in enumerate(n):
+        self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
+
+      def model_fn():
+        a = constant_op.constant([3.0, 5.0])
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/job:worker/task:0'):
+          x = variable_scope.get_variable(
+              'x',
+              initializer=constant_op.constant([10.0, 20.0]),
+              aggregation=variable_scope.VariableAggregation.SUM,
+              partitioner=partitioner)
+          x_add = x.assign_add(a, name='x_add')
+        # The variable x is on the task 1 since the device_function has been
+        # called once before the model_fn.
+        for part_id, var in enumerate(x):
+          self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
+          self.assertEqual(var.device, x_add[part_id].device)
+
+        # The colocate_vars_with can override the distribution's device.
+        with d.colocate_vars_with(x_add[0]):
+          y = variable_scope.get_variable(
+              'y',
+              initializer=constant_op.constant([20.0, 10.0]),
+              aggregation=variable_scope.VariableAggregation.SUM,
+              partitioner=partitioner)
+        y_add = y.assign_add(
+            [array_ops.identity(x_add[0]),
+             array_ops.identity(x_add[1])])
+
+        for part_id, var in enumerate(y):
+          self.assertEqual(var.device, '/job:ps/task:0')
+          self.assertEqual(y_add[part_id].device, var.device)
+          self.assertEqual(var.device, x_add[0].device)
+
+        return x_add, y_add
+
+      x, y = d.call_for_each_replica(model_fn)
+
+      if context.num_gpus() >= 1:
+        variables.global_variables_initializer().run()
+        x_val, y_val = sess.run([x, y])
+        if num_gpus < 1:
+          self.assertEqual(x_val, [13.0, 25.0])
+          self.assertEqual(y_val, [33.0, 35.0])
+        else:
+          x_expect = [10.0 + 3 * num_gpus, 20.0 + 5 * num_gpus]
+          y_expect = [
+              20.0 + x_expect[0] * num_gpus, 10.0 + x_expect[1] * num_gpus
+          ]
+          self.assertEqual(x_val, x_expect)
+          self.assertEqual(y_val, y_expect)
+
   def _test_device_assignment_local(self,
                                     d,
                                     compute_device='CPU',
@@ -192,18 +272,16 @@ class ParameterServerStrategyTestBase(
         if 'CPU' in compute_device:
           replica_compute_device = '/device:CPU:0'
         else:
-          replica_compute_device = (
-              '/device:GPU:%d' %
-              distribution_strategy_context.get_replica_context().replica_id)
+          replica_id = _get_replica_id_integer()
+          replica_compute_device = ('/device:GPU:%d' % replica_id)
         replica_compute_device = device_util.canonicalize(
             replica_compute_device)
 
         if 'CPU' in variable_device:
           replica_variable_device = '/device:CPU:0'
         else:
-          replica_variable_device = (
-              '/device:GPU:%d' %
-              distribution_strategy_context.get_replica_context().replica_id)
+          replica_id = _get_replica_id_integer()
+          replica_variable_device = ('/device:GPU:%d' % replica_id)
         replica_variable_device = device_util.canonicalize(
             replica_variable_device)
 
@@ -285,9 +363,9 @@ class ParameterServerStrategyTestBase(
   def _test_simple_increment(self, task_type, task_id, num_gpus):
     d, master_target, sess_config = self._get_test_objects(
         task_type, task_id, num_gpus)
-    if hasattr(d, '_cluster_spec') and d._cluster_spec:
-      num_workers = len(d._cluster_spec.as_dict().get(WORKER))
-      if 'chief' in d._cluster_spec.as_dict():
+    if d.extended._cluster_spec:
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if 'chief' in d.extended._cluster_spec.as_dict():
         num_workers += 1
     else:
       num_workers = 1
@@ -320,7 +398,7 @@ class ParameterServerStrategyTestBase(
       x, y, z, train_op = d.call_for_each_replica(model_fn)
       train_op = d.group(train_op)
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
       if task_id == 0:
@@ -345,20 +423,25 @@ class ParameterServerStrategyTestBase(
       self._finish_condition.release()
 
       x_val, y_val, z_val = sess.run([x, y, z])
-      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas)
-      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas)
+      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas_in_sync)
+      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas_in_sync)
       self.assertEqual(z_val, 30.0 + 1.0 * num_workers)
-      return (x_val == 10.0 + 1.0 * num_workers * d.num_replicas and
-              y_val == 20.0 + 1.0 * num_workers * d.num_replicas and
+      return (x_val == 10.0 + 1.0 * num_workers * d.num_replicas_in_sync and
+              y_val == 20.0 + 1.0 * num_workers * d.num_replicas_in_sync and
               z_val == 30.0 + 1.0 * num_workers)
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
     d, master_target, sess_config = self._get_test_objects(
         task_type, task_id, num_gpus)
-    assert hasattr(d, '_cluster_spec') and d._cluster_spec
-    num_workers = len(d._cluster_spec.as_dict().get(WORKER))
-    if CHIEF in d._cluster_spec.as_dict():
-      num_workers += 1
+    if task_type:
+      # Multi-worker
+      assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if CHIEF in d.extended._cluster_spec.as_dict():
+        num_workers += 1
+    else:
+      # local
+      num_workers = 1
 
     with ops.Graph().as_default(), \
          self.cached_session(target=master_target,
@@ -389,7 +472,7 @@ class ParameterServerStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, one)
+        g_v = d.call_for_each_replica(grad_fn, args=(one,))
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
@@ -399,7 +482,7 @@ class ParameterServerStrategyTestBase(
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
             g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
                 d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -407,10 +490,12 @@ class ParameterServerStrategyTestBase(
 
       before_out, after_out = step()
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
-      if multi_worker_util.is_chief(d._cluster_spec, task_type, task_id):
+      if (not task_type or
+          multi_worker_util.is_chief(
+              d.extended._cluster_spec, task_type, task_id)):
         variables.global_variables_initializer().run()
 
       # Workers waiting for chief worker's initializing variables.
@@ -433,8 +518,40 @@ class ParameterServerStrategyTestBase(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
+  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
+                              expected_values):
+    distribution, master_target, config = self._get_test_objects(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_device(d, next_element) for d in devices])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
 
 class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
+                                  strategy_test_lib.DistributionTestBase,
                                   parameterized.TestCase):
 
   @classmethod
@@ -473,6 +590,12 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
   def testDeviceAssignmentDistributed(self, num_gpus):
     self._test_device_assignment_distributed('worker', 1, num_gpus)
 
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus):
+    self._test_device_assignment_distributed_enable_partitioner(
+        'worker', 1, num_gpus)
+
   def testSimpleBetweenGraph(self):
     self._run_between_graph_clients(self._test_simple_increment,
                                     self._cluster_spec, context.num_gpus())
@@ -484,10 +607,55 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
 
   @combinations.generate(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraph(self, num_gpus):
+  def testMinimizeLossGraphDistributed(self, num_gpus):
     self._run_between_graph_clients(self._test_minimize_loss_graph,
                                     self._cluster_spec, num_gpus)
 
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraphLocal(self, num_gpus):
+    self._test_minimize_loss_graph(None, None, num_gpus)
+
+  # TODO(priyag): Refactor this and other multi worker tests.
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
+  def testMakeInputFnIteratorDistributed(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator('worker', 1, num_gpus,
+                                 input_fn, expected_values)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
+  def testMakeInputFnIteratorLocal(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)  # only one worker and pipeline for local.
+    self._test_input_fn_iterator(None, None, num_gpus,
+                                 input_fn, expected_values)
+
+  def testGlobalStepUpdate(self):
+    strategy = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=context.num_gpus())
+    self._test_global_step_update(strategy)
+
 
 class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                            parameterized.TestCase):
@@ -530,9 +698,9 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
           v = variable_scope.get_variable('v', initializer=10.0)
           _ = v * v
         v, = tape.watched_variables()
-        w = distribution.value_container(v)
+        w = distribution.extended.value_container(v)
         self.assertIs(values.AggregatingVariable, type(w))
-      distribution.call_for_each_replica(f)
+      distribution.extended.call_for_each_replica(f)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index a5adaac47ceb3e22909bb852c6e3418446710a51..c928b6d9f1f21508edd753f94c38ab2723cc0a9f 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -90,25 +90,21 @@ class StandardSingleLossStep(StandardInputStep):
     super(StandardSingleLossStep, self).__init__(dataset_fn, distribution)
     self._loss_fn = loss_fn
     self._optimizer = optimizer
-    self._is_run_concurrently = False
     self._iterations_per_step = iterations_per_step
 
   def __call__(self):
     with self._distribution.scope():
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         """Function to run one iteration with one input."""
         gradients_fn = backprop.implicit_grad(self._loss_fn)
         gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
 
         grads_and_vars = self.distribution.call_for_each_replica(
-            gradients_fn,
-            ctx, *inputs,
-            run_concurrently=self._is_run_concurrently)
+            gradients_fn, args=(ctx,) + inputs)
         # If threads use layers, then we need to run the first step
         # sequentially, so that layers.build() is not executed in parallel.
         # Otherwise, multiple sets of mirrored variables are going to be
         # created.
-        self._is_run_concurrently = True
         return self._optimizer._distributed_apply(  # pylint: disable=protected-access
             self.distribution, grads_and_vars)
 
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index 60ef0a2106a17d2eede6acec9b9178d1a9d736ff..5a8e8ed0dda0b99e759edbe916a46dab953929a0 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -19,16 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context
+from tensorflow.python.training import distribution_strategy_context as ds_context
 from tensorflow.python.training import optimizer
 
 
@@ -45,8 +50,7 @@ def _raise_exception_fn(_=None):
 # Must be the argument to a distribution.call_for_each_replica() call, calls a
 # get_replica_context().merge_call() that raises an exception.
 def _merge_raises_fn():
-  distribution_strategy_context.get_replica_context().merge_call(
-      _raise_exception_fn)
+  ds_context.get_replica_context().merge_call(_raise_exception_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
@@ -59,8 +63,7 @@ def _call_raises_fn(dist):
 # calls a get_replica_context().merge_call() that calls a
 # call_for_each_replica() that raises an exception.
 def _merge_call_raises_fn():
-  distribution_strategy_context.get_replica_context().merge_call(
-      _call_raises_fn)
+  ds_context.get_replica_context().merge_call(_call_raises_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
@@ -74,8 +77,7 @@ def _call_merge_raises_fn(dist):
 # get_replica_context().merge_call() that calls a call_for_each_replica() that
 # calls a get_replica_context().merge_call() that raises an exception.
 def _merge_call_merge_raises_fn():
-  distribution_strategy_context.get_replica_context().merge_call(
-      _call_merge_raises_fn)
+  ds_context.get_replica_context().merge_call(_call_merge_raises_fn)
 
 
 class DistributionTestBase(test.TestCase):
@@ -104,7 +106,7 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, one, run_concurrently=l.built)
+        g_v = d.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
@@ -114,8 +116,7 @@ class DistributionTestBase(test.TestCase):
           before_list.append(fetched)
           # control_dependencies irrelevant but harmless in eager execution
           with ops.control_dependencies([fetched]):
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.reduce(reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(d.update(
                 v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -160,7 +161,7 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, one)
+        g_v = d.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
@@ -169,8 +170,7 @@ class DistributionTestBase(test.TestCase):
           fetched = d.read_var(v)
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.reduce(reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(d.update(
                 v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -189,40 +189,20 @@ class DistributionTestBase(test.TestCase):
       # Error should go down
       self.assertLess(error_after, error_before)
 
-  def _test_map_reduce(self, d, in_graph=None):
-    with d.scope():
-      map_in = [constant_op.constant(i) for i in range(10)]
-      map_out = d.map(map_in, lambda x, y: x * y, 2)
-      observed = d.reduce(variable_scope.VariableAggregation.SUM, map_out,
-                          "/device:CPU:0")
-      expected = 90  # 2 * (0 + 1 + ... + 9)
-      self.assertEqual(expected, observed.numpy())
-
-  def _test_device_index(self, d):
-    with d.scope():
-      expected_devices = [False] * len(d.worker_devices)
-
-      def mark_devices_fn(device_id):
-        self.assertLess(device_id, len(d.worker_devices))
-        self.assertFalse(expected_devices[device_id])
-        expected_devices[device_id] = True
-
-      d.call_for_each_replica(mark_devices_fn, d.worker_device_index)
-      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
-
   def _test_replica_id(self, d):
     with d.scope():
-      expected_devices = [False] * len(d.worker_devices)
+      expected_devices = [False] * len(d.extended.worker_devices)
 
       def mark_devices_fn():
-        replica_id = (
-            distribution_strategy_context.get_replica_context().replica_id)
-        self.assertLess(replica_id, len(d.worker_devices))
+        replica_id = self.evaluate(
+            ds_context.get_replica_context().replica_id_in_sync_group)
+        self.assertLess(replica_id, len(d.extended.worker_devices))
         self.assertFalse(expected_devices[replica_id])
         expected_devices[replica_id] = True
 
       d.call_for_each_replica(mark_devices_fn)
-      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+      self.assertAllEqual(expected_devices,
+                          [True] * len(d.extended.worker_devices))
 
   def _test_call_and_merge_exceptions(self, dist):
     with dist.scope():
@@ -234,3 +214,78 @@ class DistributionTestBase(test.TestCase):
         dist.call_for_each_replica(_merge_call_raises_fn)
       with self.assertRaises(_TestException):
         dist.call_for_each_replica(_merge_call_merge_raises_fn)
+
+  def _input_fn_to_test_input_context(self,
+                                      dataset_fn,
+                                      expected_num_replicas_in_sync,
+                                      expected_num_input_pipelines,
+                                      expected_input_pipeline_id):
+    # Use a list of one element as counter so that it can be captured by the
+    # `_input_fn`. This counter is incremented by 1 each time an input_fn is
+    # called. We use this counter to check whether the `input_pipeline_id`
+    # matches the counter in the in-graph replication.
+    worker_id_counter = [0]
+
+    def _input_fn(input_context):
+      """Input fn for testing."""
+      self.assertIsNotNone(input_context)
+      self.assertEqual(expected_num_replicas_in_sync,
+                       input_context.num_replicas_in_sync)
+      self.assertEqual(expected_num_input_pipelines,
+                       input_context.num_input_pipelines)
+      if expected_input_pipeline_id is not None:
+        self.assertEqual(expected_input_pipeline_id,
+                         input_context.input_pipeline_id)
+      else:
+        self.assertEqual(worker_id_counter[0], input_context.input_pipeline_id)
+        worker_id_counter[0] += 1
+
+      return dataset_fn()
+
+    return _input_fn
+
+  def _test_input_fn_iterator(self, iterator, devices, expected_values,
+                              sess=None):
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+    evaluate(iterator.initialize())
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_device(d, next_element) for d in devices])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(iterator.initialize())
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
+
+  def _test_global_step_update(self, strategy):
+    with strategy.scope():
+      global_step = variable_scope.get_variable(
+          "global_step",
+          shape=[],
+          dtype=dtypes.int64,
+          initializer=init_ops.zeros_initializer(),
+          trainable=False,
+          aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      self.evaluate(variables.global_variables_initializer())
+
+      def model_fn():
+        train_op = global_step.assign_add(1)
+        value = global_step.read_value()
+        return train_op, value
+
+      train_ops, value = strategy.call_for_each_replica(model_fn)
+      self.evaluate(strategy.group(train_ops))
+      global_step_tensors = strategy.unwrap(value)
+      global_step_values = self.evaluate(global_step_tensors)
+      self.assertEqual([1] * len(global_step_tensors), global_step_values)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 65ef21df09ba34c274cdce73996bff7b9c32da85..f1115cb0c07666e9fe3a640cab6fb927d6d508c0 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -23,21 +23,23 @@ from __future__ import print_function
 
 import functools
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import values
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.contrib.tpu.python.tpu import training_loop
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
@@ -130,8 +132,21 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       num_cores: Number of cores to use on the TPU. If None specified, then
           auto-detect the cores and topology of the TPU system.
     """
-    super(TPUStrategy, self).__init__()
+    super(TPUStrategy, self).__init__(TPUExtended(
+        self, tpu_cluster_resolver, steps_per_run, num_cores))
 
+  @property
+  def steps_per_run(self):
+    """DEPRECATED: use .extended.steps_per_run instead."""
+    return self._extended.steps_per_run
+
+
+class TPUExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of TPUStrategy."""
+
+  def __init__(self, container_strategy, tpu_cluster_resolver, steps_per_run,
+               num_cores=None):
+    super(TPUExtended, self).__init__(container_strategy)
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
     # TODO(sourabhbajaj): Change this from num_cores to metadata_override
@@ -141,11 +156,11 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     # parallelism.
     device_map = {d.name: i for i, d in enumerate(self._tpu_metadata.devices)
                   if "device:TPU:" in d.name}
-    self._device_index = values.PerDevice(device_map)
+    self._device_index = values.PerReplica(device_map)
     self._host_device = self.get_host_cpu_device(0)
     self._tpu_devices = sorted(device_map.keys())
     # Only create variables for the number of replicas we're running.
-    self._tpu_devices = self._tpu_devices[:self.num_replicas]
+    self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
@@ -214,20 +229,29 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
 
     return enqueue_op_per_host
 
-  def distribute_dataset(self, dataset_fn):
-    worker_map = {
-        self.get_host(hid): [self.get_host_cpu_device(hid)]
+  def _make_dataset_iterator(self, dataset):
+    """Make iterators for each of the TPU hosts."""
+
+    worker_devices = [
+        (self.get_host(hid), [self.get_host_cpu_device(hid)])
         for hid in range(self.num_hosts)
-    }
+    ]
+    return values.DatasetIterator(dataset, worker_devices,
+                                  self._num_replicas_in_sync)
+
+  def _distribute_dataset(self, dataset_fn):
+    worker_devices = [
+        (self.get_host(hid), [self.get_host_cpu_device(hid)])
+        for hid in range(self.num_hosts)
+    ]
     return values.MultiWorkerDataset(
-        functools.partial(self._call_dataset_fn, dataset_fn), worker_map)
+        functools.partial(self._call_dataset_fn, dataset_fn), worker_devices)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
-  def _run_steps_on_dataset(self, fn, multi_worker_iterator, iterations,
-                            initial_loop_values=None):
-
+  def _experimental_run_steps_on_iterator(
+      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
     output_shapes = multi_worker_iterator.output_shapes
     shapes = nest.flatten(output_shapes)
     if any([not s.is_fully_defined() for s in shapes]):
@@ -257,7 +281,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       fn_inputs = dequeue_fn()
       if not isinstance(fn_inputs, tuple):
         fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
+      fn_result = fn(ctx, fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       if flat_last_step_outputs:
         with ops.control_dependencies([fn_result]):
@@ -279,7 +303,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     self._outer_control_flow_context = (
         ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
 
-    replicate_inputs = [[]] * self.num_replicas
+    replicate_inputs = [[]] * self._num_replicas_in_sync
     replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
     del self._outer_control_flow_context
     ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
@@ -303,27 +327,27 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     last_step_tensor_outputs_dict = nest.pack_sequence_as(
         ctx.last_step_outputs, last_step_tensor_outputs)
 
-    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
+    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
       output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been aggregated, take the first value
+      # For outputs that have already been reduced, take the first value
       # from the list as each value should be the same. Else return the full
       # list of values.
-      # TODO(josh11b): If aggregation is NONE, we should return a PerDevice value.
-      if aggregation is not variables_lib.VariableAggregation.NONE:
+      # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
+      # value.
+      if reduce_op is not None:
         # TODO(priyag): Should this return the element or a list with 1 element
         last_step_tensor_outputs_dict[name] = output[0]
     ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
 
     return ctx
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
+  def _call_for_each_replica(self, fn, args, kwargs):
     # TODO(jhseu): Consider making it so call_for_each_replica implies that
     # we're in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
-    kwargs.pop("run_concurrently", None)
-    with _TPUReplicaContext(self):
+    with _TPUReplicaContext(self._container_strategy()):
       return fn(*args, **kwargs)
 
-  def initialize(self):
+  def _initialize(self):
     if context.executing_eagerly():
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
       raise NotImplementedError("Eager mode not supported in TPUStrategy.")
@@ -338,7 +362,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
                               tpu.initialize_system())
       return graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
 
-  def finalize(self):
+  def _finalize(self):
     if context.executing_eagerly():
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
       raise NotImplementedError("Eager mode not supported in TPUStrategy.")
@@ -346,7 +370,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       return [tpu.shutdown_system()]
 
   def _get_devices_from(self, colocate_with=None):
-     # TODO(jhseu): Change this when we support model parallelism.
+    # TODO(jhseu): Change this when we support model parallelism.
     return self._tpu_devices
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -383,12 +407,12 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     return _create_tpu_mirrored_variable(devices, _real_mirrored_creator, *args,
                                          **kwargs)
 
-  def _reduce(self, aggregation, value, destinations):
+  def _reduce_to(self, reduce_op, value, destinations):
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if aggregation == vs.VariableAggregation.MEAN:
+      if reduce_op == reduce_util.ReduceOp.MEAN:
         # TODO(jhseu):  Revisit once we support model-parallelism.
-        value *= (1. / self.num_replicas)
-      elif aggregation != vs.VariableAggregation.SUM:
+        value *= (1. / self._num_replicas_in_sync)
+      elif reduce_op != reduce_util.ReduceOp.SUM:
         raise NotImplementedError(
             "Currently only support sum & mean in TPUStrategy.")
       return tpu_ops.cross_replica_sum(value)
@@ -396,27 +420,22 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     # Validate that the destination is same as the host device
     # Note we don't do this when in replicate context as the reduction is
     # performed on the TPU device itself.
-    devices = cross_tower_ops_lib.get_devices_from(destinations)
+    devices = cross_device_ops_lib.get_devices_from(destinations)
     if len(devices) == 1:
       assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
           self._host_device)
     else:
       raise ValueError("Multiple devices are not supported for TPUStrategy")
 
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return value[0]
     output = math_ops.add_n(value)
-    if aggregation == vs.VariableAggregation.MEAN:
+    if reduce_op == reduce_util.ReduceOp.MEAN:
       return output * (1. / len(value))
     return output
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     assert isinstance(var, values.TPUMirroredVariable)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if should_group:
+      if group:
         return fn(var, *args, **kwargs)
       else:
         return [fn(var, *args, **kwargs)]
@@ -431,9 +450,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
         updates[d] = fn(v,
                         *values.select_device_mirrored(d, args),
                         **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  # TODO(josh11b): Need to implement _update_non_slot()!
+    return values.update_regroup(self, updates, group)
 
   def read_var(self, var):
     assert isinstance(var, values.TPUMirroredVariable)
@@ -445,7 +462,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       return [val.get(device=d) for d in sorted(val.devices)]
     elif isinstance(val, list):
       # TODO(josh11b): We need to remove this case; per device values should
-      # be represented using a PerDevice wrapper instead of a list with
+      # be represented using a PerReplica wrapper instead of a list with
       # one entry per device.
       return val
     return [val]
@@ -453,14 +470,10 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def value_container(self, value):
     return value
 
-  def _broadcast(self, tensor, destinations):
+  def _broadcast_to(self, tensor, destinations):
     del destinations
     return tensor
 
-  @property
-  def num_replicas(self):
-    return self._num_cores_override or self._tpu_metadata.num_cores
-
   @property
   def num_hosts(self):
     return self._tpu_metadata.num_hosts
@@ -470,15 +483,15 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     return self._tpu_metadata.num_of_cores_per_host
 
   @property
-  def num_replicas_in_sync(self):
-    return self.num_replicas
+  def _num_replicas_in_sync(self):
+    return self._num_cores_override or self._tpu_metadata.num_cores
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
     return False
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return True
 
   @property
@@ -500,14 +513,12 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def non_slot_devices(self, var_list):
     return self._host_device
 
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     del colocate_with
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.device(self._host_device), distribute_lib.UpdateContext(
         self._host_device):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -521,11 +532,11 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def get_host_cpu_device(self, host_id):
     return self.get_host(host_id) + "/device:CPU:0"
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     del cluster_spec, task_type, task_id
     if session_config:
       session_config.isolate_session_state = True
@@ -533,6 +544,11 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       if cluster_spec:
         session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
 
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
+
 
 class _TPUReplicaContext(distribute_lib.ReplicaContext):
   """Replication Context class for TPU Strategy."""
@@ -540,9 +556,14 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
   # TODO(sourabhbajaj): Call for each tower should be updating this.
   def __init__(self, distribution_strategy):
     distribute_lib.ReplicaContext.__init__(
-        self, distribution_strategy, replica_id=0)
+        self,
+        distribution_strategy,
+        # TODO(b/118385803): properly initialize replica_id, instead of always 0
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
 
   @property
-  def device(self):
+  def devices(self):
     distribute_lib.require_replica_context(self)
-    return self._distribution_strategy.worker_devices[self._replica_id]
+    ds = self._distribution_strategy
+    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+    return [ds.extended.worker_devices[replica_id]]
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index d514e6f4c158d15665a2cd46be0547178da66544..855b9c29aec0c0a65f1a715eea764067a41ba2f3 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -18,14 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
+from absl.testing import parameterized
 
-from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import model_fn as model_fn_lib
@@ -35,10 +35,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import nest
 
@@ -190,10 +192,10 @@ def _make_mirrored():
 
 class RegroupAndSelectDeviceTest(test.TestCase):
 
-  def _is_per_device(self, result, expected, klass=values.PerDevice):
+  def _is_per_replica(self, result, expected, klass=values.PerReplica):
     self.assertIsInstance(result, klass)
     # We canonicalize the devices to match the device strings returned
-    # by PerDevice, which also does device string canonicalization.
+    # by PerReplica, which also does device string canonicalization.
     devices = [device_util.canonicalize(_device_str(i))
                for i in range(len(expected))]
     self.assertEqual(set(devices), set(result.devices))
@@ -206,18 +208,18 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                              _device_str(1): _nested_value("2")})
     self.assertIsInstance(result, tuple)
     self.assertEqual(3, len(result))
-    self._is_per_device(result[0], ["a1", "a2"])
-    self._is_per_device(result[2], ["h1", "h2"])
+    self._is_per_replica(result[0], ["a1", "a2"])
+    self._is_per_replica(result[2], ["h1", "h2"])
 
     self.assertIsInstance(result[1], list)
     self.assertEqual(3, len(result[1]))
-    self._is_per_device(result[1][0], ["b1", "b2"])
-    self._is_per_device(result[1][2], ["g1", "g2"])
+    self._is_per_replica(result[1][0], ["b1", "b2"])
+    self._is_per_replica(result[1][2], ["g1", "g2"])
 
     self.assertIsInstance(result[1][1], dict)
     self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
-    self._is_per_device(result[1][1]["c"], ["d1", "d2"])
-    self._is_per_device(result[1][1]["e"], ["f1", "f2"])
+    self._is_per_replica(result[1][1]["c"], ["d1", "d2"])
+    self._is_per_replica(result[1][1]["e"], ["f1", "f2"])
 
     # Also test that we can undo the merge using select_device()
     self.assertEqual(_nested_value("1"),
@@ -238,18 +240,18 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                             values.Mirrored)
     self.assertIsInstance(result, tuple)
     self.assertEqual(3, len(result))
-    self._is_per_device(result[0], ["a1", "a2"], values.Mirrored)
-    self._is_per_device(result[2], ["h1", "h2"], values.Mirrored)
+    self._is_per_replica(result[0], ["a1", "a2"], values.Mirrored)
+    self._is_per_replica(result[2], ["h1", "h2"], values.Mirrored)
 
     self.assertIsInstance(result[1], list)
     self.assertEqual(3, len(result[1]))
-    self._is_per_device(result[1][0], ["b1", "b2"], values.Mirrored)
-    self._is_per_device(result[1][2], ["g1", "g2"], values.Mirrored)
+    self._is_per_replica(result[1][0], ["b1", "b2"], values.Mirrored)
+    self._is_per_replica(result[1][2], ["g1", "g2"], values.Mirrored)
 
     self.assertIsInstance(result[1][1], dict)
     self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
-    self._is_per_device(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
-    self._is_per_device(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
+    self._is_per_replica(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
+    self._is_per_replica(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
 
     # Also test that we can undo the merge using select_device()
     self.assertEqual(_nested_value("1"),
@@ -275,7 +277,7 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                              _device_str(1): ("b", foo)})
     self.assertIsInstance(result, tuple)
     self.assertEqual(2, len(result))
-    self._is_per_device(result[0], ["a", "b"])
+    self._is_per_replica(result[0], ["a", "b"])
     self.assertIs(foo, result[1])
 
     # Test select_device(), should undo the merge done by regroup().
@@ -325,69 +327,46 @@ class RegroupAndSelectDeviceTest(test.TestCase):
 
       self.assertTrue(
           isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec))
-      self.assertEquals(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
+      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
       for device_id in range(3):
         d = _device_str(device_id)
-        self.assertEquals(created_estimator_specs[device_id].loss,
-                          merged_estimator_spec.loss.get(d))
-        self.assertEquals(created_estimator_specs[device_id].train_op,
-                          merged_estimator_spec.train_op.get(d))
+        self.assertEqual(created_estimator_specs[device_id].loss,
+                         merged_estimator_spec.loss.get(d))
+        self.assertEqual(created_estimator_specs[device_id].train_op,
+                         merged_estimator_spec.train_op.get(d))
         # Scaffold is populated by `EstimatorSpec.__new__`.
-        self.assertEquals(created_estimator_specs[device_id].scaffold,
-                          merged_estimator_spec.scaffold.get(d))
+        self.assertEqual(created_estimator_specs[device_id].scaffold,
+                         merged_estimator_spec.scaffold.get(d))
         # Also test that we can undo the merge using select_device()
-        self.assertEquals(created_estimator_specs[device_id],
-                          values.select_device(_device_str(device_id),
-                                               merged_estimator_spec))
+        self.assertEqual(created_estimator_specs[device_id],
+                         values.select_device(_device_str(device_id),
+                                              merged_estimator_spec))
 
 
-class PerDeviceDatasetTest(test.TestCase):
+class PerReplicaDatasetTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
 
-  def _test_iterator_no_prefetch(self, devices, dataset, expected_values):
-    per_device_dataset = values.PerDeviceDataset(
-        dataset, devices, prefetch_on_device=False)
+  def _test_iterator(self, devices, dataset, expected_values):
+    per_replica_dataset = values.PerReplicaDataset(dataset, devices)
     if context.executing_eagerly():
-      iterator = per_device_dataset.make_one_shot_iterator()
+      iterator = per_replica_dataset.make_one_shot_iterator()
     else:
-      iterator = per_device_dataset.make_initializable_iterator()
+      iterator = per_replica_dataset.make_initializable_iterator()
       self.evaluate([iterator.initializer])
 
     for expected_value in expected_values:
       next_element = iterator.get_next()
-      actual = self.evaluate([
-          values.select_device(d, next_element) for d in devices])
-      self.assertEqual(expected_value, actual)
+      computed_value = self.evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
 
     with self.assertRaises(errors.OutOfRangeError):
       next_element = iterator.get_next()
       self.evaluate([
           values.select_device(d, next_element) for d in devices])
 
-  def _test_iterator_with_prefetch(self, devices, dataset, expected_values):
-    if not context.executing_eagerly():
-      per_device_dataset = values.PerDeviceDataset(
-          dataset, devices, prefetch_on_device=True)
-      iterator = per_device_dataset.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
-
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = self.evaluate(
-            [values.select_device(d, next_element) for d in devices])
-        self.assertEqual(expected_value, computed_value)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        next_element = iterator.get_next()
-        self.evaluate([
-            values.select_device(d, next_element) for d in devices])
-
-  def _test_iterator(self, devices, dataset, expected_values):
-    self._test_iterator_no_prefetch(devices, dataset, expected_values)
-    self._test_iterator_with_prefetch(devices, dataset, expected_values)
-
   @test_util.run_in_graph_and_eager_modes
   def testOneDevice(self):
     devices = ["/device:CPU:0"]
@@ -442,9 +421,8 @@ class PerDeviceDatasetTest(test.TestCase):
       dataset = dataset_ops.Dataset.from_tensor_slices(
           random_ops.random_uniform((10,)))
 
-      per_device_dataset = values.PerDeviceDataset(
-          dataset, devices, prefetch_on_device=False)
-      iterator = per_device_dataset.make_initializable_iterator()
+      per_replica_dataset = values.PerReplicaDataset(dataset, devices)
+      iterator = per_replica_dataset.make_initializable_iterator()
 
       self.evaluate(iterator.initializer)
       next_element = iterator.get_next()
@@ -463,7 +441,7 @@ class PerDeviceDatasetTest(test.TestCase):
 
 class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
 
-  def _test_iterator(self, iterator, devices, expected_values):
+  def _test_iterator(self, sess, iterator, devices, expected_values):
     next_element = iterator.get_next()
     for device in devices:
       v = values.select_device(device, next_element)
@@ -472,73 +450,79 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
         self.assertTrue(element.device in device)
 
     for expected_value in expected_values:
-      actual = self.evaluate(
+      actual = sess.run(
           [values.select_device(d, next_element) for d in devices])
       self.assertEqual(expected_value, actual)
 
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate([values.select_device(d, next_element) for d in devices])
+      sess.run([values.select_device(d, next_element) for d in devices])
 
-  def _test_dataset(self, dataset_fn, worker_device_map, devices,
-                    expected_values):
+  def _test_dataset(self, dataset_fn, worker_devices, devices,
+                    expected_values, auto_shard=True):
     multi_worker_dataset = values.MultiWorkerDataset(
-        dataset_fn, worker_device_map, prefetch_on_device=False)
-    multi_worker_iterator = multi_worker_dataset.make_one_shot_iterator()
-    self._test_iterator(multi_worker_iterator, devices, expected_values)
+        dataset_fn, worker_devices, auto_shard=auto_shard)
+    multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
+    with self.cached_session() as sess:
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
 
   def _cpu_devices(self):
-    worker_device_map = collections.OrderedDict(
-        [("/job:worker/replica:0/task:0",
-          ["/job:worker/replica:0/task:0/device:CPU:0"]),
-         ("/job:worker/replica:0/task:1",
-          ["/job:worker/replica:0/task:1/device:CPU:0"])])
+    worker_devices = [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
     devices = [
         "/job:worker/replica:0/task:0/device:CPU:0",
         "/job:worker/replica:0/task:1/device:CPU:0"
     ]
-    return worker_device_map, devices
+    return worker_devices, devices
 
   def _cpu_and_one_gpu_devices(self):
-    # The worker_device_map doesn't have to be a OrderDict object, this is just
-    # to simplify the testing so that we can pass expected values as a list
-    # instead of a dict.
-    worker_device_map = collections.OrderedDict(
-        [("/job:worker/replica:0/task:0", [
+    worker_devices = [
+        ("/job:worker/replica:0/task:0", [
             "/job:worker/replica:0/task:0/device:GPU:0",
             "/job:worker/replica:0/task:0/device:CPU:0"
-        ]), ("/job:worker/replica:0/task:1", [
+        ]),
+        ("/job:worker/replica:0/task:1", [
             "/job:worker/replica:0/task:1/device:GPU:0",
             "/job:worker/replica:0/task:1/device:CPU:0"
-        ])])
+        ])
+    ]
     devices = [
         "/job:worker/replica:0/task:0/device:GPU:0",
         "/job:worker/replica:0/task:0/device:CPU:0",
         "/job:worker/replica:0/task:1/device:GPU:0",
         "/job:worker/replica:0/task:1/device:CPU:0"
     ]
-    return worker_device_map, devices
+    return worker_devices, devices
 
   def testDataDistributionOneDevicePerWorker(self):
-    self.skipTest("Temporarily disabled.")
-    worker_device_map, devices = self._cpu_devices()
+    worker_devices, devices = self._cpu_devices()
     with context.graph_mode():
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(dataset_fn, worker_device_map, devices,
+      self._test_dataset(dataset_fn, worker_devices, devices,
                          [[0, 1], [2, 3], [4, 5], [6, 7]])
 
+  def testDataDistributionNoAutoShard(self):
+    worker_devices, devices = self._cpu_devices()
+    with context.graph_mode():
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_dataset(dataset_fn, worker_devices, devices,
+                         [[0, 0], [1, 1], [2, 2], [3, 3]],
+                         auto_shard=False)
+
   def testDataDistributionTwoDevicePerWorker(self):
-    self.skipTest("Temporarily disabled.")
     if context.num_gpus() < 1:
       self.skipTest("A GPU is not available for this test.")
-    worker_device_map, devices = self._cpu_and_one_gpu_devices()
+    worker_devices, devices = self._cpu_and_one_gpu_devices()
     with context.graph_mode():
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(dataset_fn, worker_device_map, devices,
+      self._test_dataset(dataset_fn, worker_devices, devices,
                          [[0, 2, 1, 3], [4, 6, 5, 7]])
 
   def testTupleDataset(self):
-    self.skipTest("Temporarily disabled.")
-    worker_device_map, devices = self._cpu_devices()
+    worker_devices, devices = self._cpu_devices()
 
     with context.graph_mode():
 
@@ -550,47 +534,221 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
       expected_values = [
           [(i, i**2), (i + 1, (i + 1)**2)] for i in range(0, 8, 2)
       ]
-      self._test_dataset(dataset_fn, worker_device_map, devices,
+      self._test_dataset(dataset_fn, worker_devices, devices,
                          expected_values)
 
   def testInitializableIterator(self):
-    self.skipTest("Temporarily disabled.")
-    worker_device_map, devices = self._cpu_devices()
-    with context.graph_mode():
+    worker_devices, devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
       multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, worker_device_map, prefetch_on_device=False)
+          dataset_fn, worker_devices, auto_shard=True)
       multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
 
-      self.evaluate(multi_worker_iterator.initializer)
-      self._test_iterator(multi_worker_iterator, devices,
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices,
                           [[0, 1], [2, 3], [4, 5], [6, 7]])
 
       # After re-initializing the iterator, should be able to iterate again.
-      self.evaluate(multi_worker_iterator.initializer)
-      self._test_iterator(multi_worker_iterator, devices,
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices,
                           [[0, 1], [2, 3], [4, 5], [6, 7]])
 
   def testValueErrorForIterator(self):
-    self.skipTest("Temporarily disabled.")
     # Incompatiable arguments.
     with self.assertRaises(ValueError):
       values.MultiWorkerDataIterator({"w1": None}, {"w1": "d1", "w2": "d2"})
 
     # Test duplicated devices under same worker.
-    worker_device_map, _ = self._cpu_devices()
-    worker_device_map["/job:worker/replica:0/task:0"].append(
-        "/job:worker/replica:0/task:0/device:CPU:0")
+    worker_devices, _ = self._cpu_devices()
+    worker_devices[0][1].append("/job:worker/replica:0/task:0/device:CPU:0")
     with context.graph_mode():
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
       multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, worker_device_map, prefetch_on_device=False)
+          dataset_fn, worker_devices, auto_shard=True)
       multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
       with self.assertRaises(ValueError):
         multi_worker_iterator.get_next()
 
 
-class MirroredVariableTest(test.TestCase):
+class InputIteratorTestBase(test.TestCase):
+
+  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
+                     expected_values, sess=None, split_batch_by=None):
+    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+
+    if input_type == "input_fn":
+      input_contexts = [
+          distribute_lib.InputContext() for _ in worker_device_pairs]
+      input_fn = lambda _: dataset_fn()
+      iterator = values.InputFunctionIterator(input_fn, worker_device_pairs,
+                                              input_contexts)
+    else:
+      iterator = values.DatasetIterator(dataset_fn(), worker_device_pairs,
+                                        split_batch_by)
+
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertAllEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_device(d, next_element) for d in devices])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertAllEqual(expected_value, computed_value)
+
+
+class InputIteratorSingleWorkerTest(InputIteratorTestBase,
+                                    parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDeviceCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesOneGPUOneCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTupleDataset(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    def dataset_fn():
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+      return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testUnevenDatasetBatches(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["dataset"],
+      split_batch_by=[None, 2],
+      required_gpus=1))
+  def testBatchSplitting(self, input_type, split_batch_by):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    batch_size = 10
+    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
+
+    updated_batch_size = (
+        batch_size // split_batch_by if split_batch_by else batch_size)
+    expected_values = [[range(i, i+updated_batch_size),
+                        range(i+updated_batch_size, i+2*updated_batch_size)]
+                       for i in range(0, 100, updated_batch_size*2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values, sess=None,
+                        split_batch_by=split_batch_by)
+
+
+class InputIteratorMultiWorkerTest(
+    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
+    parameterized.TestCase):
+
+  def _cpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
+
+  def _cpu_and_one_gpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0", [
+            "/job:worker/replica:0/task:0/device:GPU:0",
+            "/job:worker/replica:0/task:0/device:CPU:0"
+        ]),
+        ("/job:worker/replica:0/task:1", [
+            "/job:worker/replica:0/task:1/device:GPU:0",
+            "/job:worker/replica:0/task:1/device:CPU:0"
+        ])
+    ]
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDevicePerWorker(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesPerWorker(self, input_type):
+    worker_devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testTupleDataset(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      def dataset_fn():
+        dataset1 = dataset_ops.Dataset.range(4)
+        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
+        return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          expected_values, sess)
+
+
+class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -602,9 +760,9 @@ class MirroredVariableTest(test.TestCase):
 
     v, _, mirrored = _make_mirrored()
 
-    self.assertEquals(v[0].name, mirrored.name)
-    self.assertEquals(v[0].dtype, mirrored.dtype)
-    self.assertEquals(v[0].shape, mirrored.shape)
+    self.assertEqual(v[0].name, mirrored.name)
+    self.assertEqual(v[0].dtype, mirrored.dtype)
+    self.assertEqual(v[0].shape, mirrored.shape)
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testVariableOnAnotherDevice(self):
@@ -614,9 +772,9 @@ class MirroredVariableTest(test.TestCase):
     mirrored = values.MirroredVariable(index, v,
                                        variable_scope.VariableAggregation.MEAN)
 
-    self.assertEquals(v.name, mirrored.name)
-    self.assertEquals(v.dtype, mirrored.dtype)
-    self.assertEquals(v.shape, mirrored.shape)
+    self.assertEqual(v.name, mirrored.name)
+    self.assertEqual(v.dtype, mirrored.dtype)
+    self.assertEqual(v.shape, mirrored.shape)
 
   def _assign_mirrored(self, devices, v, new):
     for d, var, n in zip(devices, v, new):
@@ -736,14 +894,13 @@ class MirroredVariableTest(test.TestCase):
     save_path = self._save_normal()
     self._restore_mirrored(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testFetchAMirroredVariable(self):
-    if context.num_gpus() < 1 or context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test or it's eager mode.")
-
-    with self.session(
-        graph=ops.Graph()) as sess, mirrored_strategy.MirroredStrategy(
-            ["/device:GPU:0"]).scope():
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_one_gpu,
+          combinations.core_mirrored_strategy_with_one_gpu],
+      mode=["graph"]))
+  def testFetchAMirroredVariable(self, distribution):
+    with self.session(graph=ops.Graph()) as sess, distribution.scope():
       with ops.device("/device:GPU:0"):
         v = variable_scope.get_variable(
             name="v", initializer=1., use_resource=True)
@@ -769,7 +926,7 @@ def _make_replica_local(method):
   return v, replica_local
 
 
-class ReplicaLocalVariableTest(test.TestCase):
+class ReplicaLocalVariablePropertiesTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -778,15 +935,14 @@ class ReplicaLocalVariableTest(test.TestCase):
   def testProperties(self):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
-
     v, replica_local = _make_replica_local(
         variable_scope.VariableAggregation.SUM)
 
-    self.assertEquals(v[0].name, replica_local.name)
-    self.assertEquals(v[0].dtype, replica_local.dtype)
-    self.assertEquals(v[0].shape, replica_local.shape)
-    self.assertEquals(variable_scope.VariableAggregation.SUM,
-                      replica_local.aggregation)
+    self.assertEqual(v[0].name, replica_local.name)
+    self.assertEqual(v[0].dtype, replica_local.dtype)
+    self.assertEqual(v[0].shape, replica_local.shape)
+    self.assertEqual(variable_scope.VariableAggregation.SUM,
+                     replica_local.aggregation)
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testVariableOnAnotherDevice(self):
@@ -796,11 +952,32 @@ class ReplicaLocalVariableTest(test.TestCase):
     replica_local = values.ReplicaLocalVariable(
         index, v, variable_scope.VariableAggregation.MEAN)
 
-    self.assertEquals(v.name, replica_local.name)
-    self.assertEquals(v.dtype, replica_local.dtype)
-    self.assertEquals(v.shape, replica_local.shape)
-    self.assertEquals(variable_scope.VariableAggregation.MEAN,
-                      replica_local.aggregation)
+    self.assertEqual(v.name, replica_local.name)
+    self.assertEqual(v.dtype, replica_local.dtype)
+    self.assertEqual(v.shape, replica_local.shape)
+    self.assertEqual(variable_scope.VariableAggregation.MEAN,
+                     replica_local.aggregation)
+
+  def testTensorConversion(self):
+    with context.graph_mode():
+      _, replica_local = _make_replica_local(
+          variable_scope.VariableAggregation.SUM)
+      converted = ops.internal_convert_to_tensor(replica_local, as_ref=False)
+      self.assertIsInstance(converted, ops.Tensor)
+      self.assertEqual(converted.dtype, replica_local.dtype)
+
+      converted = ops.internal_convert_to_tensor(replica_local, as_ref=True)
+      # Resources variable are converted to tensors as well when as_ref is True.
+      self.assertIsInstance(converted, ops.Tensor)
+      self.assertEqual(converted.dtype, replica_local.dtype)
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
 
   def _assign_replica_local(self, devices, v, new):
     for d, var, n in zip(devices, v, new):
@@ -817,22 +994,15 @@ class ReplicaLocalVariableTest(test.TestCase):
     save_path, _ = self._save_return_saver(sess, var)
     return save_path
 
-  def _dist_scope(self):
-    return mirrored_strategy.MirroredStrategy(_devices).scope()
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreReplicaLocalSumOneGraph(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    with self.cached_session(config=self.config) as sess:
+  def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution):
+    with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
           variable_scope.VariableAggregation.SUM)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of v[0] + v[1], 7.
         save_path, saver = self._save_return_saver(sess, replica_local)
 
@@ -844,19 +1014,18 @@ class ReplicaLocalVariableTest(test.TestCase):
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreReplicaLocalMeanOneGraph(self):
+  def testSaveAndRestoreReplicaLocalMeanOneGraph(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    with self.cached_session(config=self.config) as sess:
+    with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
           variable_scope.VariableAggregation.MEAN)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of (v[0] + v[1])/2, 3.5.
         save_path, saver = self._save_return_saver(sess, replica_local)
 
@@ -867,7 +1036,7 @@ class ReplicaLocalVariableTest(test.TestCase):
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  def _save_replica_local_mean(self):
+  def _save_replica_local_mean(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
@@ -876,7 +1045,7 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of (v[0] + v[1])/2, 3.5
         save_path = self._save(sess, replica_local)
 
@@ -884,7 +1053,7 @@ class ReplicaLocalVariableTest(test.TestCase):
         self._assign_replica_local(_devices, v, [5., 6.])
     return save_path
 
-  def _save_replica_local_sum(self):
+  def _save_replica_local_sum(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local("sum")
@@ -892,7 +1061,7 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [1.5, 2.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of v[0] + v[1], 3.5
         save_path = self._save(sess, replica_local)
 
@@ -930,7 +1099,7 @@ class ReplicaLocalVariableTest(test.TestCase):
       saver.restore(sess, save_path)
       self.assertEqual(3.5, self.evaluate(var))
 
-  def _restore_replica_local_mean(self, save_path):
+  def _restore_replica_local_mean(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
@@ -939,13 +1108,13 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Restores the saved value of 3.5 to both variables.
         saver = saver_lib.Saver(var_list=[replica_local])
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  def _restore_replica_local_sum(self, save_path):
+  def _restore_replica_local_sum(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
@@ -954,72 +1123,35 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Restores the saved value of 3.5 to both variables.
         saver = saver_lib.Saver(var_list=[replica_local])
         saver.restore(sess, save_path)
         self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalRestoreReplicaLocalMean(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
+  def testSaveReplicaLocalRestoreReplicaLocalMean(self, distribution):
+    save_path = self._save_replica_local_mean(distribution)
+    self._restore_replica_local_mean(save_path, distribution)
 
-    save_path = self._save_replica_local_mean()
-    self._restore_replica_local_mean(save_path)
+  def testSaveReplicaLocalRestoreReplicaLocalSum(self, distribution):
+    save_path = self._save_replica_local_sum(distribution)
+    self._restore_replica_local_sum(save_path, distribution)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalRestoreReplicaLocalSum(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_replica_local_sum()
-    self._restore_replica_local_sum(save_path)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalMeanRestoreNormal(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_replica_local_mean()
+  def testSaveReplicaLocalMeanRestoreNormal(self, distribution):
+    save_path = self._save_replica_local_mean(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalSumRestoreNormal(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_replica_local_sum()
+  def testSaveReplicaLocalSumRestoreNormal(self, distribution):
+    save_path = self._save_replica_local_sum(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreReplicaLocalMean(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
+  def testSaveNormalRestoreReplicaLocalMean(self, distribution):
     save_path = self._save_normal()
-    self._restore_replica_local_mean(save_path)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreReplicaLocalSum(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
+    self._restore_replica_local_mean(save_path, distribution)
 
+  def testSaveNormalRestoreReplicaLocalSum(self, distribution):
     save_path = self._save_normal()
-    self._restore_replica_local_sum(save_path)
-
-  def testTensorConversion(self):
-    with context.graph_mode():
-      _, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.SUM)
-      converted = ops.internal_convert_to_tensor(replica_local, as_ref=False)
-      self.assertIsInstance(converted, ops.Tensor)
-      self.assertEqual(converted.dtype, replica_local.dtype)
-
-      converted = ops.internal_convert_to_tensor(replica_local, as_ref=True)
-      # Resources variable are converted to tensors as well when as_ref is True.
-      self.assertIsInstance(converted, ops.Tensor)
-      self.assertEqual(converted.dtype, replica_local.dtype)
+    self._restore_replica_local_sum(save_path, distribution)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/warm_starting_util_test.py b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
index 5d57d144c1c16a08280970ecd89eb54f7cf1ffd4..b0bcf9b17456c938204a4892451928daf90b6743 100644
--- a/tensorflow/contrib/distribute/python/warm_starting_util_test.py
+++ b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
@@ -44,7 +44,9 @@ class WarmStartingUtilWithDistributionStrategyTest(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       save_with_distribution=[True, False],
       restore_with_distribution=[True, False],
       mode=["graph"]))
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 60f6b90edcb71f04bca29b90744db201e83cd545..3079175015a9aee1625404902070df8f13b2089c 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -72,7 +72,6 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:spectral_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
@@ -80,6 +79,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/signal",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index aa680a92be64cf0f099acd335369f2a1610c5953..978e627d6638ddeea9df288d389354f0ac53d115 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -29,8 +29,8 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import spectral_ops
 from tensorflow.python.ops.distributions import util
+from tensorflow.python.ops.signal import fft_ops
 
 __all__ = [
     "auto_correlation",
@@ -157,11 +157,11 @@ def auto_correlation(
                                        dtype.real_dtype.as_numpy_dtype(0.))
 
     # Autocorrelation is IFFT of power-spectral density (up to some scaling).
-    fft_x_rotated_pad = spectral_ops.fft(x_rotated_pad)
+    fft_x_rotated_pad = fft_ops.fft(x_rotated_pad)
     spectral_density = fft_x_rotated_pad * math_ops.conj(fft_x_rotated_pad)
     # shifted_product is R[m] from above detailed explanation.
     # It is the inner product sum_n X[n] * Conj(X[n - m]).
-    shifted_product = spectral_ops.ifft(spectral_density)
+    shifted_product = fft_ops.ifft(spectral_density)
 
     # Cast back to real-valued if x was real to begin with.
     shifted_product = math_ops.cast(shifted_product, dtype)
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 3aed121233be1268531495a2fa83fd323412e1fd..db77a39626900ec4d46263b1891e08c0262ce7da 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import prefetching_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
@@ -52,11 +53,16 @@ class Iterator(iterator_ops.EagerIterator):
       TypeError: If `dataset` is an unsupported type.
       RuntimeError: When invoked without eager execution enabled.
     """
-    if isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset):  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    if (isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset)
+        or (isinstance(dataset, dataset_ops.DatasetV1Adapter)
+            and isinstance(
+                dataset._dataset, prefetching_ops._PrefetchToDeviceDataset))):
       raise TypeError(
           "`tf.data.experimental.prefetch_to_device()` is not compatible with "
           "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
           "over the dataset instead.")
+    # pylint: enable=protected-access
 
     if not context.context().device_spec.device_type:
       is_remote_device = False
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 480777d948769b56ac1cc3be2052fe48459e98d6..66d52a74943d0d81fde05ce51b019558b327978d 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -768,7 +768,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -781,7 +781,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -794,7 +794,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -808,7 +808,7 @@
       "outputs": [],
       "source": [
         "# wrong translation\n",
-        "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index f384d761a8430074f022c973d7ec3d46cd90f70b..3eb396a29ccdc0478384f9fa122465731740a30d 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -26,7 +26,7 @@ from tensorflow.contrib.factorization.python.ops import clustering_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export_output
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index 1ab5418fe4659cb0068ee8c3ca1442f6f723ee76..2f7cd131d3ed20df307ed231cce2ecb50ecfbceb 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -27,7 +27,7 @@ from sklearn.cluster import KMeans as SklearnKMeans
 # pylint: disable=g-import-not-at-top
 from tensorflow.contrib.factorization.python.ops import kmeans as kmeans_lib
 from tensorflow.python.estimator import run_config
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index a926ffd5982116a21dc7a0fd1ff957d4ecc6bf94..1cd83bdb5de7c2f6dc91c980750b49aca1a7790b 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -14,6 +14,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":sequence_feature_column",
+        ":sequence_feature_column_v2",
         "//tensorflow/python:util",
     ],
 )
@@ -32,7 +33,7 @@ py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
     ],
 )
 
@@ -51,7 +52,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -69,7 +70,49 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras:layers",
     ],
 )
+
+py_library(
+    name = "sequence_feature_column_v2",
+    srcs = ["python/feature_column/sequence_feature_column_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
+    ],
+)
+
+py_test(
+    name = "sequence_feature_column_v2_test",
+    srcs = ["python/feature_column/sequence_feature_column_v2_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":sequence_feature_column",
+        ":sequence_feature_column_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index dd6da35ed009c07ad3819e7860a283c7837c1f83..9b3a5c58aaa9498257fc971ac60b97f31d5185d8 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -222,10 +222,8 @@ def sequence_categorical_column_with_identity(
     ValueError: if `default_value` is not in range `[0, num_buckets)`.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_identity(
-          key=key,
-          num_buckets=num_buckets,
-          default_value=default_value))
+      fc._categorical_column_with_identity(
+          key=key, num_buckets=num_buckets, default_value=default_value))
 
 
 def sequence_categorical_column_with_hash_bucket(
@@ -265,10 +263,8 @@ def sequence_categorical_column_with_hash_bucket(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_hash_bucket(
-          key=key,
-          hash_bucket_size=hash_bucket_size,
-          dtype=dtype))
+      fc._categorical_column_with_hash_bucket(
+          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
 
 
 def sequence_categorical_column_with_vocabulary_file(
@@ -324,7 +320,7 @@ def sequence_categorical_column_with_vocabulary_file(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key=key,
           vocabulary_file=vocabulary_file,
           vocabulary_size=vocabulary_size,
@@ -384,7 +380,7 @@ def sequence_categorical_column_with_vocabulary_list(
     ValueError: if `dtype` is not integer or string.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key=key,
           vocabulary_list=vocabulary_list,
           dtype=dtype,
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
index d8ca363627eace15e039679545366648df174c33..bcc25b8de895a769f9e11b207c2092e23d029b1f 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
@@ -53,19 +53,20 @@ class SequenceFeatureColumnIntegrationTest(test.TestCase):
     return example
 
   def _build_feature_columns(self):
-    col = fc.categorical_column_with_identity(
-        'int_ctx', num_buckets=100)
+    col = fc._categorical_column_with_identity('int_ctx', num_buckets=100)
     ctx_cols = [
-        fc.embedding_column(col, dimension=10),
-        fc.numeric_column('float_ctx')]
+        fc._embedding_column(col, dimension=10),
+        fc._numeric_column('float_ctx')
+    ]
 
     identity_col = sfc.sequence_categorical_column_with_identity(
         'int_list', num_buckets=10)
     bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
         'bytes_list', hash_bucket_size=100)
     seq_cols = [
-        fc.embedding_column(identity_col, dimension=10),
-        fc.embedding_column(bucket_col, dimension=20)]
+        fc._embedding_column(identity_col, dimension=10),
+        fc._embedding_column(bucket_col, dimension=20)
+    ]
 
     return ctx_cols, seq_cols
 
@@ -148,8 +149,8 @@ class SequenceExampleParsingTest(test.TestCase):
     """
     example = _make_sequence_example()
     columns = [
-        fc.categorical_column_with_identity('int_ctx', num_buckets=100),
-        fc.numeric_column('float_ctx'),
+        fc._categorical_column_with_identity('int_ctx', num_buckets=100),
+        fc._numeric_column('float_ctx'),
         col_fn(col_name, col_arg)
     ]
     context, seq_features = parsing_ops.parse_single_sequence_example(
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 2163af0b43864c96483df529f07881f2f985a80e..d5f74028298ee7015f5b2e3aaee7d9330c1acac1 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc_lib
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -109,13 +110,15 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=embedding_dimension_a,
+    embedding_column_a = fc._embedding_column(
+        categorical_column_a,
+        dimension=embedding_dimension_a,
         initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc.embedding_column(
-        categorical_column_b, dimension=embedding_dimension_b,
+    embedding_column_b = fc._embedding_column(
+        categorical_column_b,
+        dimension=embedding_dimension_b,
         initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -148,10 +151,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
+    embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -206,7 +208,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
     # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension,
         initializer=_get_initializer(embedding_dimension, embedding_values))
@@ -244,11 +246,11 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     with self.assertRaisesRegexp(
@@ -315,10 +317,10 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc.indicator_column(categorical_column_b)
+    indicator_column_b = fc._indicator_column(categorical_column_b)
     input_layer, sequence_length = sfc.sequence_input_layer(
         features={
             'aaa': sparse_input_a,
@@ -342,9 +344,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -530,7 +532,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     input_layer, _ = sfc.sequence_input_layer(
         features={'aaa': sparse_input}, feature_columns=[indicator_column])
@@ -616,8 +618,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
+    embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -639,7 +640,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -918,8 +919,9 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
@@ -956,8 +958,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -984,8 +985,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
@@ -1055,7 +1055,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -1101,7 +1101,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     expected_sequence_length_b = [2, 1]
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1152,7 +1152,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
 
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1218,7 +1218,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -1250,7 +1250,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -1277,7 +1277,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d34ad161855476b6a4cd9a258521dbe122b4140
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
@@ -0,0 +1,558 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This API defines FeatureColumn for sequential input.
+
+NOTE: This API is a work in progress and will likely be changing frequently.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import collections
+
+
+from tensorflow.python.feature_column import feature_column as fc_old
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
+
+# pylint: disable=protected-access
+
+
+def sequence_input_layer(
+    features,
+    feature_columns,
+    weight_collections=None,
+    trainable=True):
+  """"Builds input layer for sequence input.
+
+  All `feature_columns` must be sequence dense columns with the same
+  `sequence_length`. The output of this method can be fed into sequence
+  networks, such as RNN.
+
+  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
+  `T` is the maximum sequence length for this batch, which could differ from
+  batch to batch.
+
+  If multiple `feature_columns` are given with `Di` `num_elements` each, their
+  outputs are concatenated. So, the final `Tensor` has shape
+  `[batch_size, T, D0 + D1 + ... + Dn]`.
+
+  Example:
+
+  ```python
+  rating = sequence_numeric_column('rating')
+  watches = sequence_categorical_column_with_identity(
+      'watches', num_buckets=1000)
+  watches_embedding = embedding_column(watches, dimension=10)
+  columns = [rating, watches]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    features: A dict mapping keys to tensors.
+    feature_columns: An iterable of dense sequence columns. Valid columns are
+      - `embedding_column` that wraps a `sequence_categorical_column_with_*`
+      - `sequence_numeric_column`.
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES`.
+
+  Returns:
+    An `(input_layer, sequence_length)` tuple where:
+    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
+        `T` is the maximum sequence length for this batch, which could differ
+        from batch to batch. `D` is the sum of `num_elements` for all
+        `feature_columns`.
+    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
+        length for each example.
+
+  Raises:
+    ValueError: If any of the `feature_columns` is the wrong type.
+  """
+  feature_columns = fc_old._normalize_feature_columns(feature_columns)
+  for c in feature_columns:
+    if not isinstance(c, fc_old._SequenceDenseColumn):
+      raise ValueError(
+          'All feature_columns must be of type _SequenceDenseColumn. '
+          'You can wrap a sequence_categorical_column with an embedding_column '
+          'or indicator_column. '
+          'Given (type {}): {}'.format(type(c), c))
+
+  with variable_scope.variable_scope(
+      None, default_name='sequence_input_layer', values=features.values()):
+    builder = fc_old._LazyBuilder(features)
+    output_tensors = []
+    sequence_lengths = []
+    ordered_columns = []
+
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name):
+        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        # Flattens the final dimension to produce a 3D Tensor.
+        num_elements = column._variable_shape.num_elements()
+        shape = array_ops.shape(dense_tensor)
+        target_shape = [shape[0], shape[1], num_elements]
+        output_tensors.append(
+            array_ops.reshape(dense_tensor, shape=target_shape))
+        sequence_lengths.append(sequence_length)
+
+    fc_old._verify_static_batch_size_equality(output_tensors, ordered_columns)
+    fc_old._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
+    sequence_length = _assert_all_equal_and_return(sequence_lengths)
+
+    return array_ops.concat(output_tensors, -1), sequence_length
+
+
+def concatenate_context_input(context_input, sequence_input):
+  """Replicates `context_input` across all timesteps of `sequence_input`.
+
+  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
+  This value is appended to `sequence_input` on dimension 2 and the result is
+  returned.
+
+  Args:
+    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
+    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
+      padded_length, d0]`.
+
+  Returns:
+    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
+    d0 + d1]`.
+
+  Raises:
+    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
+      not have rank 2.
+  """
+  seq_rank_check = check_ops.assert_rank(
+      sequence_input,
+      3,
+      message='sequence_input must have rank 3',
+      data=[array_ops.shape(sequence_input)])
+  seq_type_check = check_ops.assert_type(
+      sequence_input,
+      dtypes.float32,
+      message='sequence_input must have dtype float32; got {}.'.format(
+          sequence_input.dtype))
+  ctx_rank_check = check_ops.assert_rank(
+      context_input,
+      2,
+      message='context_input must have rank 2',
+      data=[array_ops.shape(context_input)])
+  ctx_type_check = check_ops.assert_type(
+      context_input,
+      dtypes.float32,
+      message='context_input must have dtype float32; got {}.'.format(
+          context_input.dtype))
+  with ops.control_dependencies(
+      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
+    padded_length = array_ops.shape(sequence_input)[1]
+    tiled_context_input = array_ops.tile(
+        array_ops.expand_dims(context_input, 1),
+        array_ops.concat([[1], [padded_length], [1]], 0))
+  return array_ops.concat([sequence_input, tiled_context_input], 2)
+
+
+def sequence_categorical_column_with_identity(
+    key, num_buckets, default_value=None):
+  """Returns a feature column that represents sequences of integers.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  watches = sequence_categorical_column_with_identity(
+      'watches', num_buckets=1000)
+  watches_embedding = embedding_column(watches, dimension=10)
+  columns = [watches_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    num_buckets: Range of inputs. Namely, inputs are expected to be in the
+      range `[0, num_buckets)`.
+    default_value: If `None`, this column's graph operations will fail for
+      out-of-range inputs. Otherwise, this value must be in the range
+      `[0, num_buckets)`, and will replace out-of-range inputs.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: if `num_buckets` is less than one.
+    ValueError: if `default_value` is not in range `[0, num_buckets)`.
+  """
+  return fc_old._SequenceCategoricalColumn(
+      fc_old._categorical_column_with_identity(
+          key=key, num_buckets=num_buckets, default_value=default_value))
+
+
+def sequence_categorical_column_with_hash_bucket(
+    key, hash_bucket_size, dtype=dtypes.string):
+  """A sequence of categorical terms where ids are set by hashing.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  tokens = sequence_categorical_column_with_hash_bucket(
+      'tokens', hash_bucket_size=1000)
+  tokens_embedding = embedding_column(tokens, dimension=10)
+  columns = [tokens_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    hash_bucket_size: An int > 1. The number of buckets.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: `hash_bucket_size` is not greater than 1.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return fc_old._SequenceCategoricalColumn(
+      fc_old._categorical_column_with_hash_bucket(
+          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
+
+
+def sequence_categorical_column_with_vocabulary_file(
+    key, vocabulary_file, vocabulary_size=None, num_oov_buckets=0,
+    default_value=None, dtype=dtypes.string):
+  """A sequence of categorical terms where ids use a vocabulary file.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  states = sequence_categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  states_embedding = embedding_column(states, dimension=10)
+  columns = [states_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored. If None, it is set to the length of `vocabulary_file`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: `vocabulary_file` is missing or cannot be opened.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return fc_old._SequenceCategoricalColumn(
+      fc_old._categorical_column_with_vocabulary_file(
+          key=key,
+          vocabulary_file=vocabulary_file,
+          vocabulary_size=vocabulary_size,
+          num_oov_buckets=num_oov_buckets,
+          default_value=default_value,
+          dtype=dtype))
+
+
+def sequence_categorical_column_with_vocabulary_list(
+    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+  """A sequence of categorical terms where ids use an in-memory list.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  colors = sequence_categorical_column_with_vocabulary_list(
+      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
+      num_oov_buckets=2)
+  colors_embedding = embedding_column(colors, dimension=3)
+  columns = [colors_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
+      is mapped to the index of its value (if present) in `vocabulary_list`.
+      Must be castable to `dtype`.
+    dtype: The type of features. Only string and integer types are supported.
+      If `None`, it will be inferred from `vocabulary_list`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
+      hash of the input value. A positive `num_oov_buckets` can not be specified
+      with `default_value`.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: if `dtype` is not integer or string.
+  """
+  return fc_old._SequenceCategoricalColumn(
+      fc_old._categorical_column_with_vocabulary_list(
+          key=key,
+          vocabulary_list=vocabulary_list,
+          dtype=dtype,
+          default_value=default_value,
+          num_oov_buckets=num_oov_buckets))
+
+
+def sequence_numeric_column(
+    key,
+    shape=(1,),
+    default_value=0.,
+    dtype=dtypes.float32,
+    normalizer_fn=None):
+  """Returns a feature column that represents sequences of numeric data.
+
+  Example:
+
+  ```python
+  temperature = sequence_numeric_column('temperature')
+  columns = [temperature]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  sequence_feature_layer = SequenceFeatureLayer(columns)
+  input_layer, sequence_length = sequence_feature_layer(features)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input features.
+    shape: The shape of the input data per sequence id. E.g. if `shape=(2,)`,
+      each example must contain `2 * sequence_length` values.
+    default_value: A single value compatible with `dtype` that is used for
+      padding the sparse data into a dense `Tensor`.
+    dtype: The type of values.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
+
+  Returns:
+    A `SequenceNumericColumn`.
+
+  Raises:
+    TypeError: if any dimension in shape is not an int.
+    ValueError: if any dimension in shape is not a positive integer.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  shape = fc_old._check_shape(shape=shape, key=key)
+  if not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype must be convertible to float. '
+                     'dtype: {}, key: {}'.format(dtype, key))
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
+
+  return SequenceNumericColumn(
+      key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
+
+
+def _assert_all_equal_and_return(tensors, name=None):
+  """Asserts that all tensors are equal and returns the first one."""
+  with ops.name_scope(name, 'assert_all_equal', values=tensors):
+    if len(tensors) == 1:
+      return tensors[0]
+    assert_equal_ops = []
+    for t in tensors[1:]:
+      assert_equal_ops.append(check_ops.assert_equal(tensors[0], t))
+    with ops.control_dependencies(assert_equal_ops):
+      return array_ops.identity(tensors[0])
+
+
+class SequenceNumericColumn(
+    fc.SequenceDenseColumn,
+    collections.namedtuple(
+        'SequenceNumericColumn',
+        ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))):
+  """Represents sequences of numeric data."""
+
+  @property
+  def _is_v2_column(self):
+    return True
+
+  @property
+  def name(self):
+    """See `FeatureColumn` base class."""
+    return self.key
+
+  @property
+  def parse_example_spec(self):
+    """See `FeatureColumn` base class."""
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def transform_feature(self, transformation_cache, state_manager):
+    """See `FeatureColumn` base class.
+
+    In this case, we apply the `normalizer_fn` to the input tensor.
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+
+    Returns:
+      Normalized input tensor.
+    """
+    input_tensor = transformation_cache.get(self.key, state_manager)
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return input_tensor
+
+  @property
+  def variable_shape(self):
+    """Returns a `TensorShape` representing the shape of sequence input."""
+    return tensor_shape.TensorShape(self.shape)
+
+  def get_sequence_dense_tensor(self, transformation_cache, state_manager):
+    """Returns a `TensorSequenceLengthPair`.
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+    """
+    sp_tensor = transformation_cache.get(self, state_manager)
+    dense_tensor = sparse_ops.sparse_tensor_to_dense(
+        sp_tensor, default_value=self.default_value)
+    # Reshape into [batch_size, T, variable_shape].
+    dense_shape = array_ops.concat(
+        [array_ops.shape(dense_tensor)[:1], [-1], self.variable_shape],
+        axis=0)
+    dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape)
+
+    # Get the number of timesteps per example
+    # For the 2D case, the raw values are grouped according to num_elements;
+    # for the 3D case, the grouping happens in the third dimension, and
+    # sequence length is not affected.
+    num_elements = (self.variable_shape.num_elements()
+                    if sp_tensor.shape.ndims == 2 else 1)
+    seq_length = fc_old._sequence_length_from_sparse_tensor(
+        sp_tensor, num_elements=num_elements)
+
+    return fc.SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=dense_tensor, sequence_length=seq_length)
+
+  # TODO(b/119409767): Implement parents, _{get,from}_config.
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+# pylint: enable=protected-access
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca4398a142065de0be7bee57cd7e54670bbae12e
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
@@ -0,0 +1,1508 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sequential_feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc_old
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
+from tensorflow.python.feature_column import feature_column as fc_old
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import monitored_session
+
+
+class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args_a': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'sparse_input_args_b': {
+           # example 0, ids [1]
+           # example 1, ids [2, 0]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 2, 0),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [2, 0]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_args_a': {
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           'indices': (
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 0, 0, 1),
+           'dense_shape': (2, 2, 2)},
+       'sparse_input_args_b': {
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[2], [0]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (1, 1, 1, 2, 0),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[5., 6., 14., 15., 16.], [2., 3., 14., 15., 16.]],
+           # feature 1, [a: 0, 0, b: 2, -], [a: 1, -, b: 0, -]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_embedding_column(
+      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
+      expected_sequence_length):
+
+    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
+    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
+    vocabulary_size = 3
+    embedding_dimension_a = 2
+    embedding_values_a = (
+        (1., 2.),  # id 0
+        (3., 4.),  # id 1
+        (5., 6.)  # id 2
+    )
+    embedding_dimension_b = 3
+    embedding_values_b = (
+        (11., 12., 13.),  # id 0
+        (14., 15., 16.),  # id 1
+        (17., 18., 19.)  # id 2
+    )
+    def _get_initializer(embedding_dimension, embedding_values):
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
+      return _initializer
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc_old._embedding_column(
+        categorical_column_a,
+        dimension=embedding_dimension_a,
+        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_b = fc_old._embedding_column(
+        categorical_column_b,
+        dimension=embedding_dimension_b,
+        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        # Test that columns are reordered alphabetically.
+        feature_columns=[embedding_column_b, embedding_column_a])
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('sequence_input_layer/aaa_embedding/embedding_weights:0',
+         'sequence_input_layer/bbb_embedding/embedding_weights:0'),
+        tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
+      self.assertAllEqual(embedding_values_b, global_vars[1].eval(session=sess))
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_embedding_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc_old._categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc_old._embedding_column(
+        categorical_column_a, dimension=2)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must be of '
+        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
+      _, _ = sfc.sequence_input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[embedding_column_a])
+
+  def test_shared_embedding_column(self):
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [1]
+        # example 1, ids [2, 0]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 2, 0),
+        dense_shape=(2, 2))
+
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 4.),  # id 1
+        (5., 6.)  # id 2
+    )
+
+    def _get_initializer(embedding_dimension, embedding_values):
+
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
+
+      return _initializer
+
+    expected_input_layer = [
+        # example 0, ids_a [2], ids_b [1]
+        [[5., 6., 3., 4.], [0., 0., 0., 0.]],
+        # example 1, ids_a [0, 1], ids_b [2, 0]
+        [[1., 2., 5., 6.], [3., 4., 1., 2.]],
+    ]
+    expected_sequence_length = [1, 2]
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    # Test that columns are reordered alphabetically.
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension,
+        initializer=_get_initializer(embedding_dimension, embedding_values))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        feature_columns=shared_embedding_columns)
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('sequence_input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
+        tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_shared_embedding_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence shared embedding column."""
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc_old._categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc_old._categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_shared_embedding\. categorical_column must '
+        r'be of type _SequenceCategoricalColumn to use sequence_input_layer\.'):
+      _, _ = sfc.sequence_input_layer(
+          features={
+              'aaa': sparse_input_a,
+              'bbb': sparse_input_b
+          },
+          feature_columns=shared_embedding_columns)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args_a': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'sparse_input_args_b': {
+           # example 0, ids [1]
+           # example 1, ids [1, 0]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 1, 0),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [1, 0]
+           [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_args_a': {
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           'indices': (
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 0, 0, 1),
+           'dense_shape': (2, 2, 2)},
+       'sparse_input_args_b': {
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[1], [0]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (1, 1, 1, 1, 0),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[0., 0., 1., 0., 2.], [1., 1., 0., 0., 1.]],
+           # feature 1, [a: 0, 0, b: 1, -], [a: 1, -, b: 0, -]
+           [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_indicator_column(
+      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
+      expected_sequence_length):
+    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
+    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
+
+    vocabulary_size_a = 3
+    vocabulary_size_b = 2
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size_a)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size_b)
+    indicator_column_b = fc_old._indicator_column(categorical_column_b)
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        # Test that columns are reordered alphabetically.
+        feature_columns=[indicator_column_b, indicator_column_a])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_indicator_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence categorical column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc_old._categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must be of '
+        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
+      _, _ = sfc.sequence_input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[indicator_column_a])
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [0., 1]
+           # example 1, [10.]
+           'indices': ((0, 0), (0, 1), (1, 0)),
+           'values': (0., 1., 10.),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           [[0.], [1.]],
+           [[10.], [0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # feature 0, ids [[20, 3], [5]]
+           # feature 1, ids [[3], [8]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (20, 3, 5., 3., 8.),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           [[20.], [3.], [5.], [0.]],
+           [[3.], [0.], [8.], [0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_numeric_column(
+      self, sparse_input_args, expected_input_layer, expected_sequence_length):
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+
+    numeric_column = sfc_old.sequence_numeric_column('aaa')
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [0., 1.,  2., 3., 4., 5., 6., 7.]
+           # example 1, [10., 11., 12., 13.]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
+                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 4)},
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      )
+  def test_numeric_column_multi_dim(
+      self, sparse_input_args, expected_input_layer, expected_sequence_length):
+    """Tests sequence_input_layer for multi-dimensional numeric_column."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+
+    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_sequence_length_not_equal(self):
+    """Tests that an error is raised when sequence lengths are not equal."""
+    # Input a with sequence_length = [2, 1]
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    # Input b with sequence_length = [1, 1]
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0)),
+        values=(1., 10.),
+        dense_shape=(2, 2))
+    numeric_column_a = sfc_old.sequence_numeric_column('aaa')
+    numeric_column_b = sfc_old.sequence_numeric_column('bbb')
+
+    _, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        feature_columns=[numeric_column_a, numeric_column_b])
+
+    with monitored_session.MonitoredSession() as sess:
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[Condition x == y did not hold element-wise:\] '
+          r'\[x \(sequence_input_layer/aaa/sequence_length:0\) = \] \[2 1\] '
+          r'\[y \(sequence_input_layer/bbb/sequence_length:0\) = \] \[1 1\]'):
+        sess.run(sequence_length)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+           # example 1, [[[10., 11.],  [12., 13.]]]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_shape': [2, 2, 4]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 2),
+                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 4)},
+       'expected_shape': [2, 2, 4]},
+      )
+  def test_static_shape_from_tensors_numeric(
+      self, sparse_input_args, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+
+    input_layer, _ = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected_shape': [4, 2, 3]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [0, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 0, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected_shape': [4, 2, 3]}
+      )
+  def test_static_shape_from_tensors_indicator(
+      self, sparse_input_args, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    indicator_column = fc_old._indicator_column(categorical_column)
+
+    input_layer, _ = sfc.sequence_input_layer(
+        features={'aaa': sparse_input}, feature_columns=[indicator_column])
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+
+class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
+  """Tests the utility fn concatenate_context_input."""
+
+  def test_concatenate_context_input(self):
+    seq_input = ops.convert_to_tensor(np.arange(12).reshape(2, 3, 2))
+    context_input = ops.convert_to_tensor(np.arange(10).reshape(2, 5))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    input_layer = sfc.concatenate_context_input(context_input, seq_input)
+
+    expected = np.array([
+        [[0, 1, 0, 1, 2, 3, 4], [2, 3, 0, 1, 2, 3, 4], [4, 5, 0, 1, 2, 3, 4]],
+        [[6, 7, 5, 6, 7, 8, 9], [8, 9, 5, 6, 7, 8, 9], [10, 11, 5, 6, 7, 8, 9]]
+    ], dtype=np.float32)
+    with monitored_session.MonitoredSession() as sess:
+      output = sess.run(input_layer)
+      self.assertAllEqual(expected, output)
+
+  @parameterized.named_parameters(
+      {'testcase_name': 'rank_lt_3',
+       'seq_input_arg': np.arange(100).reshape(10, 10)},
+      {'testcase_name': 'rank_gt_3',
+       'seq_input_arg': np.arange(100).reshape(5, 5, 2, 2)}
+      )
+  def test_sequence_input_throws_error(self, seq_input_arg):
+    seq_input = ops.convert_to_tensor(seq_input_arg)
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'sequence_input must have rank 3'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  @parameterized.named_parameters(
+      {'testcase_name': 'rank_lt_2',
+       'context_input_arg': np.arange(100)},
+      {'testcase_name': 'rank_gt_2',
+       'context_input_arg': np.arange(100).reshape(5, 5, 4)}
+      )
+  def test_context_input_throws_error(self, context_input_arg):
+    context_input = ops.convert_to_tensor(context_input_arg)
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'context_input must have rank 2'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  def test_integer_seq_input_throws_error(self):
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(
+        TypeError, 'sequence_input must have dtype float32'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  def test_integer_context_input_throws_error(self):
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(
+        TypeError, 'context_input must have dtype float32'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+
+class InputLayerTest(test.TestCase):
+  """Tests input_layer with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc_old._embedding_column(
+        categorical_column_a, dimension=2)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type _SequenceCategoricalColumn\.'):
+      _ = fc_old.input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[embedding_column_a])
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type _SequenceCategoricalColumn\.'):
+      _ = fc_old.input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[indicator_column_a])
+
+
+def _assert_sparse_tensor_value(test_case, expected, actual):
+  _assert_sparse_tensor_indices_shape(test_case, expected, actual)
+
+  test_case.assertEqual(
+      np.array(expected.values).dtype, np.array(actual.values).dtype)
+  test_case.assertAllEqual(expected.values, actual.values)
+
+
+def _assert_sparse_tensor_indices_shape(test_case, expected, actual):
+  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
+  test_case.assertAllEqual(expected.indices, actual.indices)
+
+  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
+  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
+
+
+class SequenceCategoricalColumnWithIdentityTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 2, 0),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           'values': np.array((1, 2, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': (6, 7, 8),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': (6, 7, 8),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
+    column = sfc.sequence_categorical_column_with_identity('aaa', num_buckets=9)
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceCategoricalColumnWithHashBucketTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': ('omar', 'stringer', 'marlo'),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           # Ignored to avoid hash dependence in test.
+           'values': np.array((0, 0, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': ('omar', 'stringer', 'marlo'),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           # Ignored to avoid hash dependence in test.
+           'values': np.array((0, 0, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
+    column = sfc.sequence_categorical_column_with_hash_bucket(
+        'aaa', hash_bucket_size=10)
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_indices_shape(
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceCategoricalColumnWithVocabularyFileTest(
+    test.TestCase, parameterized.TestCase):
+
+  def _write_vocab(self, vocab_strings, file_name):
+    vocab_file = os.path.join(self.get_temp_dir(), file_name)
+    with open(vocab_file, 'w') as f:
+      f.write('\n'.join(vocab_strings))
+    return vocab_file
+
+  def setUp(self):
+    super(SequenceCategoricalColumnWithVocabularyFileTest, self).setUp()
+
+    vocab_strings = ['omar', 'stringer', 'marlo']
+    self._wire_vocabulary_file_name = self._write_vocab(vocab_strings,
+                                                        'wire_vocabulary.txt')
+    self._wire_vocabulary_size = 3
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': ('marlo', 'skywalker', 'omar'),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           'values': np.array((2, -1, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': ('omar', 'skywalker', 'marlo'),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': np.array((0, -1, 2), dtype=np.int64),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
+    column = sfc.sequence_categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+  def test_get_sparse_tensors_dynamic_zero_length(self):
+    """Tests _get_sparse_tensors with a dynamic sequence length."""
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=np.zeros((0, 2)), values=[], dense_shape=(2, 0))
+    expected = sparse_tensor.SparseTensorValue(
+        indices=np.zeros((0, 3)),
+        values=np.array((), dtype=np.int64),
+        dense_shape=(2, 0, 1))
+    column = sfc.sequence_categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    input_placeholder_shape = list(inputs.dense_shape)
+    # Make second dimension (sequence length) dynamic.
+    input_placeholder_shape[1] = None
+    input_placeholder = array_ops.sparse_placeholder(
+        dtypes.string, shape=input_placeholder_shape)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({'aaa': input_placeholder}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      result = id_weight_pair.id_tensor.eval(
+          session=sess, feed_dict={input_placeholder: inputs})
+      _assert_sparse_tensor_value(
+          self, expected, result)
+
+
+class SequenceCategoricalColumnWithVocabularyListTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': ('marlo', 'skywalker', 'omar'),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           'values': np.array((2, -1, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': ('omar', 'skywalker', 'marlo'),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': np.array((0, -1, 2), dtype=np.int64),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
+    column = sfc.sequence_categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceEmbeddingColumnTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected': [
+           # example 0, ids [2]
+           [[7., 11.], [0., 0.]],
+           # example 1, ids [0, 1]
+           [[1., 2.], [3., 5.]],
+           # example 2, ids []
+           [[0., 0.], [0., 0.]],
+           # example 3, ids [1]
+           [[3., 5.], [0., 0.]]]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [0, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 0, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected': [
+           # example 0, ids [[2]]
+           [[7., 11.], [0., 0.]],
+           # example 1, ids [[0, 1], [2]]
+           [[2, 3.5], [7., 11.]],
+           # example 2, ids []
+           [[0., 0.], [0., 0.]],
+           # example 3, ids [[1], [0, 2]]
+           [[3., 5.], [4., 6.5]]]}
+      )
+  def test_get_sequence_dense_tensor(self, inputs_args, expected):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    vocabulary_size = 3
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc_old._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': inputs}))
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
+      self.assertAllEqual(expected, embedding_lookup.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 2),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2]}
+      )
+  def test_sequence_length(self, inputs_args, expected_sequence_length):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    vocabulary_size = 3
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
+
+    _, sequence_length = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': inputs}))
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length = sess.run(sequence_length)
+      self.assertAllEqual(expected_sequence_length, sequence_length)
+      self.assertEqual(np.int64, sequence_length.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [2]
+        # example 2, ids [0, 1]
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids []
+        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 1, 2, 0, 1, 0]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
+
+    _, sequence_length = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+class SequenceSharedEmbeddingColumnTest(test.TestCase):
+
+  def test_get_sequence_dense_tensor(self):
+    vocabulary_size = 3
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 2))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [1]
+        # example 1, ids [0, 2]
+        # example 2, ids [0]
+        # example 3, ids []
+        indices=((0, 0), (1, 0), (1, 1), (2, 0)),
+        values=(1, 0, 2, 0),
+        dense_shape=(4, 2))
+
+    expected_lookups_a = [
+        # example 0, ids [2]
+        [[7., 11.], [0., 0.]],
+        # example 1, ids [0, 1]
+        [[1., 2.], [3., 5.]],
+        # example 2, ids []
+        [[0., 0.], [0., 0.]],
+        # example 3, ids [1]
+        [[3., 5.], [0., 0.]],
+    ]
+
+    expected_lookups_b = [
+        # example 0, ids [1]
+        [[3., 5.], [0., 0.]],
+        # example 1, ids [0, 2]
+        [[1., 2.], [7., 11.]],
+        # example 2, ids [0]
+        [[1., 2.], [0., 0.]],
+        # example 3, ids []
+        [[0., 0.], [0., 0.]],
+    ]
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    embedding_lookup_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input_a
+        }))[0]
+    embedding_lookup_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'bbb': sparse_input_b
+        }))[0]
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
+      self.assertAllEqual(
+          expected_lookups_a, embedding_lookup_a.eval(session=sess))
+      self.assertAllEqual(
+          expected_lookups_b, embedding_lookup_b.eval(session=sess))
+
+  def test_sequence_length(self):
+    vocabulary_size = 3
+
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    expected_sequence_length_a = [1, 2]
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0, 2]
+        # example 1, ids [1]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0, 2, 1),
+        dense_shape=(2, 2))
+    expected_sequence_length_b = [2, 1]
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)
+
+    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input_a
+        }))[1]
+    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'bbb': sparse_input_b
+        }))[1]
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length_a = sess.run(sequence_length_a)
+      self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
+      self.assertEqual(np.int64, sequence_length_a.dtype)
+      sequence_length_b = sess.run(sequence_length_b)
+      self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
+      self.assertEqual(np.int64, sequence_length_b.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [2]
+        # example 2, ids [0, 1]
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids []
+        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        # example 2, ids []
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids [0, 1]
+        indices=((0, 0), (4, 0), (5, 0), (5, 1)),
+        values=(2, 1, 0, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)
+
+    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input_a
+        }))[1]
+    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'bbb': sparse_input_b
+        }))[1]
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length_a, sequence_length_a.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length_b, sequence_length_b.eval(session=sess))
+
+
+class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected': [
+           # example 0, ids [2]
+           [[0., 0., 1.], [0., 0., 0.]],
+           # example 1, ids [0, 1]
+           [[1., 0., 0.], [0., 1., 0.]],
+           # example 2, ids []
+           [[0., 0., 0.], [0., 0., 0.]],
+           # example 3, ids [1]
+           [[0., 1., 0.], [0., 0., 0.]]]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [2, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 2, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected': [
+           # example 0, ids [[2]]
+           [[0., 0., 1.], [0., 0., 0.]],
+           # example 1, ids [[0, 1], [2]]
+           [[1., 1., 0.], [0., 0., 1.]],
+           # example 2, ids []
+           [[0., 0., 0.], [0., 0., 0.]],
+           # example 3, ids [[1], [2, 2]]
+           [[0., 1., 0.], [0., 0., 2.]]]}
+      )
+  def test_get_sequence_dense_tensor(self, inputs_args, expected):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    vocabulary_size = 3
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column = fc_old._indicator_column(categorical_column)
+
+    indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': inputs}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected, indicator_tensor.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 2),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2]}
+      )
+  def test_sequence_length(self, inputs_args, expected_sequence_length):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    vocabulary_size = 3
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column = fc_old._indicator_column(categorical_column)
+
+    _, sequence_length = indicator_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': inputs}))
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length = sess.run(sequence_length)
+      self.assertAllEqual(expected_sequence_length, sequence_length)
+      self.assertEqual(np.int64, sequence_length.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [2]
+        # example 2, ids [0, 1]
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids []
+        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 1, 2, 0, 1, 0]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column = fc.indicator_column(categorical_column)
+
+    _, sequence_length = indicator_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+def _get_sequence_dense_tensor(column, features):
+  return column.get_sequence_dense_tensor(
+      fc.FeatureTransformationCache(features), None)
+
+
+class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
+
+  def test_defaults(self):
+    a = sfc.sequence_numeric_column('aaa')
+    self.assertEqual('aaa', a.key)
+    self.assertEqual('aaa', a.name)
+    self.assertEqual((1,), a.shape)
+    self.assertEqual(0., a.default_value)
+    self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
+
+  def test_shape_saved_as_tuple(self):
+    a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
+    self.assertEqual((1, 2), a.shape)
+
+  def test_shape_must_be_positive_integer(self):
+    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+      sfc.sequence_numeric_column('aaa', shape=[1.0])
+
+    with self.assertRaisesRegexp(
+        ValueError, 'shape dimensions must be greater than 0'):
+      sfc.sequence_numeric_column('aaa', shape=[0])
+
+  def test_dtype_is_convertible_to_float(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'dtype must be convertible to float'):
+      sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
+
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, values [0., 1]
+           # example 1, [10.]
+           'indices': ((0, 0), (0, 1), (1, 0)),
+           'values': (0., 1., 10.),
+           'dense_shape': (2, 2)},
+       'expected': [
+           [[0.], [1.]],
+           [[10.], [0.]]]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # feature 0, ids [[20, 3], [5]]
+           # feature 1, ids [[3], [8]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (20, 3, 5., 3., 8.),
+           'dense_shape': (2, 2, 2)},
+       'expected': [
+           [[20.], [3.], [5.], [0.]],
+           [[3.], [0.], [8.], [0.]]]},
+      )
+  def test_get_sequence_dense_tensor(self, inputs_args, expected):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    dense_tensor, _ = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': inputs})
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected, dense_tensor.eval(session=sess))
+
+  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
+
+    def _increment_two(input_sparse_tensor):
+      return sparse_ops.sparse_add(
+          input_sparse_tensor,
+          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
+      )
+
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+
+    # Before _increment_two:
+    #   [[0.], [1.]],
+    #   [[10.], [0.]],
+    # After _increment_two:
+    #   [[2.], [1.]],
+    #   [[10.], [2.]],
+    expected_dense_tensor = [
+        [[2.], [1.]],
+        [[10.], [2.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column(
+        'aaa', normalizer_fn=_increment_two)
+
+    dense_tensor, _ = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': sparse_input})
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+           # example 1, [[[10., 11.],  [12., 13.]]]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_dense_tensor': [
+           [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
+           [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]]]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           'indices': ((0, 0, 0), (0, 0, 2), (0, 0, 4), (0, 0, 6),
+                       (0, 1, 0), (0, 1, 2), (0, 1, 4), (0, 1, 6),
+                       (1, 0, 0), (1, 0, 2), (1, 0, 4), (1, 0, 6)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 8)},
+       'expected_dense_tensor': [
+           [[[0., 0.], [1., 0.]], [[2., 0.], [3., 0.]],
+            [[4., 0.], [5., 0.]], [[6., 0.], [7., 0.]]],
+           [[[10., 0.], [11., 0.]], [[12., 0.], [13., 0.]],
+            [[0., 0.], [0., 0.]], [[0., 0.], [0., 0.]]]]},
+      )
+  def test_get_dense_tensor_multi_dim(
+      self, sparse_input_args, expected_dense_tensor):
+    """Tests get_sequence_dense_tensor for multi-dim numeric_column."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    dense_tensor, _ = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': sparse_input})
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2., 0., 1.),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 2],
+       'shape': (1,)},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2., 0., 1., 2.),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2],
+       'shape': (1,)},
+      {'testcase_name': '2D_with_shape',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2., 0., 1.),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 1],
+       'shape': (2,)},
+      {'testcase_name': '3D_with_shape',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2., 0., 1., 2.),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2],
+       'shape': (2,)},
+      )
+  def test_sequence_length(self, inputs_args, expected_sequence_length, shape):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=shape)
+
+    _, sequence_length = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': inputs})
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length = sess.run(sequence_length)
+      self.assertAllEqual(expected_sequence_length, sequence_length)
+      self.assertEqual(np.int64, sequence_length.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values []
+        # example 1, values [[0.], [1.]]
+        # example 2, [[2.]]
+        # example 3, values []
+        # example 4, [[3.]]
+        # example 5, values []
+        indices=((1, 0), (1, 1), (2, 0), (4, 0)),
+        values=(0., 1., 2., 3.),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 2, 1, 0, 1, 0]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    _, sequence_length = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': sparse_input})
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index cd747df4d69d2c264f5a64b491da9570b1423770..53efae1e10f30a2c5a42c9997c92ad909d77f58e 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -66,6 +66,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:smart_cond",
+        "//tensorflow/python:sort_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:state_ops_gen",
@@ -311,17 +312,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-py_test(
-    name = "sort_ops_test",
-    size = "medium",
-    srcs = ["python/ops/sort_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 95f5ba90aba6ff8d3f1f5b93bde2211ddf1c231b..e72e50585a3861d4527b66f89e1659d76c85960a 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -15,10 +15,6 @@
 
 """Framework utilities.
 
-See the
-[Contrib Framework](https://tensorflow.org/api_guides/python/contrib.framework)
-guide.
-
 @@assert_same_float_dtype
 @@assert_scalar
 @@assert_scalar_int
diff --git a/tensorflow/contrib/framework/python/framework/experimental_test.py b/tensorflow/contrib/framework/python/framework/experimental_test.py
index cfdc7df7d8fd4c1406bf447a79038ac33b11e047..00e04b83ac45a83e54eee7a6e4e146fb683c3d98 100644
--- a/tensorflow/contrib/framework/python/framework/experimental_test.py
+++ b/tensorflow/contrib/framework/python/framework/experimental_test.py
@@ -44,17 +44,18 @@ class ExperimentalTest(test.TestCase):
 
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
-    self.assertEqual("fn doc. (experimental)"
-                     "\n"
-                     "\nTHIS FUNCTION IS EXPERIMENTAL. It may change or "
-                     "be removed at any time, and without warning."
-                     "\n"
-                     "\nArgs:"
-                     "\n  arg0: Arg 0."
-                     "\n  arg1: Arg 1."
-                     "\n"
-                     "\nReturns:"
-                     "\n  Sum of args.", _fn.__doc__)
+    self.assertEqual(
+        "fn doc. (experimental)"
+        "\n"
+        "\nWarning: THIS FUNCTION IS EXPERIMENTAL. It may change "
+        "or be removed at any time, and without warning."
+        "\n"
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
+        "\n"
+        "\nReturns:"
+        "\n  Sum of args.", _fn.__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual(3, _fn(1, 2))
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops.py b/tensorflow/contrib/framework/python/ops/sort_ops.py
index 1921a77c1e96ee3531d1ed0f98e41c27c9d427ac..42184a4e55e292f7921702e3f8909ae54f717702 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops.py
+++ b/tensorflow/contrib/framework/python/ops/sort_ops.py
@@ -22,173 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
+from tensorflow.python.ops import sort_ops
 
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops as framework_ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-
-
-def sort(values, axis=-1, direction='ASCENDING', name=None):
-  """Sorts a tensor.
-
-  Args:
-    values: 1-D or higher numeric `Tensor`.
-    axis: The axis along which to sort. The default is -1, which sorts the last
-        axis.
-    direction: The direction in which to sort the values (`'ASCENDING'` or
-        `'DESCENDING'`).
-    name: Optional name for the operation.
-
-  Returns:
-    A `Tensor` with the same dtype and shape as `values`, with the elements
-        sorted along the given `axis`.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  with framework_ops.name_scope(name, 'sort'):
-    return _sort_or_argsort(values, axis, direction, return_argsort=False)
-
-
-def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
-  """Returns the indices of a tensor that give its sorted order along an axis.
-
-  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
-  `tf.sort(values)`. For higher dimensions, the output has the same shape as
-  `values`, but along the given axis, values represent the index of the sorted
-  element in that slice of the tensor at the given position.
-
-  Args:
-    values: 1-D or higher numeric `Tensor`.
-    axis: The axis along which to sort. The default is -1, which sorts the last
-        axis.
-    direction: The direction in which to sort the values (`'ASCENDING'` or
-        `'DESCENDING'`).
-    stable: If True, equal elements in the original tensor will not be
-        re-ordered in the returned order. Unstable sort is not yet implemented,
-        but will eventually be the default for performance reasons. If you
-        require a stable order, pass `stable=True` for forwards compatibility.
-    name: Optional name for the operation.
-
-  Returns:
-    An int32 `Tensor` with the same shape as `values`. The indices that would
-        sort each slice of the given `values` along the given `axis`.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  del stable  # Unused.
-  with framework_ops.name_scope(name, 'argsort'):
-    return _sort_or_argsort(values, axis, direction, return_argsort=True)
-
-
-def _sort_or_argsort(values, axis, direction, return_argsort):
-  """Internal sort/argsort implementation.
-
-  Args:
-    values: The input values.
-    axis: The axis along which to sort.
-    direction: 'ASCENDING' or 'DESCENDING'.
-    return_argsort: Whether to return the argsort result.
-
-  Returns:
-    Either the sorted values, or the indices of the sorted values in the
-        original tensor. See the `sort` and `argsort` docstrings.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  if direction not in _SORT_IMPL:
-    raise ValueError('%s should be one of %s' %
-                     (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
-  # Axis must be an integer, not a Tensor.
-  axis = framework_ops.convert_to_tensor(axis, name='axis')
-  axis_static = tensor_util.constant_value(axis)
-  if axis.shape.ndims != 0 or axis_static is None:
-    raise ValueError('axis must be a constant scalar')
-  axis_static = int(axis_static)  # Avoids NumPy casting error
-
-  values = framework_ops.convert_to_tensor(values, name='values')
-
-  return _SORT_IMPL[direction](values, axis_static, return_argsort)
-
-
-def _descending_sort(values, axis, return_argsort=False):
-  """Sorts values in reverse using `top_k`.
-
-  Args:
-    values: Tensor of numeric values.
-    axis: Index of the axis which values should be sorted along.
-    return_argsort: If False, return the sorted values. If True, return the
-        indices that would sort the values.
-
-  Returns:
-    The sorted values.
-  """
-  k = array_ops.shape(values)[axis]
-  rank = array_ops.rank(values)
-  static_rank = values.shape.ndims
-  # Fast path: sorting the last axis.
-  if axis == -1 or axis + 1 == values.get_shape().ndims:
-    top_k_input = values
-    transposition = None
-  else:
-    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
-    if axis < 0:
-      # Calculate the actual axis index if counting from the end. Use the static
-      # rank if available, or else make the axis back into a tensor.
-      axis += static_rank or rank
-    if static_rank is not None:
-      # Prefer to calculate the transposition array in NumPy and make it a
-      # constant.
-      transposition = constant_op.constant(
-          np.r_[
-              # Axes up to axis are unchanged.
-              np.arange(axis),
-              # Swap axis and rank - 1.
-              [static_rank - 1],
-              # Axes in [axis + 1, rank - 1) are unchanged.
-              np.arange(axis + 1, static_rank - 1),
-              # Swap axis and rank - 1.
-              [axis]],
-          name='transposition')
-    else:
-      # Generate the transposition array from the tensors.
-      transposition = array_ops.concat(
-          [
-              # Axes up to axis are unchanged.
-              math_ops.range(axis),
-              # Swap axis and rank - 1.
-              [rank - 1],
-              # Axes in [axis + 1, rank - 1) are unchanged.
-              math_ops.range(axis + 1, rank - 1),
-              # Swap axis and rank - 1.
-              [axis]
-          ],
-          axis=0)
-    top_k_input = array_ops.transpose(values, transposition)
-
-  values, indices = nn_ops.top_k(top_k_input, k)
-  return_value = indices if return_argsort else values
-  if transposition is not None:
-    # transposition contains a single cycle of length 2 (swapping 2 elements),
-    # so it is an involution (it is its own inverse).
-    return_value = array_ops.transpose(return_value, transposition)
-  return return_value
-
-
-def _ascending_sort(values, axis, return_argsort=False):
-  # Negate the values to get the ascending order from descending sort.
-  values_or_indices = _descending_sort(-values, axis, return_argsort)
-  # If not argsort, negate the values again.
-  return values_or_indices if return_argsort else -values_or_indices
-
-
-_SORT_IMPL = {
-    'ASCENDING': _ascending_sort,
-    'DESCENDING': _descending_sort,
-}
+sort = sort_ops.sort
+argsort = sort_ops.argsort
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 7ee39f304ab213a8fa4e7a6f03cda88037bff9a1..cf5b9d9476738e58f6f1286bf5652d55b49ed4d5 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -114,7 +114,7 @@ def gan_model(
     discriminator_gen_outputs = discriminator_fn(generated_data,
                                                  generator_inputs)
   with variable_scope.variable_scope(dis_scope, reuse=True):
-    real_data = ops.convert_to_tensor(real_data)
+    real_data = _convert_tensor_or_l_or_d(real_data)
     discriminator_real_outputs = discriminator_fn(real_data, generator_inputs)
 
   if check_shapes:
@@ -1071,8 +1071,19 @@ def get_sequential_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
   return get_hooks
 
 
+def _num_joint_steps(train_steps):
+  g_steps = train_steps.generator_train_steps
+  d_steps = train_steps.discriminator_train_steps
+  # Get the number of each type of step that should be run.
+  num_d_and_g_steps = min(g_steps, d_steps)
+  num_g_steps = g_steps - num_d_and_g_steps
+  num_d_steps = d_steps - num_d_and_g_steps
+
+  return num_d_and_g_steps, num_g_steps, num_d_steps
+
+
 def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
-  """Returns a hooks function for sequential GAN training.
+  """Returns a hooks function for joint GAN training.
 
   When using these train hooks, IT IS RECOMMENDED TO USE `use_locking=True` ON
   ALL OPTIMIZERS TO AVOID RACE CONDITIONS.
@@ -1105,12 +1116,7 @@ def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
   Returns:
     A function that takes a GANTrainOps tuple and returns a list of hooks.
   """
-  g_steps = train_steps.generator_train_steps
-  d_steps = train_steps.discriminator_train_steps
-  # Get the number of each type of step that should be run.
-  num_d_and_g_steps = min(g_steps, d_steps)
-  num_g_steps = g_steps - num_d_and_g_steps
-  num_d_steps = d_steps - num_d_and_g_steps
+  num_d_and_g_steps, num_g_steps, num_d_steps = _num_joint_steps(train_steps)
 
   def get_hooks(train_ops):
     g_op = train_ops.generator_train_op
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index 64d670619905a427a84bee4b661228abca591fae..31d9e827005219bdc07df86d42bef40a38f314f1 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -519,7 +519,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase):
     """Test output type."""
     loss = train.gan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.GANLoss)
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
 
   @parameterized.named_parameters(
       ('cyclegan', create_cyclegan_model),
@@ -528,7 +528,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase):
   def test_cyclegan_output_type(self, get_gan_model_fn):
     loss = train.cyclegan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.CycleGANLoss)
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
 
   @parameterized.named_parameters(
       ('gan', create_gan_model, False),
@@ -923,8 +923,7 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
         model, loss, generator_optimizer=g_opt, discriminator_optimizer=d_opt)
     self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
     # No new trainable variables should have been added.
-    self.assertEqual(num_trainable_vars,
-                     len(variables_lib.get_trainable_variables()))
+    self.assertLen(variables_lib.get_trainable_variables(), num_trainable_vars)
 
     g_sync_init_op = g_opt.get_init_tokens_op(num_tokens=1)
     d_sync_init_op = d_opt.get_init_tokens_op(num_tokens=1)
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 94f522c04e5a09ed2d9355fa675125c340407923..fbccbead03fc0d641db40ede661bf3677d44c45d 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -170,6 +170,14 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
     // Record "call" in active_ so that it can be aborted cleanly.
     RegisterCall(call);
 
+    // RendezvousMgr already aborted, shouldn't send RPC call any more
+    if (!call->status().ok()) {
+      done(call->status(), Args(), Args(), Tensor(), false);
+      session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      delete call;
+      return;
+    }
+
     // Start "call".
     Ref();
     call->Start([this, call, src_worker, rwi, done]() {
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
index d8584e4e6b7470472b0e1911b9e34c8c80a42e0f..b3f48ec1dd9c75055f4e1ea76eb203b6ccf94718 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.cc
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -52,9 +52,10 @@ Status GdrServer::Init() {
       [this](const WorkerEnv* env) {
         return new GdrRendezvousMgr(env, remote_memory_manager_.get());
       };
-  WorkerCreationFunction worker_func = [this](WorkerEnv* env) {
+  WorkerCreationFunction worker_func = [this](WorkerEnv* env,
+                                              const ConfigProto& config) {
     return std::unique_ptr<GdrWorker>(
-        new GdrWorker(env, remote_memory_manager_.get()));
+        new GdrWorker(env, config, remote_memory_manager_.get()));
   };
 
   TF_RETURN_IF_ERROR(remote_memory_manager_->Init());
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index ce1d8d2d73000559f03046aceacb169890ecc1b6..867cb83f42034c8e9061e333ea671457745f92c3 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -39,9 +39,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-GdrWorker::GdrWorker(WorkerEnv* worker_env,
+GdrWorker::GdrWorker(WorkerEnv* worker_env, const ConfigProto& config,
                      RemoteMemoryManager* remote_memory_manager)
-    : GrpcWorker(worker_env),
+    : GrpcWorker(worker_env, config),
       remote_memory_manager_(remote_memory_manager),
       recv_tensor_recent_request_ids_(100000) {}
 
diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h
index 65105ed997300aa77202301cdd8dddacb0309880..39f11e6bde5a1ca7ae91ead02279d22d70af027b 100644
--- a/tensorflow/contrib/gdr/gdr_worker.h
+++ b/tensorflow/contrib/gdr/gdr_worker.h
@@ -25,7 +25,8 @@ namespace tensorflow {
 
 class GdrWorker : public GrpcWorker {
  public:
-  GdrWorker(WorkerEnv* env, RemoteMemoryManager* remote_memory_manager);
+  GdrWorker(WorkerEnv* env, const ConfigProto& config,
+            RemoteMemoryManager* remote_memory_manager);
 
   // Serve the RecvTensorRequest but omit the tensor content and transmit it
   // out-of-band using GPU Direct RDMA whenever possible.
diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index 9393b702d11a2ef84586f712d30c26fe2a8972bb..2698b83a56a1121fa30f5b05ffa027b4dfd4ba95 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -22,48 +22,92 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_ops",
+        ":igfs_ops",
     ],
 )
 
 tf_custom_op_library(
-    name = "_dataset_ops.so",
-    srcs = ["ops/dataset_ops.cc"],
-    deps = [":dataset_kernels"],
+    name = "_ignite_ops.so",
+    srcs = [
+        "kernels/igfs/igfs.h",
+        "ops/dataset_ops.cc",
+        "ops/igfs_ops.cc",
+    ],
+    deps = [
+        ":dataset_kernels",
+        ":igfs_kernels",
+    ],
 )
 
 tf_gen_op_libs(
     op_lib_names = ["dataset_ops"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = ["igfs_ops"],
+    deps = [":igfs_kernels"],
+)
+
 cc_library(
-    name = "dataset_kernels",
+    name = "ignite_client",
     srcs = [
-        "kernels/ignite_dataset_ops.cc",
-        "kernels/ignite_client.h",
-        "kernels/ignite_byte_swapper.h",
-        "kernels/ignite_plain_client.h",
-        "kernels/ignite_ssl_wrapper.h",
-        "kernels/ignite_ssl_wrapper.cc",
-        "kernels/ignite_binary_object_parser.h",
-        "kernels/ignite_binary_object_parser.cc",
-        "kernels/ignite_dataset.h",
-        "kernels/ignite_dataset.cc",
-        "kernels/ignite_dataset_iterator.h",
-        "kernels/ignite_dataset_iterator.cc",
+        "kernels/client/ignite_client.h",
+        "kernels/client/ignite_byte_swapper.h",
+        "kernels/client/ignite_plain_client.h",
+        "kernels/client/ignite_ssl_wrapper.h",
+        "kernels/client/ignite_ssl_wrapper.cc",
     ] + if_not_windows([
-        "kernels/ignite_plain_client_unix.cc",
+        "kernels/client/ignite_plain_client_unix.cc",
     ]) + if_windows([
-        "kernels/ignite_plain_client_windows.cc",
+        "kernels/client/ignite_plain_client_windows.cc",
     ]),
     copts = if_windows([
         "-DWIN32_LEAN_AND_MEAN",
     ]),
     deps = [
         "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
         "@boringssl//:ssl",
         "@protobuf_archive//:protobuf_headers",
     ],
+)
+
+cc_library(
+    name = "dataset_kernels",
+    srcs = [
+        "kernels/dataset/ignite_binary_object_parser.cc",
+        "kernels/dataset/ignite_binary_object_parser.h",
+        "kernels/dataset/ignite_dataset.cc",
+        "kernels/dataset/ignite_dataset.h",
+        "kernels/dataset/ignite_dataset_iterator.cc",
+        "kernels/dataset/ignite_dataset_iterator.h",
+        "kernels/dataset/ignite_dataset_ops.cc",
+    ],
+    deps = [
+        ":ignite_client",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "igfs_kernels",
+    srcs = [
+        "kernels/igfs/igfs.cc",
+        "kernels/igfs/igfs.h",
+        "kernels/igfs/igfs_client.cc",
+        "kernels/igfs/igfs_client.h",
+        "kernels/igfs/igfs_extended_tcp_client.cc",
+        "kernels/igfs/igfs_extended_tcp_client.h",
+        "kernels/igfs/igfs_messages.cc",
+        "kernels/igfs/igfs_messages.h",
+        "kernels/igfs/igfs_random_access_file.cc",
+        "kernels/igfs/igfs_random_access_file.h",
+        "kernels/igfs/igfs_writable_file.cc",
+        "kernels/igfs/igfs_writable_file.h",
+    ],
+    deps = [":ignite_client"],
     alwayslink = 1,
 )
 
@@ -82,10 +126,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "igfs_ops",
+    srcs = [
+        "python/ops/igfs_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":igfs_op_loader",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
 tf_gen_op_wrapper_py(
     name = "gen_dataset_ops",
     out = "python/ops/gen_dataset_ops.py",
-    deps = ["//tensorflow/contrib/ignite:dataset_ops_op_lib"],
+    deps = [":dataset_ops_op_lib"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_igfs_ops",
+    out = "python/ops/gen_igfs_ops.py",
+    deps = [":igfs_ops_op_lib"],
 )
 
 tf_kernel_library(
@@ -97,13 +160,22 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
+tf_kernel_library(
+    name = "igfs_ops_kernels",
+    deps = [
+        ":igfs_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
 tf_custom_op_py_library(
     name = "ignite_op_loader",
     srcs = ["python/ops/ignite_op_loader.py"],
-    dso = ["//tensorflow/contrib/ignite:_dataset_ops.so"],
+    dso = [":_ignite_ops.so"],
     kernels = [
         ":dataset_ops_kernels",
-        "//tensorflow/contrib/ignite:dataset_ops_op_lib",
+        ":dataset_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -113,6 +185,22 @@ tf_custom_op_py_library(
     ],
 )
 
+tf_custom_op_py_library(
+    name = "igfs_op_loader",
+    srcs = ["python/ops/igfs_op_loader.py"],
+    dso = [":_ignite_ops.so"],
+    kernels = [
+        ":igfs_ops_kernels",
+        ":igfs_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_igfs_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
 # The Apache Ignite servers have to setup before the test and tear down
 # after the test manually. The docker engine has to be installed.
 #
@@ -122,8 +210,11 @@ tf_custom_op_py_library(
 # To tear down Apache Ignite servers:
 # $ bash ./python/tests/stop_ignite.sh
 tf_py_test(
-    name = "ignite_dataset_test",
-    srcs = ["python/tests/ignite_dataset_test.py"],
+    name = "ignite_test",
+    srcs = [
+        "python/tests/igfs_test.py",
+        "python/tests/ignite_dataset_test.py",
+    ],
     additional_deps = [
         ":ignite",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index 55c89d27996318dabb29bb15372411005301ebd9..c7db0b77e25668fb8a42d204776044420f403e44 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -1,19 +1,32 @@
-# Ignite Dataset
-
-- [Overview](#overview)
-- [Features](#features)
-  * [Distributed In-Memory Datasource](#distributed-in-memory-datasource)
-  * [Structured Objects](#structured-objects)
-  * [Distributed Training](#distributed-training)
-  * [SSL Connection](#ssl-connection)
-  * [Windows Support](#windows-support)
-- [Try it out](#try-it-out)
-- [Limitations](#limitations)
+# Apache Ignite Integration
+
+-   [Overview](#overview)
+-   [Features](#features)
+    *   [Distributed In-Memory Datasource](#distributed-in-memory-datasource)
+    *   [Structured Objects](#structured-objects)
+    *   [Distributed Training](#distributed-training)
+    *   [Distributed File System](#distributed-file-system)
+    *   [SSL Connection](#ssl-connection)
+    *   [Windows Support](#windows-support)
+-   [Try it out](#try-it-out)
+    *   [Ignite Dataset](#ignite-dataset)
+    *   [IGFS](#igfs)
+-   [Limitations](#limitations)
 
 ## Overview
 
-[Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed database, caching, and processing platform for
-transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. This contrib package contains an integration between Apache Ignite and TensorFlow. The integration is based on [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow side and [Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol) from Apache Ignite side. It allows to use Apache Ignite as a data source for neural network training, inference and all other computations supported by TensorFlow. 
+[Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed
+database, caching, and processing platform for transactional, analytical, and
+streaming workloads, delivering in-memory speeds at petabyte scale. This contrib
+package contains an integration between Apache Ignite and TensorFlow. The
+integration is based on
+[tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow
+side and
+[Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol)
+from Apache Ignite side. It allows to use Apache Ignite as a data source for
+neural network training, inference and all other computations supported by
+TensorFlow. Another part of this module is an integration with distributed file
+system based on Apache Ignite.
 
 ## Features
 
@@ -134,6 +147,23 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 
 High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well. 
 
+### Distributed File System
+
+In addition to database functionality Apache Ignite provides a distributed file
+system called [IGFS](https://ignite.apache.org/features/igfs.html). IGFS
+delivers a similar functionality to Hadoop HDFS, but only in-memory. In fact, in
+addition to its own APIs, IGFS implements Hadoop FileSystem API and can be
+transparently plugged into Hadoop or Spark deployments. This contrib package
+contains an integration between IGFS and TensorFlow. The integration is based
+on [custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys)
+from TensorFlow side and
+[IGFS Native API](https://ignite.apache.org/features/igfs.html) from Apache
+Ignite side. It has numerous uses, for example: * Checkpoints of state can be
+saved to IGFS for reliability and fault-tolerance. * Training processes
+communicate with TensorBoard by writing event files to a directory, which
+TensorBoard watches. IGFS allows this communication to work even when
+TensorBoard runs in a different process or machine.
+
 ### SSL Connection
 
 Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and authentification. Ignite Dataset supports both SSL connection with and without authntication. For more information, please refer to the [Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls) documentation.
@@ -141,9 +171,12 @@ Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikip
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
->>> dataset = IgniteDataset(cache_name="IMAGES", certfile="client.pem", cert_password="password", username="ignite", password="ignite")
->>> ...
+>>>
+>>> dataset = IgniteDataset(cache_name="IMAGES",
+                            certfile="client.pem",
+                            cert_password="password",
+                            username="ignite",
+                            password="ignite")
 ```
 
 ### Windows Support
@@ -152,7 +185,16 @@ Ignite Dataset is fully compatible with Windows. You can use it as part of Tenso
 
 ## Try it out
 
-The simplest way to try Ignite Dataset is to run a [Docker](https://www.docker.com/) container with Apache Ignite and loaded [MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with it using Ignite Dataset. Such container is available on Docker Hub: [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/). You need to start this container on your machine:
+Following examples will help you to easily start working with this module.
+
+### Ignite Dataset
+
+The simplest way to try Ignite Dataset is to run a
+[Docker](https://www.docker.com/) container with Apache Ignite and loaded
+[MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with
+it using Ignite Dataset. Such container is available on Docker Hub:
+[dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/).
+You need to start this container on your machine:
 
 ```
 docker run -it -p 10800:10800 dmitrievanthony/ignite-with-mnist
@@ -162,6 +204,35 @@ After that you will be able to work with it following way:
 
 ![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist.png "Ignite Dataset Mnist")
 
+### IGFS
+
+The simplest way to try IGFS with TensorFlow is to run
+[Docker](https://www.docker.com/) container with Apache Ignite and enabled IGFS
+and then interruct with it using TensorFlow
+[tf.gfile](https://www.tensorflow.org/api_docs/python/tf/gfile). Such container
+is available on Docker Hub:
+[dmitrievanthony/ignite-with-igfs](https://hub.docker.com/r/dmitrievanthony/ignite-with-igfs/).
+You need to start this container on your machine:
+
+```
+docker run -it -p 10500:10500 dmitrievanthony/ignite-with-igfs
+```
+
+After that you will be able to work with it following way:
+
+```python
+>>> import tensorflow as tf
+>>> import tensorflow.contrib.ignite.python.ops.igfs_ops
+>>>
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='w') as w:
+>>>   w.write("Hello, world!")
+>>>
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='r') as r:
+>>>   print(r.read())
+
+Hello, world!
+```
+
 ## Limitations
 
 Presently, Ignite Dataset works with assumption that all objects in the cache have the same structure (homogeneous objects) and the cache contains at least one object. Another limitation concerns structured objects, Ignite Dataset does not support UUID, Maps and Object arrays that might be parts of an object structure.
diff --git a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
similarity index 67%
rename from tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
index 46df3e39dc4ec6dd4ef5730a184264eaa9fc5872..aac950fcc2aaf016959bbda876ac93df4baea417 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_BYTE_SWAPPER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_BYTE_SWAPPER_H_
 
 #include <stdint.h>
 #include "tensorflow/core/platform/byte_order.h"
@@ -25,76 +25,75 @@ class ByteSwapper {
  public:
   ByteSwapper(bool big_endian) { swap_ = big_endian == port::kLittleEndian; }
 
-  inline void SwapIfRequiredInt16(int16_t *x) const {
+  void SwapIfRequiredInt16(int16_t *x) const {
     if (swap_) {
       Swap16(x);
     }
   }
 
-  inline void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
+  void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
     if (swap_) {
       Swap16(reinterpret_cast<int16_t *>(x));
     }
   }
 
-  inline void SwapIfRequiredInt32(int32_t *x) const {
+  void SwapIfRequiredInt32(int32_t *x) const {
     if (swap_) {
       Swap32(x);
     }
   }
 
-  inline void SwapIfRequiredFloat(float *x) const {
+  void SwapIfRequiredFloat(float *x) const {
     if (swap_) {
       Swap32(reinterpret_cast<int32_t *>(x));
     }
   }
 
-  inline void SwapIfRequiredInt64(int64_t *x) const {
+  void SwapIfRequiredInt64(int64_t *x) const {
     if (swap_) {
       Swap64(x);
     }
   }
 
-  inline void SwapIfRequiredDouble(double *x) const {
+  void SwapIfRequiredDouble(double *x) const {
     if (swap_) {
       Swap64(reinterpret_cast<int64_t *>(x));
     }
   }
 
-  inline void SwapIfRequiredInt16Arr(int16_t *x, int32_t length) const {
+  void SwapIfRequiredInt16Arr(int16_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++) Swap16(&x[i]);
     }
   }
 
-  inline void SwapIfRequiredUnsignedInt16Arr(uint16_t *x,
-                                             int32_t length) const {
+  void SwapIfRequiredUnsignedInt16Arr(uint16_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++)
         Swap16(reinterpret_cast<int16_t *>(&x[i]));
     }
   }
 
-  inline void SwapIfRequiredInt32Arr(int32_t *x, int32_t length) const {
+  void SwapIfRequiredInt32Arr(int32_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++) Swap32(&x[i]);
     }
   }
 
-  inline void SwapIfRequiredFloatArr(float *x, int32_t length) const {
+  void SwapIfRequiredFloatArr(float *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++)
         Swap32(reinterpret_cast<int32_t *>(&x[i]));
     }
   }
 
-  inline void SwapIfRequiredInt64Arr(int64_t *x, int32_t length) const {
+  void SwapIfRequiredInt64Arr(int64_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++) Swap64(&x[i]);
     }
   }
 
-  inline void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
+  void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++)
         Swap64(reinterpret_cast<int64_t *>(&x[i]));
@@ -102,16 +101,16 @@ class ByteSwapper {
   }
 
  private:
-  inline void Swap16(int16_t *x) const {
+  void Swap16(int16_t *x) const {
     *x = ((*x & 0xFF) << 8) | ((*x >> 8) & 0xFF);
   }
 
-  inline void Swap32(int32_t *x) const {
+  void Swap32(int32_t *x) const {
     *x = ((*x & 0xFF) << 24) | (((*x >> 8) & 0xFF) << 16) |
          (((*x >> 16) & 0xFF) << 8) | ((*x >> 24) & 0xFF);
   }
 
-  inline void Swap64(int64_t *x) const {
+  void Swap64(int64_t *x) const {
     *x = ((*x & 0xFF) << 56) | (((*x >> 8) & 0xFF) << 48) |
          (((*x >> 16) & 0xFF) << 40) | (((*x >> 24) & 0xFF) << 32) |
          (((*x >> 32) & 0xFF) << 24) | (((*x >> 40) & 0xFF) << 16) |
@@ -123,4 +122,4 @@ class ByteSwapper {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_BYTE_SWAPPER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.h b/tensorflow/contrib/ignite/kernels/client/ignite_client.h
similarity index 74%
rename from tensorflow/contrib/ignite/kernels/ignite_client.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_client.h
index 459b50b48fd95ad105bccaca4076160e0ef152ee..0da80769260d065c4ac6601c0e5cd7050b6b61cb 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_client.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_client.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_CLIENT_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -32,44 +32,44 @@ class Client {
   virtual Status ReadData(uint8_t *buf, const int32_t length) = 0;
   virtual Status WriteData(const uint8_t *buf, const int32_t length) = 0;
 
-  inline Status ReadByte(uint8_t *data) { return ReadData(data, 1); }
+  Status ReadByte(uint8_t *data) { return ReadData(data, 1); }
 
-  inline Status ReadShort(int16_t *data) {
+  Status ReadShort(int16_t *data) {
     TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 2));
     byte_swapper_.SwapIfRequiredInt16(data);
 
     return Status::OK();
   }
 
-  inline Status ReadInt(int32_t *data) {
+  Status ReadInt(int32_t *data) {
     TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 4));
     byte_swapper_.SwapIfRequiredInt32(data);
 
     return Status::OK();
   }
 
-  inline Status ReadLong(int64_t *data) {
+  Status ReadLong(int64_t *data) {
     TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 8));
     byte_swapper_.SwapIfRequiredInt64(data);
 
     return Status::OK();
   }
 
-  inline Status WriteByte(const uint8_t data) { return WriteData(&data, 1); }
+  Status WriteByte(const uint8_t data) { return WriteData(&data, 1); }
 
-  inline Status WriteShort(const int16_t data) {
+  Status WriteShort(const int16_t data) {
     int16_t tmp = data;
     byte_swapper_.SwapIfRequiredInt16(&tmp);
     return WriteData((uint8_t *)&tmp, 2);
   }
 
-  inline Status WriteInt(const int32_t data) {
+  Status WriteInt(const int32_t data) {
     int32_t tmp = data;
     byte_swapper_.SwapIfRequiredInt32(&tmp);
     return WriteData((uint8_t *)&tmp, 4);
   }
 
-  inline Status WriteLong(const int64_t data) {
+  Status WriteLong(const int64_t data) {
     int64_t tmp = data;
     byte_swapper_.SwapIfRequiredInt64(&tmp);
     return WriteData((uint8_t *)&tmp, 8);
@@ -81,4 +81,4 @@ class Client {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
similarity index 80%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
index 75424c19ee4b7df5378aa23cb41db1752e8d0651..546583246042855d179ebbb18b7dca711063b3f4 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_PLAIN_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_PLAIN_CLIENT_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
 
 namespace tensorflow {
 
@@ -40,4 +40,4 @@ class PlainClient : public Client {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_PLAIN_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
index cf672942c61e1239332711db12e62088737c4f41..54efb5b61761708a28dd031b8321ffba9a53ffa9 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
 
 #include <arpa/inet.h>
 #include <netdb.h>
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
index dad5aace5fabe1df58bb9579bf578f4c35324315..a99a3ada558e51c13ed47eb72911eb5862e71a60 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
 
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
index ceb479b0846574a35d86002ebb9c3e8e1d3687ac..8f09c24a3bedda524264f30282a0ad019d515540 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h"
 
 #include <openssl/err.h>
 #include <openssl/ssl.h>
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
similarity index 82%
rename from tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
index 0406644bbaab3de816540ce85e84b489ea9fff12..543e03d1efc3ff186c9db399af18f7aa8ad2c450 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_SSL_WRAPPER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_SSL_WRAPPER_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
 
 #include <openssl/ssl.h>
 
@@ -48,4 +48,4 @@ class SslWrapper : public Client {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_SSL_WRAPPER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
similarity index 99%
rename from tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
index 2c8a7d44b07b43f788bcbc0850b5162cc14dd951..4218ec05f2c3486dd91e2188b674e01d6aadaa2b 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
similarity index 87%
rename from tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
index eb1f856643a790de6acaa82d4b8ad894fd364376..3e8a1a19623fab3e027db16228e0228e8ec4989a 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_BINARY_OBJECT_PARSER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_BINARY_OBJECT_PARSER_H_
 
 #include <vector>
-#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -78,4 +78,4 @@ enum ObjectType {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_BINARY_OBJECT_PARSER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
index c4a7d3c513a796c9d95b371bedc609fd75188817..ace96e7b09fcf314757367baed66f622b294e43c 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
similarity index 91%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
index 66bfdf2e2a168e59cd2fec8e2ac5b8fd482d5c15..db3bafb11f2a0047c22ece6d2bc1722afaa5ffdf 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_H_
 
 #include "tensorflow/core/framework/dataset.h"
 
@@ -60,4 +60,4 @@ class IgniteDataset : public DatasetBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
index 5da9127aa6a3a4bc16347e6890cc1ba44406c0d5..ce8972f1e7fd59235556cb9514011f0b836077de 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h"
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
similarity index 87%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
index c499e2c9ccfac5c15db08c8fd8b26c37aa0404f3..5868c2cb67f9d5c91654db8cf4bb4bbc072fc1ac 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_ITERATOR_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_ITERATOR_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
@@ -96,4 +96,4 @@ constexpr int32_t kMinResLength = 12;
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_ITERATOR_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
index f75b1c5ff55ca9ee493148ff79c2edd4b15ac42a..f2108775e29b53765138dcd971bec89d7a10ce40 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <stdlib.h>
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae2dbcc2cf5d0ae7e09a26a199dc0c3c80fe22c1
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs.cc
@@ -0,0 +1,331 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h"
+
+namespace tensorflow {
+
+static string GetEnvOrElse(const string &env, string default_value) {
+  const char *env_c_str = env.c_str();
+  return getenv(env_c_str) != nullptr ? getenv(env_c_str) : default_value;
+}
+
+static string MakeRelative(const string &a, const string &b) {
+  string max = a;
+  string min = b;
+  bool first = b.size() > a.size();
+
+  if (first) {
+    max = b;
+    min = a;
+  }
+
+  auto r = mismatch(min.begin(), min.end(), max.begin());
+  return string((first ? r.first : r.second), first ? min.end() : max.end());
+}
+
+string IGFS::TranslateName(const string &name) const {
+  StringPiece scheme, namenode, path;
+  io::ParseURI(name, &scheme, &namenode, &path);
+  return string(path.data(), path.length());
+}
+
+IGFS::IGFS()
+    : host_(GetEnvOrElse("IGFS_HOST", "localhost")),
+      port_([] {
+        int port;
+        if (strings::safe_strto32(GetEnvOrElse("IGFS_PORT", "10500").c_str(),
+                                  &port)) {
+          return port;
+        } else {
+          LOG(WARNING)
+              << "IGFS_PORT environment variable had an invalid value: "
+              << getenv("IGFS_PORT") << "\nUsing default port 10500.";
+          return 10500;
+        }
+      }()),
+      fs_name_(GetEnvOrElse("IGFS_FS_NAME", "default_fs")) {
+  LOG(INFO) << "IGFS created [host=" << host_ << ", port=" << port_
+            << ", fs_name=" << fs_name_ << "]";
+}
+
+IGFS::~IGFS() {
+  LOG(INFO) << "IGFS destroyed [host=" << host_ << ", port=" << port_
+            << ", fs_name=" << fs_name_ << "]";
+}
+
+Status IGFS::NewRandomAccessFile(const string &file_name,
+                                 std::unique_ptr<RandomAccessFile> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<OpenReadResponse> open_read_response(true);
+  TF_RETURN_IF_ERROR(client->OpenRead(&open_read_response, path));
+
+  int64 resource_id = open_read_response.res.stream_id;
+  result->reset(new IGFSRandomAccessFile(path, resource_id, std::move(client)));
+
+  LOG(INFO) << "New random access file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewWritableFile(const string &file_name,
+                             std::unique_ptr<WritableFile> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ExistsResponse> exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, path));
+
+  if (exists_response.res.exists) {
+    CtrlResponse<DeleteResponse> del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, path, false));
+  }
+
+  CtrlResponse<OpenCreateResponse> open_create_resp(false);
+  TF_RETURN_IF_ERROR(client->OpenCreate(&open_create_resp, path));
+
+  int64 resource_id = open_create_resp.res.stream_id;
+  result->reset(new IGFSWritableFile(path, resource_id, std::move(client)));
+
+  LOG(INFO) << "New writable file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewAppendableFile(const string &file_name,
+                               std::unique_ptr<WritableFile> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ExistsResponse> exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, file_name));
+
+  if (exists_response.res.exists) {
+    CtrlResponse<DeleteResponse> del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, file_name, false));
+  }
+
+  CtrlResponse<OpenAppendResponse> open_append_resp(false);
+  TF_RETURN_IF_ERROR(client->OpenAppend(&open_append_resp, file_name));
+
+  result->reset(new IGFSWritableFile(TranslateName(file_name),
+                                     open_append_resp.res.stream_id,
+                                     std::move(client)));
+
+  LOG(INFO) << "New appendable file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewReadOnlyMemoryRegionFromFile(
+    const string &file_name, std::unique_ptr<ReadOnlyMemoryRegion> *result) {
+  return errors::Unimplemented("IGFS does not support ReadOnlyMemoryRegion");
+}
+
+Status IGFS::FileExists(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  const string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ExistsResponse> exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, path));
+
+  if (!exists_response.res.exists)
+    return errors::NotFound("File ", path, " not found");
+
+  LOG(INFO) << "File exists completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetChildren(const string &file_name, std::vector<string> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+  path = path + "/";
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ListPathsResponse> list_paths_response(false);
+  TF_RETURN_IF_ERROR(client->ListPaths(&list_paths_response, path));
+
+  *result = std::vector<string>();
+  std::vector<IGFSPath> entries = list_paths_response.res.entries;
+
+  for (IGFSPath &value : entries)
+    result->push_back(MakeRelative(value.path, path));
+
+  LOG(INFO) << "Get children completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetMatchingPaths(const string &pattern,
+                              std::vector<string> *results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
+Status IGFS::DeleteFile(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<DeleteResponse> del_response(false);
+  TF_RETURN_IF_ERROR(client->Delete(&del_response, path, false));
+
+  if (!del_response.res.exists)
+    return errors::NotFound("File ", path, " not found");
+
+  LOG(INFO) << "Delete file completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::CreateDir(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  const string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<MakeDirectoriesResponse> mkdir_response(false);
+  TF_RETURN_IF_ERROR(client->MkDir(&mkdir_response, path));
+
+  if (!mkdir_response.res.successful)
+    return errors::Unknown("Can't create directory ", path);
+
+  LOG(INFO) << "Create dir completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::DeleteDir(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ListFilesResponse> list_files_response(false);
+  TF_RETURN_IF_ERROR(client->ListFiles(&list_files_response, path));
+
+  if (!list_files_response.res.entries.empty()) {
+    return errors::FailedPrecondition("Can't delete a non-empty directory");
+  } else {
+    CtrlResponse<DeleteResponse> del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, path, true));
+  }
+
+  LOG(INFO) << "Delete dir completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetFileSize(const string &file_name, uint64 *size) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<InfoResponse> info_response(false);
+  TF_RETURN_IF_ERROR(client->Info(&info_response, path));
+
+  *size = info_response.res.file_info.length;
+
+  LOG(INFO) << "Get file size completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::RenameFile(const string &src, const string &dst) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string src_path = TranslateName(src);
+  string dst_path = TranslateName(dst);
+
+  if (FileExists(dst).ok()) DeleteFile(dst);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<RenameResponse> rename_response(false);
+  TF_RETURN_IF_ERROR(client->Rename(&rename_response, src_path, dst_path));
+
+  if (!rename_response.res.successful)
+    return errors::NotFound("File ", src_path, " not found");
+
+  LOG(INFO) << "Rename file completed successful [src=" << src
+            << ", dst=" << dst << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::Stat(const string &file_name, FileStatistics *stats) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<InfoResponse> info_response(false);
+  TF_RETURN_IF_ERROR(client->Info(&info_response, path));
+
+  IGFSFile info = info_response.res.file_info;
+
+  *stats = FileStatistics(info.length, info.modification_time * 1000000,
+                          (info.flags & 0x1) != 0);
+
+  LOG(INFO) << "Stat completed successful [file_name=" << file_name << "]";
+
+  return Status::OK();
+}
+
+std::unique_ptr<IGFSClient> IGFS::CreateClient() const {
+  return std::unique_ptr<IGFSClient>(
+      new IGFSClient(host_, port_, fs_name_, ""));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs.h b/tensorflow/contrib/ignite/kernels/igfs/igfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c347e937f75e8eea108811e6a3189412e22a982
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFS : public FileSystem {
+ public:
+  IGFS();
+  ~IGFS();
+  Status NewRandomAccessFile(
+      const string& file_name,
+      std::unique_ptr<RandomAccessFile>* result) override;
+  Status NewWritableFile(const string& fname,
+                         std::unique_ptr<WritableFile>* result) override;
+  Status NewAppendableFile(const string& fname,
+                           std::unique_ptr<WritableFile>* result) override;
+  Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+  Status FileExists(const string& fname) override;
+  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+  Status DeleteFile(const string& fname) override;
+  Status CreateDir(const string& name) override;
+  Status DeleteDir(const string& name) override;
+  Status GetFileSize(const string& fname, uint64* size) override;
+  Status RenameFile(const string& src, const string& target) override;
+  Status Stat(const string& fname, FileStatistics* stat) override;
+  string TranslateName(const string& name) const override;
+
+ private:
+  std::unique_ptr<IGFSClient> CreateClient() const;
+
+  const string host_;
+  const int port_;
+  const string fs_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f97c34fdd8b026a04506fd0ef9f3cc74129a9da
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+
+namespace tensorflow {
+
+IGFSClient::IGFSClient(const string &host, int port, const string &fs_name,
+                       const string &user_name)
+    : fs_name_(fs_name),
+      user_name_(user_name),
+      client_(ExtendedTCPClient(host, port, true)) {
+  client_.Connect();
+}
+
+IGFSClient::~IGFSClient() { client_.Disconnect(); }
+
+Status IGFSClient::SendRequestGetResponse(const Request &request,
+                                          Response *response) {
+  TF_RETURN_IF_ERROR(request.Write(&client_));
+  client_.reset();
+
+  if (response != nullptr) {
+    TF_RETURN_IF_ERROR(response->Read(&client_));
+    client_.reset();
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbec7b000779be8772e850a556affffa1b3b6803
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_CLIENT_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+class IGFSClient {
+ public:
+  IGFSClient(const string &host, int port, const string &fs_name,
+             const string &user_name);
+  ~IGFSClient();
+
+  Status Handshake(CtrlResponse<HandshakeResponse> *res) {
+    return SendRequestGetResponse(HandshakeRequest(fs_name_, {}), res);
+  }
+
+  Status ListFiles(CtrlResponse<ListFilesResponse> *res, const string &path) {
+    return SendRequestGetResponse(ListFilesRequest(user_name_, path), res);
+  }
+
+  Status ListPaths(CtrlResponse<ListPathsResponse> *res, const string &path) {
+    return SendRequestGetResponse(ListPathsRequest(user_name_, path), res);
+  }
+
+  Status Info(CtrlResponse<InfoResponse> *res, const string &path) {
+    return SendRequestGetResponse(InfoRequest(user_name_, path), res);
+  }
+
+  Status OpenCreate(CtrlResponse<OpenCreateResponse> *res, const string &path) {
+    return SendRequestGetResponse(OpenCreateRequest(user_name_, path), res);
+  }
+
+  Status OpenAppend(CtrlResponse<OpenAppendResponse> *res, const string &path) {
+    return SendRequestGetResponse(OpenAppendRequest(user_name_, path), res);
+  }
+
+  Status OpenRead(CtrlResponse<OpenReadResponse> *res, const string &path) {
+    return SendRequestGetResponse(OpenReadRequest(user_name_, path), res);
+  }
+
+  Status Exists(CtrlResponse<ExistsResponse> *res, const string &path) {
+    return SendRequestGetResponse(ExistsRequest(user_name_, path), res);
+  }
+
+  Status MkDir(CtrlResponse<MakeDirectoriesResponse> *res, const string &path) {
+    return SendRequestGetResponse(MakeDirectoriesRequest(user_name_, path),
+                                  res);
+  }
+
+  Status Delete(CtrlResponse<DeleteResponse> *res, const string &path,
+                bool recursive) {
+    return SendRequestGetResponse(DeleteRequest(user_name_, path, recursive),
+                                  res);
+  }
+
+  Status WriteBlock(int64_t stream_id, const uint8_t *data, int32_t len) {
+    return SendRequestGetResponse(WriteBlockRequest(stream_id, data, len),
+                                  nullptr);
+  }
+
+  Status ReadBlock(ReadBlockCtrlResponse *res, int64_t stream_id, int64_t pos,
+                   int32_t length) {
+    return SendRequestGetResponse(ReadBlockRequest(stream_id, pos, length),
+                                  res);
+  }
+
+  Status Close(CtrlResponse<CloseResponse> *res, int64_t stream_id) {
+    return SendRequestGetResponse(CloseRequest(stream_id), res);
+  }
+
+  Status Rename(CtrlResponse<RenameResponse> *res, const string &source,
+                const string &dest) {
+    return SendRequestGetResponse(RenameRequest(user_name_, source, dest), res);
+  }
+
+ private:
+  Status SendRequestGetResponse(const Request &request, Response *response);
+
+  const string fs_name_;
+  const string user_name_;
+  ExtendedTCPClient client_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea63436546d8b244b921206f9577c91b6578a775
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc
@@ -0,0 +1,144 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h"
+
+namespace tensorflow {
+
+ExtendedTCPClient::ExtendedTCPClient(const string &host, int port,
+                                     bool big_endian)
+    : PlainClient(host, port, big_endian), pos_(0) {}
+
+Status ExtendedTCPClient::ReadData(uint8_t *buf, const int32_t length) {
+  TF_RETURN_IF_ERROR(PlainClient::ReadData(buf, length));
+  pos_ += length;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteData(const uint8_t *buf, const int32_t length) {
+  TF_RETURN_IF_ERROR(PlainClient::WriteData(buf, length));
+  pos_ += length;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::Ignore(int n) {
+  uint8_t buf[n];
+  return ReadData(buf, n);
+}
+
+Status ExtendedTCPClient::SkipToPos(int target_pos) {
+  return Ignore(std::max(0, target_pos - pos_));
+}
+
+Status ExtendedTCPClient::ReadBool(bool *res) {
+  uint8_t buf = 0;
+  TF_RETURN_IF_ERROR(ReadData(&buf, 1));
+  *res = buf != 0;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::ReadNullableString(string *res) {
+  bool is_empty = false;
+  TF_RETURN_IF_ERROR(ReadBool(&is_empty));
+
+  if (!is_empty) {
+    TF_RETURN_IF_ERROR(ReadString(res));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::ReadString(string *res) {
+  int16_t length;
+  TF_RETURN_IF_ERROR(ReadShort(&length));
+
+  uint8_t *buf = new uint8_t[length];
+  Status status = ReadData(buf, length);
+
+  if (status.ok()) res->assign(reinterpret_cast<char *>(buf), length);
+
+  delete[] buf;
+  return status;
+}
+
+Status ExtendedTCPClient::ReadStringMap(std::map<string, string> *res) {
+  int size;
+  TF_RETURN_IF_ERROR(ReadInt(&size));
+
+  for (int i = 0; i < size; i++) {
+    string key;
+    string val;
+    TF_RETURN_IF_ERROR(ReadString(&key));
+    TF_RETURN_IF_ERROR(ReadString(&val));
+
+    res->insert(std::pair<string, string>(std::move(key), std::move(val)));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteSize(std::map<string, string>::size_type s) {
+  return WriteInt(s);
+}
+
+Status ExtendedTCPClient::FillWithZerosUntil(int n) {
+  int to_skip = std::max(0, n - pos_);
+
+  for (int i = 0; i < to_skip; i++) {
+    TF_RETURN_IF_ERROR(WriteByte(0));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteBool(bool val) {
+  return WriteByte((char)(val ? 1 : 0));
+}
+
+Status ExtendedTCPClient::WriteString(string str) {
+  if (!str.empty()) {
+    TF_RETURN_IF_ERROR(WriteBool(false));
+    size_t l = str.length();
+    if (l > std::numeric_limits<int16_t>::max())
+      return errors::InvalidArgument("String is too long");
+
+    TF_RETURN_IF_ERROR(WriteShort(l));
+    TF_RETURN_IF_ERROR(WriteData(reinterpret_cast<const uint8_t *>(str.c_str()),
+                                 str.length()));
+  } else {
+    TF_RETURN_IF_ERROR(WriteBool(true));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteStringMap(std::map<string, string> map) {
+  std::map<string, string>::size_type size = map.size();
+  TF_RETURN_IF_ERROR(WriteSize(size));
+
+  for (auto &x : map) {
+    TF_RETURN_IF_ERROR(WriteString(x.first));
+    TF_RETURN_IF_ERROR(WriteString(x.second));
+  }
+
+  return Status::OK();
+}
+
+void ExtendedTCPClient::reset() { pos_ = 0; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5de342fd0c20cf5b01b647756797631b8a3f203
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_EXTENDED_TCP_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_EXTENDED_TCP_CLIENT_H_
+
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
+
+namespace tensorflow {
+
+class ExtendedTCPClient : public PlainClient {
+ public:
+  ExtendedTCPClient(const string &host, int port, bool big_endian);
+  Status ReadData(uint8_t *buf, const int32_t length) override;
+  Status WriteData(const uint8_t *buf, const int32_t length) override;
+  Status Ignore(int n);
+  Status SkipToPos(int target_pos);
+  Status ReadBool(bool *res);
+  Status ReadNullableString(string *res);
+  Status ReadString(string *res);
+  Status ReadStringMap(std::map<string, string> *res);
+  Status WriteSize(std::map<string, string>::size_type s);
+  Status FillWithZerosUntil(int n);
+  Status WriteBool(bool val);
+  Status WriteString(string str);
+  Status WriteStringMap(std::map<string, string> map);
+  void reset();
+
+ private:
+  int pos_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_EXTENDED_TCP_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c63f40f35fa53bc51c44f574df50ad0c79fba91
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc
@@ -0,0 +1,344 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+Status IGFSPath::Read(ExtendedTCPClient *client) {
+  return client->ReadNullableString(&path);
+}
+
+Status IGFSFile::Read(ExtendedTCPClient *client) {
+  int32_t block_size;
+  int64_t group_block_size;
+  std::map<string, string> properties = {};
+  int64_t access_time;
+
+  bool has_path;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_path));
+  if (has_path) {
+    IGFSPath path = {};
+    TF_RETURN_IF_ERROR(path.Read(client));
+  }
+
+  TF_RETURN_IF_ERROR(client->ReadInt(&block_size));
+  TF_RETURN_IF_ERROR(client->ReadLong(&group_block_size));
+  TF_RETURN_IF_ERROR(client->ReadLong(&length));
+  TF_RETURN_IF_ERROR(client->ReadStringMap(&properties));
+  TF_RETURN_IF_ERROR(client->ReadLong(&access_time));
+  TF_RETURN_IF_ERROR(client->ReadLong(&modification_time));
+  TF_RETURN_IF_ERROR(client->ReadByte(&flags));
+
+  return Status::OK();
+}
+
+Request::Request(int32_t command_id) : command_id_(command_id) {}
+
+Status Request::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(client->WriteByte(0));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(8));
+  TF_RETURN_IF_ERROR(client->WriteInt(command_id_));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(24));
+
+  return Status::OK();
+}
+
+Status Response::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->Ignore(1));
+  TF_RETURN_IF_ERROR(client->SkipToPos(8));
+  TF_RETURN_IF_ERROR(client->ReadInt(&req_id));
+  TF_RETURN_IF_ERROR(client->SkipToPos(24));
+  TF_RETURN_IF_ERROR(client->ReadInt(&res_type));
+
+  bool has_error;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_error));
+
+  if (has_error) {
+    int32_t error_code;
+    string error_msg;
+    TF_RETURN_IF_ERROR(client->ReadString(&error_msg));
+    TF_RETURN_IF_ERROR(client->ReadInt(&error_code));
+
+    return errors::Unknown("Error [code=", error_code, ", message=\"",
+                           error_msg, "\"]");
+  }
+
+  TF_RETURN_IF_ERROR(client->SkipToPos(header_size_ + 5));
+  TF_RETURN_IF_ERROR(client->ReadInt(&length));
+  TF_RETURN_IF_ERROR(client->SkipToPos(header_size_ + response_header_size_));
+
+  return Status::OK();
+}
+
+PathCtrlRequest::PathCtrlRequest(int32_t command_id_, const string &user_name,
+                                 const string &path,
+                                 const string &destination_path, bool flag,
+                                 bool collocate,
+                                 const std::map<string, string> &properties)
+    : Request(command_id_),
+      user_name_(user_name),
+      path_(path),
+      destination_path_(destination_path),
+      flag_(flag),
+      collocate_(collocate),
+      props_(properties) {}
+
+Status PathCtrlRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(Request::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteString(user_name_));
+  TF_RETURN_IF_ERROR(WritePath(client, path_));
+  TF_RETURN_IF_ERROR(WritePath(client, destination_path_));
+  TF_RETURN_IF_ERROR(client->WriteBool(flag_));
+  TF_RETURN_IF_ERROR(client->WriteBool(collocate_));
+  TF_RETURN_IF_ERROR(client->WriteStringMap(props_));
+
+  return Status::OK();
+}
+
+Status PathCtrlRequest::WritePath(ExtendedTCPClient *client,
+                                  const string &path) const {
+  TF_RETURN_IF_ERROR(client->WriteBool(!path.empty()));
+  if (!path.empty()) TF_RETURN_IF_ERROR(client->WriteString(path));
+
+  return Status::OK();
+}
+
+Status StreamCtrlRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(client->WriteByte(0));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(8));
+  TF_RETURN_IF_ERROR(client->WriteInt(command_id_));
+  TF_RETURN_IF_ERROR(client->WriteLong(stream_id_));
+  TF_RETURN_IF_ERROR(client->WriteInt(length_));
+
+  return Status::OK();
+}
+
+StreamCtrlRequest::StreamCtrlRequest(int32_t command_id_, int64_t stream_id,
+                                     int32_t length)
+    : Request(command_id_), stream_id_(stream_id), length_(length) {}
+
+DeleteRequest::DeleteRequest(const string &user_name, const string &path,
+                             bool flag)
+    : PathCtrlRequest(DELETE_ID, user_name, path, {}, flag, true, {}) {}
+
+Status DeleteResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&exists));
+
+  return Status::OK();
+}
+
+ExistsRequest::ExistsRequest(const string &user_name, const string &path)
+    : PathCtrlRequest(EXISTS_ID, user_name, path, {}, false, true, {}) {}
+
+Status ExistsResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&exists));
+
+  return Status::OK();
+}
+
+HandshakeRequest::HandshakeRequest(const string &fs_name, const string &log_dir)
+    : Request(HANDSHAKE_ID), fs_name_(fs_name), log_dir_(log_dir) {}
+
+Status HandshakeRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(Request::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteString(fs_name_));
+  TF_RETURN_IF_ERROR(client->WriteString(log_dir_));
+
+  return Status::OK();
+}
+
+Status HandshakeResponse::Read(ExtendedTCPClient *client) {
+  int64_t block_size;
+  bool sampling;
+
+  TF_RETURN_IF_ERROR(client->ReadNullableString(&fs_name));
+  TF_RETURN_IF_ERROR(client->ReadLong(&block_size));
+
+  bool has_sampling_;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_sampling_));
+
+  if (has_sampling_) {
+    TF_RETURN_IF_ERROR(client->ReadBool(&sampling));
+  }
+
+  return Status::OK();
+}
+
+ListRequest::ListRequest(int32_t command_id_, const string &user_name,
+                         const string &path)
+    : PathCtrlRequest(command_id_, user_name, path, {}, false, true, {}) {}
+
+ListFilesRequest::ListFilesRequest(const string &user_name, const string &path)
+    : ListRequest(LIST_FILES_ID, user_name, path) {}
+
+ListPathsRequest::ListPathsRequest(const string &user_name, const string &path)
+    : ListRequest(LIST_PATHS_ID, user_name, path) {}
+
+OpenCreateRequest::OpenCreateRequest(const string &user_name,
+                                     const string &path)
+    : PathCtrlRequest(OPEN_CREATE_ID, user_name, path, {}, false, true, {}) {}
+
+Status OpenCreateRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteInt(replication_));
+  TF_RETURN_IF_ERROR(client->WriteLong(blockSize_));
+
+  return Status::OK();
+}
+
+Status OpenCreateResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+
+  return Status::OK();
+}
+
+OpenAppendRequest::OpenAppendRequest(const string &user_name,
+                                     const string &path)
+    : PathCtrlRequest(OPEN_APPEND_ID, user_name, path, {}, false, true, {}) {}
+
+Status OpenAppendRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  return Status::OK();
+}
+
+Status OpenAppendResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+
+  return Status::OK();
+}
+
+OpenReadRequest::OpenReadRequest(const string &user_name, const string &path,
+                                 bool flag,
+                                 int32_t sequential_reads_before_prefetch)
+    : PathCtrlRequest(OPEN_READ_ID, user_name, path, {}, flag, true, {}),
+      sequential_reads_before_prefetch_(sequential_reads_before_prefetch) {}
+
+OpenReadRequest::OpenReadRequest(const string &user_name, const string &path)
+    : OpenReadRequest(user_name, path, false, 0) {}
+
+Status OpenReadRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  if (flag_) {
+    TF_RETURN_IF_ERROR(client->WriteInt(sequential_reads_before_prefetch_));
+  }
+
+  return Status::OK();
+}
+
+Status OpenReadResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+  TF_RETURN_IF_ERROR(client->ReadLong(&length));
+
+  return Status::OK();
+}
+
+InfoRequest::InfoRequest(const string &user_name, const string &path)
+    : PathCtrlRequest(INFO_ID, user_name, path, {}, false, true, {}) {}
+
+Status InfoResponse::Read(ExtendedTCPClient *client) {
+  file_info = IGFSFile();
+  TF_RETURN_IF_ERROR(file_info.Read(client));
+
+  return Status::OK();
+}
+
+MakeDirectoriesRequest::MakeDirectoriesRequest(const string &user_name,
+                                               const string &path)
+    : PathCtrlRequest(MKDIR_ID, user_name, path, {}, false, true, {}) {}
+
+Status MakeDirectoriesResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+CloseRequest::CloseRequest(int64_t streamId)
+    : StreamCtrlRequest(CLOSE_ID, streamId, 0) {}
+
+Status CloseResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+ReadBlockRequest::ReadBlockRequest(int64_t stream_id, int64_t pos,
+                                   int32_t length)
+    : StreamCtrlRequest(READ_BLOCK_ID, stream_id, length), pos(pos) {}
+
+Status ReadBlockRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(StreamCtrlRequest::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteLong(pos));
+
+  return Status::OK();
+}
+
+Status ReadBlockResponse::Read(ExtendedTCPClient *client, int32_t length,
+                               uint8_t *dst) {
+  TF_RETURN_IF_ERROR(client->ReadData(dst, length));
+  successfully_read = length;
+
+  return Status::OK();
+}
+
+Status ReadBlockResponse::Read(ExtendedTCPClient *client) {
+  return Status::OK();
+}
+
+std::streamsize ReadBlockResponse::GetSuccessfullyRead() {
+  return successfully_read;
+}
+
+ReadBlockCtrlResponse::ReadBlockCtrlResponse(uint8_t *dst)
+    : CtrlResponse(false), dst(dst) {}
+
+Status ReadBlockCtrlResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(Response::Read(client));
+
+  res = ReadBlockResponse();
+  TF_RETURN_IF_ERROR(res.Read(client, length, dst));
+
+  return Status::OK();
+}
+
+WriteBlockRequest::WriteBlockRequest(int64_t stream_id, const uint8_t *data,
+                                     int32_t length)
+    : StreamCtrlRequest(WRITE_BLOCK_ID, stream_id, length), data(data) {}
+
+Status WriteBlockRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(StreamCtrlRequest::Write(client));
+  TF_RETURN_IF_ERROR(client->WriteData((uint8_t *)data, length_));
+
+  return Status::OK();
+}
+
+RenameRequest::RenameRequest(const string &user_name, const string &path,
+                             const string &destination_path)
+    : PathCtrlRequest(RENAME_ID, user_name, path, destination_path, false, true,
+                      {}) {}
+
+Status RenameResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h
new file mode 100644
index 0000000000000000000000000000000000000000..44a2928a2b2b48849c7ba4454e0e7848c2217b3b
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h
@@ -0,0 +1,356 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_MESSAGES_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_MESSAGES_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h"
+
+namespace tensorflow {
+
+enum CommandId {
+  HANDSHAKE_ID = 0,
+  EXISTS_ID = 2,
+  INFO_ID = 3,
+  RENAME_ID = 6,
+  DELETE_ID = 7,
+  MKDIR_ID = 8,
+  LIST_PATHS_ID = 9,
+  LIST_FILES_ID = 10,
+  OPEN_READ_ID = 13,
+  OPEN_APPEND_ID = 14,
+  OPEN_CREATE_ID = 15,
+  CLOSE_ID = 16,
+  READ_BLOCK_ID = 17,
+  WRITE_BLOCK_ID = 18,
+};
+
+class IGFSPath {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  string path;
+};
+
+class IGFSFile {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t length;
+  int64_t modification_time;
+  uint8_t flags;
+};
+
+class Request {
+ public:
+  Request(int32_t command_id);
+  virtual Status Write(ExtendedTCPClient *client) const;
+
+ protected:
+  const int32_t command_id_;
+};
+
+class Response {
+ public:
+  virtual Status Read(ExtendedTCPClient *client);
+
+  int32_t res_type;
+  int32_t req_id;
+  int32_t length;
+
+ protected:
+  static const int32_t header_size_ = 24;
+  static const int32_t response_header_size_ = 9;
+};
+
+class PathCtrlRequest : public Request {
+ public:
+  PathCtrlRequest(int32_t command_id, const string &user_name,
+                  const string &path, const string &destination_path, bool flag,
+                  bool collocate, const std::map<string, string> &properties);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  Status WritePath(ExtendedTCPClient *client, const string &path) const;
+
+  const string user_name_;
+  const string path_;
+  const string destination_path_;
+  const bool flag_;
+  const bool collocate_;
+  const std::map<string, string> props_;
+};
+
+class StreamCtrlRequest : public Request {
+ public:
+  StreamCtrlRequest(int32_t command_id, int64_t stream_id, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  int64_t stream_id_;
+  int32_t length_;
+};
+
+template <class R>
+class CtrlResponse : public Response {
+ public:
+  CtrlResponse(bool optional) : optional_(optional) {}
+  Status Read(ExtendedTCPClient *client) override {
+    TF_RETURN_IF_ERROR(Response::Read(client));
+
+    if (optional_) {
+      TF_RETURN_IF_ERROR(client->ReadBool(&has_content));
+
+      if (!has_content) return Status::OK();
+    }
+
+    res = R();
+    has_content = true;
+    TF_RETURN_IF_ERROR(res.Read(client));
+
+    return Status::OK();
+  }
+
+  R res;
+  bool has_content;
+
+ private:
+  bool optional_;
+};
+
+template <class T>
+class ListResponse {
+ public:
+  Status Read(ExtendedTCPClient *client) {
+    int32_t len;
+    TF_RETURN_IF_ERROR(client->ReadInt(&len));
+
+    entries.clear();
+
+    for (int32_t i = 0; i < len; i++) {
+      T f = {};
+      TF_RETURN_IF_ERROR(f.Read(client));
+      entries.push_back(f);
+    }
+
+    return Status::OK();
+  }
+
+  std::vector<T> entries;
+};
+
+class DeleteRequest : public PathCtrlRequest {
+ public:
+  DeleteRequest(const string &user_name, const string &path, bool flag);
+};
+
+class DeleteResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool exists;
+};
+
+class ExistsRequest : public PathCtrlRequest {
+ public:
+  explicit ExistsRequest(const string &user_name, const string &path);
+};
+
+class ExistsResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool exists;
+};
+
+class HandshakeRequest : public Request {
+ public:
+  HandshakeRequest(const string &fs_name, const string &log_dir);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  string fs_name_;
+  string log_dir_;
+};
+
+class HandshakeResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  string fs_name;
+};
+
+class ListRequest : public PathCtrlRequest {
+ public:
+  explicit ListRequest(int32_t command_id, const string &user_name,
+                       const string &path);
+};
+
+class ListFilesRequest : public ListRequest {
+ public:
+  ListFilesRequest(const string &user_name, const string &path);
+};
+
+class ListFilesResponse : public ListResponse<IGFSFile> {};
+
+class ListPathsRequest : public ListRequest {
+ public:
+  ListPathsRequest(const string &user_name, const string &path);
+};
+
+class ListPathsResponse : public ListResponse<IGFSPath> {};
+
+class OpenCreateRequest : public PathCtrlRequest {
+ public:
+  OpenCreateRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  int32_t replication_;
+  int64_t blockSize_;
+};
+
+class OpenCreateResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
+};
+
+class OpenAppendRequest : public PathCtrlRequest {
+ public:
+  explicit OpenAppendRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+};
+
+class OpenAppendResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
+};
+
+class OpenReadRequest : public PathCtrlRequest {
+ public:
+  OpenReadRequest(const string &user_name, const string &path, bool flag,
+                  int32_t seqReadsBeforePrefetch);
+  OpenReadRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  /** Sequential reads before prefetch. */
+  int32_t sequential_reads_before_prefetch_;
+};
+
+class OpenReadResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
+  int64_t length;
+};
+
+class InfoRequest : public PathCtrlRequest {
+ public:
+  InfoRequest(const string &user_name, const string &path);
+};
+
+class InfoResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  IGFSFile file_info;
+};
+
+class MakeDirectoriesRequest : public PathCtrlRequest {
+ public:
+  MakeDirectoriesRequest(const string &userName, const string &path);
+};
+
+class MakeDirectoriesResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool successful;
+};
+
+/** Stream control requests. **/
+
+class CloseRequest : public StreamCtrlRequest {
+ public:
+  explicit CloseRequest(int64_t stream_id);
+};
+
+class CloseResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool successful;
+};
+
+class ReadBlockRequest : public StreamCtrlRequest {
+ public:
+  ReadBlockRequest(int64_t stream_id, int64_t pos, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  int64_t pos;
+};
+
+class ReadBlockResponse {
+ public:
+  Status Read(ExtendedTCPClient *client, int32_t length, uint8_t *dst);
+  Status Read(ExtendedTCPClient *client);
+  std::streamsize GetSuccessfullyRead();
+
+ private:
+  int32_t length;
+  std::streamsize successfully_read;
+};
+
+class ReadBlockCtrlResponse : public CtrlResponse<ReadBlockResponse> {
+ public:
+  ReadBlockCtrlResponse(uint8_t *dst);
+  Status Read(ExtendedTCPClient *client) override;
+
+ private:
+  uint8_t *dst;
+};
+
+class WriteBlockRequest : public StreamCtrlRequest {
+ public:
+  WriteBlockRequest(int64_t stream_id, const uint8_t *data, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  const uint8_t *data;
+};
+
+class RenameRequest : public PathCtrlRequest {
+ public:
+  RenameRequest(const string &user_name, const string &path,
+                const string &destination_path);
+};
+
+class RenameResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool successful;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_MESSAGES_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4c898f14e6d298e65f563f4493a822172c40851
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+IGFSRandomAccessFile::IGFSRandomAccessFile(const string &file_name,
+                                           int64_t resource_id,
+                                           std::unique_ptr<IGFSClient> &&client)
+    : file_name_(file_name),
+      resource_id_(resource_id),
+      client_(std::move(client)) {}
+
+IGFSRandomAccessFile::~IGFSRandomAccessFile() {
+  CtrlResponse<CloseResponse> close_response = {false};
+  Status status = client_->Close(&close_response, resource_id_);
+
+  if (!status.ok()) LOG(ERROR) << status.ToString();
+}
+
+Status IGFSRandomAccessFile::Read(uint64 offset, size_t n, StringPiece *result,
+                                  char *scratch) const {
+  ReadBlockCtrlResponse response = ReadBlockCtrlResponse((uint8_t *)scratch);
+  TF_RETURN_IF_ERROR(client_->ReadBlock(&response, resource_id_, offset, n));
+
+  std::streamsize sz = response.res.GetSuccessfullyRead();
+  if (sz == 0) return errors::OutOfRange("End of file");
+
+  *result = StringPiece(scratch, sz);
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h
new file mode 100644
index 0000000000000000000000000000000000000000..b21369ff8a3b19774bcc743f93a5ec4ae1c9b49a
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_RANDOM_ACCESS_FILE_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_RANDOM_ACCESS_FILE_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFSRandomAccessFile : public RandomAccessFile {
+ public:
+  IGFSRandomAccessFile(const string &file_name, int64_t resource_id,
+                       std::unique_ptr<IGFSClient> &&client);
+  ~IGFSRandomAccessFile() override;
+  Status Read(uint64 offset, size_t n, StringPiece *result,
+              char *scratch) const override;
+
+ private:
+  const string file_name_;
+  const int64_t resource_id_;
+  std::unique_ptr<IGFSClient> client_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_RANDOM_ACCESS_FILE_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c15ecb7deeb0cf5a8a040e0d1e4b70c732729474
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+IGFSWritableFile::IGFSWritableFile(const string &file_name, int64_t resource_id,
+                                   std::unique_ptr<IGFSClient> &&client)
+    : file_name_(file_name),
+      resource_id_(resource_id),
+      client_(std::move(client)) {}
+
+IGFSWritableFile::~IGFSWritableFile() {
+  if (resource_id_ >= 0) {
+    CtrlResponse<CloseResponse> close_response = {false};
+
+    Status status = client_->Close(&close_response, resource_id_);
+    if (!status.ok()) LOG(ERROR) << status.ToString();
+  }
+}
+
+Status IGFSWritableFile::Append(StringPiece data) {
+  return client_->WriteBlock(resource_id_, (uint8_t *)data.data(), data.size());
+}
+
+Status IGFSWritableFile::Close() {
+  int64_t resource_to_be_closed = resource_id_;
+  resource_id_ = -1;
+
+  CtrlResponse<CloseResponse> close_response = {false};
+  return client_->Close(&close_response, resource_to_be_closed);
+}
+
+Status IGFSWritableFile::Flush() { return Sync(); }
+
+Status IGFSWritableFile::Sync() {
+  CtrlResponse<CloseResponse> close_response = {false};
+  TF_RETURN_IF_ERROR(client_->Close(&close_response, resource_id_));
+
+  CtrlResponse<OpenAppendResponse> open_append_resp(false);
+  TF_RETURN_IF_ERROR(client_->OpenAppend(&open_append_resp, file_name_));
+
+  resource_id_ = open_append_resp.res.stream_id;
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h
new file mode 100644
index 0000000000000000000000000000000000000000..b406db17e0e350e2cef610bb05c40f658e100140
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_WRITABLE_FILE_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_WRITABLE_FILE_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFSWritableFile : public WritableFile {
+ public:
+  IGFSWritableFile(const string &file_name, int64_t resource_id,
+                   std::unique_ptr<IGFSClient> &&client);
+  ~IGFSWritableFile() override;
+  Status Append(StringPiece data) override;
+  Status Close() override;
+  Status Flush() override;
+  Status Sync() override;
+
+ private:
+  const string file_name_;
+  int64_t resource_id_;
+  std::unique_ptr<IGFSClient> client_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_WRITABLE_FILE_H_
diff --git a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc b/tensorflow/contrib/ignite/ops/igfs_ops.cc
similarity index 62%
rename from tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
rename to tensorflow/contrib/ignite/ops/igfs_ops.cc
index f3b24b2341e590adfbeac1a18b6a65fbfd34f598..473bddff08b339d3b76a33d40fe34486acdbe151 100644
--- a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
+++ b/tensorflow/contrib/ignite/ops/igfs_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,17 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/core/platform/env.h"
 
-namespace tensorflow {
-namespace fuzzing {
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs.h"
 
-class FuzzDecodeJpeg : public FuzzStringInputOp {
-  SINGLE_INPUT_OP_BUILDER(DT_STRING, DecodeJpeg);
-};
+namespace tensorflow {
 
-STANDARD_TF_FUZZ_FUNCTION(FuzzDecodeJpeg);
+REGISTER_FILE_SYSTEM("igfs", IGFS);
 
-}  // end namespace fuzzing
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py b/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e1d6707d6400a7cd84016150d20973809aca20e
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python helper for loading IGFS ops and kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_ignite_ops.so"))
diff --git a/tensorflow/contrib/ignite/python/ops/igfs_ops.py b/tensorflow/contrib/ignite/python/ops/igfs_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..12b973b707730f6ba5b057b74a46b27d8f973ede
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/ops/igfs_ops.py
@@ -0,0 +1,40 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ignite File System for checkpointing and communication with TensorBoard.
+
+Apache Ignite is a memory-centric distributed database, caching, and
+processing platform for transactional, analytical, and streaming workloads,
+delivering in-memory speeds at petabyte scale. In addition to database
+functionality Apache Ignite provides a distributed file system called
+IGFS (https://ignite.apache.org/features/igfs.html). IGFS delivers a similar
+functionality to Hadoop HDFS, but only in-memory. In fact, in addition to
+its own APIs, IGFS implements Hadoop FileSystem API and can be transparently
+plugged into Hadoop or Spark deployments. This contrib package contains an
+integration between IGFS and TensorFlow.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import resource_loader
+
+file_system_library = os.path.join(resource_loader.get_data_files_path(),
+                                   "../../_ignite_ops.so")
+load_library.load_file_system_library(file_system_library)
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
index c9af7386cf0a26ed1a950130aa36caa7fb831fd0..e450e2d84ba31a7de925fdb78fc972a592c6ad8c 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
@@ -21,4 +21,4 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.platform import resource_loader
 
 _dataset_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
+    resource_loader.get_path_to_datafile("../../_ignite_ops.so"))
diff --git a/tensorflow/contrib/signal/python/__init__.py b/tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
old mode 100644
new mode 100755
similarity index 73%
rename from tensorflow/contrib/signal/python/__init__.py
rename to tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
index e672d1146c53a813613c9076c0cb6056f7081441..5e39e16c05290f6b5786421670c69a3bd1e27add
--- a/tensorflow/contrib/signal/python/__init__.py
+++ b/tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
@@ -1,4 +1,5 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Signal ops."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+nohup apache-ignite-fabric/bin/ignite.sh /data/config/ignite-config-igfs.xml &
+sleep 5 # Wait Apache Ignite to be started
+
+tail -f nohup.out
diff --git a/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml b/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml
new file mode 100644
index 0000000000000000000000000000000000000000..5d81bf33226cad0d5cc0ea1fb5c5b55672494976
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xmlns:util="http://www.springframework.org/schema/util"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+       http://www.springframework.org/schema/beans/spring-beans.xsd
+       http://www.springframework.org/schema/util
+       http://www.springframework.org/schema/util/spring-util.xsd">
+
+  <bean class="org.apache.ignite.configuration.IgniteConfiguration">
+    <property name="fileSystemConfiguration">
+      <bean class="org.apache.ignite.configuration.FileSystemConfiguration">
+        <!-- Distinguished file system name. -->
+        <property name="name" value="default_fs"/>
+        <property name="managementPort" value="9000"/>
+        <property name="ipcEndpointEnabled" value="true"/>
+        <property name="defaultMode" value="PRIMARY"/>
+        <property name="ipcEndpointConfiguration">
+          <bean class="org.apache.ignite.igfs.IgfsIpcEndpointConfiguration">
+            <property name="host" value="" />
+            <property name="port" value="10500"/>
+            <property name="type" value="TCP"/>
+          </bean>
+        </property>
+      </bean>
+    </property>
+    <property name="discoverySpi">
+      <bean class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
+        <property name="ipFinder">
+          <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder">
+            <property name="addresses">
+              <list>
+                <value>127.0.0.1</value>
+              </list>
+            </property>
+          </bean>
+        </property>
+      </bean>
+    </property>
+  </bean>
+
+</beans>
diff --git a/tensorflow/contrib/ignite/python/tests/igfs_test.py b/tensorflow/contrib/ignite/python/tests/igfs_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cacfc568942e20200b7daf10599dde513a4a0a68
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/igfs_test.py
@@ -0,0 +1,215 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for IGFS."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.contrib.ignite.python.ops.igfs_ops  # pylint: disable=unused-import
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+class IGFSTest(test.TestCase):
+  """The Apache Ignite servers have to setup before the test and tear down
+
+     after the test manually. The docker engine has to be installed.
+
+     To setup Apache Ignite servers:
+     $ bash start_ignite.sh
+
+     To tear down Apache Ignite servers:
+     $ bash stop_ignite.sh
+  """
+
+  def test_create_file(self):
+    """Test create file.
+
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_create_file/1"
+    self.assertFalse(gfile.Exists(file_name))
+    # Create file.
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    # Check that file was created.
+    self.assertTrue(gfile.Exists(file_name))
+
+  def test_write_read_file(self):
+    """Test write/read file.
+
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_write_read_file/1"
+    rows = 10000
+    self.assertFalse(gfile.Exists(file_name))
+    # Write data.
+    with gfile.Open(file_name, mode="w") as w:
+      for i in range(rows):
+        w.write("This is row\n")
+    # Read data.
+    with gfile.Open(file_name, mode="r") as r:
+      lines = r.readlines()
+    # Check that data is equal.
+    self.assertEqual(rows, len(lines))
+    for i in range(rows):
+      self.assertEqual("This is row\n", lines[i])
+
+  def test_delete_recursively(self):
+    """Test delete recursively.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_delete_recursively/"
+    file_name = "igfs:///test_delete_recursively/1"
+    self.assertFalse(gfile.Exists(dir_name))
+    self.assertFalse(gfile.Exists(file_name))
+    gfile.MkDir(dir_name)
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    self.assertTrue(gfile.Exists(dir_name))
+    self.assertTrue(gfile.Exists(file_name))
+    # Delete directory recursively.
+    gfile.DeleteRecursively(dir_name)
+    # Check that directory was deleted.
+    self.assertFalse(gfile.Exists(dir_name))
+    self.assertFalse(gfile.Exists(file_name))
+
+  def test_copy(self):
+    """Test copy.
+
+    """
+    # Setup and check preconditions.
+    src_file_name = "igfs:///test_copy/1"
+    dst_file_name = "igfs:///test_copy/2"
+    self.assertFalse(gfile.Exists(src_file_name))
+    self.assertFalse(gfile.Exists(dst_file_name))
+    with gfile.Open(src_file_name, mode="w") as w:
+      w.write("42")
+    self.assertTrue(gfile.Exists(src_file_name))
+    self.assertFalse(gfile.Exists(dst_file_name))
+    # Copy file.
+    gfile.Copy(src_file_name, dst_file_name)
+    # Check that files are identical.
+    self.assertTrue(gfile.Exists(src_file_name))
+    self.assertTrue(gfile.Exists(dst_file_name))
+    with gfile.Open(dst_file_name, mode="r") as r:
+      data = r.read()
+    self.assertEqual("42", data)
+
+  def test_is_directory(self):
+    """Test is directory.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_is_directory/1"
+    file_name = "igfs:///test_is_directory/2"
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    gfile.MkDir(dir_name)
+    # Check that directory is a directory.
+    self.assertTrue(gfile.IsDirectory(dir_name))
+    # Check that file is not a directory.
+    self.assertFalse(gfile.IsDirectory(file_name))
+
+  def test_list_directory(self):
+    """Test list directory.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_list_directory/"
+    file_names = [
+        "igfs:///test_list_directory/1", "igfs:///test_list_directory/2/3"
+    ]
+    ch_dir_names = [
+        "igfs:///test_list_directory/4",
+    ]
+    for file_name in file_names:
+      with gfile.Open(file_name, mode="w") as w:
+        w.write("")
+    for ch_dir_name in ch_dir_names:
+      gfile.MkDir(ch_dir_name)
+    ls_expected_result = file_names + ch_dir_names
+    # Get list of files in directory.
+    ls_result = gfile.ListDirectory(dir_name)
+    # Check that list of files is correct.
+    self.assertEqual(len(ls_expected_result), len(ls_result))
+    for e in ["1", "2", "4"]:
+      self.assertTrue(e in ls_result)
+
+  def test_make_dirs(self):
+    """Test make dirs.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_make_dirs/"
+    self.assertFalse(gfile.Exists(dir_name))
+    # Make directory.
+    gfile.MkDir(dir_name)
+    # Check that directory was created.
+    self.assertTrue(gfile.Exists(dir_name))
+
+  def test_remove(self):
+    """Test remove.
+
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_remove/1"
+    self.assertFalse(gfile.Exists(file_name))
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    self.assertTrue(gfile.Exists(file_name))
+    # Remove file.
+    gfile.Remove(file_name)
+    # Check that file was removed.
+    self.assertFalse(gfile.Exists(file_name))
+
+  def test_rename_file(self):
+    """Test rename file.
+
+    """
+    # Setup and check preconditions.
+    src_file_name = "igfs:///test_rename_file/1"
+    dst_file_name = "igfs:///test_rename_file/2"
+    with gfile.Open(src_file_name, mode="w") as w:
+      w.write("42")
+    self.assertTrue(gfile.Exists(src_file_name))
+    # Rename file.
+    gfile.Rename(src_file_name, dst_file_name)
+    # Check that only new name of file is available.
+    self.assertFalse(gfile.Exists(src_file_name))
+    self.assertTrue(gfile.Exists(dst_file_name))
+    with gfile.Open(dst_file_name, mode="r") as r:
+      data = r.read()
+    self.assertEqual("42", data)
+
+  def test_rename_dir(self):
+    """Test rename dir.
+
+    """
+    # Setup and check preconditions.
+    src_dir_name = "igfs:///test_rename_dir/1"
+    dst_dir_name = "igfs:///test_rename_dir/2"
+    gfile.MkDir(src_dir_name)
+    # Rename directory.
+    gfile.Rename(src_dir_name, dst_dir_name)
+    # Check that only new name of directory is available.
+    self.assertFalse(gfile.Exists(src_dir_name))
+    self.assertTrue(gfile.Exists(dst_dir_name))
+    self.assertTrue(gfile.IsDirectory(dst_dir_name))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/ignite/python/tests/start_ignite.sh b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
index a67bd44f2fb0d654ba07f022a5070c68df8e2ede..112e0dea844620de600e277bff3685dd7c42c49c 100755
--- a/tensorflow/contrib/ignite/python/tests/start_ignite.sh
+++ b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
@@ -20,3 +20,7 @@ SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )"
 # Start Apache Ignite with plain client listener.
 docker run -itd --name ignite-plain -p 42300:10800 \
 -v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-plain.sh
+
+# Start Apache Ignite with IGFS.
+docker run -itd --name ignite-igfs -p 10500:10500 \
+-v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-igfs.sh
\ No newline at end of file
diff --git a/tensorflow/contrib/ignite/python/tests/stop_ignite.sh b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
index 8f03dbd1ede61f548d3de9d9738f97667e75df3c..35b0f32d1b3e1373a231ff23f2b40c8ccc417baf 100755
--- a/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
+++ b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
@@ -15,5 +15,4 @@
 # ==============================================================================
 
 docker rm -f ignite-plain
-docker rm -f ignite-ssl
-docker rm -f ignite-ssl-auth
+docker rm -f ignite-igfs
\ No newline at end of file
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
index 478b716d88321101c971789f36c0ff8ecd3f418e..108da04494685f06f9afc26a26a5dadcdd99b0ff 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
@@ -115,7 +115,7 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, &tranformation_matrix](
+          [&input_data, &output_data, &tranformation_matrix](
               int64 start_channel, int64 end_channel) {
             // Applying projection matrix to input RGB vectors.
             const float* p = input_data.data() + start_channel * kChannelSize;
diff --git a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
index 24b790977dfdb675ff7bf0a119a08e243a30d3aa..ae9c7a611945e1445c933d74b9944054b3f0e0a4 100644
--- a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
@@ -24,7 +24,7 @@ from tensorflow.contrib.image.python.ops import dense_image_warp
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
@@ -259,7 +259,7 @@ class DenseImageWarpTest(test_util.TensorFlowTestCase):
 
     shape = [1, 2, 1, 1]
     msg = 'Should have raised an exception for invalid image size'
-    with self.assertRaises(ValueError, msg=msg):
+    with self.assertRaises(errors.InvalidArgumentError, msg=msg):
       self.check_interpolation_correctness(shape, 'float32', 'float32')
 
 
diff --git a/tensorflow/contrib/image/python/ops/dense_image_warp.py b/tensorflow/contrib/image/python/ops/dense_image_warp.py
index 9c7ada7afb7cb620c2e06887795053778f133287..f7ced440720209cb05dfcd79395c51517f9de0d5 100644
--- a/tensorflow/contrib/image/python/ops/dense_image_warp.py
+++ b/tensorflow/contrib/image/python/ops/dense_image_warp.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -60,28 +61,38 @@ def _interpolate_bilinear(grid,
       msg = 'Grid must be 4 dimensional. Received size: '
       raise ValueError(msg + str(grid.get_shape()))
 
-    batch_size, height, width, channels = shape
+    batch_size, height, width, channels = (array_ops.shape(grid)[0],
+                                           array_ops.shape(grid)[1],
+                                           array_ops.shape(grid)[2],
+                                           array_ops.shape(grid)[3])
+
+    shape = [batch_size, height, width, channels]
     query_type = query_points.dtype
     grid_type = grid.dtype
 
-    if (query_points.shape.rank != 3 or
-        query_points.shape.dims[2].value != 2):
-      msg = ('Query points must be 3 dimensional and size 2 in dim 2. Received '
-             'size: ')
-      raise ValueError(msg + str(query_points.get_shape()))
-
-    _, num_queries, _ = query_points.get_shape().as_list()
-
-    if height < 2 or width < 2:
-      msg = 'Grid must be at least batch_size x 2 x 2 in size. Received size: '
-      raise ValueError(msg + str(grid.get_shape()))
-
-    alphas = []
-    floors = []
-    ceils = []
-
-    index_order = [0, 1] if indexing == 'ij' else [1, 0]
-    unstacked_query_points = array_ops.unstack(query_points, axis=2)
+    with ops.control_dependencies([
+        check_ops.assert_equal(
+            len(query_points.get_shape()),
+            3,
+            message='Query points must be 3 dimensional.'),
+        check_ops.assert_equal(
+            array_ops.shape(query_points)[2],
+            2,
+            message='Query points must be size 2 in dim 2.')
+    ]):
+      num_queries = array_ops.shape(query_points)[1]
+
+    with ops.control_dependencies([
+        check_ops.assert_greater_equal(
+            height, 2, message='Grid height must be at least 2.'),
+        check_ops.assert_greater_equal(
+            width, 2, message='Grid width must be at least 2.')
+    ]):
+      alphas = []
+      floors = []
+      ceils = []
+      index_order = [0, 1] if indexing == 'ij' else [1, 0]
+      unstacked_query_points = array_ops.unstack(query_points, axis=2)
 
     for dim in index_order:
       with ops.name_scope('dim-' + str(dim)):
@@ -112,16 +123,18 @@ def _interpolate_bilinear(grid,
         alpha = array_ops.expand_dims(alpha, 2)
         alphas.append(alpha)
 
-    if batch_size * height * width > np.iinfo(np.int32).max / 8:
-      error_msg = """The image size or batch size is sufficiently large
-                     that the linearized addresses used by array_ops.gather
-                     may exceed the int32 limit."""
-      raise ValueError(error_msg)
-
-    flattened_grid = array_ops.reshape(grid,
-                                       [batch_size * height * width, channels])
-    batch_offsets = array_ops.reshape(
-        math_ops.range(batch_size) * height * width, [batch_size, 1])
+    with ops.control_dependencies([
+        check_ops.assert_less_equal(
+            math_ops.cast(batch_size * height * width, dtype=dtypes.float32),
+            np.iinfo(np.int32).max / 8,
+            message="""The image size or batch size is sufficiently large
+                       that the linearized addresses used by array_ops.gather
+                       may exceed the int32 limit.""")
+    ]):
+      flattened_grid = array_ops.reshape(
+          grid, [batch_size * height * width, channels])
+      batch_offsets = array_ops.reshape(
+          math_ops.range(batch_size) * height * width, [batch_size, 1])
 
     # This wraps array_ops.gather. We reshape the image data such that the
     # batch, y, and x coordinates are pulled into the first dimension.
@@ -182,7 +195,11 @@ def dense_image_warp(image, flow, name='dense_image_warp'):
                 of dimensions.
   """
   with ops.name_scope(name):
-    batch_size, height, width, channels = image.get_shape().as_list()
+    batch_size, height, width, channels = (array_ops.shape(image)[0],
+                                           array_ops.shape(image)[1],
+                                           array_ops.shape(image)[2],
+                                           array_ops.shape(image)[3])
+
     # The flow is defined on the image grid. Turn the flow into a list of query
     # points in the grid space.
     grid_x, grid_y = array_ops.meshgrid(
diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py
index 3327a9f9a613bfb56e6a25af0fe1c0ca18609035..9e19884df852c0fd259a55aef56c62b4189cd1da 100644
--- a/tensorflow/contrib/keras/api/keras/layers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 # Generic layers.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index c8812d4b23f94102d093db878a709b090a3318d6..588f15b867c1fedbadd5a5d945d870a356549468 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -70,7 +70,10 @@ py_test(
         "python/ops/core_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "noasan",  # TODO(b/119323169)
+    ],
     deps = [
         ":_typecheck",
         ":core",
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index e6596bfdfb9b153e5946ab7f8933c160cf2f2326..795591ea621dd192e203d4c4c680aebed961f690 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -253,7 +253,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -277,7 +277,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index af8e673f5906ad972408d30f23f2e8ba7e031a00..32f3006b749e3b34572a8d642054c0ec4c4664b0 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -14,10 +14,6 @@
 # ==============================================================================
 """Ops for building neural network layers, regularizers, summaries, etc.
 
-See the
-[Contrib Layers](https://tensorflow.org/api_guides/python/contrib.layers)
-guide.
-
 @@avg_pool2d
 @@avg_pool3d
 @@batch_norm
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index 124515e5a6474f2cc1038830346e27277c6ceea7..8015a571e14d0024b0beca700936c21f705b5752 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -691,7 +691,6 @@ class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
       index += num_val
     return grouped_vals
 
-  @test_util.enable_c_shapes
   def testEmbeddingLookupSparse(self):
     vocab_size = 13
     batch_size = 10
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 6fb4b9ff3534cab34c84de5d13fea7aff756556d..7e6eafaa0d6f60cfc28a4c422abac0b6d5a991fb 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -27,7 +27,7 @@ from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index d90d6ecf7f671a40a7ff2b066b6782c7421f9887..cab8da808b6413518ff4864cb0b03a42809260f1 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -27,7 +27,7 @@ import numpy as np
 
 from tensorflow.contrib.layers.python.layers import feature_column as fc
 from tensorflow.contrib.layers.python.layers import feature_column_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index ac9561c7693fc4ad994a00889aa3f15b4b5a5ee4..403b522ce45ac6ad98a321378626b87aaa7738aa 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base
 from tensorflow.python.layers import convolutional as convolutional_layers
 from tensorflow.python.layers import core as core_layers
@@ -1958,7 +1959,7 @@ class GDN(base.Layer):
     self._reparam_offset = reparam_offset
     self.data_format = data_format
     self._channel_axis()  # trigger ValueError early
-    self.input_spec = base.InputSpec(min_ndim=3, max_ndim=5)
+    self.input_spec = input_spec.InputSpec(min_ndim=3, max_ndim=5)
 
   def _channel_axis(self):
     try:
@@ -2015,7 +2016,7 @@ class GDN(base.Layer):
       raise ValueError('The channel dimension of the inputs to `GDN` '
                        'must be defined.')
     self._input_rank = input_shape.ndims
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         ndim=input_shape.ndims, axes={
             channel_axis: num_channels
         })
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index 28a6f5aed99b1443ebcc9c391ec332e0febbb04b..7bf2ac62d76d67f0eb131f8f57c5c063955424fa 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -19,9 +19,6 @@ This module and all its submodules are deprecated. See
 [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
 for migration instructions.
 
-See the [Contrib Learn](https://tensorflow.org/api_guides/python/contrib.learn)
-guide.
-
 @@BaseEstimator
 @@Estimator
 @@Trainable
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index eabebb7e881558471c343c0573cc9a8f4a425312..18ca4214a1c407653294ecfac0116bf00cda46a1 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -28,7 +28,6 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -38,11 +37,12 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 # The default learning rate of 0.05 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 3d85533d92d17095bae9a69f229171e1bf61ba10..7a3cc8bd984b1b621f50d9dbf2979dcd6fa8b11f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -38,7 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 4e65c180d8bee9ab8fe9b1fbf32edc229c31af09..d46a873bfaa297e7f6242aa56e9d0bf0eb551867 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -36,7 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 2bd57597c2e9444b51b1dacfbe4180b443c95a3d..ee25cebd484f1e831fe8b6d3aa7290da7558adee 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -38,7 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index e100bc7a1e7be4896e9ab1c965775b5185b38897..439b17e505d1146492a32cc2fd58febee2b2456d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index 597ca4e86dbf66c86182f14a2a364b662d52fb0a..dfc76bfde6c0109f98093232b6f223d6938007f9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer as sdca_optimizer_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -1745,7 +1745,7 @@ class LinearRegressorTest(test.TestCase):
           'place_holder':
               constant_op.constant([[0.0]] * num_examples),
       }, constant_op.constant(
-          [[1 if i % 4 is 0 else 0] for i in range(num_examples)])
+          [[1 if i % 4 == 0 else 0] for i in range(num_examples)])
 
     place_holder = feature_column_lib.real_valued_column('place_holder')
     sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 647667188238dc18b137eaad98356a79b3a549b4..7a5354222f103aa0f45adc513079e420bbbfd30c 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -524,7 +524,7 @@ class SDCALinearRegressorTest(test.TestCase):
           # LinearClassifier requires at least one column.
           'place_holder':
               constant_op.constant([[0.0]] * num_examples),
-      }, constant_op.constant([[1 if i % 4 is 0 else 0]
+      }, constant_op.constant([[1 if i % 4 == 0 else 0]
                                for i in range(num_examples)])
 
     with self._single_threaded_test_session():
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 619294b51822bd9983eda777acae5cf0d253926d..d8ac4163b21ce9accceb35f68cf13b0d6b093f9c 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -22,7 +22,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework.python.ops import add_arg_scope
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -67,34 +66,6 @@ def _scale_losses(losses, weights):
   return math_ops.reduce_sum(reduced_losses)
 
 
-def _safe_div(numerator, denominator, name="value"):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-    name: An optional name for the returned op.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator, name=name)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator),
-      name=name)
-
-
 def _safe_mean(losses, num_present):
   """Computes a safe mean of the losses.
 
@@ -107,7 +78,7 @@ def _safe_mean(losses, num_present):
       then zero is returned.
   """
   total_loss = math_ops.reduce_sum(losses)
-  return _safe_div(total_loss, num_present, name="value")
+  return math_ops.div_no_nan(total_loss, num_present, name="value")
 
 
 @deprecated("2016-12-30", "Use tf.losses.compute_weighted_loss instead.")
@@ -612,14 +583,14 @@ def mean_pairwise_squared_error(predictions,
         math_ops.square(diffs), reduction_indices=reduction_indices)
     num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                            num_present_per_batch,
-                            name="value")
+    term1 = 2.0 * math_ops.div_no_nan(
+        sum_squares_diff_per_batch, num_present_per_batch, name="value")
 
     sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
-    term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                            math_ops.square(num_present_per_batch),
-                            name="value")
+    term2 = 2.0 * math_ops.div_no_nan(
+        math_ops.square(sum_diff),
+        math_ops.square(num_present_per_batch),
+        name="value")
 
     loss = _scale_losses(term1 - term2, weights)
 
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 6c3b02e12b3082be8bfcc316c4c6122931eb5f76..1293e59cbcba86115e99b505b1f0672a01526462 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -142,7 +142,7 @@ First, download and install JetPack for Android version 3.2 or greater from [Nvi
 git clone https://github.com/tensorflow/tensorflow.git
 cd tensorflow
 JETPACK=$HOME/JetPack_Android_3.2
-TEGRA_LIBS="$JETPACK/cuDNN/aarch64/cuda/lib64/libcudnn.so  $JETPACK/cuda-9.0/extras/CUPTI/lib64/libcupti.so $JETPACK/cuda/targets/aarch64-linux-androideabi/lib64/libcufft.so"
+TEGRA_LIBS="$JETPACK/cuDNN/aarch64/cuda/lib64/libcudnn.so  $JETPACK/cuda/extras/CUPTI/lib64/libcupti.so $JETPACK/cuda/targets/aarch64-linux-androideabi/lib64/libcufft.so"
 ```
 
 #### Building all CUDA-enabled native binaries:
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 0a07588f07f0bb89dbf5dc5909f511f74470fb41..b396c527673902d61072dc9cf7d2766476be8369 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -34,7 +34,7 @@ NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.
 # 1.10 branch does not work. `make distclean` fails and blocks the build
 # process. For now we're hardcoding to the version which is used by
 # TensorFlow 1.9.
-PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz"
+PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz"
 # TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' once
 # the archive has been propagated in mirror.bazel.build.
 RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index eab93f2cc5ed3d5179a58fa717d8b83d0c4d7337..655c7eefcb978d40c8bc16a23685e03ed71bfb63 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -42,6 +42,7 @@ tensorflow/core/kernels/conv_grad_filter_ops.cc
 tensorflow/core/kernels/conv_grad_input_ops.cc
 tensorflow/core/kernels/conv_grad_ops.cc
 tensorflow/core/kernels/conv_ops.cc
+tensorflow/core/kernels/conv_ops_3d.cc
 tensorflow/core/kernels/conv_ops_fused.cc
 tensorflow/core/kernels/conv_ops_using_gemm.cc
 tensorflow/core/kernels/crop_and_resize_op.cc
@@ -156,6 +157,7 @@ tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc
+tensorflow/core/kernels/multinomial_op.cc
 tensorflow/core/kernels/no_op.cc
 tensorflow/core/kernels/non_max_suppression_op.cc
 tensorflow/core/kernels/one_hot_op.cc
@@ -163,6 +165,7 @@ tensorflow/core/kernels/pack_op.cc
 tensorflow/core/kernels/pad_op.cc
 tensorflow/core/kernels/padding_fifo_queue.cc
 tensorflow/core/kernels/padding_fifo_queue_op.cc
+tensorflow/core/kernels/pooling_ops_3d.cc
 tensorflow/core/kernels/pooling_ops_common.cc
 tensorflow/core/kernels/population_count_op.cc
 tensorflow/core/kernels/quantization_utils.cc
@@ -248,7 +251,9 @@ tensorflow/core/kernels/spectrogram_op.cc
 tensorflow/core/kernels/split_lib_cpu.cc
 tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_v_op.cc
+tensorflow/core/kernels/stack.cc
 tensorflow/core/kernels/stack_ops.cc
+tensorflow/core/kernels/stateless_random_ops.cc
 tensorflow/core/kernels/strided_slice_op.cc
 tensorflow/core/kernels/strided_slice_op_inst_0.cc
 tensorflow/core/kernels/strided_slice_op_inst_1.cc
diff --git a/tensorflow/contrib/metrics/python/metrics/classification.py b/tensorflow/contrib/metrics/python/metrics/classification.py
index ac1236086503a7c6e541bdf098efcb92f84e577f..062deb74b165329d8e72efa73b9d81f4174f8831 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification.py
@@ -175,7 +175,7 @@ def f1_score(labels, predictions, weights=None, num_thresholds=200,
       return best_f1
 
     best_f1 = distribution_strategy_context.get_replica_context().merge_call(
-        f1_across_replicas, values)
+        f1_across_replicas, args=(values,))
 
     update_op = compute_best_f1_score(tp=update_ops['tp'], fp=update_ops['fp'],
                                       fn=update_ops['fn'], name='update')
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index d6932f6e4b603b1a76250ab622f5fe8eaea81bc9..09fe65b73f8f866a02a5f0c4d7d736973782882a 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -24,7 +24,6 @@ from __future__ import print_function
 
 import collections as collections_lib
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -46,32 +45,6 @@ from tensorflow.python.util.deprecation import deprecated
 _EPSILON = 1e-7
 
 
-def _safe_div(numerator, denominator):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator))
-
-
 @deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
             'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
@@ -3247,24 +3220,20 @@ def streaming_covariance(predictions,
 
     # We update the means by Delta=Error*BatchCount/(BatchCount+PrevCount)
     # batch_mean_prediction is E[x_B] in the update equation
-    batch_mean_prediction = _safe_div(
-        math_ops.reduce_sum(weighted_predictions),
-        batch_count)
-    delta_mean_prediction = _safe_div(
-        (batch_mean_prediction - mean_prediction) * batch_count,
-        update_count)
+    batch_mean_prediction = math_ops.div_no_nan(
+        math_ops.reduce_sum(weighted_predictions), batch_count)
+    delta_mean_prediction = math_ops.div_no_nan(
+        (batch_mean_prediction - mean_prediction) * batch_count, update_count)
     update_mean_prediction = state_ops.assign_add(mean_prediction,
                                                   delta_mean_prediction)
     # prev_mean_prediction is E[x_A] in the update equation
     prev_mean_prediction = update_mean_prediction - delta_mean_prediction
 
     # batch_mean_label is E[y_B] in the update equation
-    batch_mean_label = _safe_div(
-        math_ops.reduce_sum(weighted_labels),
-        batch_count)
-    delta_mean_label = _safe_div(
-        (batch_mean_label - mean_label) * batch_count,
-        update_count)
+    batch_mean_label = math_ops.div_no_nan(
+        math_ops.reduce_sum(weighted_labels), batch_count)
+    delta_mean_label = math_ops.div_no_nan(
+        (batch_mean_label - mean_label) * batch_count, update_count)
     update_mean_label = state_ops.assign_add(mean_label, delta_mean_label)
     # prev_mean_label is E[y_A] in the update equation
     prev_mean_label = update_mean_label - delta_mean_label
@@ -3926,9 +3895,8 @@ def cohen_kappa(labels,
       po_sum = math_ops.reduce_sum(po)
       total = math_ops.reduce_sum(pe_row)
       pe_sum = math_ops.reduce_sum(
-          _safe_div(
-              math_ops.to_double(pe_row * pe_col),
-              math_ops.to_double(total)))
+          math_ops.div_no_nan(
+              math_ops.to_double(pe_row * pe_col), math_ops.to_double(total)))
       po_sum, pe_sum, total = (math_ops.to_double(po_sum),
                                math_ops.to_double(pe_sum),
                                math_ops.to_double(total))
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index b313024e2852caf2385454771b289ad0162cc463..45a60d79482787df4564ae3360f8252af93c7a26 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -51,7 +51,7 @@ The pruning library allows for specification of the following hyper parameters:
 | begin_pruning_step | integer | 0 | The global step at which to begin pruning |
 | end_pruning_step   | integer | -1 | The global step at which to terminate pruning. Defaults to -1 implying that pruning continues till  the training stops |
 | weight_sparsity_map | list of strings | [""] | list of weight variable name (or layer name):target sparsity pairs. Eg. [conv1:0.9,conv2/kernel:0.8]. For layers/weights not in this list, sparsity as specified by the target_sparsity hyperparameter is used. |
-| threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds |
+| threshold_decay | float | 0.0 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
 | nbins | integer | 256 | Number of bins to use for histogram computation. Note: When running on TPUs, a large (>1024) value for `nbins` may adversely affect the training time. |
 | block_height|integer | 1 | Number of rows in a block for block sparse matrices|
diff --git a/tensorflow/contrib/model_pruning/python/layers/core_layers.py b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
index f0ce6fe03966c2de2dfd8ebcca07bf46afcf4fce..1fa5c8cb485704a5fccc486e823bbc4050bf505a 100644
--- a/tensorflow/contrib/model_pruning/python/layers/core_layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
@@ -119,7 +120,7 @@ class _MaskedConv(base.Layer):
     self.bias_initializer = bias_initializer
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
-    self.input_spec = base.InputSpec(ndim=self.rank + 2)
+    self.input_spec = input_spec.InputSpec(ndim=self.rank + 2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -171,7 +172,7 @@ class _MaskedConv(base.Layer):
           dtype=self.dtype)
     else:
       self.bias = None
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         ndim=self.rank + 2, axes={channel_axis: input_dim})
     self.built = True
 
@@ -393,14 +394,14 @@ class MaskedFullyConnected(base.Layer):
     self.bias_initializer = bias_initializer
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
-    self.input_spec = base.InputSpec(min_ndim=2)
+    self.input_spec = input_spec.InputSpec(min_ndim=2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         min_ndim=2, axes={-1: tensor_shape.dimension_value(input_shape[-1])})
 
     self.kernel = self.add_variable(
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index d2b811641764df05c66654dfcb044fa7e78853a5..f6b4373edd0544555dd16a373802d2feb5d674b1 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -204,7 +204,7 @@ def get_pruning_hparams():
       begin_pruning_step=0,
       end_pruning_step=-1,
       weight_sparsity_map=[''],
-      threshold_decay=0.9,
+      threshold_decay=0.0,
       pruning_frequency=10,
       nbins=256,
       block_height=1,
@@ -456,13 +456,14 @@ class Pruning(object):
 
       pool_window = [self._block_dim[0], self._block_dim[1]]
       pool_fn = pruning_utils.factorized_pool
-
+      squeeze_axis = None
       if not self._spec.use_tpu:
         pool_fn = nn_ops.pool
         abs_weights = array_ops.reshape(
             abs_weights,
             [1, abs_weights.get_shape()[0],
              abs_weights.get_shape()[1], 1])
+        squeeze_axis = [0, 3]
 
       pooled_weights = pool_fn(
           abs_weights,
@@ -473,7 +474,7 @@ class Pruning(object):
           name=weights.op.name + '_pooled')
 
       if pooled_weights.get_shape().ndims != 2:
-        pooled_weights = array_ops.squeeze(pooled_weights)
+        pooled_weights = array_ops.squeeze(pooled_weights, axis=squeeze_axis)
 
       smoothed_threshold, new_mask = self._update_mask(pooled_weights,
                                                        threshold)
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
index 91b0bb7f6003c047e4dcf342695f433edbc11614..14fc51229ab53a77e8089040e8a8576babd0fafd 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -188,7 +188,6 @@ def _histogram(values, value_range, nbins=100, dtype=dtypes.int32, name=None):
   with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
     values = ops.convert_to_tensor(values, name='values')
     values = array_ops.reshape(values, [-1])
-    value_range = ops.convert_to_tensor(value_range, name='value_range')
     nbins_float = np.float32(nbins)
 
     # Map tensor values that fall within value_range to [0, 1].
@@ -250,7 +249,6 @@ def compute_cdf(values, value_range, **kwargs):
   name = kwargs.get('name', None)
   with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
     values = ops.convert_to_tensor(values, name='values')
-    value_range = ops.convert_to_tensor(value_range, name='value_range')
     nbins_float = np.float32(nbins)
 
     # Map tensor values that fall within value_range to [0, 1].
@@ -336,7 +334,7 @@ def factorized_pool(input_tensor,
         padding=padding)
 
   return array_ops.squeeze(
-      array_ops.transpose(width_pooling, perm=[0, 1, 3, 2]))
+      array_ops.transpose(width_pooling, perm=[0, 1, 3, 2]), axis=[0, 1])
 
 
 def determine_partitioned_axis(partitioned_variable):
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
index 0aca843497611552d922715514118cac003c29b2..d6f2bfcb6c2e2beda912eb538d8a4a0a17b486b3 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -85,8 +85,28 @@ class PruningUtilsTest(test.TestCase):
 
 
 @parameterized.named_parameters(
-    ("1x1", [1, 1]), ("4x4", [4, 4]), ("6x6", [6, 6]), ("1x4", [1, 4]),
-    ("4x1", [4, 1]), ("1x8", [1, 8]), ("8x1", [8, 1]))
+    ("Input_32x32_block_1x1", [32, 32], [1, 1]),
+    # block size 6x6
+    ("Input_3x3_block_6x6", [3, 3], [6, 6]),
+    ("Input_32x32_block_6x6", [32, 32], [6, 6]),
+    ("Input_2x32_block_6x6", [2, 32], [6, 6]),
+    ("Input_32x2_block_6x6", [32, 2], [6, 6]),
+    ("Input_30x30_block_6x6", [30, 30], [6, 6]),
+    # block size 4x4
+    ("Input_32x32_block_4x4", [32, 32], [4, 4]),
+    ("Input_2x32_block_4x4", [2, 32], [4, 4]),
+    ("Input_32x2_block_4x4", [32, 2], [4, 4]),
+    ("Input_30x30_block_4x4", [30, 30], [4, 4]),
+    # block size 1x4
+    ("Input_32x32_block_1x4", [32, 32], [1, 4]),
+    ("Input_2x32_block_1x4", [2, 32], [1, 4]),
+    ("Input_32x2_block_1x4", [32, 2], [1, 4]),
+    ("Input_30x30_block_1x4", [30, 30], [1, 4]),
+    # block size 4x1
+    ("Input_32x32_block_4x1", [32, 32], [4, 1]),
+    ("Input_2x32_block_4x1", [2, 32], [4, 1]),
+    ("Input_32x2_block_4x1", [32, 2], [4, 1]),
+    ("Input_30x30_block_4x1", [30, 30], [4, 1]))
 class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
 
   def _compare_pooling_methods(self, weights, pooling_kwargs):
@@ -97,9 +117,11 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
               array_ops.reshape(
                   weights,
                   [1, weights.get_shape()[0],
-                   weights.get_shape()[1], 1]), **pooling_kwargs))
+                   weights.get_shape()[1], 1]), **pooling_kwargs),
+          axis=[0, 3])
       pooled_weights_factorized_pool = pruning_utils.factorized_pool(
           weights, **pooling_kwargs)
+
       self.assertAllClose(pooled_weights_tf.eval(),
                           pooled_weights_factorized_pool.eval())
 
@@ -113,8 +135,8 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
           [expanded_tensor, kronecker_product])
       self.assertAllEqual(expanded_tensor_val, kronecker_product_val)
 
-  def testFactorizedAvgPool(self, window_shape):
-    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+  def testFactorizedAvgPool(self, input_shape, window_shape):
+    weights = variable_scope.get_variable("weights", shape=input_shape)
     pooling_kwargs = {
         "window_shape": window_shape,
         "pooling_type": "AVG",
@@ -123,8 +145,8 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
     }
     self._compare_pooling_methods(weights, pooling_kwargs)
 
-  def testFactorizedMaxPool(self, window_shape):
-    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+  def testFactorizedMaxPool(self, input_shape, window_shape):
+    weights = variable_scope.get_variable("weights", shape=input_shape)
     pooling_kwargs = {
         "window_shape": window_shape,
         "pooling_type": "MAX",
@@ -133,8 +155,8 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
     }
     self._compare_pooling_methods(weights, pooling_kwargs)
 
-  def testExpandTensor(self, block_dim):
-    weights = random_ops.random_normal(shape=[1024, 512])
+  def testExpandTensor(self, input_shape, block_dim):
+    weights = random_ops.random_normal(shape=input_shape)
     self._compare_expand_tensor_with_kronecker_product(weights, block_dim)
 
 
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index 9ce50bfe1054072b315adecb87f1ba729dfe0d83..b7fd2d2fb9db3eed15eb1cc2934199939790b1c0 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -106,6 +106,32 @@ class MovingAverageOptimizer(optimizer.Optimizer):
       self._swapped_variable_name_map[v_avg.op.name] = v.op.name
     return control_flow_ops.group(train_op, ma_op, name='train_with_avg')
 
+  def _find_swapped_variable(self, v_name_to_tensor, v_name, tensor):
+    """Returns name of swapped variable for given tensor.
+
+    Args:
+      v_name_to_tensor: Mapping from variable names to tensors.
+      v_name: name of the variable for which swapped variable should be returned
+      tensor: Tensor which correspond to variable for which swapped variable
+        should be returned.
+
+    Returns:
+      Tensor which correspond to swapped variable.
+
+    Raises:
+      ValueError: If swapped variable could not be found in v_name_to_tensor.
+    """
+    swapped_v_name = self._swapped_variable_name_map.get(v_name, None)
+    if swapped_v_name is None:
+      return tensor
+    else:
+      if swapped_v_name in v_name_to_tensor:
+        return v_name_to_tensor[swapped_v_name]
+      else:
+        raise ValueError(
+            ('Variable to swap %s is not part of variables to save. '
+             'This breaks MovingAverageOptimizer.') % swapped_v_name)
+
   def swapping_saver(self, var_list=None, name='swapping_saver', **kwargs):
     """Create a saver swapping moving averages and variables.
 
@@ -141,33 +167,33 @@ class MovingAverageOptimizer(optimizer.Optimizer):
     if not isinstance(var_list, dict):
       var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
 
-    # OpListToDict converts variables to tensors. We make sure we can get
-    # the unique variable name for normal and resource vaiables.
-    def get_v_name(tensor):
-      if tensor.op.type == 'ReadVariableOp':
-        return tensor.op.inputs[0].op.name
-      else:
-        return tensor.op.name
-
     v_name_to_tensor = {}
-    for tensor in six.itervalues(var_list):
-      v_name = get_v_name(tensor)
-      v_name_to_tensor[v_name] = tensor
+    for k, tensor_or_list in six.iteritems(var_list):
+      # For each partitioned variable OpListToDict returns list of constituent
+      # parts instead of single tensor.
+      if (isinstance(tensor_or_list, list)
+          or isinstance(tensor_or_list, variables.PartitionedVariable)):
+        for tensor in tensor_or_list:
+          v_name = tensor.op.name
+          v_name_to_tensor[v_name] = tensor
+      else:
+        v_name_to_tensor[k] = tensor_or_list
 
     # Now swap variables and moving averages
     swapped_var_list = {}
-    for k, tensor in six.iteritems(var_list):
-      v_name = get_v_name(tensor)
-      swapped_v_name = self._swapped_variable_name_map.get(v_name, None)
-      tensor_to_save = tensor
-      if swapped_v_name is not None:
-        if swapped_v_name in v_name_to_tensor:
-          tensor_to_save = v_name_to_tensor[swapped_v_name]
-        else:
-          raise ValueError(
-              ('Variable to swap %s is not part of variables to save. '
-               'This breaks MovingAverageOptimizer.') % swapped_v_name)
-      swapped_var_list[k] = tensor_to_save
+    for k, tensor_or_list in six.iteritems(var_list):
+      if isinstance(tensor_or_list, list):
+        tensor_list_to_save = []
+        for tensor in tensor_or_list:
+          v_name = tensor.op.name
+          swapped_variable = self._find_swapped_variable(v_name_to_tensor,
+                                                         v_name,
+                                                         tensor)
+          tensor_list_to_save.append(swapped_variable)
+        swapped_var_list[k] = tensor_list_to_save
+      else:
+        swapped_var_list[k] = self._find_swapped_variable(
+            v_name_to_tensor, k, tensor_or_list)
 
     # Build the swapping saver.
     return saver.Saver(swapped_var_list, name=name, **kwargs)
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index f22e7245285a8b2716645f9789eb5997928a22d2..643403eea6f88bcb33aa96d6539bc9a45a109c6b 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -26,6 +26,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -43,97 +45,171 @@ class MovingAverageOptimizerTest(test.TestCase):
     # Test that MovingAverageOptimizer works with resource variables.
     self._helpTestRun(use_resource=True)
 
-  def _helpTestRun(self, use_resource=False):
+  def testRunUsePartitionedVars(self):
+    # Test that MovingAverageOptimizer works with partitioned variables.
+    self._helpTestRun(use_partitioned_vars=True)
+
+  def testRunUseResourcePartitionedVars(self):
+    # Test that MovingAverageOptimizer works with resource and partitioned
+    # variables.
+    self._helpTestRun(use_partitioned_vars=True, use_resource=True)
+
+  def _helpTestRun(self, use_resource=False, use_partitioned_vars=False):
+    # Partitioned variables are represented as a "collection" of partitions.
+    # To simplify the test and reuse as much code as possible we employ
+    # following test strategy for partitioned variables.
+    #
+    # In the case of non-partitioned variables test runs on variables with
+    # shape [2].
+    #
+    # In the case of partitioned variables we use shape [4] with two partitions,
+    # thus each partition has shape [2].
+    # For partitioned variables the test is run twice (for loop over
+    # variable_part_names), first time on the first partition of each variable,
+    # second time on the second partition of each variable.
+    variable_part_names = ['part_0', 'part_1'] if use_partitioned_vars else ['']
     for sequential_update in [True, False]:
       for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-        with self.session(graph=ops.Graph()) as sess:
-          orig_val0 = [1.0, 2.0]
-          orig_val1 = [3.0, 4.0]
-          var0 = variable_scope.get_variable(
-              'var0',
-              initializer=constant_op.constant(orig_val0, dtype=dtype),
-              use_resource=use_resource)
-          var1 = variable_scope.get_variable(
-              'var1',
-              initializer=constant_op.constant(orig_val1, dtype=dtype),
-              use_resource=use_resource)
-          grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-          grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-
-          opt = moving_average_optimizer.MovingAverageOptimizer(
-              gradient_descent.GradientDescentOptimizer(learning_rate=2.0),
-              average_decay=0.5,
-              sequential_update=sequential_update)
-          save_dir = tempfile.mkdtemp(
-              prefix=os.path.join(self.get_temp_dir(), 'run_1'))
-          save_path = os.path.join(save_dir, 'model')
-          update = opt.apply_gradients(
-              list(six.moves.zip([grads0, grads1], [var0, var1])))
-          global_vars = variables.global_variables()
-          ema_var0 = [
-              v for v in global_vars
-              if v.op.name == 'var0/ExponentialMovingAverage'
-          ][0]
-          ema_var1 = [
-              v for v in global_vars
-              if v.op.name == 'var1/ExponentialMovingAverage'
-          ][0]
-          perturb = control_flow_ops.group([
-              state_ops.assign_add(var0, [1.0, 1.0]),
-              state_ops.assign_add(var1, [2.0, 2.0]),
-              state_ops.assign_add(ema_var0, [3.0, 3.0]),
-              state_ops.assign_add(ema_var1, [4.0, 4.0])
-          ])
-
-          # Test that saver with missing ema variables will fail.
-          with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
-            opt.swapping_saver(var_list=[var0])
-
-          train_saver = opt.swapping_saver()
-          train_saver_subset = opt.swapping_saver(var_list=[var0, ema_var0])
-          inference_saver = saver.Saver()
-          variables.global_variables_initializer().run()
-          # Step 1.
-          update.run()
-          self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
-          self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
-          if sequential_update:
-            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
-          # Test that the swapping saver save/restore operation is identity.
-          train_saver.save(sess, save_path)
-          train_saver.restore(sess, save_path)
-          self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
-          self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
-          if sequential_update:
-            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
-          # Test that the subset saver saves the EMA variable as well.
-          if sequential_update:
-            subset_save_path = save_path + '_subset'
-            train_saver_subset.save(sess, subset_save_path)
-            perturb.run()
-            self.assertAllCloseAccordingToType([1.8, 2.8], var0.eval())
-            self.assertAllCloseAccordingToType([3.9, 4.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
-            self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restoring should only restore var0 and ema_var0.
-            train_saver_subset.restore(sess, subset_save_path)
+        for var_part_name in variable_part_names:
+          with self.session(graph=ops.Graph()) as sess:
+            orig_val0 = [1.0, 2.0]
+            orig_val1 = [3.0, 4.0]
+            grads0 = [0.1, 0.1]
+            grads1 = [0.01, 0.01]
+            if use_partitioned_vars:
+              # Use partitioned variables.
+              # Create partitioned and duplicate each value used as initial
+              # value of variables.
+              partitioner = partitioned_variables.fixed_size_partitioner(
+                  num_shards=2)
+              orig_val0 = orig_val0 * 2
+              orig_val1 = orig_val1 * 2
+              grads0 = grads0 * 2
+              grads1 = grads1 * 2
+            else:
+              # Regular (non-partitioned) variables.
+              partitioner = None
+            var0 = variable_scope.get_variable(
+                'var0',
+                initializer=constant_op.constant(orig_val0, dtype=dtype),
+                use_resource=use_resource,
+                partitioner=partitioner)
+            var1 = variable_scope.get_variable(
+                'var1',
+                initializer=constant_op.constant(orig_val1, dtype=dtype),
+                use_resource=use_resource,
+                partitioner=partitioner)
+            # Make a fake loss, such that gradient(loss, var0) == grads0
+            # and gradient(loss, var1) == grads1
+            grads0 = constant_op.constant(grads0, dtype=dtype)
+            grads1 = constant_op.constant(grads1, dtype=dtype)
+            loss = (math_ops.reduce_sum(grads0 * var0)
+                    + math_ops.reduce_sum(grads1 * var1))
+
+            opt = moving_average_optimizer.MovingAverageOptimizer(
+                gradient_descent.GradientDescentOptimizer(learning_rate=2.0),
+                average_decay=0.5,
+                sequential_update=sequential_update)
+            save_dir = tempfile.mkdtemp(
+                prefix=os.path.join(self.get_temp_dir(), 'run_1'))
+            save_path = os.path.join(save_dir, 'model')
+
+            update = opt.minimize(loss)
+
+            # Get variables and their EMAs. In case of partitioned variables
+            # get proper part of each variable.
+            def _get_variable(var_name, part_name, ema):
+              """Returns variable of it's moving average by name."""
+              matches = [
+                  v for v in variables.global_variables()
+                  if ((var_name in v.op.name)
+                      and (part_name in v.op.name)
+                      and (('ExponentialMovingAverage' in v.op.name) == ema))
+              ]
+              self.assertEqual(len(matches), 1)
+              return matches[0]
+            var0 = _get_variable('var0', var_part_name, ema=False)
+            var1 = _get_variable('var1', var_part_name, ema=False)
+            ema_var0 = _get_variable('var0', var_part_name, ema=True)
+            ema_var1 = _get_variable('var1', var_part_name, ema=True)
+
+            perturb = control_flow_ops.group([
+                state_ops.assign_add(var0, [1.0, 1.0]),
+                state_ops.assign_add(var1, [2.0, 2.0]),
+                state_ops.assign_add(ema_var0, [3.0, 3.0]),
+                state_ops.assign_add(ema_var1, [4.0, 4.0])
+            ])
+
+            # Test that saver with missing ema variables will fail.
+            with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
+              opt.swapping_saver(var_list=[var0])
+
+            train_saver = opt.swapping_saver()
+            train_saver_subset = opt.swapping_saver(var_list=[var0, ema_var0])
+            inference_saver = saver.Saver()
+            variables.global_variables_initializer().run()
+            # Step 1.
+            update.run()
             self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
-            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
-            self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restore back to previous state.
+            self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
+            if sequential_update:
+              self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
+            # Test that the swapping saver save/restore operation is identity.
+            train_saver.save(sess, save_path)
             train_saver.restore(sess, save_path)
+            self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
+            self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
+            if sequential_update:
+              self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
+            # Test that the subset saver saves the EMA variable as well.
+            if sequential_update:
+              subset_save_path = save_path + '_subset'
+              train_saver_subset.save(sess, subset_save_path)
+              perturb.run()
+              self.assertAllCloseAccordingToType([1.8, 2.8], var0.eval())
+              self.assertAllCloseAccordingToType([3.9, 4.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
+              self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
+              # Restoring should only restore var0 and ema_var0.
+              train_saver_subset.restore(sess, subset_save_path)
+              self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
+              self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
+              self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
+              # Restore back to previous state.
+              train_saver.restore(sess, save_path)
 
-          # If updates are parallel, this is not always true after the 1st step.
-          if sequential_update:
+            # If updates are parallel,
+            # this is not always true after the 1st step.
+            if sequential_update:
+              # Test that the normal saver will have the averaged variables.
+              # We test that the average values are between the original value
+              # and the most recent variable values (since they are an average
+              # of the two).
+              val0 = var0.eval()
+              val1 = var1.eval()
+              train_saver.save(sess, save_path)
+              inference_saver.restore(sess, save_path)
+              avg_val0 = var0.eval()
+              avg_val1 = var1.eval()
+              for i in six.moves.range(len(val0)):
+                self.assertLess(val0[i], avg_val0[i])
+                self.assertLess(avg_val0[i], orig_val0[i])
+                self.assertLess(val1[i], avg_val1[i])
+                self.assertLess(avg_val1[i], orig_val1[i])
+              train_saver.restore(sess, save_path)
+            # Step 2.
+            update.run()
             # Test that the normal saver will have the averaged variables.
-            # We test that the average values are between the original value
-            # and the most recent variable values (since they are an average
-            # of the two).
+            # We test that the average values are between the original value and
+            # the most recent variable values (since they are an average of the
+            # two).
             val0 = var0.eval()
             val1 = var1.eval()
+            self.assertAllCloseAccordingToType([0.6, 1.6], val0)
+            self.assertAllCloseAccordingToType([2.96, 3.96], val1)
             train_saver.save(sess, save_path)
             inference_saver.restore(sess, save_path)
             avg_val0 = var0.eval()
@@ -143,26 +219,6 @@ class MovingAverageOptimizerTest(test.TestCase):
               self.assertLess(avg_val0[i], orig_val0[i])
               self.assertLess(val1[i], avg_val1[i])
               self.assertLess(avg_val1[i], orig_val1[i])
-            train_saver.restore(sess, save_path)
-          # Step 2.
-          update.run()
-          # Test that the normal saver will have the averaged variables.
-          # We test that the average values are between the original value and
-          # the most recent variable values (since they are an average of the
-          # two).
-          val0 = var0.eval()
-          val1 = var1.eval()
-          self.assertAllCloseAccordingToType([0.6, 1.6], val0)
-          self.assertAllCloseAccordingToType([2.96, 3.96], val1)
-          train_saver.save(sess, save_path)
-          inference_saver.restore(sess, save_path)
-          avg_val0 = var0.eval()
-          avg_val1 = var1.eval()
-          for i in six.moves.range(len(val0)):
-            self.assertLess(val0[i], avg_val0[i])
-            self.assertLess(avg_val0[i], orig_val0[i])
-            self.assertLess(val1[i], avg_val1[i])
-            self.assertLess(avg_val1[i], orig_val1[i])
 
   def testFailWhenSaverCreatedBeforeInitialized(self):
     with self.cached_session():
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
index 44a8890cb107440b79cf8fbbdfcfda503b1c910f..960826407b66b4efa3c2693efb6d2e17c4b47b33 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -83,14 +84,14 @@ class NadamOptimizer(adam.AdamOptimizer):
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
       # m_bar = (1 - beta1) * g_t + beta1 * m_t
-      m_bar = m_scaled_g_values + beta1_t * m_t
+      m_bar = m_scaled_g_values + beta1_t * array_ops.gather(m_t, indices)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, "v")
     v_scaled_g_values = (grad * grad) * (1 - beta2_t)
     v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
-    v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(
-        var, lr * m_bar / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+    v_t_slice = array_ops.gather(v_t, indices)
+    v_sqrt = math_ops.sqrt(v_t_slice)
+    var_update = scatter_add(var, indices, -lr * m_bar / (v_sqrt + epsilon_t))
     return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
index 85e05ce71cec6ef897cadb7d123e630febb3c064..a4372f64874e7591dbceac901fad6c941209bef9 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -52,14 +52,19 @@ def nadam_update_numpy(param,
 class NadamOptimizerTest(test.TestCase):
 
   def doTestSparse(self, use_resource=False):
+    # need to use a larger value of epsilon here so that
+    # np.sqrt(v_t) + epsilon doesn't get rounded to 0 when
+    # the dtype is half and np.sqrt(v_t) = 0, as is the case
+    # when the gradient is 0
+    sparse_epsilon = 1e-7
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
 
         if use_resource:
           var0 = resource_variable_ops.ResourceVariable(var0_np)
@@ -67,21 +72,21 @@ class NadamOptimizerTest(test.TestCase):
         else:
           var0 = variables.Variable(var0_np)
           var1 = variables.Variable(var1_np)
-        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
-            constant_op.constant(grads0_np),
-            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
-        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
         grads1 = ops.IndexedSlices(
-            constant_op.constant(grads1_np),
-            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
-        opt = nadam_optimizer.NadamOptimizer()
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = nadam_optimizer.NadamOptimizer(epsilon=sparse_epsilon)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
@@ -91,8 +96,10 @@ class NadamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
           update.run()
 
-          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0,
+                                               epsilon=sparse_epsilon)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1,
+                                               epsilon=sparse_epsilon)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 3ba3ee29ec79687df522eb330665a2ce80061682..835fb4aec4f88572cb54d24ca2deae022e277c5c 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -56,6 +56,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:reduce_util",
     ],
 )
 
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index f789c83e005ab7ad7e7caff4ef9ee3c2f57c21fe..a72db5e12fc086c3ec817d25d4964bbb9df2db60 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -24,6 +24,7 @@ import abc
 
 import six
 
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -446,7 +447,7 @@ class _OptimizerV2State(object):
     if v is None:
       if colocate_with is None:
         colocate_with = self._non_slot_devices
-      with self._distribution.colocate_vars_with(colocate_with):
+      with self._distribution.extended.colocate_vars_with(colocate_with):
         # TODO(josh11b): Use get_variable() except for the legacy Adam use case.
         v = variable_scope.variable(initial_value, name=name, trainable=False)
       self._non_slot_dict[name] = v
@@ -790,14 +791,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
         # Scale loss for number of replicas (callable-loss case). In this case,
         # we have to be careful to call distribute_lib.get_loss_reduction()
         # *after* loss() is evaluated, so we know what loss reduction it uses.
-        if scale_loss_by_num_replicas is None:
-          scale_loss_by_num_replicas = (
-              distribute_lib.get_loss_reduction() == variable_scope
-              .VariableAggregation.MEAN)
-        if scale_loss_by_num_replicas:
-          num_replicas = distribute_ctx.get_distribution_strategy().num_replicas
-          if num_replicas > 1:
-            loss_value *= 1. / num_replicas
+        loss_value = self._scale_loss(loss_value, scale_loss_by_num_replicas)
 
       if var_list is None:
         var_list = tape.watched_variables()
@@ -808,14 +802,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
                          "be a function when eager execution is enabled.")
 
     # Scale loss for number of replicas (non-callable-loss case).
-    if scale_loss_by_num_replicas is None:
-      scale_loss_by_num_replicas = (
-          distribute_lib.get_loss_reduction() == variable_scope
-          .VariableAggregation.MEAN)
-    if scale_loss_by_num_replicas:
-      num_replicas = distribute_ctx.get_distribution_strategy().num_replicas
-      if num_replicas > 1:
-        loss *= 1. / num_replicas
+    loss = self._scale_loss(loss, scale_loss_by_num_replicas)
 
     if gate_gradients not in [
         optimizer_v1.Optimizer.GATE_NONE, optimizer_v1.Optimizer.GATE_OP,
@@ -857,6 +844,19 @@ class OptimizerV2(optimizer_v1.Optimizer):
     ])
     return grads_and_vars
 
+  @staticmethod
+  def _scale_loss(loss_value, scale_loss_by_num_replicas):
+    """Scale loss for the number of replicas."""
+    if scale_loss_by_num_replicas is None:
+      scale_loss_by_num_replicas = (
+          distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN)
+    if scale_loss_by_num_replicas:
+      num_replicas = \
+        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      if num_replicas > 1:
+        loss_value *= 1. / num_replicas
+    return loss_value
+
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     """Apply gradients to variables.
 
@@ -892,7 +892,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, v in grads_and_vars],))
     return distribute_ctx.get_replica_context().merge_call(
-        self._distributed_apply, filtered, global_step=global_step, name=name)
+        self._distributed_apply, args=(filtered,),
+        kwargs={"global_step": global_step, "name": name})
 
   def _get_or_create_state(self, var_list=None):
     """Either looks up or creates `_OptimizerV2State`.
@@ -927,8 +928,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
     """`apply_gradients` for use with a `DistributionStrategy`."""
-    reduced_grads = distribution.batch_reduce(
-        variable_scope.VariableAggregation.SUM, grads_and_vars)
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
     grads_and_vars = zip(reduced_grads, var_list)
 
@@ -944,7 +945,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     with ops.name_scope(name, self._name) as name:
       per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list)
       # Include the current value of any dynamic hyper parameters in `state`.
-      non_slot_devices = distribution.non_slot_devices(var_list)
+      non_slot_devices = distribution.extended.non_slot_devices(var_list)
       state = per_graph_state._copy_with_dynamic_hyper(  # pylint: disable=protected-access
           self._hyper, distribution, non_slot_devices)
 
@@ -989,7 +990,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
       # Use the processors to update the variables.
       update_ops = []
       for grad, var in grads_and_vars:
-        update_ops.extend(distribution.update(var, update, grad, grouped=False))
+        update_ops.extend(distribution.extended.update(
+            var, update, args=(grad,), group=False))
 
       # Give the child class a chance to do something after applying
       # gradients
@@ -1001,8 +1003,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
       update_ops = control_flow_ops.group(update_ops)
       with ops.control_dependencies([update_ops]):
-        finish_updates = distribution.update_non_slot(
-            non_slot_devices, finish, grouped=False)
+        finish_updates = distribution.extended.update_non_slot(
+            non_slot_devices, finish, group=False)
       # We said grouped=False, which means finish_updates is always a list.
       # It will be [None] when finish() returns None.
       if finish_updates == [None]:
@@ -1017,8 +1019,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
           def update_global_step(global_step, name):
             return global_step.assign_add(1, read_value=False, name=name)
 
-          apply_updates = distribution.update(global_step, update_global_step,
-                                              name)
+          apply_updates = distribution.extended.update(
+              global_step, update_global_step, args=(name,))
 
       # Add the training op to the TRAIN_OP graph collection in graph mode.
       if not eager_execution:
diff --git a/tensorflow/contrib/resampler/BUILD b/tensorflow/contrib/resampler/BUILD
index b3f32b8f34e7b956b44bc82322bba16ed6fe43c7..38fcca03116721f3dabfa6d1e7122c369b6b405d 100644
--- a/tensorflow/contrib/resampler/BUILD
+++ b/tensorflow/contrib/resampler/BUILD
@@ -50,6 +50,7 @@ tf_kernel_library(
     prefix = "resampler_ops",
     deps = [
         ":resampler_ops_op_lib",
+        "//tensorflow/compiler/tf2xla/kernels:resampler_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/contrib/resampler/ops/resampler_ops.cc b/tensorflow/contrib/resampler/ops/resampler_ops.cc
index 5ab212032e50ace9545762bebda5679f68fbf77c..f785d4ee5fcd63212882ccf736bfc61c35d68545 100644
--- a/tensorflow/contrib/resampler/ops/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/ops/resampler_ops.cc
@@ -25,7 +25,7 @@ REGISTER_OP("Resampler")
     .Input("data: T")
     .Input("warp: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle data;
       ShapeHandle warp;
@@ -48,7 +48,7 @@ REGISTER_OP("ResamplerGrad")
     .Input("grad_output: T")
     .Output("grad_data: T")
     .Output("grad_warp: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(0));
       c->set_output(1, c->input(1));
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 391df8cdb4b1c6cd0e22ff2e27527c58abd4c303..e124867415f94fb5052f34f50363ea718d71053b 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -196,6 +196,7 @@ cuda_py_tests(
     srcs = ["python/kernel_tests/lstm_ops_test.py"],
     additional_deps = [
         ":rnn_py",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index 026bf08ced33cf0d663cf0940e8bea3f3f2aca28..cbc8af5350276bf3398cf29a24554fd27e0621ee 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """RNN Cells and additional RNN operations.
 
-See [Contrib RNN](https://tensorflow.org/api_guides/python/contrib.rnn) guide.
-
 <!--From core-->
 @@RNNCell
 @@LayerRNNCell
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.h b/tensorflow/contrib/rnn/kernels/blas_gemm.h
index 9535a76566748eaf8b4756ad0dc26218262ed990..d37210d4b81203287fb633adc309688a35d093bb 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.h
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.h
@@ -32,15 +32,26 @@ struct TensorCuBlasGemm {
                   const T* b, int ldb, float beta, T* c, int ldc);
 };
 
+template <typename T>
+struct gemm_compute_type {
+  typedef T type;
+};
+
+template <>
+struct gemm_compute_type<Eigen::half> {
+  typedef float type;
+};
+
 template <typename Device, typename T, bool USE_CUBLAS>
 struct TensorBlasGemm;
 
 template <typename Device, typename T>
 struct TensorBlasGemm<Device, T, true /* USE_CUBLAS */> {
   static void compute(OpKernelContext* ctx, const Device& d, bool transa,
-                      bool transb, float alpha,
+                      bool transb, typename gemm_compute_type<T>::type alpha,
                       typename TTypes<T>::ConstMatrix a,
-                      typename TTypes<T>::ConstMatrix b, float beta,
+                      typename TTypes<T>::ConstMatrix b,
+                      typename gemm_compute_type<T>::type beta,
                       typename TTypes<T>::Matrix c) {
     int64 m = c.dimensions()[0];
     int64 n = c.dimensions()[1];
@@ -55,19 +66,23 @@ struct TensorBlasGemm<Device, T, true /* USE_CUBLAS */> {
 template <typename Device, typename T>
 struct TensorBlasGemm<Device, T, false /* USE_CUBLAS */> {
   static void compute(OpKernelContext* ctx, const Device& d, bool transa,
-                      bool transb, T alpha, typename TTypes<T>::ConstMatrix a,
-                      typename TTypes<T>::ConstMatrix b, T beta,
+                      bool transb, typename gemm_compute_type<T>::type alpha,
+                      typename TTypes<T>::ConstMatrix a,
+                      typename TTypes<T>::ConstMatrix b,
+                      typename gemm_compute_type<T>::type beta,
                       typename TTypes<T>::Matrix c) {
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
     contract_pairs[0] =
         Eigen::IndexPair<Eigen::DenseIndex>(transa == false, transb == true);
-    if (alpha == T(1) && beta == T(0)) {
+    if (alpha == typename gemm_compute_type<T>::type(1.f) &&
+        beta == typename gemm_compute_type<T>::type(0.f)) {
       c.device(d) = a.contract(b, contract_pairs);
-    } else if (alpha == T(1) && beta == T(1)) {
+    } else if (alpha == typename gemm_compute_type<T>::type(1.f) &&
+               beta == typename gemm_compute_type<T>::type(1.f)) {
       c.device(d) += a.contract(b, contract_pairs);
     } else {
-      c.device(d) = c.constant(alpha) * a.contract(b, contract_pairs) +
-                    c.constant(beta) * c;
+      c.device(d) = c.constant(T(alpha)) * a.contract(b, contract_pairs) +
+                    c.constant(T(beta)) * c;
     }
   }
 };
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.h b/tensorflow/contrib/rnn/kernels/gru_ops.h
index 3e2cb39e64bb3f0b22ea66c5601af36c5fb9b0fd..38be58fa104f8b30e4aede6d18330960fc30dcb5 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.h
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.h
@@ -88,7 +88,9 @@ struct GRUBlockCellFprop : public GRUCell {
     typename TTypes<T>::ConstMatrix const_x_h_prev(x_h_prev.data(),
                                                    x_h_prev.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, false, false, T(1), const_x_h_prev, w_ru, T(0), r_u_bar);
+        ctx, d, false, false, typename gemm_compute_type<T>::type(1.f),
+        const_x_h_prev, w_ru, typename gemm_compute_type<T>::type(0.f),
+        r_u_bar);
 
     // Creating a bias matrix for adding by broadcasting 'b_ru'
     Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({batch_size_, 1});
@@ -107,7 +109,8 @@ struct GRUBlockCellFprop : public GRUCell {
     typename TTypes<T>::ConstMatrix const_x_h_prevr(x_h_prevr.data(),
                                                     x_h_prevr.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, false, false, T(1), const_x_h_prevr, w_c, T(0), c);
+        ctx, d, false, false, typename gemm_compute_type<T>::type(1.f),
+        const_x_h_prevr, w_c, typename gemm_compute_type<T>::type(0.f), c);
 
     Eigen::array<Eigen::DenseIndex, 2> b_c_shape({1, b_c.dimensions()[0]});
     c.device(d) += (b_c.reshape(b_c_shape).broadcast(broadcast_shape));
@@ -148,9 +151,10 @@ struct GRUBlockCellBprop : public GRUCell {
     // [2nd_component_of_d_x d_h_prevr] = d_c_bar X w_c^T
     typename TTypes<T>::ConstMatrix const_d_c_bar(d_c_bar.data(),
                                                   d_c_bar.dimensions());
-    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(ctx, d, false, true, T(1),
-                                                   const_d_c_bar, w_c, T(0),
-                                                   d_x_comp2_and_h_prevr);
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, d, false, true, typename gemm_compute_type<T>::type(1.f),
+        const_d_c_bar, w_c, typename gemm_compute_type<T>::type(0.f),
+        d_x_comp2_and_h_prevr);
 
     d_hr.device(d) = d_x_comp2_and_h_prevr.slice(h_offsets(), h_extends());
     d_r_bar.device(d) = (d_hr * h_prev * r) * (r.constant(T(1)) - r);
@@ -164,7 +168,8 @@ struct GRUBlockCellBprop : public GRUCell {
     typename TTypes<T>::ConstMatrix const_d_r_bar_u_bar(
         d_r_bar_u_bar.data(), d_r_bar_u_bar.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, false, true, T(1), const_d_r_bar_u_bar, w_ru, T(0),
+        ctx, d, false, true, typename gemm_compute_type<T>::type(1.f),
+        const_d_r_bar_u_bar, w_ru, typename gemm_compute_type<T>::type(0.f),
         d_x_comp1_and_h_prev_comp1);
 
     // d_x = d_x_comp1 + d_x_comp2
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
index ee08d306f84baaba8b774ce3fa1a04d5f9a4f6dd..d369bc12ae88dafb4e3ca0095a08bcc3ee09bf70 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -61,7 +61,8 @@ void LSTMBlockCellFpropWithEigen(
   // states1 = xh * w + b
   typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
   TensorBlasGemm<CPUDevice, T, false /* USE_CUBLAS */>::compute(
-      ctx, d, false, false, T(1), const_xh, w, T(0), icfo);
+      ctx, d, false, false, typename gemm_compute_type<T>::type(1.f), const_xh,
+      w, typename gemm_compute_type<T>::type(0.f), icfo);
   Eigen::array<Eigen::DenseIndex, 2> b_shape({1, b.dimensions()[0]});
   Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({cell.batch_size(), 1});
   icfo.device(d) += b.reshape(b_shape).broadcast(broadcast_shape);
@@ -87,11 +88,11 @@ void LSTMBlockCellFpropWithEigen(
   if (use_peephole) {
     auto f_peep = cs_prev * wcf.reshape(p_shape).broadcast(p_broadcast_shape);
     f.device(d) = (icfo.slice(cell.icfo_f_offsets(), cell.cell_extents()) +
-                   f.constant(forget_bias) + f_peep)
+                   f.constant(T(forget_bias)) + f_peep)
                       .sigmoid();
   } else {
     f.device(d) = (icfo.slice(cell.icfo_f_offsets(), cell.cell_extents()) +
-                   f.constant(forget_bias))
+                   f.constant(T(forget_bias)))
                       .sigmoid();
   }
 
@@ -100,7 +101,7 @@ void LSTMBlockCellFpropWithEigen(
 
   if (cell_clip > 0.0f) {
     cs.device(d) =
-        cs.binaryExpr(cs.constant(cell_clip), Eigen::scalar_clip_op<T>());
+        cs.binaryExpr(cs.constant(T(cell_clip)), Eigen::scalar_clip_op<T>());
   }
 
   // co = tanh(cs)
@@ -225,6 +226,7 @@ void LSTMBlockCellBpropWithEigen(
   template struct LSTMBlockCellBprop<CPUDevice, T, false /* USE_CUBLAS */>;
 
 DEFINE_CPU_SPECS(float);
+DEFINE_CPU_SPECS(Eigen::half);
 #undef DEFINE_CPU_SPECS
 
 }  // namespace functor
@@ -373,7 +375,7 @@ class LSTMBlockCellOp : public OpKernel {
       Name("LSTMBlockCell").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       LSTMBlockCellOp<CPUDevice, T, false>);
 REGISTER_KERNEL(float);
-// REGISTER_KERNEL(double);
+REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -398,7 +400,6 @@ namespace functor {
 
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
-// DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // end namespace functor
 
@@ -661,7 +662,7 @@ class LSTMBlockCellGradOp : public OpKernel {
       Name("LSTMBlockCellGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       LSTMBlockCellGradOp<CPUDevice, T, false>);
 REGISTER_KERNEL(float);
-// REGISTER_KERNEL(double);
+REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -1008,7 +1009,7 @@ class BlockLSTMOp : public OpKernel {
       Name("BlockLSTM").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       BlockLSTMOp<CPUDevice, T, false>);
 REGISTER_KERNEL(float);
-// REGISTER_KERNEL(double);
+REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -1283,7 +1284,7 @@ class BlockLSTMGradOp : public OpKernel {
       Name("BlockLSTMGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       BlockLSTMGradOp<CPUDevice, T, false>);
 REGISTER_KERNEL(float);
-// REGISTER_KERNEL(double);
+REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
index b664b0f45ee08648e4dc10e8244340df1615ad19..15ae95f13cffa5d1469d737b23f2a83b9e5a694f 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
@@ -141,7 +141,7 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
   //
   const int gid = batch_id * cell_size * 4 + act_id;
   const int cid = batch_id * cell_size + act_id;
-  Eigen::internal::scalar_sigmoid_op<T> sigmoid_op;
+  Eigen::internal::scalar_logistic_op<T> sigmoid_op;
   Eigen::internal::scalar_tanh_op<T> tanh_op;
   Eigen::scalar_clip_op<T> clip_op;
 
@@ -169,7 +169,7 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
   f[cid] = f_local;
 
   T cs_local = i_local * ci_local + f_local * cs_prev[cid];
-  if (cell_clip_t > strict_cast<T>(0.0f)) {
+  if (cell_clip > 0.0f) {
     cs_local = clip_op(cs_local, cell_clip_t);
   }
   cs[cid] = cs_local;
@@ -248,7 +248,8 @@ void LSTMBlockCellFpropWithCUDA(
   // states1 = xh * w
   typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
   TensorBlasGemm<GPUDevice, T, true /* USE_CUBLAS */>::compute(
-      ctx, d, false, false, 1.f, const_xh, w, 0.f, icfo);
+      ctx, d, false, false, typename gemm_compute_type<T>::type(1.f), const_xh,
+      w, typename gemm_compute_type<T>::type(0.f), icfo);
 
   // Add bias, apply non-linearities and gating.
   //
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
index 9ce0b399ba173b67285e907a050c71af5d57068c..d5700d2a200f6cdac06183366c0d11ec3531235b 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.rnn.python.kernel_tests import benchmarking
@@ -27,6 +28,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import rnn
@@ -38,7 +41,70 @@ from tensorflow.python.platform import test
 block_lstm = lstm_ops._block_lstm  # pylint: disable=protected-access
 
 
-def blocks_match(sess, use_peephole):
+class _MaskedRandomUniformInitializer(init_ops.RandomUniform):
+  """Initializer for uniform dist tensors with trailing bits zeroed-out.
+
+  Allow returning tensors with last few mantissa bits set to 0. This potentially
+  helps avoid getting into precision issues when testing low precision (float16)
+  computation.
+  """
+
+  def __init__(self,
+               minval=0,
+               maxval=None,
+               seed=None,
+               dtype=dtypes.float16,
+               num_valid_mantissa_bits=4):
+    """Constructor.
+
+    Args:
+      minval: A python scalar or a scalar tensor. Lower bound of the range of
+        random values to generate.
+      maxval: A python scalar or a scalar tensor. Upper bound of the range of
+        random values to generate.  Defaults to 1 for float types.
+      seed: A Python integer. Used to create random seeds. See
+        `tf.set_random_seed` for behavior.
+      dtype: The data type. Only supports tf.float16 for now.
+      num_valid_mantissa_bits: number of non-zero mantissa bits, default to 4.
+
+    Raises:
+      ValueError: An error if `dtype` is not tf.float16.
+    """
+    if dtype not in (dtypes.float16,):
+      raise ValueError("dtype: %s not supported" % dtype.name)
+
+    super(_MaskedRandomUniformInitializer, self).__init__(
+        minval=minval, maxval=maxval, seed=seed, dtype=dtype)
+    self._num_mantissa_bits = 10
+    self._num_valid_mantissa_bits = num_valid_mantissa_bits
+
+  def __call__(self, shape, dtype=dtypes.float16, partition_info=None):
+    if dtype and dtype != dtypes.float16:
+      raise ValueError("dtype: %s not supported" % dtype.name)
+    res = super(_MaskedRandomUniformInitializer, self).__call__(
+        shape, dtype, partition_info)
+    # get uint16 view of the underlying buffer.
+    res = gen_array_ops.bitcast(res, dtypes.uint16)
+
+    # mask the last `shift` mantissa bits.
+    shift = self._num_mantissa_bits - self._num_valid_mantissa_bits
+    mask = (0xffff >> shift) << shift
+    res = gen_bitwise_ops.bitwise_and(res, mask)
+
+    # restore float16 view.
+    return gen_array_ops.bitcast(res, dtype)
+
+
+def _get_initializer(init_bound, dtype, seed):
+  if dtype == dtypes.float16:
+    return _MaskedRandomUniformInitializer(
+        -init_bound, init_bound, dtype=dtype, seed=seed)
+  else:
+    return init_ops.random_uniform_initializer(
+        -init_bound, init_bound, dtype=dtype, seed=seed)
+
+
+def blocks_match(sess, use_peephole, dtype=dtypes.float32, cell_clip=None):
   batch_size = 2
   input_size = 3
   cell_size = 4
@@ -47,36 +113,42 @@ def blocks_match(sess, use_peephole):
   inputs = []
   for _ in range(sequence_length):
     inp = ops.convert_to_tensor(
-        np.random.randn(batch_size, input_size), dtype=dtypes.float32)
+        np.random.randn(batch_size, input_size), dtype=dtype)
     inputs.append(inp)
   stacked_inputs = array_ops.stack(inputs)
 
-  initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212)
+  init_bound = 1e-1 if dtype == dtypes.float16 else 1e-2
+  initializer = _get_initializer(init_bound, dtype=dtype, seed=19890212)
 
   with variable_scope.variable_scope("test", initializer=initializer):
     # magic naming so that the cells pick up these variables and reuse them
     if use_peephole:
       wci = variable_scope.get_variable(
-          "rnn/lstm_cell/w_i_diag", shape=[cell_size], dtype=dtypes.float32)
+          "rnn/lstm_cell/w_i_diag", shape=[cell_size], dtype=dtype)
       wcf = variable_scope.get_variable(
-          "rnn/lstm_cell/w_f_diag", shape=[cell_size], dtype=dtypes.float32)
+          "rnn/lstm_cell/w_f_diag", shape=[cell_size], dtype=dtype)
       wco = variable_scope.get_variable(
-          "rnn/lstm_cell/w_o_diag", shape=[cell_size], dtype=dtypes.float32)
+          "rnn/lstm_cell/w_o_diag", shape=[cell_size], dtype=dtype)
 
     w = variable_scope.get_variable(
         "rnn/lstm_cell/kernel",
         shape=[input_size + cell_size, cell_size * 4],
-        dtype=dtypes.float32)
+        dtype=dtype)
     b = variable_scope.get_variable(
         "rnn/lstm_cell/bias",
         shape=[cell_size * 4],
-        dtype=dtypes.float32,
+        dtype=dtype,
         initializer=init_ops.zeros_initializer())
 
     basic_cell = rnn_cell.LSTMCell(
-        cell_size, use_peepholes=use_peephole, state_is_tuple=True, reuse=True)
+        cell_size,
+        use_peepholes=use_peephole,
+        cell_clip=cell_clip,
+        dtype=dtype,
+        state_is_tuple=True,
+        reuse=True)
     basic_outputs_op, basic_state_op = rnn.static_rnn(
-        basic_cell, inputs, dtype=dtypes.float32)
+        basic_cell, inputs, dtype=dtype)
 
     if use_peephole:
       _, _, _, _, _, _, block_outputs_op = block_lstm(
@@ -87,7 +159,7 @@ def blocks_match(sess, use_peephole):
           wci=wci,
           wcf=wcf,
           wco=wco,
-          cell_clip=0,
+          cell_clip=cell_clip,
           use_peephole=True)
     else:
       _, _, _, _, _, _, block_outputs_op = block_lstm(
@@ -95,13 +167,15 @@ def blocks_match(sess, use_peephole):
           inputs,
           w,
           b,
-          cell_clip=0)
+          cell_clip=cell_clip)
 
     fused_cell = lstm_ops.LSTMBlockFusedCell(
-        cell_size, cell_clip=0, use_peephole=use_peephole, reuse=True,
+        cell_size,
+        cell_clip=cell_clip,
+        use_peephole=use_peephole,
+        reuse=True,
         name="rnn/lstm_cell")
-    fused_outputs_op, fused_state_op = fused_cell(
-        stacked_inputs, dtype=dtypes.float32)
+    fused_outputs_op, fused_state_op = fused_cell(stacked_inputs, dtype=dtype)
 
     sess.run([variables.global_variables_initializer()])
     basic_outputs, basic_state = sess.run([basic_outputs_op, basic_state_op[0]])
@@ -127,7 +201,19 @@ def blocks_match(sess, use_peephole):
             block_wgrads, fused_wgrads)
 
 
-class LSTMBlockCellTest(test.TestCase):
+class LSTMBlockCellTest(test.TestCase, parameterized.TestCase):
+
+  TEST_CASES = ({
+      "testcase_name": "Fp32",
+      "dtype": dtypes.float32,
+      "rtol": 1e-6,
+      "atol": 1e-6
+  }, {
+      "testcase_name": "Fp16",
+      "dtype": dtypes.float16,
+      "rtol": 8e-3,
+      "atol": 8e-4
+  })
 
   def testNoneDimsWithDynamicRNN(self):
     with self.session(use_gpu=True, graph=ops.Graph()) as sess:
@@ -314,41 +400,43 @@ class LSTMBlockCellTest(test.TestCase):
       for basic, block in zip(basic_res, block_res):
         self.assertAllClose(basic, block)
 
-  def testLSTMBasicToBlock(self):
-    with self.session(use_gpu=True) as sess:
+  def LSTMBasicToBlockTestHelper(self,
+                                 dtype=dtypes.float32,
+                                 use_peephole=False,
+                                 cell_clip=None,
+                                 rtol=1e-6,
+                                 atol=1e-6):
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       (basic_state, fused_state, basic_outputs, block_outputs, fused_outputs,
        basic_grads, block_grads, fused_grads, basic_wgrads, block_wgrads,
        fused_wgrads) = blocks_match(
-           sess, use_peephole=False)
+           sess, use_peephole=use_peephole, dtype=dtype, cell_clip=cell_clip)
 
-      self.assertAllClose(basic_outputs, block_outputs)
-      self.assertAllClose(basic_grads, block_grads)
+      self.assertAllClose(basic_outputs, block_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(basic_grads, block_grads, rtol=rtol, atol=atol)
       for basic, block in zip(basic_wgrads, block_wgrads):
-        self.assertAllClose(basic, block, rtol=1e-6, atol=1e-6)
+        self.assertAllClose(basic, block, rtol=rtol, atol=atol)
 
-      self.assertAllClose(basic_outputs, fused_outputs)
-      self.assertAllClose(basic_state, fused_state)
-      self.assertAllClose(basic_grads, fused_grads)
-      for basic, fused in zip(block_wgrads, fused_wgrads):
-        self.assertAllClose(basic, fused, rtol=1e-6, atol=1e-6)
+      self.assertAllClose(basic_outputs, fused_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(basic_state, fused_state, rtol=rtol, atol=atol)
+      self.assertAllClose(basic_grads, fused_grads, rtol=rtol, atol=atol)
+      for basic, fused in zip(basic_wgrads, fused_wgrads):
+        self.assertAllClose(basic, fused, rtol=rtol, atol=atol)
 
-  def testLSTMBasicToBlockPeeping(self):
-    with self.session(use_gpu=True) as sess:
-      (basic_state, fused_state, basic_outputs, block_outputs, fused_outputs,
-       basic_grads, block_grads, fused_grads, basic_wgrads, block_wgrads,
-       fused_wgrads) = blocks_match(
-           sess, use_peephole=True)
+  @parameterized.named_parameters(*TEST_CASES)
+  def testLSTMBasicToBlock(self, dtype, rtol, atol):
+    self.LSTMBasicToBlockTestHelper(
+        dtype, use_peephole=False, rtol=rtol, atol=atol)
 
-      self.assertAllClose(basic_outputs, block_outputs)
-      self.assertAllClose(basic_grads, block_grads)
-      for basic, block in zip(basic_wgrads, block_wgrads):
-        self.assertAllClose(basic, block, rtol=1e-6, atol=1e-6)
+  @parameterized.named_parameters(*TEST_CASES)
+  def testLSTMBasicToBlockPeeping(self, dtype, rtol, atol):
+    self.LSTMBasicToBlockTestHelper(
+        dtype, use_peephole=True, rtol=rtol, atol=atol)
 
-      self.assertAllClose(basic_outputs, fused_outputs)
-      self.assertAllClose(basic_state, fused_state)
-      self.assertAllClose(basic_grads, fused_grads)
-      for basic, fused in zip(block_wgrads, fused_wgrads):
-        self.assertAllClose(basic, fused, rtol=1e-6, atol=1e-6)
+  @parameterized.named_parameters(*TEST_CASES)
+  def testLSTMBasicToBlockCellClip(self, dtype, rtol, atol):
+    self.LSTMBasicToBlockTestHelper(
+        dtype, use_peephole=True, cell_clip=0.5, rtol=rtol, atol=atol)
 
   def testLSTMFusedSequenceLengths(self):
     """Verify proper support for sequence lengths in LSTMBlockFusedCell."""
@@ -444,16 +532,21 @@ class BenchmarkLSTMBlock(test.Benchmark):
         "batch_size": [1, 8, 13, 32, 67, 128],
         "cell_size": [128, 250, 512, 650, 1024, 1350],
         "time_steps": [40],
-        "use_gpu": [True, False]
+        "use_gpu": [True, False],
+        "dtype": ["float32", "float16"],
     }):
+      dtype = dtypes.float32 if config["dtype"] == "float32" else dtypes.float16
       with ops.Graph().as_default():
         with benchmarking.device(use_gpu=config["use_gpu"]):
           inputs = variable_scope.get_variable(
               "x",
-              [config["time_steps"], config["batch_size"], config["cell_size"]])
-          cell = lstm_ops.LSTMBlockCell(config["cell_size"])
-          outputs = rnn.dynamic_rnn(
-              cell, inputs, time_major=True, dtype=dtypes.float32)
+              dtype=dtype,
+              shape=[
+                  config["time_steps"], config["batch_size"],
+                  config["cell_size"]
+              ])
+          cell = lstm_ops.LSTMBlockCell(config["cell_size"], dtype=dtype)
+          outputs = rnn.dynamic_rnn(cell, inputs, time_major=True, dtype=dtype)
           init_op = variables.global_variables_initializer()
 
         with session.Session() as sess:
@@ -464,12 +557,14 @@ class BenchmarkLSTMBlock(test.Benchmark):
         # is set, this will produce a copy-paste-able CSV file.
         print(",".join(
             map(str, [
-                config["batch_size"], config["cell_size"], config["cell_size"],
-                config["time_steps"], config["use_gpu"], wall_time
+                config["dtype"], config["batch_size"], config["cell_size"],
+                config["cell_size"], config["time_steps"], config["use_gpu"],
+                wall_time
             ])))
         benchmark_name_template = "_".join([
-            "LSTMBlockCell_fprop", "BS%(batch_size)i", "CS%(cell_size)i",
-            "IS%(cell_size)i", "TS%(time_steps)i", "gpu_%(use_gpu)s"
+            "LSTMBlockCell_fprop", "DT_%(dtype)s", "BS%(batch_size)i",
+            "CS%(cell_size)i", "IS%(cell_size)i", "TS%(time_steps)i",
+            "gpu_%(use_gpu)s"
         ])
 
         self.report_benchmark(
@@ -488,8 +583,10 @@ class BenchmarkLSTMBlock(test.Benchmark):
         "batch_size": [1, 8, 13, 32, 67, 128],
         "cell_size": [128, 250, 512, 650, 1024, 1350],
         "time_steps": [40],
-        "use_gpu": [True, False]
+        "use_gpu": [True, False],
+        "dtype": ["float32", "float16"],
     }):
+      dtype = dtypes.float32 if config["dtype"] == "float32" else dtypes.float16
       with ops.Graph().as_default():
         with benchmarking.device(use_gpu=config["use_gpu"]):
           time_steps = config["time_steps"]
@@ -498,21 +595,21 @@ class BenchmarkLSTMBlock(test.Benchmark):
           inputs = variable_scope.get_variable(
               "x", [time_steps, batch_size, cell_size],
               trainable=False,
-              dtype=dtypes.float32)
+              dtype=dtype)
           with variable_scope.variable_scope(
               "rnn", reuse=variable_scope.AUTO_REUSE):
             w = variable_scope.get_variable(
                 "rnn/lstm_cell/kernel",
                 shape=[input_size + cell_size, cell_size * 4],
-                dtype=dtypes.float32)
+                dtype=dtype)
             b = variable_scope.get_variable(
                 "rnn/lstm_cell/bias",
                 shape=[cell_size * 4],
-                dtype=dtypes.float32,
+                dtype=dtype,
                 initializer=init_ops.zeros_initializer())
-            cell = lstm_ops.LSTMBlockCell(cell_size)
+            cell = lstm_ops.LSTMBlockCell(cell_size, dtype=dtype)
             outputs = rnn.dynamic_rnn(
-                cell, inputs, time_major=True, dtype=dtypes.float32)
+                cell, inputs, time_major=True, dtype=dtype)
           grads = gradients_impl.gradients(outputs, [inputs, w, b])
           init_op = variables.global_variables_initializer()
 
@@ -524,12 +621,13 @@ class BenchmarkLSTMBlock(test.Benchmark):
         # is set, this will produce a copy-paste-able CSV file.
         print(",".join(
             map(str, [
-                batch_size, cell_size, cell_size, time_steps, config["use_gpu"],
-                wall_time
+                config["dtype"], batch_size, cell_size, cell_size, time_steps,
+                config["use_gpu"], wall_time
             ])))
         benchmark_name_template = "_".join([
-            "LSTMBlockCell_bprop", "BS%(batch_size)i", "CS%(cell_size)i",
-            "IS%(cell_size)i", "TS%(time_steps)i", "gpu_%(use_gpu)s"
+            "LSTMBlockCell_bprop", "DT_%(dtype)s", "BS%(batch_size)i",
+            "CS%(cell_size)i", "IS%(cell_size)i", "TS%(time_steps)i",
+            "gpu_%(use_gpu)s"
         ])
 
         self.report_benchmark(
diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py
index b30ca7882fce1747cb1dcb27f97f5b012ff9da02..251a933eaec826b08266123245d9aef8573d3e06 100644
--- a/tensorflow/contrib/rnn/python/ops/gru_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py
@@ -21,7 +21,7 @@ from tensorflow.contrib.rnn.ops import gen_gru_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -165,7 +165,7 @@ class GRUBlockCell(LayerRNNCell):
       num_units = cell_size
     self._cell_size = num_units
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 2f65c73470110922081166e067443f9e7a6c0596..b043026bc556a8879b15b432829baf8136250c0e 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.rnn.ops import gen_lstm_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -156,7 +157,7 @@ def _block_lstm(seq_len_max,
 
   Args:
     seq_len_max: A `Tensor` of type `int64`.
-    x: A list of at least 1 `Tensor` objects of the same type in: `float32`.
+    x: A list of at least 1 `Tensor` objects of the same type.
     w: A `Tensor`. Must have the same type as `x`.
     b: A `Tensor`. Must have the same type as `x`.
     cs_prev: A `Tensor`. Must have the same type as `x`.
@@ -189,6 +190,7 @@ def _block_lstm(seq_len_max,
   Raises:
     ValueError: If `b` does not have a valid shape.
   """
+  dtype = x[0].dtype
   batch_size = x[0].get_shape().with_rank(2).dims[0].value
   cell_size4 = b.get_shape().with_rank(1).dims[0].value
   if cell_size4 is None:
@@ -197,13 +199,13 @@ def _block_lstm(seq_len_max,
   zero_state = None
   if cs_prev is None or h_prev is None:
     zero_state = array_ops.constant(
-        0, dtype=dtypes.float32, shape=[batch_size, cell_size])
+        0, dtype=dtype, shape=[batch_size, cell_size])
   if cs_prev is None:
     cs_prev = zero_state
   if h_prev is None:
     h_prev = zero_state
   if wci is None:
-    wci = array_ops.constant(0, dtype=dtypes.float32, shape=[cell_size])
+    wci = array_ops.constant(0, dtype=dtype, shape=[cell_size])
     wcf = wci
     wco = wci
 
@@ -384,7 +386,7 @@ class LSTMBlockCell(LayerRNNCell):
         "scope": "lstm_cell"
     }
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -627,7 +629,7 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
     self._use_peephole = use_peephole
 
     # Inputs must be 3-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=3)
+    self.input_spec = input_spec.InputSpec(ndim=3)
 
   @property
   def num_units(self):
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index e159dc95796e8f02287a4b6db4d25023348fe8da..8a1c09f171e6108174671e3122d5ff4c0b236003 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_array_ops
@@ -2752,7 +2752,7 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
     self._activation = activation or math_ops.tanh
 
     # Restrict inputs to be 2-dimensional matrices
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -3089,7 +3089,7 @@ class IndRNNCell(rnn_cell_impl.LayerRNNCell):
     super(IndRNNCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
@@ -3183,7 +3183,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     super(IndyGRUCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
@@ -3323,7 +3323,7 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
     super(IndyLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._forget_bias = forget_bias
@@ -3444,7 +3444,7 @@ class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
     super(MinimalRNNCell, self).__init__(name=name, dtype=dtype, **kwargs)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self.units = units
     self.activation = activations.get(activation)
@@ -3558,7 +3558,7 @@ class CFNCell(rnn_cell_impl.LayerRNNCell):
     super(CFNCell, self).__init__(name=name, dtype=dtype, **kwargs)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self.units = units
     self.activation = activations.get(activation)
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 291ff83791c7cded2dccc4719bb12e84f00afa42..269443b2c6508bb618d30f64487b1a6a84e8646f 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -82,7 +82,6 @@ py_library(
     name = "keras_saved_model",
     srcs = ["python/saved_model/keras_saved_model.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python:array_ops",
@@ -103,7 +102,10 @@ py_test(
     size = "medium",
     srcs = ["python/saved_model/keras_saved_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_oss",  # TODO(b/119349471): Re-enable
+        "no_windows",
+    ],
     deps = [
         ":keras_saved_model",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
index 6aae4bc5e2981ca4e36e434e577a35a5ac531bba..4c8db94d6f48749d880da284d18aa5a7879b1494 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -19,17 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import six
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import keras as estimator_keras_util
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export as export_helpers
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import models as models_lib
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.metrics import Metric
 from tensorflow.python.keras.models import model_from_json
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
@@ -276,42 +277,40 @@ def _create_signature_def_map(model, mode):
     inputs_dict.update(targets_dict)
   outputs_dict = {name: x
                   for name, x in zip(model.output_names, model.outputs)}
+  metrics = estimator_keras_util._convert_keras_metrics_to_estimator(model)
+
+  # Add metric variables to the `LOCAL_VARIABLES` collection. Metric variables
+  # are by default not added to any collections. We are doing this here, so
+  # that metric variables get initialized.
+  local_vars = set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
+  vars_to_add = set()
+  if metrics is not None:
+    for key, value in six.iteritems(metrics):
+      if isinstance(value, Metric):
+        vars_to_add.update(value.variables)
+        # Convert Metric instances to (value_tensor, update_op) tuple.
+        metrics[key] = (value.result(), value.updates[0])
+  # Remove variables that are in the local variables collection already.
+  vars_to_add = vars_to_add.difference(local_vars)
+  for v in vars_to_add:
+    ops.add_to_collection(ops.GraphKeys.LOCAL_VARIABLES, v)
+
   export_outputs = model_fn_lib.export_outputs_for_mode(
       mode,
       predictions=outputs_dict,
       loss=model.total_loss if model.optimizer else None,
-      metrics=estimator_keras_util._convert_keras_metrics_to_estimator(model))
+      metrics=metrics)
   return export_helpers.build_all_signature_defs(
       inputs_dict,
       export_outputs=export_outputs,
       serving_only=(mode == model_fn_lib.ModeKeys.PREDICT))
 
 
-def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):
+def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
   """Assert model and clone contain the same checkpointable objects."""
 
-  def get_non_optimizer_objects(m, g):
-    """Gather set of model and optimizer checkpointable objects."""
-    # Set default graph because optimizer.variables() returns optimizer
-    # variables defined in the default graph.
-    with g.as_default():
-      all_objects = set(checkpointable_utils.list_objects(m))
-      optimizer_and_variables = set()
-      for obj in all_objects:
-        if isinstance(obj, optimizers.TFOptimizer):
-          optimizer_and_variables.update(checkpointable_utils.list_objects(obj))
-          optimizer_and_variables.update(set(obj.optimizer.variables()))
-      return all_objects - optimizer_and_variables
-
-  model_objects = get_non_optimizer_objects(model, model_graph)
-  clone_objects = get_non_optimizer_objects(clone, clone_graph)
-
-  if len(model_objects) != len(clone_objects):
-    raise errors.InternalError(
-        None, None,
-        'Model and clone must use the same variables.'
-        '\n\tModel variables: %s\n\t Clone variables: %s'
-        % (model_objects, clone_objects))
+  # TODO(fchollet, kathywu): make sure this works in eager mode.
+  return True
 
 
 def load_keras_model(saved_model_path):
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
index 364b65e06a3cdccc5ec23ddca2403bb28e38598e..d8637effe2ba88689d591482b067ac6f4a1683c1 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
@@ -29,7 +29,6 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
@@ -150,8 +149,6 @@ class TestModelSavingandLoading(test.TestCase):
       x = np.random.random((1, 3))
       y = np.random.random((1, 3))
       model.train_on_batch(x, y)
-      model.train_on_batch(x, y)
-
       ref_y = model.predict(x)
 
       temp_saved_model = self._save_model_dir()
@@ -308,6 +305,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       self, model_builder, uses_learning_phase, optimizer, train_before_export):
     saved_model_path = self._save_model_dir()
     with self.session(graph=ops.Graph()):
+      np.random.seed(130)
       input_arr = np.random.random((1, 3))
       target_arr = np.random.random((1, 3))
 
@@ -346,16 +344,24 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
         inputs, outputs = load_model(sess, output_path,
                                      model_fn_lib.ModeKeys.EVAL)
 
-        eval_results = sess.run(outputs, {inputs[input_name]: input_arr,
-                                          inputs[target_name]: target_arr})
+        # First obtain the loss and predictions, and run the metric update op by
+        # feeding in the inputs and targets.
+        loss, predictions, _ = sess.run(
+            (outputs['loss'], outputs['predictions/' + output_name],
+             outputs['metrics/mean_absolute_error/update_op']), {
+                 inputs[input_name]: input_arr,
+                 inputs[target_name]: target_arr
+             })
+
+        # The metric value should be run after the update op, to ensure that it
+        # reflects the correct value.
+        metric_value = sess.run(outputs['metrics/mean_absolute_error/value'])
 
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
-        self.assertAllClose(ref_loss, eval_results['loss'], atol=1e-05)
-        self.assertAllClose(
-            ref_mae, eval_results['metrics/mae/update_op'], atol=1e-05)
-        self.assertAllClose(
-            ref_predict, eval_results['predictions/' + output_name], atol=1e-05)
+        self.assertAllClose(ref_loss, loss, atol=1e-05)
+        self.assertAllClose(ref_mae, metric_value, atol=1e-05)
+        self.assertAllClose(ref_predict, predictions, atol=1e-05)
 
       # Load train graph, and check for the train op, and prediction values
       with session.Session(graph=ops.Graph()) as sess:
@@ -364,8 +370,8 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
         self.assertIn('loss', outputs)
-        self.assertIn('metrics/mae/update_op', outputs)
-        self.assertIn('metrics/mae/value', outputs)
+        self.assertIn('metrics/mean_absolute_error/update_op', outputs)
+        self.assertIn('metrics/mean_absolute_error/value', outputs)
         self.assertIn('predictions/' + output_name, outputs)
 
         # Train for a step
@@ -458,11 +464,6 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
       clone.train_on_batch(input_arr, target_arr)
 
-    with self.assertRaisesRegexp(
-        errors.InternalError, 'Model and clone must use the same variables.'):
-      keras_saved_model._assert_same_non_optimizer_objects(
-          model, model_graph, clone, clone_graph)
-
   def testSaveSeqModelWithoutInputShapesRaisesError(self):
     """A Sequential model that hasn't been built should raise an error."""
     model = sequential_model_without_input_shape(True)
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index 6bd58c4d322c04d4d14d04678e24a05c0f876208..5e4f130b31483204a111e2f778fa5d0fc4526fea 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -4,129 +4,11 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
-load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
-
 py_library(
     name = "signal_py",
-    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "test_util",
-    srcs = ["python/kernel_tests/test_util.py"],
+    srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:tf_optimizer",
-        "//tensorflow/python:training",
-    ],
-)
-
-cuda_py_tests(
-    name = "mel_ops_test",
-    srcs = ["python/kernel_tests/mel_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        ":test_util",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-cuda_py_tests(
-    name = "mfcc_ops_test",
-    srcs = ["python/kernel_tests/mfcc_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-)
-
-cuda_py_tests(
-    name = "reconstruction_ops_test",
-    srcs = ["python/kernel_tests/reconstruction_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_tests(
-    name = "shape_ops_test",
-    srcs = ["python/kernel_tests/shape_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        ":test_util",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_tests(
-    name = "spectral_ops_test",
-    size = "large",
-    srcs = ["python/kernel_tests/spectral_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-    tags = ["nomac"],
-)
-
-cuda_py_tests(
-    name = "window_ops_test",
-    srcs = ["python/kernel_tests/window_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        ":test_util",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/signal",
     ],
 )
diff --git a/tensorflow/contrib/signal/__init__.py b/tensorflow/contrib/signal/__init__.py
index d088e744346aac0aa8675b95d7b792379fc7b019..d01f5ccf51c132082a419ec7db49045ef8bab725 100644
--- a/tensorflow/contrib/signal/__init__.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -14,6 +14,9 @@
 # ==============================================================================
 """Signal processing operations.
 
+`tf.contrib.signal` has been renamed to `tf.signal`. `tf.contrib.signal` will be
+removed in TensorFlow 2.0.
+
 See the
 [Contrib Signal](https://tensorflow.org/api_guides/python/contrib.signal)
 guide.
@@ -39,18 +42,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops.mel_ops import linear_to_mel_weight_matrix
-from tensorflow.contrib.signal.python.ops.mfcc_ops import mfccs_from_log_mel_spectrograms
-from tensorflow.contrib.signal.python.ops.reconstruction_ops import overlap_and_add
-from tensorflow.contrib.signal.python.ops.shape_ops import frame
+from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
+from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
+from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
+from tensorflow.python.ops.signal.shape_ops import frame
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
+from tensorflow.python.ops.signal.spectral_ops import stft
+from tensorflow.python.ops.signal.window_ops import hamming_window
+from tensorflow.python.ops.signal.window_ops import hann_window
+
+from tensorflow.python.util.all_util import remove_undocumented
+
 # `frame` used to be named `frames`, which is a noun and not a verb.
 # Keep an alias to `frames` for backwards compatibility.
-from tensorflow.contrib.signal.python.ops.shape_ops import frame as frames
-from tensorflow.contrib.signal.python.ops.spectral_ops import inverse_stft
-from tensorflow.contrib.signal.python.ops.spectral_ops import inverse_stft_window_fn
-from tensorflow.contrib.signal.python.ops.spectral_ops import stft
-from tensorflow.contrib.signal.python.ops.window_ops import hamming_window
-from tensorflow.contrib.signal.python.ops.window_ops import hann_window
+frames = frame
 
-from tensorflow.python.util.all_util import remove_undocumented
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 42898e797cc351e3de290cc65fc825f1406c739d..605625c3059868d349da015b8286d219691fc255 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -79,6 +79,7 @@ from tensorflow.python.ops.summary_ops_v2 import image
 from tensorflow.python.ops.summary_ops_v2 import import_event
 from tensorflow.python.ops.summary_ops_v2 import initialize
 from tensorflow.python.ops.summary_ops_v2 import never_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import record_summaries
 from tensorflow.python.ops.summary_ops_v2 import record_summaries_every_n_global_steps
 from tensorflow.python.ops.summary_ops_v2 import scalar
 from tensorflow.python.ops.summary_ops_v2 import should_record_summaries
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 4d1807130c57039976dfa57c27bb0d4807e75212..10e4556dacbc17ec02c2bd698389b04d517d7076 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -152,6 +152,27 @@ class EagerFileTest(test_util.TensorFlowTestCase):
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
+  def testRecordEveryNGlobalSteps(self):
+    step = training_util.get_or_create_global_step()
+    logdir = tempfile.mkdtemp()
+
+    def run_step():
+      summary_ops.scalar('scalar', i, step=step)
+      step.assign_add(1)
+
+    with summary_ops.create_file_writer(
+        logdir).as_default(), summary_ops.record_summaries_every_n_global_steps(
+            2, step):
+      for i in range(10):
+        run_step()
+      # And another 10 steps as a graph function.
+      run_step_fn = function.defun(run_step)
+      for i in range(10):
+        run_step_fn()
+
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(len(events), 11)
+
   def testMaxQueue(self):
     logs = tempfile.mkdtemp()
     with summary_ops.create_file_writer(
@@ -279,12 +300,9 @@ class EagerDbTest(summary_test_util.SummaryDbTest):
 
   def testDbURIOpen(self):
     tmpdb_path = os.path.join(self.get_temp_dir(), 'tmpDbURITest.sqlite')
-    tmpdb_uri = six.moves.urllib_parse.urljoin("file:", tmpdb_path)
-    tmpdb_writer = summary_ops.create_db_writer(
-        tmpdb_uri,
-        "experimentA",
-        "run1",
-        "user1")
+    tmpdb_uri = six.moves.urllib_parse.urljoin('file:', tmpdb_path)
+    tmpdb_writer = summary_ops.create_db_writer(tmpdb_uri, 'experimentA',
+                                                'run1', 'user1')
     with summary_ops.always_record_summaries():
       with tmpdb_writer.as_default():
         summary_ops.scalar('t1', 2.0)
diff --git a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
index 596c59ead3460aa63eeff44d5a11a4a8c5cde0da..290c16fe3966791ea78986539750caf938a37322 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.tensor_forest.python.ops import gen_model_ops
 
 # pylint: disable=unused-import
@@ -28,10 +30,12 @@ from tensorflow.contrib.tensor_forest.python.ops.gen_model_ops import update_mod
 # pylint: enable=unused-import
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 
 _model_ops = loader.load_op_library(
@@ -88,6 +92,59 @@ class TreeVariableSavable(saver.BaseSaverBuilder.SaveableObject):
           params=self.params.serialized_params_proto)
 
 
+class TreeVariable(tracking.TrackableResource):
+  """A tree model."""
+
+  def __init__(self, params, tree_config, stats_handle, name, container=None):
+    self._params = params
+    self._tree_config = tree_config
+    self._stats_handle = stats_handle
+    self._name = name
+    self._container = container
+    self._init_op = None
+    super(TreeVariable, self).__init__()
+    self._resource_handle = self.create_resource()
+
+  def create_resource(self):
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      shared_name = "tree_variable_%d" % (ops.uid(),)
+    else:
+      shared_name = self._name
+    return gen_model_ops.decision_tree_resource_handle_op(
+        self._container, shared_name=shared_name, name=self._name)
+
+  def initialize(self):
+    return gen_model_ops.create_tree_variable(
+        self.resource_handle,
+        self._tree_config,
+        params=self._params.serialized_params_proto)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_model_ops.tree_is_initialized_op(self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {
+        "tree_variable":
+            functools.partial(
+                TreeVariableSavable,
+                params=self._params,
+                tree_handle=self.resource_handle,
+                stats_handle=self._stats_handle,
+                create_op=self._init_op)
+    }
+
+
 def tree_variable(params, tree_config, stats_handle, name, container=None):
   r"""Creates a tree model and returns a handle to it.
 
@@ -102,18 +159,13 @@ def tree_variable(params, tree_config, stats_handle, name, container=None):
     A `Tensor` of type mutable `string`. The handle to the tree.
   """
   with ops.name_scope(name, "TreeVariable") as name:
-    resource_handle = gen_model_ops.decision_tree_resource_handle_op(
-        container, shared_name=name, name=name)
-
-    create_op = gen_model_ops.create_tree_variable(
-        resource_handle,
-        tree_config,
-        params=params.serialized_params_proto)
-    is_initialized_op = gen_model_ops.tree_is_initialized_op(resource_handle)
+    tree_var = TreeVariable(params, tree_config, stats_handle, name, container)
+    resource_handle = tree_var.resource_handle
+    create_op = tree_var.initializer
+    is_initialized_op = tree_var.is_initialized()
     # Adds the variable to the savable list.
-    saveable = TreeVariableSavable(params, resource_handle, stats_handle,
-                                   create_op,
-                                   resource_handle.name)
+    saveable = tree_var._gather_saveables_for_checkpoint()["tree_variable"](  # pylint: disable=protected-access
+        name=resource_handle.name)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     resources.register_resource(resource_handle, create_op, is_initialized_op)
     return resource_handle
diff --git a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
index 44d486edecc4e4f7ba8a9b6d680178298813621b..9184198cd4c8fd2a7609714d094d5ef2b6868658 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.tensor_forest.python.ops import gen_stats_ops
 # pylint: disable=unused-import
 from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import finalize_tree
@@ -25,10 +27,12 @@ from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import process_in
 # pylint: enable=unused-import
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 
 _stats_ops = loader.load_op_library(
@@ -84,8 +88,58 @@ class FertileStatsVariableSavable(saver.BaseSaverBuilder.SaveableObject):
           params=self.params.serialized_params_proto)
 
 
-def fertile_stats_variable(params, stats_config, name,
-                           container=None):
+class FertileStatsVariable(tracking.TrackableResource):
+  """A Fertile stats variable."""
+
+  def __init__(self, params, stats_config, name, container=None):
+    self._params = params
+    self._stats_config = stats_config
+    self._name = name
+    self._container = container
+    self._init_op = None
+    super(FertileStatsVariable, self).__init__()
+    self._resource_handle = self.create_resource()
+
+  def create_resource(self):
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      shared_name = "fertile_stats_variable_%d" % (ops.uid(),)
+    else:
+      shared_name = self._name
+    return gen_stats_ops.fertile_stats_resource_handle_op(
+        self._container, shared_name=shared_name, name=self._name)
+
+  def initialize(self):
+    return gen_stats_ops.create_fertile_stats_variable(
+        self.resource_handle,
+        self._stats_config,
+        params=self._params.serialized_params_proto)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_stats_ops.fertile_stats_is_initialized_op(self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {
+        "fertile_stats_variable":
+            functools.partial(
+                FertileStatsVariableSavable,
+                params=self._params,
+                stats_handle=self.resource_handle,
+                create_op=self.initializer)
+    }
+
+
+def fertile_stats_variable(params, stats_config, name, container=None):
   r"""Creates a stats object and returns a handle to it.
 
   Args:
@@ -98,17 +152,15 @@ def fertile_stats_variable(params, stats_config, name,
     A `Tensor` of type mutable `string`. The handle to the stats.
   """
   with ops.name_scope(name, "FertileStatsVariable") as name:
-    resource_handle = gen_stats_ops.fertile_stats_resource_handle_op(
-        container, shared_name=name, name=name)
-
-    create_op = gen_stats_ops.create_fertile_stats_variable(
-        resource_handle, stats_config,
-        params=params.serialized_params_proto)
-    is_initialized_op = gen_stats_ops.fertile_stats_is_initialized_op(
-        resource_handle)
+    fertile_stats_var = FertileStatsVariable(params, stats_config, name,
+                                             container)
+    resource_handle = fertile_stats_var.resource_handle
+    create_op = fertile_stats_var.initializer
+    is_initialized_op = fertile_stats_var.is_initialized()
     # Adds the variable to the savable list.
-    saveable = FertileStatsVariableSavable(params, resource_handle, create_op,
-                                           resource_handle.name)
+    saveable = (
+        fertile_stats_var._gather_saveables_for_checkpoint()[  # pylint: disable=protected-access
+            "fertile_stats_variable"](name=resource_handle.name))
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     resources.register_resource(resource_handle, create_op, is_initialized_op)
     return resource_handle
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
index 3f24f58f03aac2ba6d368d7eccf8731f611a81b4..22b6f09d0cd88068f7bedabe7687920420a3028f 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -73,7 +73,16 @@ class SummaryFileWriter : public SummaryWriterInterface {
     e->set_step(global_step);
     e->set_wall_time(GetWallTime());
     Summary::Value* v = e->mutable_summary()->add_value();
-    t.AsProtoTensorContent(v->mutable_tensor());
+
+    if (t.dtype() == DT_STRING) {
+      // Treat DT_STRING specially, so that tensor_util.MakeNdarray in Python
+      // can convert the TensorProto to string-type numpy array. MakeNdarray
+      // does not work with strings encoded by AsProtoTensorContent() in
+      // tensor_content.
+      t.AsProtoField(v->mutable_tensor());
+    } else {
+      t.AsProtoTensorContent(v->mutable_tensor());
+    }
     v->set_tag(tag);
     if (!serialized_metadata.empty()) {
       v->mutable_metadata()->ParseFromString(serialized_metadata);
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
index cd3f712256f2293ed725745f8cbe48109856ef86..ffbfb9533e887e54b0f5bdfde11dadce21073a94 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
 
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -104,6 +105,23 @@ TEST_F(SummaryFileWriterTest, WriteTensor) {
                                   CHECK_EQ(e.summary().value_size(), 1);
                                   EXPECT_EQ(e.summary().value(0).tag(), "name");
                                 }));
+  TF_CHECK_OK(SummaryTestHelper(
+      "string_tensor_test",
+      [](SummaryWriterInterface* writer) {
+        Tensor hello(DT_STRING, TensorShape({}));
+        hello.scalar<string>()() = "hello";
+        TF_RETURN_IF_ERROR(writer->WriteTensor(
+            2, hello, "name", SummaryMetadata().SerializeAsString()));
+        TF_RETURN_IF_ERROR(writer->Flush());
+        return Status::OK();
+      },
+      [](const Event& e) {
+        EXPECT_EQ(e.step(), 2);
+        CHECK_EQ(e.summary().value_size(), 1);
+        EXPECT_EQ(e.summary().value(0).tag(), "name");
+        EXPECT_EQ(e.summary().value(0).tensor().dtype(), DT_STRING);
+        EXPECT_EQ(e.summary().value(0).tensor().string_val()[0], "hello");
+      }));
 }
 
 TEST_F(SummaryFileWriterTest, WriteScalar) {
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index f30c31a78925801da69ebf4e950d70b018cb15d2..d304d72c6aa4092b7f6afdd6859847bd37c93e95 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -29,6 +29,10 @@ load(
     "if_tensorrt",
 )
 
+exports_files(glob([
+    "test/testdata/*",
+]))
+
 tf_cuda_cc_test(
     name = "tensorrt_test_cc",
     size = "small",
@@ -550,6 +554,30 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_test(
+    name = "quantization_mnist_test",
+    srcs = ["test/quantization_mnist_test.py"],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/keras:keras",
+        "//tensorflow/python/estimator:estimator",
+    ],
+    data = [
+        "test/testdata/checkpoint",
+        "test/testdata/model.ckpt-46900.data-00000-of-00001",
+        "test/testdata/model.ckpt-46900.index",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_pip",
+        "no_tap",  # It is not able to download the mnist data.
+        "no_windows",
+        "nomac",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["convert/utils.cc"],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index b1443e7791d6d3c5ed8952d29df12ede77fe8c23..f6d44cb719123ac55ea8c56c34d157e87e244626 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -82,81 +82,79 @@ std::vector<int> GetLoadedTensorRTVersion() {
 }
 
 TrtCandidateSelector::TrtCandidateSelector(
-    const grappler::GraphProperties& graph_properties,
-    int precision_mode)
+    const grappler::GraphProperties& graph_properties, int precision_mode)
     : graph_properties_(graph_properties), precision_mode_(precision_mode) {}
 
 Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   // TODO(laigd): move this set to TrtNodeValidator where it should belong.
   // LINT.IfChange
   static const std::set<string> candidate_ops = {
-    "Identity",
-    "Snapshot",
-    "Const",
-    "Conv2D",
-    "MaxPool",
-    "BiasAdd",
-    "Relu",
-    "Add",
-    "Mul",
-    "Sub",
-    "Rsqrt",
-    "Pad",
-    "Mean",
-    "AvgPool",
-    "ConcatV2",
-    "DepthwiseConv2dNative",
-    "FusedBatchNorm",
-    "FusedBatchNormV2",
-    "Div",
-    "RealDiv",
-    "Rsqrt",
-    "Reciprocal",
-    "Exp",
-    "Log",
-    "Sqrt",
-    "Abs",
-    "Neg",
-    "Transpose",
-    "Reshape",
-    "MatMul",
-    "BatchMatMul",
-    "Softmax",
-    "Minimum",
-    "Maximum",
-    "TopKV2",
-    "Sum",
-    "Prod",
-    "Max",
-    "Min",
-    "Relu6",
+      "Identity",
+      "Snapshot",
+      "Const",
+      "Conv2D",
+      "MaxPool",
+      "BiasAdd",
+      "Relu",
+      "Add",
+      "Mul",
+      "Sub",
+      "Rsqrt",
+      "Pad",
+      "Mean",
+      "AvgPool",
+      "ConcatV2",
+      "DepthwiseConv2dNative",
+      "FusedBatchNorm",
+      "FusedBatchNormV2",
+      "Div",
+      "RealDiv",
+      "Rsqrt",
+      "Reciprocal",
+      "Exp",
+      "Log",
+      "Sqrt",
+      "Abs",
+      "Neg",
+      "Transpose",
+      "Reshape",
+      "MatMul",
+      "BatchMatMul",
+      "Softmax",
+      "Minimum",
+      "Maximum",
+      "TopKV2",
+      "Sum",
+      "Prod",
+      "Max",
+      "Min",
+      "Relu6",
   };
-  bool is_supported_op_type = (candidate_ops.count(node->type_string()) ||
-      PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
-#if NV_TENSORRT_MAJOR >= 5
+  bool is_supported_op_type =
+      (candidate_ops.count(node->type_string()) ||
+       PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
   static const std::set<string> quantize_ops = {
-    "QuantizeAndDequantizeV2",
-    "QuantizeAndDequantizeV3",
-    "FakeQuantWithMinMaxVars",
-    "FakeQuantWithMinMaxArgs",
+      "QuantizeAndDequantizeV2",
+      "QuantizeAndDequantizeV3",
+      "FakeQuantWithMinMaxVars",
+      "FakeQuantWithMinMaxArgs",
   };
   // In INT8 mode, we will always apply the quantization ranges provided by
   // these ops to the relevant tensors. This happens regardless of the value of
   // use_calibration.
-  if (precision_mode_ == INT8MODE &&
-      quantize_ops.count(node->type_string())) {
+  if (precision_mode_ == INT8MODE && quantize_ops.count(node->type_string())) {
     is_supported_op_type = true;
   }
-#endif
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
   if (!is_supported_op_type) {
     return errors::Unimplemented("Op type ", node->type_string(),
-                                 " is not supported.");
+                                 " is not supported");
   }
 
   std::vector<const Edge*> input_edges;
   TF_RETURN_IF_ERROR(node->input_edges(&input_edges));
   std::vector<std::pair<const NodeDef*, int>> input_node_and_ports;
+  input_node_and_ports.reserve(input_edges.size());
   for (const Edge* input_edge : input_edges) {
     input_node_and_ports.emplace_back(&input_edge->src()->def(),
                                       input_edge->src_output());
@@ -280,7 +278,9 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 #endif
 
   // Create RewriterConfig.
-  tensorflow::RewriterConfig rw_cfg;
+  tensorflow::ConfigProto config_proto;
+  auto& rw_cfg =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   // TODO(aaroey): use only const folding and layout for the time being since
   // new optimizers break the graph for trt.
   rw_cfg.add_optimizers("constfold");
@@ -304,7 +304,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   parameters["use_calibration"].set_b(use_calibration);
 
   // Run optimizer.
-  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
+  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, config_proto);
   TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, new_graph_def));
 
   if (VLOG_IS_ON(5)) {
@@ -582,11 +582,11 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
     }
   }
 
-  const bool calibrate_int8 = (info.precision_mode == INT8MODE && info.use_calibration);
+  const bool calibrate_int8 =
+      (info.precision_mode == INT8MODE && info.use_calibration);
   // Build the engine and get its serialized representation.
   string segment_string;
-  if (info.engine_type == EngineInfo::EngineType::TRTStatic || 
-      calibrate_int8) {
+  if (info.engine_type == EngineInfo::EngineType::TRTStatic || calibrate_int8) {
     // Create static engine for fp32/fp16 mode, and test validity of the engine
     // for int8 calibration mode. We don't want engine to fail at the
     // calibration time. So we are constructing a FP32 engine here to check its
@@ -596,8 +596,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     // TODO(sami): What happens if 1st dim is not batch?
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
-        info.segment_graph_def,
-        calibrate_int8 ? FP32MODE : info.precision_mode,
+        info.segment_graph_def, calibrate_int8 ? FP32MODE : info.precision_mode,
         max_batch_size, info.max_workspace_size_bytes, input_shapes,
         &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
         info.use_calibration,
@@ -927,12 +926,12 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     }
     curr_engine.precision_mode = params.precision_mode;
     if (params.use_calibration && params.precision_mode != INT8MODE) {
-      return tensorflow::errors::Unimplemented(
-           "Calibration with FP32 or FP16 is not implemented. ");
+      return errors::InvalidArgument(
+          "Calibration with FP32 or FP16 is not supported.");
     }
     curr_engine.engine_type = ((params.is_dyn_op || params.use_calibration)
-             ? EngineInfo::EngineType::TRTDynamic
-             : EngineInfo::EngineType::TRTStatic);
+                                   ? EngineInfo::EngineType::TRTDynamic
+                                   : EngineInfo::EngineType::TRTStatic);
     curr_engine.use_calibration = params.use_calibration;
     curr_engine.cached_engine_batches = params.cached_engine_batches;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
@@ -952,7 +951,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     converted_segments.push_back(std::move(curr_segment));
 
     if (VLOG_IS_ON(8)) {
-      string fname = curr_engine.engine_name;
+      string fname = engine_segments.back().engine_name;
       StrAppend(&fname, ".pb");
       std::fstream f;
       f.open(fname.c_str(), std::fstream::out | std::fstream::binary);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 366d69115b2ee000d77e088539a06ac8c88134ee..2904e73abc01b11400f73cd5779c1aeceb0d7e0b 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -53,6 +53,9 @@ class TrtCandidateSelector {
   // GraphProperties of the graph whose nodes are to be validated by
   // IsTensorRTCandidate().
   const grappler::GraphProperties& graph_properties_;
+
+  // Quantization ops are only converted when using quantized precisions.
+  const int precision_mode_;
 };
 
 struct ConversionParams {
@@ -101,8 +104,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode = 1, int minimum_segment_size = 3,
     bool is_dyn_op = false, int max_cached_engines = 1,
-    std::vector<int> cached_engine_batches = {},
-    bool use_calibration = true);
+    std::vector<int> cached_engine_batches = {}, bool use_calibration = true);
 
 // Method to call from optimization pass
 tensorflow::Status ConvertAfterShapes(ConversionParams& params);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
index e2a7c40f301891b99c7cadb9f526233c0b81f461..2d2bfeb192c1893824c7b30bfad593c62c203392 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
@@ -85,27 +85,42 @@ TEST(TrtCandidateSelector, Basics) {
       ops::MatMul(s.WithOpName("matmul_with_incompatible_input"),
                   incompatible_feed, const_2);
 
+  // Quantize ops.
+  auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
+  auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("quantize"), feed,
+                                               quantize_attrs);
+
+  // Get GrapplerItem and GraphProperties.
   grappler::GrapplerItem item;
   TF_EXPECT_OK(s.ToGraphDef(&item.graph));
   Tensor feed_tensor(DT_FLOAT, input_shape);
   item.feed.push_back(std::make_pair("feed", feed_tensor));
-
   grappler::GraphProperties graph_properties(item);
   TF_EXPECT_OK(graph_properties.InferStatically(true));
 
-  TrtCandidateSelector selector(graph_properties, FP32MODE);
-  TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node()));
-  ExpectStatus(
-      selector.IsTensorRTCandidate(incompatible_matmul.operation.node()),
-      error::INVALID_ARGUMENT,
-      "transpose_a is not supported for TensorRT FullyConnected "
-      "(op: MatMul), at: incompatible_matmul");
-  ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
-               error::UNIMPLEMENTED, "Op type Sin is not supported");
-  ExpectStatus(selector.IsTensorRTCandidate(
-                   matmul_with_incompatible_input.operation.node()),
-               error::INTERNAL,
-               "Failed to convert input with index 0 to a TRT_TensorOrWeights");
+  for (const int precision_mode : {FP32MODE, INT8MODE}) {
+    TrtCandidateSelector selector(graph_properties, precision_mode);
+    TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node()));
+    ExpectStatus(
+        selector.IsTensorRTCandidate(incompatible_matmul.operation.node()),
+        error::INVALID_ARGUMENT,
+        "transpose_a is not supported for TensorRT FullyConnected "
+        "(op: MatMul), at: incompatible_matmul");
+    ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
+                 error::UNIMPLEMENTED, "Op type Sin is not supported");
+    ExpectStatus(
+        selector.IsTensorRTCandidate(
+            matmul_with_incompatible_input.operation.node()),
+        error::INTERNAL,
+        "Failed to convert input with index 0 to a TRT_TensorOrWeights");
+    if (precision_mode == INT8MODE) {
+      TF_EXPECT_OK(selector.IsTensorRTCandidate(quantize.operation.node()));
+    } else {
+      ExpectStatus(selector.IsTensorRTCandidate(quantize.operation.node()),
+                   error::UNIMPLEMENTED,
+                   "Op type FakeQuantWithMinMaxArgs is not supported");
+    }
+  }
 }
 
 class FakeCluster : public grappler::Cluster {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index bc29bf8ffacf53492da2591fdc56513ac8f8c694..cb2a1ca87ac7434e7480ee09f14071b67f107410 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -54,10 +54,10 @@ limitations under the License.
 // would work!
 #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 
-#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                               \
-  do {                                                                   \
-    return tensorflow::errors::Internal(                                 \
-        "TFTRT::", __FUNCTION__, "failed to add TRT layer, at: ", node); \
+#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                                \
+  do {                                                                    \
+    return tensorflow::errors::Internal(                                  \
+        "TFTRT::", __FUNCTION__, " failed to add TRT layer, at: ", node); \
   } while (0)
 
 #define TFTRT_RETURN_ERROR_IF_FALSE(status, node) \
@@ -187,10 +187,49 @@ Status ValidateTensorProperties(const string& producer_node_type,
   return Status::OK();
 }
 
+string DebugString(const nvinfer1::DimensionType type) {
+  switch (type) {
+    case nvinfer1::DimensionType::kSPATIAL:
+      return "kSPATIAL";
+    case nvinfer1::DimensionType::kCHANNEL:
+      return "kCHANNEL";
+    case nvinfer1::DimensionType::kINDEX:
+      return "kINDEX";
+    case nvinfer1::DimensionType::kSEQUENCE:
+      return "kSEQUENCE";
+    default:
+      return StrCat(static_cast<int>(type), "=unknown");
+  }
+}
+
+string DebugString(const nvinfer1::DataType trt_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      return "kFLOAT";
+    case nvinfer1::DataType::kHALF:
+      return "kHALF";
+    case nvinfer1::DataType::kINT8:
+      return "kINT8";
+    case nvinfer1::DataType::kINT32:
+      return "kINT32";
+    default:
+      return "Invalid TRT data type";
+  }
+}
+
 string DebugString(const nvinfer1::Dims& dims) {
   string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d=");
   for (int i = 0; i < dims.nbDims; ++i) {
-    StrAppend(&out, dims.d[i], ",");
+    StrAppend(&out, dims.d[i], "[", DebugString(dims.type[i]), "],");
+  }
+  StrAppend(&out, ")");
+  return out;
+}
+
+string DebugString(const nvinfer1::Permutation& permutation, int len) {
+  string out = "nvinfer1::Permutation(";
+  for (int i = 0; i < len; ++i) {
+    StrAppend(&out, permutation.order[i], ",");
   }
   StrAppend(&out, ")");
   return out;
@@ -198,16 +237,15 @@ string DebugString(const nvinfer1::Dims& dims) {
 
 string DebugString(const nvinfer1::ITensor& tensor) {
   return StrCat("nvinfer1::ITensor(@", reinterpret_cast<uintptr_t>(&tensor),
-                ", shape=", DebugString(tensor.getDimensions()), ")");
+                ", name=", tensor.getName(),
+                ", dtype=", DebugString(tensor.getType()),
+                ", dims=", DebugString(tensor.getDimensions()), ")");
 }
 
-// Return whether or not the broadcast is feasible;
-bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
-                               const bool operand_l_is_tensor,
-                               const nvinfer1::Dims& operand_r,
-                               const bool operand_r_is_tensor,
-                               nvinfer1::Dims* operand_l_new_shape,
-                               nvinfer1::Dims* operand_r_new_shape) {
+Status Converter::GetTrtBroadcastShape(
+    const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r,
+    nvinfer1::Dims* operand_l_new_dims,
+    nvinfer1::Dims* operand_r_new_dims) const {
   // ***************************************************************************
   // TensorRT Elementwise op supports broadcast but requires both tensor to be
   // of Identical rank
@@ -232,52 +270,59 @@ bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
   // -> T: 1 1 1 -1 3 5 1
   // -> W: 1 1 1  1 3 5 1
   // ***************************************************************************
-  const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
-  const size_t element_size = sizeof(operand_l.d[0]);
-
-  // fill in dimensions
-  int l_s[max_nb_dims];
-  std::fill(l_s, l_s + max_nb_dims, 1);
-  int l_d = operand_l_is_tensor ? operand_l.nbDims + 1 : operand_l.nbDims;
-  int r_s[max_nb_dims];
-  std::fill(r_s, r_s + max_nb_dims, 1);
-  int r_d = operand_r_is_tensor ? operand_r.nbDims + 1 : operand_r.nbDims;
-
-  int max_d = std::max(l_d, r_d);
-  std::memcpy(l_s + max_d - operand_l.nbDims, operand_l.d,
-              operand_l.nbDims * element_size);
-  std::memcpy(r_s + max_d - operand_r.nbDims, operand_r.d,
-              operand_r.nbDims * element_size);
-
-  // set -1 for batch dimension, since batch size is not supposed to be
-  // broadcasted
-  if (operand_l_is_tensor) {
-    if (max_d != l_d) {  // if broadcast beyond batch dimension, fail
-      return false;
-    }
-    l_s[0] = -1;
-  }
-  if (operand_r_is_tensor) {
-    if (max_d != r_d) {  // if broadcast beyond batch dimension, fail
-      return false;
-    }
-    r_s[0] = -1;
+  if (!operand_l.is_tensor() && !operand_r.is_tensor()) {
+    return errors::InvalidArgument(
+        "Broadcasting requires at least one of the operands be tensors");
   }
 
-  // compare broadcast feasibility
-  for (int i = max_d - 1; i >= 0; i--) {
-    if ((l_s[i] != r_s[i]) && (l_s[i] != 1) && (r_s[i] != 1)) {
-      return false;
+  const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
+  auto compute_output_dims =
+      [max_nb_dims](const TRT_TensorOrWeights& input, int broadcast_num_dims,
+                    int* output_dims_array, nvinfer1::Dims* output_dims) {
+        const nvinfer1::Dims input_dims = input.GetTrtDims();
+        std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
+        std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
+                  output_dims_array + broadcast_num_dims - input_dims.nbDims);
+        if (input.is_tensor()) {
+          const int true_input_dims = input_dims.nbDims + 1;
+          if (true_input_dims < broadcast_num_dims) {
+            return errors::InvalidArgument(
+                "Broadcasting beyond batch dimension is not supported ",
+                "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
+                broadcast_num_dims, ")");
+          }
+          // Set the batch dimension to -1, since batch size is not supposed to
+          // be broadcasted.
+          output_dims_array[0] = -1;
+        }
+        // Copy to output dimensions (stripping the batch dimension).
+        output_dims->nbDims = broadcast_num_dims - 1;
+        std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
+                  output_dims->d);
+        return Status::OK();
+      };
+
+  // Compute the output dimensions.
+  const int broadcast_num_dims =
+      std::max(operand_l.GetTrtDims().nbDims + (operand_l.is_tensor() ? 1 : 0),
+               operand_r.GetTrtDims().nbDims + (operand_r.is_tensor() ? 1 : 0));
+  int output_l[max_nb_dims], output_r[max_nb_dims];
+  TF_RETURN_IF_ERROR(compute_output_dims(operand_l, broadcast_num_dims,
+                                         output_l, operand_l_new_dims));
+  TF_RETURN_IF_ERROR(compute_output_dims(operand_r, broadcast_num_dims,
+                                         output_r, operand_r_new_dims));
+
+  // Compare broadcast feasibility
+  for (int i = 0; i < broadcast_num_dims; ++i) {
+    if ((output_l[i] != output_r[i]) && (output_l[i] != 1) &&
+        (output_r[i] != 1)) {
+      return errors::InvalidArgument(
+          "Infeasible broadcast scheme (", "batch_dim: ", output_l[0], ", ",
+          DebugString(*operand_l_new_dims), " vs ", "batch_dim: ", output_r[0],
+          ", ", DebugString(*operand_r_new_dims), ")");
     }
   }
-
-  // output new TensorRT Dimension (stripping the batch dimension)
-  operand_l_new_shape->nbDims = max_d - 1;
-  std::memcpy(operand_l_new_shape->d, l_s + 1, (max_d - 1) * element_size);
-  operand_r_new_shape->nbDims = max_d - 1;
-  std::memcpy(operand_r_new_shape->d, r_s + 1, (max_d - 1) * element_size);
-
-  return true;
+  return Status::OK();
 }
 
 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
@@ -381,7 +426,7 @@ size_t TRT_ShapedWeights::size_bytes() const {
 
 string TRT_ShapedWeights::DebugString() const {
   return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_),
-                ", type=", type_,
+                ", type=", DataTypeString(type_),
                 ", values=", reinterpret_cast<uintptr_t>(GetValues()), ")");
 }
 
@@ -491,8 +536,7 @@ nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
 string TRT_TensorOrWeights::DebugString() const {
   string output = "TRT_TensorOrWeights(type=";
   if (is_tensor()) {
-    StrAppend(&output, "tensor @", reinterpret_cast<uintptr_t>(tensor()),
-              ", shape=", convert::DebugString(tensor()->getDimensions()),
+    StrAppend(&output, "tensor=", convert::DebugString(*tensor()),
               ", batch_size=", batch_size_);
   } else {
     StrAppend(&output, "weights=", weights_.DebugString());
@@ -755,8 +799,9 @@ Status TrtNodeValidator::ValidateNode(
     Status status = ConvertToTensorOrWeights(
         *pair.first, pair.second, graph_properties, &tensor_or_weights);
     if (!status.ok()) {
-      return errors::Internal("Failed to convert input with index ", i,
-                              " to a TRT_TensorOrWeights");
+      return errors::Internal(
+          "Failed to convert input with index ", i,
+          " to a TRT_TensorOrWeights: ", status.error_message());
     }
     inputs.push_back(tensor_or_weights);
   }
@@ -789,8 +834,7 @@ Status TrtNodeValidator::ConvertConstToWeights(
 }
 
 Converter::Converter(nvinfer1::INetworkDefinition* trt_network,
-                     int precision_mode,
-                     bool use_calibration)
+                     int precision_mode, bool use_calibration)
     : trt_network_(trt_network),
       precision_mode_(precision_mode),
       use_calibration_(use_calibration) {
@@ -947,6 +991,8 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
   for (int32_t i = 0; i < dims.nbDims; ++i) {
     permutation.order[i] = order_with_batch_dim[i + 1] - 1;
   }
+  VLOG(1) << "TransposeTensor permutation: "
+          << DebugString(permutation, dims.nbDims);
   layer->setFirstTranspose(permutation);
 
   nvinfer1::Dims reshape_dims;
@@ -963,24 +1009,23 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
 }
 
 Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
-                                 float* out_min,
-                                 float* out_max) const {
+                                 float* out_min, float* out_max) const {
   switch (weights.type_) {
-    case tensorflow::DataType::DT_FLOAT: {
+    case DataType::DT_FLOAT: {
       auto inp = static_cast<float const*>(weights.GetValues());
       auto result = std::minmax_element(inp, inp + weights.count());
       *out_min = *result.first;
       *out_max = *result.second;
       break;
     }
-    case tensorflow::DataType::DT_HALF: {
+    case DataType::DT_HALF: {
       auto inp = static_cast<Eigen::half const*>(weights.GetValues());
       auto result = std::minmax_element(inp, inp + weights.count());
       *out_min = Eigen::half_impl::half_to_float(*result.first);
       *out_max = Eigen::half_impl::half_to_float(*result.second);
       break;
     }
-    case tensorflow::DataType::DT_INT32: {
+    case DataType::DT_INT32: {
       auto inp = static_cast<int const*>(weights.GetValues());
       auto result = std::minmax_element(inp, inp + weights.count());
       *out_min = static_cast<float>(*result.first);
@@ -988,11 +1033,11 @@ Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
       break;
     }
     default:
-      return tensorflow::errors::Unimplemented(
-          "Data type not supported for GetWeightRange: " +
-          tensorflow::DataTypeString(weights.type_));
+      return errors::Unimplemented(
+          "Data type not supported for GetWeightRange: ",
+          DataTypeString(weights.type_));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
@@ -1009,8 +1054,9 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
   }
   if (can_check_shapes &&
       TrtDimsNumElements(input.GetTrtDims()) != TrtDimsNumElements(dims)) {
-    return tensorflow::errors::InvalidArgument(
-        "Reshape shapes are not compatible.");
+    return errors::InvalidArgument("Reshape shapes are not compatible (",
+                                   DebugString(input.GetTrtDims()), " vs ",
+                                   DebugString(dims), ")");
   }
 
   if (input.is_tensor()) {
@@ -1038,15 +1084,15 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
       float max_range = 0.0f;
       TF_RETURN_IF_ERROR(
           GetWeightRange(input.weights(), &min_range, &max_range));
-      // Avoid setting range to 0 because TRT will throw an error. If the weights
-      // are zero then the range doesn't matter: using 127.0f should ensure the
-      // quantized weight will be exactly zero.
+      // Avoid setting range to 0 because TRT will throw an error. If the
+      // weights are zero then the range doesn't matter: using 127.0f should
+      // ensure the quantized weight will be exactly zero.
       if (min_range == 0.0f && max_range == 0.0f) {
         min_range = -127.0f;
         max_range = 127.0f;
       }
       ProvideQuantizationRange(const_cast<nvinfer1::ITensor*>(*tensor),
-          min_range, max_range);
+                               min_range, max_range);
     }
   }
   return tensorflow::Status::OK();
@@ -1064,25 +1110,29 @@ void Converter::ProvideQuantizationRange(nvinfer1::ITensor* tensor,
   quantization_ranges_[tensor] = symmetric_range;
 }
 
-void Converter::ApplyQuantizationRanges(bool warn_missing_ranges) {
-  // Infer ranges across marked ops
+void Converter::MaybeApplyQuantizationRanges() {
+  if (precision_mode() != INT8MODE) return;
+
+  // Infer ranges across marked ops.
   PropagateQuantizationRanges();
-  // Apply ranges
+  // Apply ranges.
+#if NV_TENSORRT_MAJOR >= 5
   for (auto pair : quantization_ranges_) {
     nvinfer1::ITensor* tensor = pair.first;
     const float range = pair.second;
-#if NV_TENSORRT_MAJOR >= 5
     VLOG(1) << "Setting range for: " << tensor->getName() << ": " << range;
+    // TODO(laigd): if 'tensor' already has a range set which doesn't match
+    // 'range', it should report error.
     tensor->setDynamicRange(-range, range);
-#endif
   }
+#endif
 
   // Warn user about tensors that are missing ranges. If TRT fuses some layers
   // then these tensors may not actually be required, which is why this is
   // just a warning. If we are still missing ranges even after fusion,
   // Builder::buildCudaEngine() will return nullptr and we will catch the
   // error at that point.
-  if (warn_missing_ranges) {
+  if (!use_calibration()) {
     // Get all tensors from network
     std::set<nvinfer1::ITensor*> all_tensors;
     for (int i = 0; i < this->network()->getNbLayers(); i++) {
@@ -1096,15 +1146,15 @@ void Converter::ApplyQuantizationRanges(bool warn_missing_ranges) {
     }
     // Find tensors with no ranges
     for (auto tensor : all_tensors) {
-      if (quantization_ranges_.find(tensor) == quantization_ranges_.end()) {
+      if (!quantization_ranges_.count(tensor)) {
         // Note: there may be some warnings for "(Unnamed ITensor* N)". These
         // are tensors which are created internally by TF-TRT. The ranges for
         // these unnamed ITensors are always inferred from user provided ranges,
         // thus there will also be a warning for the range(s) the user missed.
         LOG(WARNING) << "Quantization range was not found for "
-                      << tensor->getName() << ". "
-                      << "This is okay if TensorRT does not need the range "
-                      << "(e.g. due to node fusion).";
+                     << tensor->getName() << ". "
+                     << "This is okay if TensorRT does not need the range "
+                     << "(e.g. due to node fusion).";
       }
     }
   }
@@ -1120,20 +1170,21 @@ void Converter::PropagateQuantizationRanges() {
   while (information_added) {
     information_added = false;
     for (auto it = quantization_infer_.begin();
-        it != quantization_infer_.end();) {
+         it != quantization_infer_.end();) {
       auto input_tensor_range = quantization_ranges_.find(it->first);
       auto output_tensor_range = quantization_ranges_.find(it->second);
       if (input_tensor_range != quantization_ranges_.end() &&
           output_tensor_range == quantization_ranges_.end()) {
         // Input has range but output doesn't: copy range
+        // TODO(laigd): consider reporting error if it a different range is
+        // already set.
         quantization_ranges_[it->second] = input_tensor_range->second;
         information_added = true;
-        VLOG(1) << "Copy quantization range: "
-                << it->first->getName() << " -> " << it->second->getName();
+        VLOG(1) << "Copy quantization range: " << it->first->getName() << " -> "
+                << it->second->getName();
       }
       // We can remove edges when the output range is known
-      if (quantization_ranges_.find(it->second) !=
-          quantization_ranges_.end()) {
+      if (quantization_ranges_.find(it->second) != quantization_ranges_.end()) {
         it = quantization_infer_.erase(it);
       } else {
         ++it;
@@ -1198,12 +1249,11 @@ TRT_ShapedWeights ConvertFP32ToFP16(TrtWeightStore* store,
 }
 
 // ****************************************************************************
-// Constant folding functions
-// TODO(jie): once optimizer kicks in, we should have done constant folding
-// there.
+// Constant folding functions for weights.
+// TODO(laigd): we should probably use eigen directly.
 // *****************************************************************************
 struct LambdaFactory {
-  enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB, RECIP };
+  enum class OP_CATEGORY : int { RSQRT = 0, NEG, RECIP };
   OP_CATEGORY op;
 
   template <typename T>
@@ -1218,84 +1268,10 @@ struct LambdaFactory {
       case OP_CATEGORY::RECIP:
         return [](T t) -> T { return 1.0 / t; };
       default:
-        VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+        LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
         return nullptr;
     }
   }
-
-  template <typename T>
-  std::function<T(T, T)> binary() {
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [](T l, T r) -> T { return l + r; };
-      case OP_CATEGORY::SUB:
-        return [](T l, T r) -> T { return l - r; };
-      case OP_CATEGORY::MUL:
-        return [](T l, T r) -> T { return l * r; };
-      default:
-        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [](T l, T r) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
-
-  template <typename T>
-  std::function<T(T)> broadcast_r(T val) {
-    VLOG(2) << "LAMBDA VAL : " << val;
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l + val;
-        };
-      case OP_CATEGORY::SUB:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l - val;
-        };
-      case OP_CATEGORY::MUL:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l * val;
-        };
-      default:
-        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [val](T l) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
-
-  template <typename T>
-  std::function<T(T)> broadcast_l(T val) {
-    VLOG(2) << "LAMBDA VAL : " << val;
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val + l;
-        };
-      case OP_CATEGORY::SUB:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val - l;
-        };
-      case OP_CATEGORY::MUL:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val * l;
-        };
-      default:
-        LOG(ERROR) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [val](T l) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
 };
 
 template <>
@@ -1303,15 +1279,18 @@ std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
   switch (op) {
     case OP_CATEGORY::RSQRT: {
       VLOG(2) << "RSQRT GETS DONE";
-      return [](Eigen::half t) -> Eigen::half {
+      return [](Eigen::half t) {
         return Eigen::half(1.0 / sqrt(static_cast<float>(t)));
       };
     }
     case OP_CATEGORY::NEG:
-      return [](Eigen::half t) -> Eigen::half { return -t; };
-    // TODO(aaroey): can we support RECIP?
+      return [](Eigen::half t) { return -t; };
+    case OP_CATEGORY::RECIP:
+      return [](Eigen::half t) {
+        return Eigen::half(1.0 / static_cast<float>(t));
+      };
     default:
-      VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+      LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
       return nullptr;
   }
 }
@@ -1343,50 +1322,48 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
   return tensorflow::Status::OK();
 }
 
+// If swapped_inputs is false, 'tensor' is the left operand and 'weights' is the
+// right operand. If swapped_inputs is true, those two are swapped.
+//
 // TODO(jie): broadcast is needed yet not implemented.
-// Only implemented channel wise for the time being
-tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
-                                        const nvinfer1::ITensor* tensor,
-                                        TRT_ShapedWeights weights,
-                                        bool swapped_inputs) {
+// Only implemented channel wise for the time being.
+Status BinaryTensorOpWeight(OpConverterParams* params,
+                            const nvinfer1::ITensor* tensor,
+                            TRT_ShapedWeights weights, bool swapped_inputs) {
+  static const std::unordered_set<string> supported_ops = {"Sub", "Add", "Mul",
+                                                           "Div", "RealDiv"};
   const auto& node_def = params->node_def;
-  // tensor is the left operand while weights is the right operand;
-  // when swapped_inputs set to true, those two are swapped.
-  // TODO(aaroey): use a set.
-  if (node_def.op() != "Sub" && node_def.op() != "Add" &&
-      node_def.op() != "Mul" && node_def.op() != "Div" &&
-      node_def.op() != "RealDiv") {
-    return tensorflow::errors::Unimplemented(
-        "op not supported: " + node_def.op() + ", at: " + node_def.name());
+  if (!supported_ops.count(node_def.op())) {
+    return errors::Unimplemented(node_def.op(), " is not supported, at ",
+                                 node_def.name());
   }
 
-  // Check type consistency
-  nvinfer1::DataType ttype;
-  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &ttype));
+  // Check type consistency.
+  nvinfer1::DataType trt_dtype;
+  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &trt_dtype));
 
-  // Check scale mode
+  // Check scale mode.
   auto dims_w = weights.shape_;
-  auto dims_t = tensor->getDimensions();
+  const auto dims_t = tensor->getDimensions();
 
   // TODO(jie): addScale checks for input tensor dimension
   if (dims_t.nbDims != 3) {
-    return tensorflow::errors::InvalidArgument(
-        "addScale requires tensor with rank 3, " + node_def.name());
+    return errors::InvalidArgument("addScale requires tensor with rank 3, at ",
+                                   node_def.name());
   }
 
-  // default to element-wise
+  // Default to element-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
   // TODO(jie): maybe use a permutation instead to support more cases;
-  bool permutation_flag = false;
+  bool need_to_permute = false;
 
   if (weights.count() == 1) {
-    VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
-    // no broadcasting on Batch dimension;
-    VLOG(2) << "WEIGHTS DIM: " << dims_w.nbDims
-            << " tensor DIM: " << dims_t.nbDims;
+    VLOG(2) << "weights dims: " << DebugString(dims_w)
+            << "; tensor dims: " << DebugString(dims_t);
+    // Make sure no broadcasting on batch dimension.
     if (dims_w.nbDims == dims_t.nbDims + 1) {
       if (dims_w.d[0] == 1) {
         for (int i = 1; i < dims_w.nbDims; i++) {
@@ -1394,72 +1371,70 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
         }
         dims_w.nbDims--;
       } else {
-        return tensorflow::errors::InvalidArgument(
-            "Binary op cannot operate on batch, " + node_def.name());
+        return errors::InvalidArgument("Binary op cannot operate on batch, at ",
+                                       node_def.name());
       }
     }
 
     if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) {
       scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      // default is element;
+      // Default is element-wise
       for (int i = 1; i < dims_w.nbDims; i++) {
         if (dims_w.d[i] != dims_t.d[i]) {
-          // if dimension does not match, switch back to channel;
-          VLOG(2) << "channel";
+          // If dimension does not match, switch back to per-channel
           scale_mode = nvinfer1::ScaleMode::kCHANNEL;
           break;
         }
       }
-      // if channel as candidate, validate it
+      // If the mode is per-channel, since channel dimension is assumed to be
+      // the third to last dimension, we need to make sure all other dimensions
+      // have size 1.
       if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
         for (int i = 1; i < dims_w.nbDims; i++) {
           if (dims_w.d[i] != 1)
-            return tensorflow::errors::InvalidArgument(
-                "Weight shape not compatible at, " + node_def.name());
+            return errors::InvalidArgument(
+                "Weight dims not compatible for channel-wise broadcast at ",
+                node_def.name());
         }
-      } else {
-        VLOG(2) << "elementwise";
       }
     } else if (dims_w.nbDims == 1 &&
                dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) {
-      // channel wise and broadcast required;
-      permutation_flag = true;
+      // Channel wise and broadcast required. We compare the last dimension of
+      // the tensor shape because of tensorflow default broadcasting rules.
+      need_to_permute = true;
       scale_mode = nvinfer1::ScaleMode::kCHANNEL;
     } else {
-      return tensorflow::errors::InvalidArgument(
-          "Weight shape not compatible at, " + node_def.name());
+      return errors::InvalidArgument("Weight dims not compatible at ",
+                                     node_def.name());
     }
   }
+  // TODO(laigd): we should add validation_only support in TransposeTensor() and
+  // PrepareTensorForShape().
+  if (params->validation_only) return Status::OK();
 
-  // transpose last dimension
+  // Transpose last dimension.
   std::vector<int> permutation(dims_t.nbDims + 1);
-  if (permutation_flag) {
-    if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
-      // we swap the last dimension into channel for trt.
-      // because of tensorflow default broadcasting rules.
-      for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
-        permutation[i] = i;
-      }
-      permutation[1] = dims_t.nbDims;
-      permutation[dims_t.nbDims] = 1;
-      TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-          const_cast<nvinfer1::ITensor*>(tensor), permutation, &tensor));
-    } else {
-      return tensorflow::errors::InvalidArgument(
-          "Transpose cannot be applied, " + node_def.name());
-    }
+  if (need_to_permute) {
+    // We swap the last dimension into channel for trt, because of tensorflow
+    // default broadcasting rules.
+    for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
+      permutation[i] = i;
+    }
+    permutation[1] = dims_t.nbDims;
+    permutation[dims_t.nbDims] = 1;
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(tensor), permutation, &tensor));
   }
 
   if (params->converter->precision_mode() == FP16MODE) {
     weights = ConvertFP32ToFP16(params->weight_store, weights);
   }
 
-  // prepare weights
+  // Prepare weights
   TRT_ShapedWeights shift_weights(weights.type_);
   TRT_ShapedWeights scale_weights(weights.type_);
   TRT_ShapedWeights power_weights(weights.type_);
 
-  // Maybe I should do a switch
   if (node_def.op() == "Sub") {
     if (swapped_inputs) {
       shift_weights = weights;
@@ -1482,19 +1457,21 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
     }
   } else if (node_def.op() == "Div" || node_def.op() == "RealDiv") {
     if (swapped_inputs) {
-      // We need to infer the quantization range for this intermediate
-      // tensor.
-      // x -> [Recip] -> 1/x -> [Scale] -> s/x
-      //                  ^
-      //          need range for this
+      // We need to infer the quantization range for this intermediate tensor.
+      //
+      //   x -> [Recip] -> 1/x -> [Scale] -> s/x
+      //                    ^
+      //            need range for this
+      //
       // We have the quantization scales for x and s/x - can we divide the scale
-      // for s/x by s? Only if it was a scalar...
+      // for s/x by s? Only if it is a scalar.
+      //
       // Because of this issue, fall back to BinaryTensorOpTensor if we are
       // doing INT8 with no calibration. There is most likely no performance
       // penalty by falling back here.
       if (params->converter->precision_mode() == INT8MODE &&
           !params->converter->use_calibration()) {
-        return tensorflow::errors::Unimplemented(
+        return errors::Unimplemented(
             "Intermediate quantization range cannot be determined without"
             " calibration. Falling back to BinaryTensorOpTensor for ",
             node_def.op(), ", at ", node_def.name());
@@ -1518,8 +1495,8 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
   } else if (node_def.op() == "Add") {
     shift_weights = weights;
   } else {
-    return tensorflow::errors::Unimplemented("Binary op not supported: " +
-                                             node_def.op());
+    // This should not happen.
+    return errors::Unimplemented("Binary op not supported at ", node_def.op());
   }
 
   nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
@@ -1529,8 +1506,8 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  // transpose back dimension
-  if (permutation_flag) {
+  // Transpose back dimension
+  if (need_to_permute) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), permutation,
         &output_tensor));
@@ -1664,9 +1641,9 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params,
                                            params->node_def.name());
 }
 
-tensorflow::Status BinaryTensorOpTensor(OpConverterParams* params,
-                                        const TRT_TensorOrWeights& operand_l,
-                                        const TRT_TensorOrWeights& operand_r) {
+Status BinaryTensorOpTensor(OpConverterParams* params,
+                            const TRT_TensorOrWeights& operand_l,
+                            const TRT_TensorOrWeights& operand_r) {
   const auto& node_def = params->node_def;
   static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
       {"Add", nvinfer1::ElementWiseOperation::kSUM},
@@ -1677,50 +1654,52 @@ tensorflow::Status BinaryTensorOpTensor(OpConverterParams* params,
       {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
       {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
   };
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end()) {
+    return errors::Unimplemented("Binary op ", node_def.op(),
+                                 " not supported at: ", node_def.name());
+  }
 
-  const nvinfer1::ITensor* tensor_l;
-  const nvinfer1::ITensor* tensor_r;
-
-  nvinfer1::Dims dim_l;
-  nvinfer1::Dims dim_r;
-
-  if (!TensorRTGetBroadcastShape(operand_l.GetTrtDims(), operand_l.is_tensor(),
-                                 operand_r.GetTrtDims(), operand_r.is_tensor(),
-                                 &dim_l, &dim_r)) {
-    return tensorflow::errors::InvalidArgument(
-        "Binary op broadcast scheme not supported by TensorRT op: " +
-        node_def.op() + ", at: " + node_def.name());
+  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
+  Status status = params->converter->GetTrtBroadcastShape(
+      operand_l, operand_r, &broadcasted_dims_l, &broadcasted_dims_r);
+  if (!status.ok()) {
+    return errors::InvalidArgument(
+        "Unsupported binary op broadcast scheme for op ", node_def.name(), ": ",
+        status.error_message());
   }
+  if (params->validation_only) return Status::OK();
 
-  TF_RETURN_IF_ERROR(
-      params->converter->PrepareTensorForShape(operand_l, dim_l, &tensor_l));
-  TF_RETURN_IF_ERROR(
-      params->converter->PrepareTensorForShape(operand_r, dim_r, &tensor_r));
+  const nvinfer1::ITensor* tensor_l = nullptr;
+  const nvinfer1::ITensor* tensor_r = nullptr;
+  status = params->converter->PrepareTensorForShape(
+      operand_l, broadcasted_dims_l, &tensor_l);
+  if (status.ok()) {
+    status = params->converter->PrepareTensorForShape(
+        operand_r, broadcasted_dims_r, &tensor_r);
+  }
+  if (!status.ok()) {
+    return errors::Internal("Failed to convert binary op ", node_def.name(),
+                            ": ", status.error_message());
+  }
 
-  // get trt type & shape
+  // Check type consistency.
   TFAttrs attrs(node_def);
-  // maybe this part has to be moved into the block of rsqrt later
   nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+  TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype)
+      << DebugString(tensor_l->getType()) << " vs " << DebugString(dtype);
+  TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype)
+      << DebugString(tensor_r->getType()) << " vs " << DebugString(dtype);
 
-  // check type consistency
-  TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype);
-  TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype);
-  auto op_pair = ops.find(node_def.op());
-  if (op_pair == ops.end()) {
-    return tensorflow::errors::Unimplemented(
-        "binary op: ", node_def.op(), " not supported at: ", node_def.name());
-  }
-
+  // Add ElementWise layer.
   nvinfer1::IElementWiseLayer* layer =
       params->converter->network()->addElementWise(
-          // TODO(aaroey): will tensor_l/tensor_r get modified?
           *const_cast<nvinfer1::ITensor*>(tensor_l),
           *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // pass the output
+  // Pass the output
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
@@ -1730,6 +1709,7 @@ tensorflow::Status ConvertPlugin(OpConverterParams* params) {
   const auto& node_def = params->node_def;
   // prepare input
   std::vector<nvinfer1::ITensor*> all_inputs;
+  all_inputs.reserve(inputs.size());
   for (auto input : inputs) {
     all_inputs.emplace_back(const_cast<nvinfer1::ITensor*>(input.tensor()));
   }
@@ -2008,23 +1988,22 @@ tensorflow::Status ConvertActivation(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertQuantize(OpConverterParams* params) {
+Status ConvertQuantize(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   if ((inputs.size() == 0) ||
-      (inputs.size() != 1 && node_def.op() == "FakeQuantWithMinMaxArgs") ||
-      (inputs.size() != 3 && node_def.op() == "FakeQuantWithMinMaxVars") ||
-      (inputs.size() != 3 && node_def.op() == "QuantizeAndDequantizeV2") ||
-      (inputs.size() != 4 && node_def.op() == "QuantizeAndDequantizeV3")) {
-    return tensorflow::errors::InvalidArgument(
-        "Invalid number of inputs for ", node_def.op(), ", at ",
-        node_def.name());
+      (node_def.op() == "FakeQuantWithMinMaxArgs" && inputs.size() != 1) ||
+      (node_def.op() == "FakeQuantWithMinMaxVars" && inputs.size() != 3) ||
+      (node_def.op() == "QuantizeAndDequantizeV2" && inputs.size() != 3) ||
+      (node_def.op() == "QuantizeAndDequantizeV3" && inputs.size() != 4)) {
+    return errors::InvalidArgument("Invalid number of inputs for ",
+                                   node_def.op(), ", at ", node_def.name());
   }
   if (inputs.at(0).is_weights()) {
     // TensorRT will automatically quantize weights, so we will ignore ranges
     // for weights.
     params->outputs->push_back(inputs.at(0));
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
   float min_range = 0.0f;
   float max_range = 0.0f;
@@ -2032,9 +2011,8 @@ tensorflow::Status ConvertQuantize(OpConverterParams* params) {
     // Get ranges via node attributes.
     TFAttrs attrs(node_def);
     if (attrs.count("min") == 0 || attrs.count("max") == 0) {
-      return tensorflow::errors::InvalidArgument(
-          "Min or max attribute not found for ", node_def.op(), " at ",
-          node_def.name());
+      return errors::InvalidArgument("Min or max attribute not found for ",
+                                     node_def.op(), " at ", node_def.name());
     }
     min_range = attrs.get<float>("min");
     max_range = attrs.get<float>("max");
@@ -2043,29 +2021,26 @@ tensorflow::Status ConvertQuantize(OpConverterParams* params) {
              node_def.op() == "QuantizeAndDequantizeV3") {
     // Get ranges via inputs.
     if (!inputs.at(1).is_weights() || !inputs.at(2).is_weights()) {
-      return tensorflow::errors::InvalidArgument(
-          "Min and max inputs for ", node_def.op(),
-          " must be weights not tensors, at ", node_def.name());
-    }
-    // Min
-    TRT_ShapedWeights weights_min = inputs.at(1).weights();
-    auto weights_min_ptr = static_cast<float*>(const_cast<void*>(
-        weights_min.GetValues()));
-    min_range = weights_min_ptr[0];
-    // Max
-    TRT_ShapedWeights weights_max = inputs.at(2).weights();
-    auto weights_max_ptr = static_cast<float*>(const_cast<void*>(
-        weights_max.GetValues()));
-    max_range = weights_max_ptr[0];
+      return errors::InvalidArgument("Min and max inputs for ", node_def.op(),
+                                     " must be weights not tensors, at ",
+                                     node_def.name());
+    }
+    auto get_weights_value = [&inputs](int index) {
+      auto raw_weights = static_cast<float*>(
+          const_cast<void*>(inputs.at(index).weights().GetValues()));
+      return raw_weights[0];
+    };
+    min_range = get_weights_value(1);
+    max_range = get_weights_value(2);
   } else {
-    return tensorflow::errors::InvalidArgument(
-        "Unknown quantization op \"", node_def.op(), "\", at ",
-        node_def.name());
+    return errors::InvalidArgument("Unknown quantization op ", node_def.op(),
+                                   ", at ", node_def.name());
   }
+  if (params->validation_only) return Status::OK();
+
   // Store ranges for tensor
   params->converter->ProvideQuantizationRange(
-      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()),
-      min_range,
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()), min_range,
       max_range);
   // Sometimes, TRT may not quantize a tensor, either because it chooses to
   // execute a higher precision kernel or because of op fusion. In these cases,
@@ -2077,7 +2052,7 @@ tensorflow::Status ConvertQuantize(OpConverterParams* params) {
   // possible (i.e. not quantizing in place where fusion will occur), then there
   // is no problem with the current implementation.
   params->outputs->push_back(inputs.at(0));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 // TODO(pdavoodi): we should update relu6 implementation once TensorRT supports
@@ -2087,14 +2062,14 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
   const auto& node_def = params->node_def;
   if (inputs.size() != 1) {
     return tensorflow::errors::InvalidArgument(
-        "Invalid number of inputs for Relu6, at ",
-        node_def.name());
+        "Invalid number of inputs for Relu6, at ", node_def.name());
   }
   if (inputs.at(0).is_weights()) {
     return tensorflow::errors::Unimplemented(
         "Relu6 is only implemented for tensors, not weights, at ",
         node_def.name());
   }
+  if (params->validation_only) return Status::OK();
   // ***************************************************************************
   // TensorRT does not implement Relu6 natively. This function converts Relu6 op
   // to available TensorRT ops: Relu6(x) = min(Relu(x), 6)
@@ -2110,12 +2085,12 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
           nvinfer1::ActivationType::kRELU);
   TFTRT_RETURN_ERROR_IF_NULLPTR(relu_layer, node_def.name());
 
-  // Large range of relu is problematic during quantization in INT8 precision mode.
-  // Setting dynamic range of relu = [0.f, 6.0f] helps with quantization.
+  // Large range of relu is problematic during quantization in INT8 precision
+  // mode. Setting dynamic range of relu = [0.f, 6.0f] helps with quantization.
   // TRT only uses dynamic ranges in INT8 precision mode,
   // and this does not affect the FP32 path.
-  params->converter->ProvideQuantizationRange(
-      relu_layer->getOutput(0), 0.0f, 6.0f);
+  params->converter->ProvideQuantizationRange(relu_layer->getOutput(0), 0.0f,
+                                              6.0f);
 
   // Create a constant layer to store the floating point weight i.e. 6.0f This
   // tensor will be broadcasted uniformly during elementwise `min` operation.
@@ -2128,14 +2103,14 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
   }
   TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
       tensorflow::DataType::DT_FLOAT, dims);
-  auto weights_ptr = static_cast<float*>(const_cast<void*>(
-      weights.GetValues()));
-  weights_ptr[0] = 6.f;
+  auto weights_ptr =
+      static_cast<float*>(const_cast<void*>(weights.GetValues()));
+  weights_ptr[0] = 6.0f;
   nvinfer1::IConstantLayer* const6_layer =
       params->converter->network()->addConstant(dims, weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(const6_layer, node_def.name());
-  params->converter->ProvideQuantizationRange(
-      const6_layer->getOutput(0), 0.0f, 6.0f);
+  params->converter->ProvideQuantizationRange(const6_layer->getOutput(0), 0.0f,
+                                              6.0f);
 
   // ElementWise Min Operation
   // Min op is a nop for INT8 execution path, as the input tensor
@@ -2152,107 +2127,110 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertScale(OpConverterParams* params) {
+tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "ConvertScale only supports tensor<op>weight: ", node_def.name());
-  }
-
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  TRT_ShapedWeights weights = inputs.at(1).weights();
-  if (params->converter->precision_mode() == FP16MODE) {
-    weights = ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
+    return errors::InvalidArgument("Input expects tensor and weights, at ",
+                                   node_def.name());
   }
+  if (params->validation_only) return Status::OK();
 
-  TRT_ShapedWeights empty_weights(weights.type_);
+  nvinfer1::ITensor* tensor =
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
+  const nvinfer1::Dims original_dims = tensor->getDimensions();
   TFAttrs attrs(node_def);
-
-  const auto data_format = attrs.get<string>("data_format");
-  int channel_index;
-  const auto dims = tensor->getDimensions();
-  if (data_format == "NHWC") {
-    //  1). NHWC is really N+C
-    channel_index = dims.nbDims - 1;  // batch dimension is implicit here!
-  } else {
-    //  2). NCHW is really N+CHW
-    channel_index = 0;  // batch dimension is implicit here!
-  }
+  const string data_format = attrs.get<string>("data_format");
+  const int channel_index =
+      (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
 
   nvinfer1::Permutation permutation;
-  for (int32_t i = 0; i < dims.nbDims; ++i) {
-    permutation.order[i] = i;
-  }
-
-  if (channel_index >= 0) {
+  if (channel_index != 0) {
+    // Permute the dimensions so that the channel dimension is the first
+    // dimension.
+    for (int i = 0; i < original_dims.nbDims; ++i) {
+      permutation.order[i] = i;
+    }
     permutation.order[0] = channel_index;
     permutation.order[channel_index] = 0;
-  } else {
-    return tensorflow::errors::Unimplemented(
-        "TFTRT::BiasAdd cannot apply on batch dimension, at ", node_def.name());
+    VLOG(1) << "ConvertBiasAdd permutation: "
+            << DebugString(permutation, original_dims.nbDims);
   }
 
   // TensorRT addScale requires input to be of rank 3, we need to apply
-  // transpose as well as reshape
-  if (channel_index != 0 || dims.nbDims != 3) {
+  // transpose as well as reshape.
+  // TODO(laigd): this doesn't match what the TRT doc says, fix the doc?
+  if (channel_index != 0 || original_dims.nbDims != 3) {
     nvinfer1::IShuffleLayer* shuffle_layer =
-        params->converter->network()->addShuffle(
-            *const_cast<nvinfer1::ITensor*>(tensor));
+        params->converter->network()->addShuffle(*tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
     params->converter->MarkQuantizationRangesAsInferrable(
-        const_cast<nvinfer1::ITensor*>(tensor), shuffle_layer->getOutput(0));
+        tensor, shuffle_layer->getOutput(0));
 
+    // NOTE(laigd): for some reason we need to apply the reshape
+    // unconditionally. The default shape has nbDims==-1 and it seems the
+    // behavior is undefined in some cases.
     nvinfer1::Dims reshape_dims;
     reshape_dims.nbDims = 3;
-    reshape_dims.d[0] = 0;                          // 0 copy from the input
-    reshape_dims.d[1] = dims.nbDims >= 2 ? 0 : 1;   // 0 copy from the input
-    reshape_dims.d[2] = dims.nbDims >= 3 ? -1 : 1;  // -1 infer from the rest
+    // 0 means copying from input; -1 means inferring from the rest.
+    reshape_dims.d[0] = 0;
+    reshape_dims.d[1] = original_dims.nbDims >= 2 ? 0 : 1;
+    reshape_dims.d[2] = original_dims.nbDims >= 3 ? -1 : 1;
+    shuffle_layer->setReshapeDimensions(reshape_dims);
+
     if (channel_index != 0) {
-      // maybe we do not need this check. concerned about TRT optimization
       shuffle_layer->setFirstTranspose(permutation);
     }
-    shuffle_layer->setReshapeDimensions(reshape_dims);
     tensor = shuffle_layer->getOutput(0);
   }
 
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (params->converter->precision_mode() == FP16MODE) {
+    weights = ConvertFP32ToFP16(params->weight_store, weights);
+  }
   nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
   if (weights.shape_.d[0] == 1) {
     mode = nvinfer1::ScaleMode::kUNIFORM;
   }
 
+  TRT_ShapedWeights empty_weights(weights.type_);
   nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), mode, weights.GetTrtWeights(),
-      empty_weights.GetTrtWeights(), empty_weights.GetTrtWeights());
+      *tensor, mode, weights.GetTrtWeights(), empty_weights.GetTrtWeights(),
+      empty_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // restore transpose & reshape
-  if (channel_index != 0 || dims.nbDims != 3) {
+  // Restore transpose & reshape.
+  if (channel_index != 0 || original_dims.nbDims != 3) {
     nvinfer1::IShuffleLayer* shuffle_layer =
-        params->converter->network()->addShuffle(
-            *const_cast<nvinfer1::ITensor*>(output_tensor));
+        params->converter->network()->addShuffle(*output_tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
-    nvinfer1::Dims reshape_dims = dims;
-    int tmp = reshape_dims.d[channel_index];
-    reshape_dims.d[channel_index] = reshape_dims.d[0];
-    reshape_dims.d[0] = tmp;
+    // NOTE: for same reason as mentioned above we need to apply the reshape
+    // unconditionally.
+    nvinfer1::Dims reshape_dims = original_dims;
+    if (channel_index != 0) {
+      // NOTE: according to NVIDIA dimension types are deprecated, so we don't
+      // need to copy them back.
+      reshape_dims.d[channel_index] = original_dims.d[0];
+      reshape_dims.d[0] = original_dims.d[channel_index];
+    }
     shuffle_layer->setReshapeDimensions(reshape_dims);
+
     if (channel_index != 0) {
       shuffle_layer->setSecondTranspose(permutation);
     }
     params->converter->MarkQuantizationRangesAsInferrable(
-        const_cast<nvinfer1::ITensor*>(output_tensor), shuffle_layer->getOutput(0));
+        output_tensor, shuffle_layer->getOutput(0));
     output_tensor = shuffle_layer->getOutput(0);
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status GetTensorDimsWithProtoShape(const Tensor& tensor,
@@ -2413,18 +2391,17 @@ tensorflow::Status ConvertIdentity(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertBinary(OpConverterParams* params) {
+Status ConvertBinary(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   if (inputs.size() != 2) {
-    return tensorflow::errors::FailedPrecondition(
-        "Binary ops require two tensor input, at ", node_def.name());
+    return errors::InvalidArgument("Binary ops require two inputs, at ",
+                                   node_def.name());
   }
 
   // Constant folding should have been done by TensorFlow
-
   if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         "Constant folding is falled back to TensorFlow, binary op received "
         "both input as constant at: ",
         node_def.name());
@@ -2436,11 +2413,12 @@ tensorflow::Status ConvertBinary(OpConverterParams* params) {
   // can be fused in more situations. However, most of the benefits of
   // IScaleLayer are when the layer performs both a shift and a scale, which we
   // don't do except for convolutions.
-  // Try to convert into Scale layer first (for better performance)
+  //
+  // Try to convert into Scale layer first (for better performance).
   // Since scale layer supports restricted broadcast policy and op types, we
   // allow failure and try to handle it through Elementwise op
-  // (BinaryTensorOpTensor)
-  Status status = tensorflow::Status::OK();
+  // (BinaryTensorOpTensor).
+  Status status = Status::OK();
   if (inputs.at(0).is_tensor() && inputs.at(1).is_weights()) {
     status = BinaryTensorOpWeight(params, inputs.at(0).tensor(),
                                   inputs.at(1).weights(), false);
@@ -2448,7 +2426,10 @@ tensorflow::Status ConvertBinary(OpConverterParams* params) {
     status = BinaryTensorOpWeight(params, inputs.at(1).tensor(),
                                   inputs.at(0).weights(), true);
   }
+  // If both input are tensors, or one of them is weights but the conversion
+  // above failed, try the conversion using BinaryTensorOpTensor.
   if ((inputs.at(0).is_tensor() && inputs.at(1).is_tensor()) || !status.ok()) {
+    if (!status.ok()) VLOG(1) << status;
     status = BinaryTensorOpTensor(params, inputs.at(0), inputs.at(1));
   }
   return status;
@@ -2478,17 +2459,19 @@ tensorflow::Status ConvertUnary(OpConverterParams* params) {
 
   nvinfer1::IUnaryLayer* layer;
   if (node_def.op() == "Rsqrt") {
-    // We will need a quantization range for intermediate tensor
-    // if not using calibration.
-    // x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
-    //                   ^
-    //             need range here
+    // We will need a quantization range for intermediate tensor if not using
+    // calibration.
+    //
+    //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
+    //                     ^
+    //               need range here
     if (params->converter->precision_mode() == INT8MODE &&
         !params->converter->use_calibration()) {
-        return tensorflow::errors::Unimplemented(
-            "Intermediate quantization range cannot be determined without"
-            " calibration for Rsqrt, consider replacing with "
-            "Sqrt -> FakeQuant -> Reciprocal ops, at ", node_def.name());
+      return errors::Unimplemented(
+          "Intermediate quantization range cannot be determined without"
+          " calibration for Rsqrt, consider replacing with "
+          "Sqrt -> FakeQuant -> Reciprocal ops, at ",
+          node_def.name());
     }
     layer = params->converter->network()->addUnary(
         *const_cast<nvinfer1::ITensor*>(tensor),
@@ -3091,40 +3074,49 @@ tensorflow::Status ConvertTopK(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
-void TrtNodeValidator::RegisterOpValidators() {
+static void RegisterValidatableOpConverters(
+    std::unordered_map<string, OpConverter>* registration) {
   // TODO(laigd): support all op types.
-  op_validators_["Const"] = ConvertConst;
-  op_validators_["Transpose"] = ConvertTranspose;
-  op_validators_["Reshape"] = ConvertReshape;
-  op_validators_["MatMul"] = ConvertMatMul;
+  (*registration)["BiasAdd"] = ConvertBiasAdd;
+  (*registration)["Const"] = ConvertConst;
+  (*registration)["Transpose"] = ConvertTranspose;
+  (*registration)["Reshape"] = ConvertReshape;
+  (*registration)["MatMul"] = ConvertMatMul;
+  (*registration)["Relu6"] = ConvertRelu6;
+
+  for (auto quantization_op_type :
+       {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3",
+        "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs"}) {
+    (*registration)[quantization_op_type] = ConvertQuantize;
+  }
+  for (auto binary_op_type :
+       {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum"}) {
+    (*registration)[binary_op_type] = ConvertBinary;
+  }
+}
+
+void TrtNodeValidator::RegisterOpValidators() {
+  RegisterValidatableOpConverters(&op_validators_);
 }
 
 void Converter::RegisterOpConverters() {
-  // vgg_16 slim implementation
+  RegisterValidatableOpConverters(&op_registry_);
+
   op_registry_["Conv2D"] = ConvertConv2D;
   op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
   op_registry_["Relu"] = ConvertActivation;
   op_registry_["MaxPool"] = ConvertPool;
   op_registry_["AvgPool"] = ConvertPool;
-  op_registry_["BiasAdd"] = ConvertScale;
-  op_registry_["Const"] = ConvertConst;
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
   op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
 
-  // resnet_50_v1 slim implementation
-  op_registry_["Add"] = ConvertBinary;
-  op_registry_["Mul"] = ConvertBinary;
-  op_registry_["Sub"] = ConvertBinary;
   op_registry_["Pad"] = ConvertPad;
 
   op_registry_["ConcatV2"] = ConvertConcat;
   op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
   op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
 
-  op_registry_["Div"] = ConvertBinary;
-  op_registry_["RealDiv"] = ConvertBinary;
-
   op_registry_["Rsqrt"] = ConvertUnary;
   op_registry_["Reciprocal"] = ConvertUnary;
   op_registry_["Exp"] = ConvertUnary;
@@ -3133,27 +3125,14 @@ void Converter::RegisterOpConverters() {
   op_registry_["Abs"] = ConvertUnary;
   op_registry_["Neg"] = ConvertUnary;
 
-  op_registry_["Transpose"] = ConvertTranspose;
-  op_registry_["Reshape"] = ConvertReshape;
-
   op_registry_["Sum"] = ConvertReduce;
   op_registry_["Prod"] = ConvertReduce;
   op_registry_["Max"] = ConvertReduce;
   op_registry_["Min"] = ConvertReduce;
   op_registry_["Mean"] = ConvertReduce;
-  op_registry_["Maximum"] = ConvertBinary;
-  op_registry_["Minimum"] = ConvertBinary;
   op_registry_["Softmax"] = ConvertSoftmax;
-  op_registry_["MatMul"] = ConvertMatMul;
   op_registry_["BatchMatMul"] = ConvertBatchMatMul;
   op_registry_["TopKV2"] = ConvertTopK;
-  op_registry_["Relu6"] = ConvertRelu6;
-# if NV_TENSORRT_MAJOR >= 5
-  op_registry_["QuantizeAndDequantizeV2"] = ConvertQuantize;
-  op_registry_["QuantizeAndDequantizeV3"] = ConvertQuantize;
-  op_registry_["FakeQuantWithMinMaxVars"] = ConvertQuantize;
-  op_registry_["FakeQuantWithMinMaxArgs"] = ConvertQuantize;
-#endif
 
   plugin_converter_ = ConvertPlugin;
 }
@@ -3164,8 +3143,7 @@ tensorflow::Status ConvertGraphDefToEngine(
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
-    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
-    bool use_calibration,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully) {
   engine->reset();
   if (convert_successfully) *convert_successfully = false;
@@ -3254,9 +3232,7 @@ tensorflow::Status ConvertGraphDefToEngine(
   if (convert_successfully) *convert_successfully = true;
 
   // Apply user provided quantization ranges to tensors
-  const bool warn_missing_ranges = (precision_mode == INT8MODE &&
-                                    !use_calibration);
-  converter.ApplyQuantizationRanges(warn_missing_ranges);
+  converter.MaybeApplyQuantizationRanges();
 
   // Build the engine.
   VLOG(1) << "Starting engine creation";
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 78749124a2690fe029e0a8b503bf6916efb0cae2..54e19b73957bccdae2b23bd3556de9ad00b864e5 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -147,8 +147,7 @@ tensorflow::Status ConvertGraphDefToEngine(
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
-    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
-    bool use_calibration,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully);
 
 // Helper class for the segmenter to determine whether an output edge from the
@@ -395,8 +394,7 @@ class TrtNodeValidator {
 // Class to convert TF nodes to TRT network.
 class Converter {
  public:
-  Converter(nvinfer1::INetworkDefinition* trt_network,
-            int precision_mode,
+  Converter(nvinfer1::INetworkDefinition* trt_network, int precision_mode,
             bool use_calibration);
 
   //////////////////////////////////////////////////////////////////////////////
@@ -442,12 +440,12 @@ class Converter {
   // This function should be called when we know the quantization range of a
   // tensor, either from a quantize/dequantize node or when the output is a
   // fixed range (e.g. SoftMax, Relu6, Sigmoid).
-  void ProvideQuantizationRange(nvinfer1::ITensor* tensor,
-                                float min_range, float max_range);
+  void ProvideQuantizationRange(nvinfer1::ITensor* tensor, float min_range,
+                                float max_range);
 
   // Should be called when full TRT network has been constructed and before
   // building the engine.
-  void ApplyQuantizationRanges(bool warn_missing_ranges);
+  void MaybeApplyQuantizationRanges();
 
   // Below are helper methods for op converters to add different layers to the
   // TRT network.
@@ -464,6 +462,13 @@ class Converter {
                                const nvinfer1::Dims& dims,
                                const nvinfer1::ITensor** tensor);
 
+  // Return OK if the broadcast scheme is supported and compute the shapes after
+  // broadcasting.
+  Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
+                              const TRT_TensorOrWeights& operand_r,
+                              nvinfer1::Dims* operand_l_new_dims,
+                              nvinfer1::Dims* operand_r_new_dims) const;
+
  private:
   // Verify the provided batch_size is consistent with batch_size_ and update it
   // if necessary.
@@ -482,10 +487,10 @@ class Converter {
   void RegisterOpConverters();
 
   void PropagateQuantizationRanges();
-  
+
   // Gets the min and max value in a TRT_ShapedWeights
-  Status GetWeightRange(const TRT_ShapedWeights& weights,
-                        float* out_min, float* out_max) const;
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const;
 
   // Registered op converters by op type.
   std::unordered_map<string, OpConverter> op_registry_;
@@ -503,7 +508,7 @@ class Converter {
   TrtWeightStore weight_store_;
 
   // During conversion, this table is populated with quantization ranges per
-  // tensor. ApplyQuantizationRanges() will use this table to set the TensorRT
+  // tensor. MaybeApplyQuantizationRanges() will use this table to set the TRT
   // quantization ranges. Since TRT only supports symmetric ranges, we will
   // store the range as a single float = max(abs(min_range), abs(max_range)).
   // Range refers to the floating point values, e.g. min_range = 0.0f, max_range
@@ -514,8 +519,8 @@ class Converter {
   // first tensor to second tensor. PropagateQuantizationRanges() will propagate
   // known ranges from quantization_ranges_ across these edges, adding the new
   // ranges to quantization_ranges_ so that they can be applied in
-  // ApplyQuantizationRanges().
-  std::vector<std::pair<nvinfer1::ITensor*,nvinfer1::ITensor*>>
+  // MaybeApplyQuantizationRanges().
+  std::vector<std::pair<nvinfer1::ITensor*, nvinfer1::ITensor*>>
       quantization_infer_;
 
   const int precision_mode_;
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
index 257394c0a308cdeced07b81929a69d583fbe0c40..603c4f7b5e5af8df7f81484c715675968f5da695 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
@@ -35,7 +35,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
+#include "tensorflow/core/public/session.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -47,7 +50,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
+using ::tensorflow::strings::StrCat;
 using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 
 // TODO(laigd): put this into some test utils file.
 void ExpectStatus(Status status, error::Code code = error::OK,
@@ -69,6 +74,32 @@ nvinfer1::Dims GetTestDims(const std::vector<int>& d) {
   return dims;
 }
 
+nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) {
+  switch (tf_dtype) {
+    case DT_FLOAT:
+      return nvinfer1::DataType::kFLOAT;
+    case DT_HALF:
+      return nvinfer1::DataType::kHALF;
+    case DT_INT32:
+      return nvinfer1::DataType::kINT32;
+    default:
+      QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype);
+  }
+}
+
+DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      return DT_FLOAT;
+    case nvinfer1::DataType::kHALF:
+      return DT_HALF;
+    case nvinfer1::DataType::kINT32:
+      return DT_INT32;
+    default:
+      QCHECK(false) << "Unexpected data type " << static_cast<int>(trt_dtype);
+  }
+}
+
 NodeDef MakeNodeDef(const string& name, const string& op,
                     const std::vector<string>& inputs) {
   NodeDef node_def;
@@ -111,6 +142,15 @@ bool TrtDimsEqualsArray(const std::vector<int>& lhs,
   return TrtDimsEquals(GetTestDims(lhs), rhs);
 }
 
+// TODO(laigd): define a parameterized matcher that can compare against the
+// vector.
+void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
+                              const nvinfer1::Dims& rhs) {
+  EXPECT_TRUE(TrtDimsEqualsArray(lhs, rhs))
+      << "expected: " << DebugString(GetTestDims(lhs)) << "\n"
+      << "  actual: " << DebugString(rhs);
+}
+
 bool TrtShapedWeightsEquals(const TRT_ShapedWeights& lhs,
                             const TRT_ShapedWeights& rhs) {
   return TrtDimsEquals(lhs.shape_, rhs.shape_) && lhs.type_ == rhs.type_ &&
@@ -121,8 +161,7 @@ template <typename T>
 void ValidateWeights(const TRT_ShapedWeights& weights,
                      const std::vector<int>& expected_dims,
                      const std::vector<T>& expected_value) {
-  EXPECT_TRUE(TrtDimsEqualsArray(expected_dims, weights.shape_))
-      << weights.DebugString();
+  ExpectTrtDimsEqualsArray(expected_dims, weights.shape_);
   ASSERT_EQ(expected_value.size(), weights.count()) << weights.DebugString();
   const T* actual_values = static_cast<const T*>(weights.GetValues());
   for (int i = 0; i < expected_value.size(); ++i) {
@@ -272,9 +311,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
           EXPECT_EQ(1, ptr->batch_size());
         }
         EXPECT_EQ(&itensor, ptr->tensor());
-        EXPECT_TRUE(TrtDimsEqualsArray({1}, ptr->GetTrtDims()))
-            << "- expected: " << DebugString(dims)
-            << "\n        vs\n-   actual: " << DebugString(ptr->GetTrtDims());
+        ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
       }
     }
   }
@@ -293,9 +330,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
       EXPECT_EQ(false, ptr->is_weights());
       EXPECT_EQ(1, ptr->batch_size());
       EXPECT_NE(nullptr, ptr->tensor());
-      EXPECT_TRUE(TrtDimsEqualsArray({1}, ptr->GetTrtDims()))
-          << "- expected: " << DebugString(dims)
-          << "\n        vs\n-   actual: " << DebugString(ptr->GetTrtDims());
+      ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
     }
   }
   // Test constructor with TRT_ShapedWeights argument.
@@ -312,9 +347,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
 
       nvinfer1::Dims dims;
       dims.nbDims = 0;
-      EXPECT_TRUE(TrtDimsEqualsArray({}, ptr->GetTrtDims()))
-          << "- expected: " << DebugString(dims)
-          << "\n        vs\n-   actual: " << DebugString(ptr->GetTrtDims());
+      ExpectTrtDimsEqualsArray({}, ptr->GetTrtDims());
     }
   }
 }
@@ -348,34 +381,50 @@ TEST_F(ValidatorTest, ConvertToTensorOrWeights) {
                                           graph_properties, &output));
     ValidateWeights<float>(output.weights(), {2}, {1.0, 2.0});
   }
-  // Convert non-Const. We test the case where the non-batch dimemsion is
-  // unknown as well, to make sure the validator allows that.
-  for (const int32 non_batch_dim : {-1, 2}) {
-    const int32 batch_size = 12;
 
+  // Helper method to run ConvertToTensorOrWeights() with predefined parameters.
+  auto convert_to_tensor_or_weights = [this](const std::vector<int64>& dims,
+                                             TRT_TensorOrWeights* output) {
     Scope s = Scope::NewRootScope();
-    ops::Placeholder::Attrs attrs;
-    TF_EXPECT_OK(TensorShapeUtils::MakeShape(
-        std::vector<int32>{batch_size, non_batch_dim}, &attrs.shape_));
+    const auto attrs = ops::Placeholder::Shape(PartialTensorShape{dims});
     auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT, attrs);
     auto add = ops::Add(s.WithOpName("add"), feed, feed);
 
     grappler::GrapplerItem item;
     TF_EXPECT_OK(s.ToGraphDef(&item.graph));
-
     grappler::GraphProperties graph_properties(item);
     TF_EXPECT_OK(graph_properties.InferStatically(true));
-
-    auto& node_def = add.operation.node()->def();
+    const NodeDef& node_def = add.operation.node()->def();
+    return this->ConvertToTensorOrWeights(node_def, /*output_port=*/0,
+                                          graph_properties, output);
+  };
+  // Convert non-Const with #dims > nvinfer1::Dims::MAX_DIMS+1.
+  {
     TRT_TensorOrWeights output;
-    ExpectStatus(ConvertToTensorOrWeights(node_def, /*output_port=*/0,
-                                          graph_properties, &output));
+    ExpectStatus(
+        convert_to_tensor_or_weights(
+            std::vector<int64>(nvinfer1::Dims::MAX_DIMS + 2, 1), &output),
+        error::OUT_OF_RANGE, "Input tensor rank is greater than 9");
+  }
+  // Convert non-Const with #dims < 2.
+  {
+    TRT_TensorOrWeights output;
+    ExpectStatus(
+        convert_to_tensor_or_weights({1}, &output), error::INVALID_ARGUMENT,
+        "Input tensor with rank<2 is not supported since the first dimension "
+        "is treated as batch dimension by TRT");
+  }
+  // Convert non-Const. We test the case where the non-batch dimemsion is
+  // unknown as well, to make sure the validator allows that.
+  for (const int32 non_batch_dim : {-1, 2}) {
+    const int32 batch_size = 12;
+    TRT_TensorOrWeights output;
+    ExpectStatus(
+        convert_to_tensor_or_weights({batch_size, non_batch_dim}, &output));
     EXPECT_EQ(true, output.is_tensor());
     EXPECT_EQ(batch_size, output.batch_size());
     EXPECT_NE(nullptr, output.tensor());
-    EXPECT_TRUE(TrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims()))
-        << "- expected: {" << non_batch_dim << "} \n        vs\n"
-        << "-   actual: " << DebugString(output.GetTrtDims());
+    ExpectTrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims());
   }
 }
 
@@ -526,9 +575,9 @@ TEST_F(ConverterTest, AddAndGetInputs) {
   EXPECT_EQ(nvinfer1::DataType::kFLOAT, inputs[0].tensor()->getType());
   EXPECT_EQ(nvinfer1::DataType::kINT32, inputs[2].tensor()->getType());
   EXPECT_EQ(nvinfer1::DataType::kHALF, inputs[3].tensor()->getType());
-  EXPECT_TRUE(TrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions()));
-  EXPECT_TRUE(TrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions()));
-  EXPECT_TRUE(TrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions()));
+  ExpectTrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions());
+  ExpectTrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions());
+  ExpectTrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions());
 }
 
 TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
@@ -574,7 +623,7 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
       {{"my_op", "my_output"}, {"my_op:1", "my_output_1"}}));
   EXPECT_EQ(2, output_tensors.size());
   for (auto output_tensor : output_tensors) {
-    EXPECT_TRUE(TrtDimsEqualsArray({2, 1}, output_tensor->getDimensions()));
+    ExpectTrtDimsEqualsArray({2, 1}, output_tensor->getDimensions());
   }
   EXPECT_EQ("my_output", string(output_tensors[0]->getName()));
   EXPECT_EQ("my_output_1", string(output_tensors[1]->getName()));
@@ -599,8 +648,7 @@ TEST_F(ConverterTest, TransposeTensor) {
   // OK.
   TF_EXPECT_OK(
       converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions());
 }
 
 TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
@@ -612,7 +660,7 @@ TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
   // Shape size doesn't match.
   ExpectStatus(converter_->PrepareTensorForShape(tw, GetTestDims({2, 3, 6}),
                                                  &output_tensor),
-               error::INVALID_ARGUMENT, "Reshape shapes are not compatible.");
+               error::INVALID_ARGUMENT, "Reshape shapes are not compatible");
 
   // TODO(aaroey): we should check the case where uninferred dimensions are not
   // an exact divisor of input dim ensions, e.g. for dims {-1, 7}.
@@ -620,14 +668,12 @@ TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
   // Infer shape, ok.
   TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({-1, 2}),
                                                  &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({15, 2}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({15, 2}, output_tensor->getDimensions());
 
   // Regular shape.
   TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
                                                  &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({10, 3}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
 }
 
 TEST_F(ConverterTest, PrepareTensorForShape_Weights) {
@@ -637,8 +683,7 @@ TEST_F(ConverterTest, PrepareTensorForShape_Weights) {
   const nvinfer1::ITensor* output_tensor = nullptr;
   TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
                                                  &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({10, 3}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
 }
 
 TEST_F(ConverterTest, MaybeUpdateBatchSize) {
@@ -678,51 +723,57 @@ TEST_F(ConverterTest, AddAndGetTensorOrWeights) {
                "tensor/weights my_tensor already exist");
 }
 
-TEST_F(ConverterTest, GetWeightRange) {
+template <typename T>
+void TestGetWeightRange(ConverterTest* test, TrtWeightStore* weight_store) {
   TRT_ShapedWeights weights =
-      weight_store_->GetTempWeights(DT_FLOAT, GetTestDims({2, 3}));
-  const std::vector<float> values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+      weight_store->GetTempWeights(DataTypeToEnum<T>::v(), GetTestDims({2, 3}));
+  const std::vector<T> values = {T(3), T(1), T(2), T(6), T(5), T(4)};
   memcpy(const_cast<void*>(weights.GetValues()), values.data(),
          weights.size_bytes());
 
   float out_min = 0.0f;
   float out_max = 0.0f;
-  TF_EXPECT_OK(GetWeightRange(weights, &out_min, &out_max));
+  TF_EXPECT_OK(test->GetWeightRange(weights, &out_min, &out_max));
   EXPECT_EQ(1.0f, out_min);
   EXPECT_EQ(6.0f, out_max);
 }
 
+TEST_F(ConverterTest, GetWeightRange) {
+  TestGetWeightRange<float>(this, weight_store_);
+  TestGetWeightRange<Eigen::half>(this, weight_store_);
+  TestGetWeightRange<int32>(this, weight_store_);
+}
+
 TEST_F(ConverterTest, ProvideQuantizationRange) {
   FakeITensor fake_tensor;
   // Assymetric range
   converter_->ProvideQuantizationRange(&fake_tensor, 0.0f, 6.0f);
-  EXPECT_EQ(quantization_ranges()[&fake_tensor], 6.0f);
+  EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]);
   converter_->ProvideQuantizationRange(&fake_tensor, 1.0f, 6.0f);
-  EXPECT_EQ(quantization_ranges()[&fake_tensor], 6.0f);
+  EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]);
   converter_->ProvideQuantizationRange(&fake_tensor, -8.0f, 6.0f);
-  EXPECT_EQ(quantization_ranges()[&fake_tensor], 8.0f);
+  EXPECT_EQ(8.0f, quantization_ranges()[&fake_tensor]);
   converter_->ProvideQuantizationRange(&fake_tensor, -8.123f, -6.123f);
-  EXPECT_EQ(quantization_ranges()[&fake_tensor], 8.123f);
+  EXPECT_EQ(8.123f, quantization_ranges()[&fake_tensor]);
   // Symmetric range
   converter_->ProvideQuantizationRange(&fake_tensor, -6.123f, 6.123f);
-  EXPECT_EQ(quantization_ranges()[&fake_tensor], 6.123f);
+  EXPECT_EQ(6.123f, quantization_ranges()[&fake_tensor]);
 }
 
-TEST_F(ConverterTest, ApplyQuantizationRanges) {
+TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
   // input -> infer1 -> infer2 -> infer3
-  FakeITensor input;
-  FakeITensor infer_1;
-  FakeITensor infer_2;
-  FakeITensor infer_3;
+  FakeITensor input, infer_1, infer_2, infer_3;
   FakeITensor not_infer;
-  converter_->ProvideQuantizationRange(&input, -5.0f, 5.0f);
-  converter_->ProvideQuantizationRange(&not_infer, -100.0f, 100.0f);
-  converter_->MarkQuantizationRangesAsInferrable(&input, &infer_1);
-  converter_->MarkQuantizationRangesAsInferrable(&infer_1, &infer_2);
-  converter_->MarkQuantizationRangesAsInferrable(&infer_2, &infer_3);
+  Converter int8_converter(/*trt_network=*/nullptr, INT8MODE,
+                           /*use_calibration=*/true);
+  int8_converter.ProvideQuantizationRange(&input, -5.0f, 5.0f);
+  int8_converter.ProvideQuantizationRange(&not_infer, -100.0f, 100.0f);
+  int8_converter.MarkQuantizationRangesAsInferrable(&input, &infer_1);
+  int8_converter.MarkQuantizationRangesAsInferrable(&infer_1, &infer_2);
+  int8_converter.MarkQuantizationRangesAsInferrable(&infer_2, &infer_3);
 
   // Input range should be inferred along the chain and applied to tensors.
-  converter_->ApplyQuantizationRanges(/*warn_missing_ranges=*/false);
+  int8_converter.MaybeApplyQuantizationRanges();
 #if NV_TENSORRT_MAJOR >= 5
   EXPECT_EQ(input.getDynamicRange(), 5.0f);
   EXPECT_EQ(infer_1.getDynamicRange(), 5.0f);
@@ -733,27 +784,117 @@ TEST_F(ConverterTest, ApplyQuantizationRanges) {
 }
 
 TEST_F(ConverterTest, PropagateQuantizationRanges) {
-  // input <-> infer1 <-> infer2 <-> infer3
-  FakeITensor input;
-  FakeITensor infer_1;
-  FakeITensor infer_2;
-  FakeITensor infer_3;
+  // infer0 <-> infer1 <-> infer2 <-> infer3
+  //              |
+  //            infer4 <-> infer5
+  FakeITensor infer[6];
   FakeITensor not_infer;
-  converter_->ProvideQuantizationRange(&input, -5.0f, 5.0f);
-  converter_->MarkQuantizationRangesAsInferrable(&input, &infer_1);
-  converter_->MarkQuantizationRangesAsInferrable(&infer_1, &infer_2);
-  converter_->MarkQuantizationRangesAsInferrable(&infer_3, &infer_2);
+  converter_->ProvideQuantizationRange(&infer[4], -5.0f, 5.0f);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[0], &infer[1]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[1], &infer[2]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[3], &infer[2]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[1]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[5]);
 
   // Input range should be inferred along the chain.
   PropagateQuantizationRanges();
   auto ranges = quantization_ranges();
-  EXPECT_EQ(ranges[&input], 5.0f);
-  EXPECT_EQ(ranges[&infer_1], 5.0f);
-  EXPECT_EQ(ranges[&infer_2], 5.0f);
-  EXPECT_EQ(ranges[&infer_3], 5.0f);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(5.0f, ranges[&infer[i]]);
+  }
   EXPECT_EQ(ranges.count(&not_infer), 0);
 }
 
+TEST_F(ConverterTest, GetTrtBroadcastShape) {
+  const bool kIsTensor = true;
+  const bool kIsNotTensor = false;
+  auto symmetric_test = [this](const std::vector<int>& operand_1_shape,
+                               const std::vector<int>& operand_2_shape,
+                               const bool operand_1_is_tensor,
+                               const bool operand_2_is_tensor,
+                               const std::vector<int>& expected_operand_1_shape,
+                               const std::vector<int>& expected_operand_2_shape,
+                               error::Code expected_code = error::OK,
+                               const char* expected_error_msg_substr = nullptr,
+                               const int operand_1_batch_size = -1,
+                               const int operand_2_batch_size = -1) {
+    auto create_tensor_or_weights = [](const std::vector<int>& shape,
+                                       bool is_tensor, int batch_size = -1) {
+      if (is_tensor) {
+        return TRT_TensorOrWeights{nvinfer1::DataType::kFLOAT,
+                                   GetTestDims(shape), batch_size};
+      }
+      TRT_ShapedWeights weights;
+      weights.shape_ = GetTestDims(shape);
+      return TRT_TensorOrWeights(weights);
+    };
+
+    nvinfer1::Dims operand_1_new_dims, operand_2_new_dims;
+    TRT_TensorOrWeights operand_1 = create_tensor_or_weights(
+        operand_1_shape, operand_1_is_tensor, operand_1_batch_size);
+    TRT_TensorOrWeights operand_2 = create_tensor_or_weights(
+        operand_2_shape, operand_2_is_tensor, operand_2_batch_size);
+
+    // operand_1 broadcast operand_2
+    ExpectStatus(
+        this->converter_->GetTrtBroadcastShape(
+            operand_1, operand_2, &operand_1_new_dims, &operand_2_new_dims),
+        expected_code, expected_error_msg_substr);
+    if (expected_code == error::OK) {
+      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
+      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+    }
+    // operand_2 broadcast operand_1
+    ExpectStatus(
+        this->converter_->GetTrtBroadcastShape(
+            operand_2, operand_1, &operand_2_new_dims, &operand_1_new_dims),
+        expected_code, expected_error_msg_substr);
+    if (expected_code == error::OK) {
+      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
+      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+    }
+  };
+
+  // Both inputs are weights.
+  symmetric_test(
+      {1}, {1}, kIsNotTensor, kIsNotTensor, {}, {}, error::INVALID_ARGUMENT,
+      "Broadcasting requires at least one of the operands be tensors");
+
+  // One tensor and one weights.
+  symmetric_test({1, 1, 1}, {2}, kIsTensor, kIsNotTensor, {1, 1, 1}, {1, 1, 2});
+  symmetric_test({1, 1, 2}, {2}, kIsTensor, kIsNotTensor, {1, 1, 2}, {1, 1, 2});
+  symmetric_test({1, 3, 2}, {1}, kIsTensor, kIsNotTensor, {1, 3, 2}, {1, 1, 1});
+  symmetric_test({1, 1, 1}, {2, 3}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {1, 2, 3});
+  symmetric_test({1, 1, 1}, {2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {2, 3, 4});
+  symmetric_test({1, 1, 1}, {1, 2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {2, 3, 4});
+  symmetric_test({1, 3, 4}, {1, 2, 1, 4}, kIsTensor, kIsNotTensor, {1, 3, 4},
+                 {2, 1, 4});
+  symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme");
+  symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme",
+                 /*operand_1_batch_size=*/2);
+  symmetric_test({1, 1, 1}, {1, 1, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 4 vs broadcast #dims 5)");
+
+  // Both inputs are tensors.
+  symmetric_test({1, 1, 1}, {1, 1}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 3 vs broadcast #dims 4)");
+  symmetric_test({1, 3, 4}, {2, 1, 4}, kIsTensor, kIsTensor, {1, 3, 4},
+                 {2, 1, 4});
+  symmetric_test({1, 1, 1}, {1, 1, 1, 1}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 4 vs broadcast #dims 5)");
+}
+
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
@@ -791,8 +932,12 @@ class OpConverterTest : public ::testing::Test {
     validator_inputs_.clear();
   }
 
-  void BuildAndRun(const char* input_name, const std::vector<float>& input_data,
-                   const char* output_name, std::vector<float>* output_data) {
+  // TODO(laigd): test fp16 and int8 support.
+  template <typename T>
+  void BuildAndRun(
+      const std::vector<std::pair<const char*, const std::vector<T>>>&
+          input_data,
+      const char* output_name, std::vector<T>* output_data) {
     // Mark the output tensor as TRT engine output.
     TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(
         {{string(output_name), string(output_name)}}));
@@ -803,25 +948,33 @@ class OpConverterTest : public ::testing::Test {
     CHECK_NOTNULL(engine_.get());
 
     // Execute the TRT engine.
-    const int input_size = input_data.size() * sizeof(float);
-    const int output_size = output_data->size() * sizeof(float);
-    const int input_index = engine_->getBindingIndex(input_name);
-    const int output_index = engine_->getBindingIndex(output_name);
+    ASSERT_LE(input_data.size() + 1, 3);
+    void* buffers[3];
+    for (const auto name_and_data : input_data) {
+      const int input_size = name_and_data.second.size() * sizeof(T);
+      const int input_index = engine_->getBindingIndex(name_and_data.first);
+      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size));
+      ASSERT_EQ(
+          0, cudaMemcpyAsync(buffers[input_index], name_and_data.second.data(),
+                             input_size, cudaMemcpyHostToDevice, stream_));
+    }
 
-    ASSERT_EQ(engine_->getNbBindings(), 2);
-    void* buffers[2];
-    ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size));
+    const int output_size = output_data->size() * sizeof(T);
+    const int output_index = engine_->getBindingIndex(output_name);
     ASSERT_EQ(0, cudaMalloc(&buffers[output_index], output_size));
-    ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input_data.data(),
-                                 input_size, cudaMemcpyHostToDevice, stream_));
+
+    ASSERT_EQ(engine_->getNbBindings(), input_data.size() + 1);
+
     TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context(
         engine_->createExecutionContext());
     execution_context->enqueue(/*batchSize=*/1, buffers, stream_, nullptr);
     ASSERT_EQ(0, cudaMemcpyAsync(output_data->data(), buffers[output_index],
                                  output_size, cudaMemcpyDeviceToHost, stream_));
     cudaStreamSynchronize(stream_);
-    ASSERT_EQ(0, cudaFree(buffers[input_index]));
-    ASSERT_EQ(0, cudaFree(buffers[output_index]));
+
+    for (int i = 0; i < input_data.size() + 1; ++i) {
+      ASSERT_EQ(0, cudaFree(buffers[i]));
+    }
   }
 
   bool HasStaticShape(const nvinfer1::Dims& dims) const {
@@ -836,18 +989,7 @@ class OpConverterTest : public ::testing::Test {
   void AddTestTensor(
       const char* name, const std::vector<int32>& dims, int batch_size = 1,
       nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
-    DataType tf_dtype = DT_FLOAT;
-    switch (trt_dtype) {
-      case nvinfer1::DataType::kFLOAT:
-        tf_dtype = DT_FLOAT;
-        break;
-      case nvinfer1::DataType::kINT32:
-        tf_dtype = DT_INT32;
-        break;
-      default:
-        ASSERT_TRUE(false) << "Unexpected data type "
-                           << static_cast<int>(trt_dtype);
-    }
+    DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
     ops::Placeholder::Attrs attrs;
     TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_));
     attrs.shape_.InsertDim(0, batch_size);
@@ -940,6 +1082,11 @@ class OpConverterTest : public ::testing::Test {
   TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
   cudaStream_t stream_;
+  // Used to create placeholders with shape and data type information. The
+  // created placeholders will be used as inputs to the node to be verified,
+  // thus we need the shape and data type information to get a non-empty
+  // GraphProperties.
+  // TODO(laigd): consider use this Scope to create the NodeDef to verify.
   Scope scope_;
   std::unordered_map<string, NodeDef> validator_inputs_;
 };
@@ -1063,15 +1210,15 @@ TEST_F(OpConverterTest, ConvertTranspose) {
     Reset();
     AddTestTensor("input", {1, 2, 3});
     AddTestWeights<int32>("weights", {4}, {0, 3, 1, 2});
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_transpose", &output));
     EXPECT_TRUE(output.is_tensor());
-    EXPECT_TRUE(TrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions()))
-        << output.DebugString();
+    ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions());
 
     std::vector<float> output_data(6);
-    BuildAndRun("input", {1, 2, 3, 4, 5, 6}, "my_transpose", &output_data);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_transpose",
+                       &output_data);
     EXPECT_THAT(output_data, ElementsAre(1, 4, 2, 5, 3, 6));
   }
 }
@@ -1153,15 +1300,15 @@ TEST_F(OpConverterTest, ConvertReshape) {
     Reset();
     AddTestTensor("input", ok_params[i].tensor_dims, ok_params[i].batch_size);
     AddTestWeights<int32>("weights", {4}, ok_params[i].shape);
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_reshape", &output));
     EXPECT_TRUE(output.is_tensor());
-    EXPECT_TRUE(TrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions()))
-        << output.DebugString();
+    ExpectTrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions());
 
     std::vector<float> output_data(6);
-    BuildAndRun("input", {1, 2, 3, 4, 5, 6}, "my_reshape", &output_data);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_reshape",
+                       &output_data);
     EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
@@ -1175,15 +1322,14 @@ TEST_F(OpConverterTest, ConvertMatMul) {
         "Input expects tensor and weights, at my_matmul");
   }
 
-  // Get the NodeDef for Reshape.
+  // Get the NodeDef for MatMul.
   auto get_matmul_nodedef = [](DataType dtype, bool transpose_a,
                                bool transpose_b) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), dtype);
     auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
-    ops::MatMul::Attrs matmul_attrs;
-    matmul_attrs.transpose_a_ = transpose_a;
-    matmul_attrs.transpose_b_ = transpose_b;
+    const auto matmul_attrs =
+        ops::MatMul::TransposeA(transpose_a).TransposeB(transpose_b);
     auto matmul =
         ops::MatMul(s.WithOpName("my_matmul"), input, weights, matmul_attrs);
     return matmul.operation.node()->def();
@@ -1199,82 +1345,499 @@ TEST_F(OpConverterTest, ConvertMatMul) {
         node_def, error::UNIMPLEMENTED,
         "Data type is not supported, for node my_matmul got int32");
   }
-  {
-    // transpose_a is set.
-    for (bool transpose_b : {false, true}) {
-      Reset();
-      NodeDef node_def =
-          get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/true, transpose_b);
-      AddTestTensor("input", {2}, /*batch_size=*/1);
-      AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-      RunValidationAndConversion(
-          node_def, error::INVALID_ARGUMENT,
-          "transpose_a is not supported for TensorRT FullyConnected");
+  // transpose_a is set.
+  for (bool transpose_b : {false, true}) {
+    Reset();
+    NodeDef node_def =
+        get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/true, transpose_b);
+    AddTestTensor("input", {2}, /*batch_size=*/1);
+    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "transpose_a is not supported for TensorRT FullyConnected");
+  }
+  // OK.
+  for (bool transpose_b : {false, true}) {
+    Reset();
+    NodeDef node_def =
+        get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/false, transpose_b);
+    AddTestTensor("input", {2}, /*batch_size=*/1);
+    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
+
+    std::vector<float> output_data(2);
+    BuildAndRun<float>({{"input", {0, 1}}}, "my_matmul", &output_data);
+    if (transpose_b) {
+      EXPECT_THAT(output_data, ElementsAre(1, 3));
+    } else {
+      EXPECT_THAT(output_data, ElementsAre(2, 3));
     }
   }
-  {
-    // OK.
-    for (bool transpose_b : {false, true}) {
-      Reset();
-      NodeDef node_def =
-          get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/false, transpose_b);
-      AddTestTensor("input", {2}, /*batch_size=*/1);
-      AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-      RunConversion(node_def);
+}
+
+template <DataType dtype>
+void TestConvertBiasAdd(OpConverterTest* test) {
+  // Get the NodeDef for BiasAdd.
+  auto get_biasadd_nodedef = [](const string& data_format) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
+    const auto biasadd_attrs = ops::BiasAdd::DataFormat(data_format);
+    auto biasadd =
+        ops::BiasAdd(s.WithOpName("my_biasadd"), input, weights, biasadd_attrs);
+    return biasadd.operation.node()->def();
+  };
+
+  typedef typename EnumToDataType<dtype>::Type CType;
+  for (const string& data_format : {"NHWC", "NCHW"}) {
+    for (const int trt_input_rank : {1, 2, 3, 4}) {
+      test->Reset();
+      NodeDef node_def = get_biasadd_nodedef(data_format);
+
+      // Add input, dims_array will be like {2, 1, ..., 1, 3}
+      std::vector<int32> dims_array(trt_input_rank, 1);
+      if (trt_input_rank == 1) {
+        dims_array[0] = (data_format == "NHWC" ? 3 : 2);
+      } else {
+        dims_array[0] = 2;
+        dims_array[trt_input_rank - 1] = 3;
+      }
+      test->AddTestTensor("input", dims_array, /*batch_size=*/1,
+                          TfDataTypeToTrt(dtype));
+
+      // Add bias weights.
+      const int channel_size = (data_format == "NHWC" ? 3 : 2);
+      std::vector<CType> bias(channel_size);
+      for (int i = 0; i < channel_size; ++i) {
+        bias[i] = CType(i + 1);  // bias will be {1, 2, 3, ...}
+      }
+      test->AddTestWeights<CType>("weights", {channel_size}, bias);
+
+      // Run the conversion.
+      test->RunValidationAndConversion(node_def);
       TRT_TensorOrWeights output;
-      TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
+      TF_EXPECT_OK(test->GetTensorOrWeights("my_biasadd", &output));
       EXPECT_TRUE(output.is_tensor());
-      EXPECT_TRUE(TrtDimsEqualsArray({2}, output.tensor()->getDimensions()))
-          << output.DebugString();
-
-      std::vector<float> output_data(2);
-      BuildAndRun("input", {0, 1}, "my_matmul", &output_data);
-      if (transpose_b) {
-        EXPECT_THAT(output_data, ElementsAre(1, 3));
+      ExpectTrtDimsEqualsArray(dims_array, output.tensor()->getDimensions());
+
+      // Build and run the engine.
+      const int num_input = TrtDimsNumElements(GetTestDims(dims_array));
+      ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
+                num_input);
+      std::vector<CType> output_data(num_input);
+      test->BuildAndRun<CType>(
+          {{"input", std::vector<CType>(num_input, CType(0))}}, "my_biasadd",
+          &output_data);
+      if (trt_input_rank == 1) {
+        if (data_format == "NHWC") {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3)));
+        } else {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2)));
+        }
       } else {
-        EXPECT_THAT(output_data, ElementsAre(2, 3));
+        if (data_format == "NHWC") {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3),
+                                               CType(1), CType(2), CType(3)));
+        } else {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(1), CType(1),
+                                               CType(2), CType(2), CType(2)));
+        }
       }
     }
   }
 }
 
-TEST_F(OpConverterTest, ConvertQuantize) {
+TEST_F(OpConverterTest, ConvertBiasAdd) {
   {
     // Input list is empty, should fail.
-    NodeDef node_def =
-        MakeNodeDef("my_quantize", "QuantizeAndDequantizeV2", {});
-    RunConversion(
+    NodeDef node_def = MakeNodeDef("my_biasadd", "BiasAdd", {});
+    RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Invalid number of inputs for QuantizeAndDequantizeV2, at my_quantize");
+        "Input expects tensor and weights, at my_biasadd");
+  }
+
+  // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test
+  // DT_INT32 type here.
+  TestConvertBiasAdd<DT_FLOAT>(this);
+  TestConvertBiasAdd<DT_HALF>(this);
+}
+
+template <typename OpType>
+NodeDef GetBinaryOpNodeDef(const string& input_name_l,
+                           const string& input_name_r, DataType dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input_l = ops::Placeholder(s.WithOpName(input_name_l), dtype);
+  auto input_r = ops::Placeholder(s.WithOpName(input_name_r), dtype);
+  auto op = OpType(s.WithOpName("my_binary"), input_l, input_r);
+  return op.operation.node()->def();
+}
+
+void CheckAddedLayers(OpConverterTest* test, bool expect_scale_layer) {
+  bool element_wise_layer_found = false;
+  bool scale_layer_found = false;
+  for (int i = 0; i < test->converter_->network()->getNbLayers(); i++) {
+    nvinfer1::ILayer* layer = test->converter_->network()->getLayer(i);
+    if (dynamic_cast<nvinfer1::IScaleLayer*>(layer)) {
+      scale_layer_found = true;
+    } else if (dynamic_cast<nvinfer1::IElementWiseLayer*>(layer)) {
+      element_wise_layer_found = true;
+    }
+  }
+  EXPECT_EQ(expect_scale_layer, scale_layer_found);
+  EXPECT_NE(expect_scale_layer, element_wise_layer_found);
+}
+
+template <typename OpType, DataType dtype>
+void TestBinaryTensorOpWeightNoBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  for (auto swap_inputs : {false, true}) {
+    test->Reset();
+    NodeDef node_def;
+    if (swap_inputs) {
+      node_def = GetBinaryOpNodeDef<OpType>("weights", "input", dtype);
+    } else {
+      node_def = GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
+    }
+
+    const std::vector<CType> operand1{CType(3), CType(7.5)};
+    const std::vector<CType> operand2{CType(2), CType(3)};
+
+    // It requires the dims to be at least of rank 3 to apply an IScaleLayer.
+    test->AddTestTensor("input", /*dims=*/{1, 1, 2}, /*batch_size=*/1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestWeights<CType>("weights", /*dims=*/{1, 1, 2},
+                                /*values=*/swap_inputs ? operand1 : operand2);
+    test->RunValidationAndConversion(node_def);
+
+    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+    CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+    // Check the dims of the output ITensor.
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 1, 2}, output.tensor()->getDimensions());
+
+    std::vector<CType> output_data(2);
+    test->BuildAndRun<CType>(
+        {{"input",
+          /*input_data=*/swap_inputs ? operand2 : operand1}},
+        "my_binary", &output_data);
+    if (node_def.op() == "Add") {
+      EXPECT_THAT(output_data, ElementsAre(CType(5), CType(10.5)));
+    } else if (node_def.op() == "Sub") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1), CType(4.5)));
+    } else if (node_def.op() == "Mul") {
+      EXPECT_THAT(output_data, ElementsAre(CType(6), CType(22.5)));
+    } else if (node_def.op() == "Div") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+    } else if (node_def.op() == "RealDiv") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+    } else {
+      ASSERT_TRUE(false);
+    }
+  }
+}
+
+template <DataType dtype>
+void TestBinaryTensorOpWeightWithChannelWiseBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
+  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
+  const std::vector<CType> weights{CType(10), CType(20)};
+  // There are two types of valid dim pairs which requires channel-wise
+  // broadcasting:
+  // - input dims (X Y Z) vs weights dims (X 1 1)
+  // - input dims (X Y Z) vs weights dims (Z)
+  // Here X=Z=2 and Y=1.
+  for (auto weights_dims : std::vector<std::vector<int>>{{2, 1, 1}, {2}}) {
+    test->Reset();
+    test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestWeights<CType>("weights", weights_dims, weights);
+    test->RunValidationAndConversion(node_def);
+
+    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+    CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+    // Check the dims of the output ITensor.
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
+
+    std::vector<CType> output_data(4);
+    test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+    if (weights_dims.size() == 1) {
+      EXPECT_THAT(output_data,
+                  ElementsAre(CType(11), CType(22), CType(13), CType(24)));
+    } else {
+      EXPECT_THAT(output_data,
+                  ElementsAre(CType(11), CType(12), CType(23), CType(24)));
+    }
+  }
+}
+
+template <DataType dtype>
+void TestBinaryTensorOpWeightWithUniformlyBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
+  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
+  const std::vector<CType> weights{CType(10)};
+  test->Reset();
+  test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestWeights<CType>("weights", {1, 1, 1, 1}, weights);
+  test->RunValidationAndConversion(node_def);
+
+  // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+  CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+  // Check the dims of the output ITensor.
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
+
+  std::vector<CType> output_data(4);
+  test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+  EXPECT_THAT(output_data,
+              ElementsAre(CType(11), CType(12), CType(13), CType(14)));
+}
+
+template <typename OpType>
+void TestBinaryTensorOpWeightFallback(OpConverterTest* test,
+                                      const std::vector<int32>& input_dims,
+                                      const std::vector<int>& weights_dims,
+                                      error::Code code = error::OK,
+                                      const char* error_msg_substr = nullptr,
+                                      const int input_batch_size = 1) {
+  const DataType dtype = DT_FLOAT;
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const size_t num_inputs = TrtDimsNumElements(GetTestDims(input_dims));
+  const size_t num_weights = TrtDimsNumElements(GetTestDims(weights_dims));
+
+  test->Reset();
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
+  test->AddTestTensor("input", /*dims=*/input_dims, input_batch_size,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestWeights<CType>(
+      "weights", /*dims=*/weights_dims,
+      /*values=*/std::vector<CType>(num_weights, CType(1)));
+  test->RunValidationAndConversion(node_def, code, error_msg_substr);
+  if (code != error::OK) return;
+
+  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
+  CheckAddedLayers(test, /*expect_scale_layer=*/false);
+
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+
+  // Check the dims of the output ITensor.
+  std::vector<int> expected_output_dims = input_dims;
+  for (int i = expected_output_dims.size() - 1, j = weights_dims.size() - 1;
+       i >= 0 && j >= 0; --i, --j) {
+    if (expected_output_dims[i] == 1) {
+      expected_output_dims[i] = weights_dims[j];
+    }
+  }
+  ExpectTrtDimsEqualsArray(expected_output_dims,
+                           output.tensor()->getDimensions());
+
+  // Check the result of running the engine.
+  const int expected_num_outputs =
+      TrtDimsNumElements(GetTestDims(expected_output_dims));
+  std::vector<CType> output_data(expected_num_outputs);
+  test->BuildAndRun<CType>(
+      {{"input",
+        /*input_data=*/std::vector<CType>(num_inputs, CType(2))}},
+      "my_binary", &output_data);
+  if (node_def.op() == "Add") {
+    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
+                                 expected_num_outputs, CType(3))));
+  } else if (node_def.op() == "Minimum") {
+    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
+                                 expected_num_outputs, CType(1))));
+  } else {
+    ASSERT_TRUE(false);
+  }
+}
+
+template <typename OpType, DataType dtype>
+void TestBinaryTensorOpTensor(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  test->Reset();
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<OpType>("input1", "input2", dtype);
+  test->AddTestTensor("input1", /*dims=*/{1, 2}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestTensor("input2", /*dims=*/{2, 1}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->RunValidationAndConversion(node_def);
+
+  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
+  CheckAddedLayers(test, /*expect_scale_layer=*/false);
+
+  // Check output dims.
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
+
+  std::vector<CType> output_data(4);
+  // After broadcasting first input becomes {3, 6, 3, 6} and second input
+  // becomes {2, 3, 2, 3}.
+  test->BuildAndRun<CType>(
+      {{"input1", {CType(3), CType(6)}}, {"input2", {CType(2), CType(3)}}},
+      "my_binary", &output_data);
+  if (node_def.op() == "Add") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(5), CType(8), CType(6), CType(9)));
+  } else if (node_def.op() == "Sub") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1), CType(4), CType(0), CType(3)));
+  } else if (node_def.op() == "Mul") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(6), CType(12), CType(9), CType(18)));
+  } else if (node_def.op() == "Div") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+  } else if (node_def.op() == "RealDiv") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+  } else if (node_def.op() == "Minimum") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(2), CType(2), CType(3), CType(3)));
+  } else if (node_def.op() == "Maximum") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(3), CType(6), CType(3), CType(6)));
+  } else {
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_F(OpConverterTest, ConvertBinary) {
+  // Input size doesn't match, should fail.
+  for (size_t num_inputs = 0; num_inputs < 2; ++num_inputs) {
+    Reset();
+    NodeDef node_def = MakeNodeDef("my_add", "Add", {num_inputs, "input"});
+    AddTestTensor("input", {1}, /*batch_size=*/1, nvinfer1::DataType::kFLOAT);
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Binary ops require two inputs, at my_add");
+  }
+  {
+    // Both inputs are weights.
+    Reset();
+    NodeDef node_def = MakeNodeDef("my_add", "Add", {"weights1", "weights2"});
+    AddTestWeights<float>("weights1", {1}, {1});
+    AddTestWeights<float>("weights2", {1}, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Constant folding is falled back to TensorFlow, binary op received "
+        "both input as constant at: my_add");
+  }
+
+  // Test BinaryTensorOpWeight() without broadcasting.
+  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_FLOAT>(this);
+#if 0
+  // TODO(b/119560144): it doesn't support FP16 constants and the following test
+  // will fail.
+  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_HALF>(this);
+#endif
+
+  // Test BinaryTensorOpWeight() with channel-wise broadcasting.
+  TestBinaryTensorOpWeightWithChannelWiseBroadcast<DT_FLOAT>(this);
+
+  // Test BinaryTensorOpWeight() with uniformly broadcasting.
+  TestBinaryTensorOpWeightWithUniformlyBroadcast<DT_FLOAT>(this);
+
+  // Test BinaryTensorOpWeight() falling back to BinaryTensorOpTensor().
+  // Unsupported op.
+  TestBinaryTensorOpWeightFallback<ops::Minimum>(this, {1, 1, 1}, {1});
+  // Rank of input tensor dimension <3.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1}, {1});
+  // Broadcast on batch dimension, should fail.
+  TestBinaryTensorOpWeightFallback<ops::Add>(
+      this, {1, 1, 1}, {2, 1, 1, 1}, error::INVALID_ARGUMENT,
+      "Unsupported binary op broadcast scheme for op my_binary",
+      /*input_batch_size=*/2);
+  // Incompatible dims with per-channel mode.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1, 1}, {1, 2, 1});
+  // Incompatible dims.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 2, 1}, {2});
+
+  // Test BinaryTensorOpTensor() with broadcasting.
+  TestBinaryTensorOpTensor<ops::Add, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Sub, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Mul, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Div, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::RealDiv, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Minimum, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Maximum, DT_FLOAT>(this);
+
+  TestBinaryTensorOpTensor<ops::Add, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Sub, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Mul, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Div, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::RealDiv, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Minimum, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Maximum, DT_HALF>(this);
+}
+
+TEST_F(OpConverterTest, ConvertQuantize) {
+  for (const string& op :
+       {"FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars",
+        "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3"}) {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_quantize", op, {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        StrCat("Invalid number of inputs for ", op, ", at my_quantize")
+            .c_str());
   }
   {
     // FakeQuantWithMinMaxArgs attributes are empty, should fail.
     NodeDef node_def =
         MakeNodeDef("my_quantize", "FakeQuantWithMinMaxArgs", {"input"});
     AddTestTensor("input", {1, 2, 3});
-    RunConversion(node_def, error::INVALID_ARGUMENT,
-                  "Min or max attribute not found for FakeQuantWithMinMaxArgs "
-                  "at my_quantize");
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Min or max attribute not found for FakeQuantWithMinMaxArgs "
+        "at my_quantize");
   }
   {
     // FakeQuantWithMinMaxArgs ranges set via attributes, ok.
     Reset();
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    ops::FakeQuantWithMinMaxArgs::Attrs quantize_attrs;
-    quantize_attrs.min_ = -6.0f;
-    quantize_attrs.max_ = 6.0f;
+    auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
     auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("my_quantize"),
                                                  input, quantize_attrs);
     const NodeDef& node_def = quantize.operation.node()->def();
     AddTestTensor("input", {1, 2, 3});
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
     EXPECT_TRUE(output.is_tensor());
     auto ranges = quantization_ranges();
-    EXPECT_EQ(ranges.count(output.tensor()), 1);
-    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
   }
   {
     // FakeQuantWithMinMaxVars ranges set via inputs, ok.
@@ -1289,13 +1852,13 @@ TEST_F(OpConverterTest, ConvertQuantize) {
     AddTestTensor("input", {1, 2, 3});
     AddTestWeights<float>("weights_min", {1}, {-6.0f});
     AddTestWeights<float>("weights_max", {1}, {6.0f});
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
     EXPECT_TRUE(output.is_tensor());
     auto ranges = quantization_ranges();
-    EXPECT_EQ(ranges.count(output.tensor()), 1);
-    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
   }
   {
     // QuantizeAndDequantizeV2 ranges set via inputs, ok.
@@ -1310,13 +1873,31 @@ TEST_F(OpConverterTest, ConvertQuantize) {
     AddTestTensor("input", {1, 2, 3});
     AddTestWeights<float>("weights_min", {1}, {-6.0f});
     AddTestWeights<float>("weights_max", {1}, {6.0f});
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
     EXPECT_TRUE(output.is_tensor());
     auto ranges = quantization_ranges();
-    EXPECT_EQ(ranges.count(output.tensor()), 1);
-    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // QuantizeAndDequantizeV2 Range inputs are tensors, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::QuantizeAndDequantizeV2(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights_min", {1});
+    AddTestTensor("weights_max", {1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Min and max inputs for QuantizeAndDequantizeV2 must be weights not "
+        "tensors, at my_quantize");
   }
   {
     // QuantizeAndDequantizeV3 ranges set via inputs, ok.
@@ -1333,31 +1914,13 @@ TEST_F(OpConverterTest, ConvertQuantize) {
     AddTestWeights<float>("weights_min", {1}, {-6.0f});
     AddTestWeights<float>("weights_max", {1}, {6.0f});
     AddTestWeights<int>("num_bits", {1}, {8});
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
     EXPECT_TRUE(output.is_tensor());
     auto ranges = quantization_ranges();
-    EXPECT_EQ(ranges.count(output.tensor()), 1);
-    EXPECT_EQ(ranges[output.tensor()], 6.0f);
-  }
-  {
-    // QuantizeAndDequantizeV2 Range inputs are tensors, should fail.
-    Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
-    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
-    auto quantize = ops::QuantizeAndDequantizeV2(
-        s.WithOpName("my_quantize"), input, weights_min, weights_max);
-    const NodeDef& node_def = quantize.operation.node()->def();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestTensor("weights_min", {1});
-    AddTestTensor("weights_max", {1});
-    RunConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Min and max inputs for QuantizeAndDequantizeV2 must be weights not "
-        "tensors, at my_quantize");
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
   }
 }
 
@@ -1365,21 +1928,29 @@ TEST_F(OpConverterTest, ConvertRelu6) {
   {
     // Input list is empty, should fail.
     NodeDef node_def = MakeNodeDef("my_relu6", "Relu6", {});
-    RunConversion(node_def, error::INVALID_ARGUMENT,
-                  "Invalid number of inputs for Relu6, at my_relu6");
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Invalid number of inputs for Relu6, at my_relu6");
   }
 
   // Get the NodeDef for Relu6.
   Scope s = Scope::NewRootScope();
   auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
   auto relu6 = ops::Relu6(s.WithOpName("my_relu6"), input);
-  const NodeDef& node_def = relu6.operation.node()->def();
-
+  const NodeDef node_def = relu6.operation.node()->def();
+  {
+    // Input is weights, should fail.
+    Reset();
+    AddTestWeights<float>("input", {1}, {1.0f});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Relu6 is only implemented for tensors, not weights, at my_relu6");
+  }
   {
     // Clip tensor values and set quantization ranges, ok.
     Reset();
     AddTestTensor("input", {1, 2, 3});
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_relu6", &output));
     EXPECT_TRUE(output.is_tensor());
@@ -1387,17 +1958,10 @@ TEST_F(OpConverterTest, ConvertRelu6) {
     EXPECT_EQ(ranges[output.tensor()], 6.0f);
 
     std::vector<float> output_data(6);
-    BuildAndRun("input", {-100, -1, 0, 3, 5, 9}, "my_relu6", &output_data);
+    BuildAndRun<float>({{"input", {-100, -1, 0, 3, 5, 9}}}, "my_relu6",
+                       &output_data);
     EXPECT_THAT(output_data, ElementsAre(0, 0, 0, 3, 5, 6));
   }
-  {
-    // Input is weights, should fail.
-    Reset();
-    AddTestWeights<float>("input", {1, 2, 3}, {-100, -1, 0, 3, 5, 9});
-    RunConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Relu6 is only implemented for tensors, not weights, at my_relu6");
-  }
 }
 
 }  // namespace convert
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 780343d6623ceb0cf3d0f0ebfc30aa669c280f44..1e907e0d2a669b2bef5fc6ca0822c1e6049c7018 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -126,8 +126,8 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context, GetPrecisionMode(precision_string, &precision_mode_));
   OP_REQUIRES_OK(context,
                  context->GetAttr("use_calibration", &use_calibration_));
-  calibration_mode_ = (use_calibration_ &&
-      (precision_mode_ == INT8MODE && calibration_data.size() == 0));
+  calibration_mode_ = (use_calibration_ && precision_mode_ == INT8MODE &&
+                       calibration_data.size() == 0);
   if (calibration_data.size()) {
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
@@ -499,8 +499,8 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
     // means calibration_mode_ is true and this path won't get executed.
     auto status = convert::ConvertGraphDefToEngine(
         segment_graph_, precision_mode_, batch_size, workspace_size_, shapes,
-        &logger, allocator, calibrator_.get(), &engine,
-        use_calibration_, &convert_successfully); 
+        &logger, allocator, calibrator_.get(), &engine, use_calibration_,
+        &convert_successfully);
     if (!status.ok()) {
       if (convert_successfully) {
         // This means it fail to build the engine even when the network is built
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 95f47c90148a1ab9773164fce1cb040235ecad5b..f0945087d92cbc08940699b760b1d06e5539bf7a 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -63,20 +63,20 @@ class TrtPrecisionMode(object):
     return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
 
 
-def tensorrt_rewriter_config(rewriter_config=None,
-                             max_batch_size=1,
-                             max_workspace_size_bytes=2 << 20,
-                             precision_mode=TrtPrecisionMode.FP32,
-                             minimum_segment_size=3,
-                             is_dynamic_op=False,
-                             maximum_cached_engines=1,
-                             cached_engine_batch_sizes=None,
-                             use_calibration=True):
+def get_tensorrt_rewriter_config(rewriter_config=None,
+                                 max_batch_size=1,
+                                 max_workspace_size_bytes=2 << 20,
+                                 precision_mode=TrtPrecisionMode.FP32,
+                                 minimum_segment_size=3,
+                                 is_dynamic_op=False,
+                                 maximum_cached_engines=1,
+                                 cached_engine_batch_sizes=None,
+                                 use_calibration=True):
   """Returns a RewriterConfig proto for TRT transformation.
 
   Args:
-    rewriter_config: a RewriterConfig proto to append the TensorRTOptimizer to.
-      If None, it will create one with default settings.
+    rewriter_config: a template RewriterConfig proto used to create a
+      TRT-enabled RewriterConfig. If None, it will use a default one.
     max_batch_size: max size for the input batch
     max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
       engine can use at execution time. This corresponds to the 'workspaceSize'
@@ -96,15 +96,15 @@ def tensorrt_rewriter_config(rewriter_config=None,
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
-    use_calibration: this argument is ignored if precision_mode is not INT8.
-      if set to True, a calibration graph will be created to calibrate the
-      missing ranges. The calibration graph must be converted to an inference
-      graph using calib_graph_to_infer_graph() after running calibration.
-      if set to False, quantization nodes will be expected for every tensor in
-      the graph (exlcuding those which will be fused). If a range is missing,
-      an error will occur. Please note that accuracy may be negatively affected
-      if there is a mismatch between which tensors TRT quantizes and which
-      tensors were trained with fake quantization.
+    use_calibration: this argument is ignored if precision_mode is not INT8. If
+      set to True, a calibration graph will be created to calibrate the missing
+      ranges. The calibration graph must be converted to an inference graph
+      using calib_graph_to_infer_graph() after running calibration. if set to
+      False, quantization nodes will be expected for every tensor in the graph
+      (exlcuding those which will be fused). If a range is missing, an error
+      will occur. Please note that accuracy may be negatively affected if there
+      is a mismatch between which tensors TRT quantizes and which tensors were
+      trained with fake quantization.
 
   Returns:
     A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
@@ -117,13 +117,16 @@ def tensorrt_rewriter_config(rewriter_config=None,
       rewriter_config, rewriter_config_pb2.RewriterConfig):
     raise TypeError("rewriter_config should be a RewriterConfig proto.")
 
+  rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
   if rewriter_config is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
     # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
     # need to run constant folding again.
-    rewriter_config.optimizers.extend(["constfold", "layout", "constfold"])
-    rewriter_config.meta_optimizer_iterations = (
+    rewriter_config_with_trt.optimizers.extend(
+        ["constfold", "layout", "constfold"])
+    rewriter_config_with_trt.meta_optimizer_iterations = (
         rewriter_config_pb2.RewriterConfig.ONE)
+  else:
+    rewriter_config_with_trt.CopyFrom(rewriter_config)
 
   if precision_mode.upper() not in TrtPrecisionMode.supported_precision_modes():
     raise ValueError(("precision mode '{}' is not supported."
@@ -131,7 +134,7 @@ def tensorrt_rewriter_config(rewriter_config=None,
                           precision_mode,
                           TrtPrecisionMode.supported_precision_modes))
 
-  optimizer = rewriter_config.custom_optimizers.add()
+  optimizer = rewriter_config_with_trt.custom_optimizers.add()
   optimizer.name = "TensorRTOptimizer"
   optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
   optimizer.parameter_map["max_batch_size"].i = max_batch_size
@@ -149,7 +152,7 @@ def tensorrt_rewriter_config(rewriter_config=None,
     optimizer.parameter_map["cached_engine_batches"].list.i.extend(
         cached_engine_batch_sizes)
   optimizer.parameter_map["use_calibration"].b = use_calibration
-  return rewriter_config
+  return rewriter_config_with_trt
 
 
 def create_inference_graph(input_graph_def,
@@ -161,7 +164,6 @@ def create_inference_graph(input_graph_def,
                            is_dynamic_op=False,
                            maximum_cached_engines=1,
                            cached_engine_batch_sizes=None,
-                           rewriter_config=None,
                            use_calibration=True,
                            input_saved_model_dir=None,
                            input_saved_model_tags=None,
@@ -194,17 +196,15 @@ def create_inference_graph(input_graph_def,
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
-    rewriter_config: a RewriterConfig proto to append the TensorRTOptimizer to.
-      If None, it will create one with default settings.
-    use_calibration: this argument is ignored if precision_mode is not INT8.
-      if set to True, a calibration graph will be created to calibrate the
-      missing ranges. The calibration graph must be converted to an inference
-      graph using calib_graph_to_infer_graph() after running calibration.
-      if set to False, quantization nodes will be expected for every tensor in
-      the graph (exlcuding those which will be fused). If a range is missing,
-      an error will occur. Please note that accuracy may be negatively affected
-      if there is a mismatch between which tensors TRT quantizes and which
-      tensors were trained with fake quantization.
+    use_calibration: this argument is ignored if precision_mode is not INT8. If
+      set to True, a calibration graph will be created to calibrate the missing
+      ranges. The calibration graph must be converted to an inference graph
+      using calib_graph_to_infer_graph() after running calibration. if set to
+      False, quantization nodes will be expected for every tensor in the graph
+      (exlcuding those which will be fused). If a range is missing, an error
+      will occur. Please note that accuracy may be negatively affected if there
+      is a mismatch between which tensors TRT quantizes and which tensors were
+      trained with fake quantization.
     input_saved_model_dir: the directory to load the SavedModel which contains
       the input graph to transforms. Used only when input_graph_def is None.
     input_saved_model_tags: list of tags to load the SavedModel.
@@ -212,8 +212,9 @@ def create_inference_graph(input_graph_def,
       returned GraphDef and save it to the specified directory. This option only
       works when the input graph is loaded from a SavedModel, i.e. when
       input_saved_model_dir is specified and input_graph_def is None.
-    session_config: the ConfigProto used to create a Session. If not specified,
-      a default ConfigProto will be used.
+    session_config: the ConfigProto used to create a Session. It's also used as
+      a template to create a TRT-enabled ConfigProto for conversion. If not
+      specified, a default ConfigProto will be used.
 
   Returns:
     A GraphDef transformed from input_graph_def (or the SavedModel graph def
@@ -343,21 +344,30 @@ def create_inference_graph(input_graph_def,
       grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
           output_collection)
 
-  # Create RewriterConfig.
-  rewriter_config = tensorrt_rewriter_config(
+  # Create TRT-enabled ConfigProto.
+  session_config_with_trt = config_pb2.ConfigProto()
+  session_config_with_trt.CopyFrom(session_config)
+  rewriter_config = None
+  if (session_config_with_trt.HasField("graph_options") and
+      session_config_with_trt.graph_options.HasField("rewrite_options")):
+    rewriter_config = session_config_with_trt.graph_options.rewrite_options
+  rewriter_config_with_trt = get_tensorrt_rewriter_config(
       rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode,
       minimum_segment_size, is_dynamic_op, maximum_cached_engines,
       cached_engine_batch_sizes, use_calibration)
+  session_config_with_trt.graph_options.rewrite_options.CopyFrom(
+      rewriter_config_with_trt)
 
   # Run Grappler.
   transformed_graph_def = tf_optimizer.OptimizeGraph(
-      rewriter_config, grappler_meta_graph_def, graph_id=b"tf_graph")
+      session_config_with_trt, grappler_meta_graph_def, graph_id=b"tf_graph")
 
   # Optionally write the transformed graphdef as SavedModel.
   if output_saved_model_dir is not None:
     saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
     with ops.Graph().as_default():
       importer.import_graph_def(transformed_graph_def, name="")
+      # We don't use TRT here.
       with session.Session(config=session_config) as sess:
         saved_model_builder.add_meta_graph_and_variables(
             sess,
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert_test.py b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
index 9f2eeac990dcacb547d336b68bc042016c3e6171..aa82f4207f5fa9c646cadbc4ca4fd7ab40c089ff 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert_test.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
@@ -47,9 +47,9 @@ from tensorflow.python.tools import saved_model_utils
 class TrtConvertTest(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration python API."""
 
-  def testTensorrtRewriterConfig(self):
-    """Test case for trt_convert.tensorrt_rewriter_config()."""
-    rewriter_cfg = trt_convert.tensorrt_rewriter_config(
+  def testGetTensorrtRewriterConfig(self):
+    """Test case for trt_convert.get_tensorrt_rewriter_config()."""
+    rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
         rewriter_config=None,
         max_batch_size=128,
         max_workspace_size_bytes=1234,
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py
index cbff661f99df0e6f6d1a2b0f8806849e7e5ca454..b325d76edfabce25f165a6b23c5f39bb6ac84247 100644
--- a/tensorflow/contrib/tensorrt/test/base_test.py
+++ b/tensorflow/contrib/tensorrt/test/base_test.py
@@ -56,8 +56,9 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
             strides=[1, 2, 2, 1],
             padding="SAME",
             name="conv")
-        bias = constant_op.constant(
-            [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtype)
+        bias = constant_op.constant([4., 1.5, 2., 3., 5., 7.],
+                                    name="bias",
+                                    dtype=dtype)
         added = nn.bias_add(conv, bias, name="bias_add")
         relu = nn.relu(added, "relu")
         identity = array_ops.identity(relu, "identity")
@@ -73,11 +74,12 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-    # breaks the connection check, fix it.
-    # - my_trt_op_0 should have ["weights", "conv", "bias", "bias_add",
-    #   "relu", "identity", "max_pool"]
-    return ["my_trt_op_0"]
+    return {
+        "my_trt_op_0": [
+            "weights", "conv", "bias", "bias_add", "relu", "identity",
+            "max_pool"
+        ]
+    }
 
 
 class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
@@ -92,7 +94,7 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+          dtype=dtype, shape=input_dims, name=input_name)
       with g.device("/GPU:0"):
         conv_filter = constant_op.constant(
             [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
@@ -105,10 +107,10 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
             padding="SAME",
             name="conv")
         c1 = constant_op.constant(
-            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c1")
+            np.random.randn(12, 12, 6), dtype=dtype, name="c1")
         p = math_ops.mul(conv, c1, name="mul")
         c2 = constant_op.constant(
-            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c2")
+            np.random.randn(12, 12, 6), dtype=dtype, name="c2")
         q = math_ops.div(conv, c2, name="div")
 
         edge = self.trt_incompatible_op(q, name="incompatible")
@@ -129,22 +131,21 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-    # breaks the connection check, fix it.
-    # - my_trt_op_0 should have ["mul", "sub", "div1", "mul1", "add1",
-    #   "add", "sub1"];
-    # - my_trt_op_1 should have ["weights","conv", "div"]
-    return ["my_trt_op_0", "my_trt_op_1"]
+    return {
+        "my_trt_op_0": [
+            "add", "add1", "c1", "div1", "mul", "mul1", "sub", "sub1"
+        ],
+        "my_trt_op_1": ["c2", "conv", "div", "weights"]
+    }
 
-  def ShouldRunTest(self, run_params):
-    # TODO(aaroey): LayoutOptimizer adds Transpose(Const, Const) to the graph
-    # which breaks the conversion. We should fix it as:
-    # - Detect the invalid NodeDef earlier before adding them to segment
-    # - Let it able to change the RewriterConfig when calling
-    #   create_inference_graph().
-    # It will be good to add debugging feature for Grappler to print the graph
-    # after running each optimizer.
-    return False
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    return super(
+        SimpleMultiEnginesTest, self
+    ).GetConversionParams(run_params)._replace(
+        # Disable layout optimizer, since it'll add Transpose(Const, Const) to
+        # the graph and breaks the conversion check.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
 
 
 class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
@@ -199,7 +200,7 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     # can cause overflow.
     return ((run_params.precision_mode != "FP16") and
             not (trt_test.IsQuantizationMode(run_params.precision_mode) and
-            not run_params.use_calibration))
+                 not run_params.use_calibration))
 
 
 class PartiallyConvertedTestB(PartiallyConvertedTestA):
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
index 7545bb9df20f295a8fdbc82b573cdb3407f8c5e4..6546ef64778e0ee3638b3aea08c61a9b32e0dc7b 100644
--- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
@@ -41,6 +41,7 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
     input_name = "input"
     input_matrix_rows = 4
     input_matrix_columns = 144
+    # Note that tf.nn.bias_add supports up to 5 dimensions.
     input_dims = [input_matrix_rows, input_matrix_columns]
     output_name = "output"
     g = ops.Graph()
@@ -74,18 +75,18 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
       x5 = nn.bias_add(x5, b)
       x5 = gen_array_ops.reshape(x5, [4, -1])
 
-      x6 = gen_array_ops.reshape(x, [4, 12, 12])
-      b = self._ConstOp((12,))
+      x6 = gen_array_ops.reshape(x, [4, 24, 6])
+      b = self._ConstOp((6,))
       x6 = nn.bias_add(x6, b, data_format="NHWC")
       x6 = gen_array_ops.reshape(x6, [4, -1])
 
-      x7 = gen_array_ops.reshape(x, [4, 12, 3, 4])
-      b = self._ConstOp((4,))
+      x7 = gen_array_ops.reshape(x, [4, 12, 4, 3])
+      b = self._ConstOp((3,))
       x7 = nn.bias_add(x7, b, data_format="NHWC")
       x7 = gen_array_ops.reshape(x7, [4, -1])
 
-      x8 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
-      b = self._ConstOp((2,))
+      x8 = gen_array_ops.reshape(x, [4, 4, 3, 2, 6])
+      b = self._ConstOp((6,))
       x8 = nn.bias_add(x8, b, data_format="NHWC")
       x8 = gen_array_ops.reshape(x8, [4, -1])
 
@@ -94,13 +95,13 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
       x9 = nn.bias_add(x9, b, data_format="NCHW")
       x9 = gen_array_ops.reshape(x9, [4, -1])
 
-      x10 = gen_array_ops.reshape(x, [4, 12, 3, 4])
-      b = self._ConstOp((12,))
+      x10 = gen_array_ops.reshape(x, [4, 3, 4, 12])
+      b = self._ConstOp((3,))
       x10 = nn.bias_add(x10, b, data_format="NCHW")
       x10 = gen_array_ops.reshape(x10, [4, -1])
 
-      x11 = gen_array_ops.reshape(x, [4, 12, 12])
-      b = self._ConstOp((12,))
+      x11 = gen_array_ops.reshape(x, [4, 6, 24])
+      b = self._ConstOp((6,))
       x11 = nn.bias_add(x11, b, data_format="NCHW")
       x11 = gen_array_ops.reshape(x11, [4, -1])
 
@@ -116,9 +117,14 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
-    return super(BiasaddMatMulTest,
-                 self).GetConversionParams(run_params)._replace(
-                     max_batch_size=4, maximum_cached_engines=1)
+    conversion_params = super(BiasaddMatMulTest,
+                              self).GetConversionParams(run_params)
+    return conversion_params._replace(
+        max_batch_size=4,
+        maximum_cached_engines=1,
+        # Disable layout optimizer, since it will convert BiasAdd with NHWC
+        # format to NCHW format under four dimentional input.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
index 2586d936ef120c4548543ef82d2c7db3425d9c94..e7d6ec4ad395d38a06f97020f2f363009f2286c7 100644
--- a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
@@ -12,208 +12,279 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Script to test TF-TRT INT8 conversion without calibration on Mnist model."""
 
-import numpy as np
-import os
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 
-import tensorflow as tf
-from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
-from tensorflow.core.protobuf import config_pb2 
-from tensorflow.python.keras.datasets import mnist
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import test
-from tensorflow.python import estimator as tf_estimator
+from tensorflow.contrib.tensorrt.python import trt_convert
+# pylint: disable=unused-import
+from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+# pylint: enable=unused-import
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import data
+from tensorflow.python import keras
 from tensorflow.python.estimator.estimator import Estimator
+from tensorflow.python.estimator.model_fn import EstimatorSpec
+from tensorflow.python.estimator.model_fn import ModeKeys
 from tensorflow.python.estimator.run_config import RunConfig
-from tensorflow.python.estimator.model_fn import ModeKeys, EstimatorSpec
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.datasets import mnist
+from tensorflow.python.layers import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
+from tensorflow.python.training import saver
+from tensorflow.python.training.adam import AdamOptimizer
+from tensorflow.python.training.checkpoint_management import latest_checkpoint
+from tensorflow.python.training.training_util import get_global_step
 
 INPUT_NODE_NAME = 'input'
 OUTPUT_NODE_NAME = 'output'
 
-def build_graph(x):
-  def quantize(x, r):
-    x = tf.fake_quant_with_min_max_args(x, -r, r)
-    return x
 
-  def dense_layer(x, num_inputs, num_outputs, quantization_range, name='dense'):
-    """Equivalent to tf.layers.dense but with a quantization range between
-    the MatMul and BiasAdd."""
-    with tf.variable_scope(name) as scope:
-      kernel = tf.get_variable('kernel', shape=[num_inputs, num_outputs],
-          dtype=tf.float32, initializer=tf.keras.initializers.glorot_uniform())
-      bias = tf.get_variable('bias', shape=[num_outputs,],
-          dtype=tf.float32, initializer=tf.keras.initializers.zeros())
-      x = tf.matmul(x, kernel)
-      x = quantize(x, quantization_range)
-      x = tf.nn.bias_add(x, bias)
+class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
+
+  def _BuildGraph(self, x):
+
+    def _Quantize(x, r):
+      x = gen_array_ops.quantize_and_dequantize_v2(x, -r, r)
+      return x
+
+    def _DenseLayer(x, num_inputs, num_outputs, quantization_range, name):
+      """Dense layer with quantized outputs.
+
+      Args:
+        x: input to the dense layer
+        num_inputs: number of input columns of x
+        num_outputs: number of output columns
+        quantization_range: the min/max range for quantization
+        name: name of the variable scope
+
+      Returns:
+        The output of the layer.
+      """
+      with variable_scope.variable_scope(name):
+        kernel = variable_scope.get_variable(
+            'kernel',
+            shape=[num_inputs, num_outputs],
+            dtype=dtypes.float32,
+            initializer=keras.initializers.glorot_uniform())
+        bias = variable_scope.get_variable(
+            'bias',
+            shape=[num_outputs],
+            dtype=dtypes.float32,
+            initializer=keras.initializers.zeros())
+        x = math_ops.matmul(x, kernel)
+        x = _Quantize(x, quantization_range)
+        x = nn.bias_add(x, bias)
+        x = _Quantize(x, quantization_range)
+      return x
+
+    x = _Quantize(x, 1)
+    # Conv + Bias + Relu6
+    x = layers.conv2d(x, filters=32, kernel_size=3, use_bias=True)
+    x = nn.relu6(x)
+    # Conv + Bias + Relu6
+    x = layers.conv2d(x, filters=64, kernel_size=3, use_bias=True)
+    x = nn.relu6(x)
+    # Reduce
+    x = math_ops.reduce_mean(x, [1, 2])
+    x = _Quantize(x, 6)
+    # FC1
+    x = _DenseLayer(x, 64, 512, 6, name='dense')
+    x = nn.relu6(x)
+    # FC2
+    x = _DenseLayer(x, 512, 10, 25, name='dense_1')
+    x = array_ops.identity(x, name=OUTPUT_NODE_NAME)
     return x
 
-  x = quantize(x, 1)
-  # Conv + Bias + Relu6
-  x = tf.layers.conv2d(x, filters=32, kernel_size=3, use_bias=True)
-  x = tf.nn.relu6(x)
-  # Conv + Bias + Relu6
-  x = tf.layers.conv2d(x, filters=64, kernel_size=3, use_bias=True)
-  x = tf.nn.relu6(x)
-  x = tf.reduce_mean(x, [1, 2])
-  x = quantize(x, 6)
-  # FC1
-  x = dense_layer(x, 64, 512, 6, name='dense')
-  x = quantize(x, 6)
-  x = tf.nn.relu6(x)
-  # FC2
-  x = dense_layer(x, 512, 10, 25, name='dense_1')
-  x = quantize(x, 25)
-  x = tf.identity(x, name=OUTPUT_NODE_NAME)
-  return x
-
-def preprocess_fn(x, y):
-  x = tf.cast(x, tf.float32)
-  x = tf.expand_dims(x, axis=2)
-  x = 2.0 * (x / 255.0) - 1.0
-  y = tf.cast(y, tf.int32)
-  return x, y
-
-def run(is_training, use_trt, batch_size, num_epochs, model_dir):
-  """Train or evaluate the model.
-
-  Args:
-    is_training: Whether to train or evaluate the model. In training mode,
-      quantization will be simulated where the fake_quant_with_min_max_args
-      are placed.
-    use_trt: If true, use TRT INT8 mode for evaluation, which will perform real
-      quantization. Otherwise use native TensorFlow which will perform
-      simulated quantization. Ignored if is_training is True.
-    batch_size: Batch size.
-    num_epochs: How many epochs to train. Ignored if is_training is False.
-    model_dir: Where to save or load checkpoint.
-  """
-  # Get dataset
-  train, test = mnist.load_data()
-  
-  def eval_input_fn():
-    mnist_x, mnist_y = test
-    dataset = tf.data.Dataset.from_tensor_slices((mnist_x, mnist_y))
-    dataset = dataset.apply(tf.data.experimental.map_and_batch(
-        map_func=preprocess_fn,
-        batch_size=batch_size,
-        num_parallel_calls=8))
-    dataset = dataset.repeat(count=1)
-    iterator = dataset.make_one_shot_iterator()
-    features, labels = iterator.get_next()
-    return features, labels
-
-  def train_input_fn():
-    mnist_x, mnist_y = train
-    dataset = tf.data.Dataset.from_tensor_slices((mnist_x, mnist_y))
-    dataset = dataset.shuffle(2*len(mnist_x))
-    dataset = dataset.apply(tf.data.experimental.map_and_batch(
-        map_func=preprocess_fn,
-        batch_size=batch_size,
-        num_parallel_calls=8))
-    dataset = dataset.repeat(count=num_epochs)
-    iterator = dataset.make_one_shot_iterator()
-    features, labels = iterator.get_next()
-    return features, labels
-
-  def model_fn(features, labels, mode):
-    if is_training:
-      logits_out = build_graph(features)
-    else:
-      graph_def = get_graph_def(use_trt, batch_size, model_dir)
-      logits_out = tf.import_graph_def(graph_def,
-          input_map={INPUT_NODE_NAME: features},
-          return_elements=[OUTPUT_NODE_NAME+':0'],
-          name='')[0]
-    loss = tf.losses.sparse_softmax_cross_entropy(
-        labels=labels,
-        logits=logits_out)
-    tf.summary.scalar('loss', loss)
-    classes_out = tf.argmax(logits_out, axis=1, name='classes_out')
-    accuracy = tf.metrics.accuracy(
-        labels=labels,
-        predictions=classes_out,
-        name='acc_op')
-    tf.summary.scalar('accuracy', accuracy[1])
-    if mode == ModeKeys.EVAL:
-      return EstimatorSpec(
-          mode,
-          loss=loss,
-          eval_metric_ops={'accuracy': accuracy})
-    elif mode == ModeKeys.TRAIN:
-      optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
-      train_op = optimizer.minimize(
-          loss,
-          global_step=tf.train.get_global_step())
-      return EstimatorSpec(
-          mode,
-          loss=loss,
-          train_op=train_op)
-
-  tf_config = config_pb2.ConfigProto()
-  tf_config.gpu_options.allow_growth = True
-  estimator = Estimator(
-      model_fn=model_fn,
-      model_dir=None,
-      config=RunConfig(session_config=tf_config))
-  if is_training:
-    estimator.train(train_input_fn)
-  results = estimator.evaluate(eval_input_fn)
-  print('accuracy:', results['accuracy'])
-  return results
-
-def get_graph_def(use_trt, batch_size, model_dir):
-  # Load graph and freeze
-  with tf.Graph().as_default() as graph:
-    with tf.Session() as sess:
-      x = tf.placeholder(shape=(None, 28, 28, 1),
-                         dtype=tf.float32,
-                         name=INPUT_NODE_NAME)
-      logits_out = build_graph(x)
+  def _GetGraphDef(self, use_trt, max_batch_size, model_dir):
+    """Get the frozen mnist GraphDef.
+
+    Args:
+      use_trt: whether use TF-TRT to convert the graph.
+      max_batch_size: the max batch size to apply during TF-TRT conversion.
+      model_dir: the model directory to load the checkpoints.
+
+    Returns:
+      The frozen mnist GraphDef.
+    """
+    graph = ops.Graph()
+    with self.session(graph=graph) as sess:
+      with graph.device('/GPU:0'):
+        x = array_ops.placeholder(
+            shape=(None, 28, 28, 1), dtype=dtypes.float32, name=INPUT_NODE_NAME)
+        self._BuildGraph(x)
       # Load weights
-      saver = tf.train.Saver()
-      checkpoint_file = tf.train.latest_checkpoint(model_dir)
-      saver.restore(sess, checkpoint_file)
+      mnist_saver = saver.Saver()
+      checkpoint_file = latest_checkpoint(model_dir)
+      mnist_saver.restore(sess, checkpoint_file)
       # Freeze
-      graph_def = tf.graph_util.convert_variables_to_constants(
-          sess,
-          sess.graph_def,
-          output_node_names=[OUTPUT_NODE_NAME]
+      graph_def = graph_util.convert_variables_to_constants(
+          sess, sess.graph_def, output_node_names=[OUTPUT_NODE_NAME])
+    # Convert with TF-TRT
+    if use_trt:
+      logging.info('Number of nodes before TF-TRT conversion: %d',
+                   len(graph_def.node))
+      graph_def = trt_convert.create_inference_graph(
+          graph_def,
+          outputs=[OUTPUT_NODE_NAME],
+          max_batch_size=max_batch_size,
+          precision_mode='INT8',
+          max_workspace_size_bytes=4096 << 19,
+          minimum_segment_size=2,
+          use_calibration=False,
       )
-  # Convert with TF-TRT
-  if use_trt:
-    print('nodes before:', len(graph_def.node))
-    graph_def = create_inference_graph(graph_def,
-        outputs=[OUTPUT_NODE_NAME],
-        max_batch_size=batch_size,
-        precision_mode='int8',
-        max_workspace_size_bytes=4096 << 19,
-        minimum_segment_size=2,
-        use_calibration=False,
-    )
-    print('tftrt total nodes:', len(graph_def.node))
-    print('trt only nodes',
-        len([1 for n in graph_def.node if str(n.op)=='TRTEngineOp']))
-  return graph_def
+      logging.info('Number of nodes after TF-TRT conversion: %d',
+                   len(graph_def.node))
+      num_engines = len(
+          [1 for n in graph_def.node if str(n.op) == 'TRTEngineOp'])
+      self.assertEqual(1, num_engines)
+    return graph_def
 
+  def _Run(self, is_training, use_trt, batch_size, num_epochs, model_dir):
+    """Train or evaluate the model.
 
-class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
+    Args:
+      is_training: whether to train or evaluate the model. In training mode,
+        quantization will be simulated where the quantize_and_dequantize_v2 are
+        placed.
+      use_trt: if true, use TRT INT8 mode for evaluation, which will perform
+        real quantization. Otherwise use native TensorFlow which will perform
+        simulated quantization. Ignored if is_training is True.
+      batch_size: batch size.
+      num_epochs: how many epochs to train. Ignored if is_training is False.
+      model_dir: where to save or load checkpoint.
+
+    Returns:
+      The Estimator evaluation result.
+    """
+    # Get dataset
+    train_data, test_data = mnist.load_data()
+
+    def _PreprocessFn(x, y):
+      x = math_ops.cast(x, dtypes.float32)
+      x = array_ops.expand_dims(x, axis=2)
+      x = 2.0 * (x / 255.0) - 1.0
+      y = math_ops.cast(y, dtypes.int32)
+      return x, y
+
+    def _EvalInputFn():
+      mnist_x, mnist_y = test_data
+      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset.apply(
+          data.experimental.map_and_batch(
+              map_func=_PreprocessFn,
+              batch_size=batch_size,
+              num_parallel_calls=8))
+      dataset = dataset.repeat(count=1)
+      iterator = dataset.make_one_shot_iterator()
+      features, labels = iterator.get_next()
+      return features, labels
+
+    def _TrainInputFn():
+      mnist_x, mnist_y = train_data
+      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset.shuffle(2 * len(mnist_x))
+      dataset = dataset.apply(
+          data.experimental.map_and_batch(
+              map_func=_PreprocessFn,
+              batch_size=batch_size,
+              num_parallel_calls=8))
+      dataset = dataset.repeat(count=num_epochs)
+      iterator = dataset.make_one_shot_iterator()
+      features, labels = iterator.get_next()
+      return features, labels
+
+    def _ModelFn(features, labels, mode):
+      if is_training:
+        logits_out = self._BuildGraph(features)
+      else:
+        graph_def = self._GetGraphDef(use_trt, batch_size, model_dir)
+        logits_out = importer.import_graph_def(
+            graph_def,
+            input_map={INPUT_NODE_NAME: features},
+            return_elements=[OUTPUT_NODE_NAME + ':0'],
+            name='')[0]
+
+      loss = losses.sparse_softmax_cross_entropy(
+          labels=labels, logits=logits_out)
+      summary.scalar('loss', loss)
+
+      classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out')
+      accuracy = metrics.accuracy(
+          labels=labels, predictions=classes_out, name='acc_op')
+      summary.scalar('accuracy', accuracy[1])
 
+      if mode == ModeKeys.EVAL:
+        return EstimatorSpec(
+            mode, loss=loss, eval_metric_ops={'accuracy': accuracy})
+      elif mode == ModeKeys.TRAIN:
+        optimizer = AdamOptimizer(learning_rate=1e-2)
+        train_op = optimizer.minimize(loss, global_step=get_global_step())
+        return EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+    config_proto = config_pb2.ConfigProto()
+    config_proto.gpu_options.allow_growth = True
+    estimator = Estimator(
+        model_fn=_ModelFn,
+        model_dir=model_dir if is_training else None,
+        config=RunConfig(session_config=config_proto))
+
+    if is_training:
+      estimator.train(_TrainInputFn)
+    results = estimator.evaluate(_EvalInputFn)
+    logging.info('accuracy: %s', str(results['accuracy']))
+    return results
+
+  # To generate the checkpoint, set a different model_dir and call self._Run()
+  # by setting is_training=True and num_epochs=1000, e.g.:
+  # model_dir = '/tmp/quantization_mnist'
+  # self._Run(
+  #     is_training=True,
+  #     use_trt=False,
+  #     batch_size=128,
+  #     num_epochs=100,
+  #     model_dir=model_dir)
   def testEval(self):
-    model_dir = test.test_src_dir_path(
-        'contrib/tensorrt/test/quantization_mnist_test_data')
-    acc_tf = run(is_training=False,
+    if not trt_convert.is_tensorrt_enabled():
+      return
+    model_dir = test.test_src_dir_path('contrib/tensorrt/test/testdata')
+
+    accuracy_tf_native = self._Run(
+        is_training=False,
         use_trt=False,
         batch_size=128,
         num_epochs=None,
         model_dir=model_dir)['accuracy']
-    acc_tftrt = run(is_training=False,
+    logging.info('accuracy_tf_native: %f', accuracy_tf_native)
+    self.assertAllClose(accuracy_tf_native, 0.9662)
+
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return
+
+    accuracy_tf_trt = self._Run(
+        is_training=False,
         use_trt=True,
         batch_size=128,
         num_epochs=None,
         model_dir=model_dir)['accuracy']
-    self.assertAllClose(acc_tf, 0.9717)
-    self.assertAllClose(acc_tftrt, 0.9744)
+    logging.info('accuracy_tf_trt: %f', accuracy_tf_trt)
+    self.assertAllClose(accuracy_tf_trt, 0.9677)
+
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/quantization_test.py b/tensorflow/contrib/tensorrt/test/quantization_test.py
index 83295ce2bd3a7392f5837b4e10bbf73c81d91255..28353273edec4a2b0fd4300f87b0b1a4dbe37652 100644
--- a/tensorflow/contrib/tensorrt/test/quantization_test.py
+++ b/tensorflow/contrib/tensorrt/test/quantization_test.py
@@ -20,88 +20,86 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.tensorrt.python import trt_convert
 from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_impl
-from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-def build_graph(input_name, input_dims, output_name,
-                add_quantization_nodes=False, dtype=dtypes.float32):
-  def quantize(x, r):
+def _GetParams(add_quantization_nodes, dtype=dtypes.float32):
+  input_name = "input"
+  input_dims = [8, 8]
+  output_name = "output"
+
+  def _Quantize(x, r):
     if add_quantization_nodes:
       x = gen_array_ops.fake_quant_with_min_max_vars(x, -r, r)
     return x
+
   g = ops.Graph()
   with g.as_default():
     x = array_ops.placeholder(
         dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
-
-    x = quantize(x, 10.0)
+    x = _Quantize(x, 10.0)
     x = x + 5
-    x = quantize(x, 15.0)
+    x = _Quantize(x, 15.0)
     x = x - 5
-    x = quantize(x, 10.0)
+    x = _Quantize(x, 10.0)
     x = x * 0.1
-    x = quantize(x, 1.0)
-    w = constant_op.constant(np.ones((10, 1)), dtype=dtypes.float32)
+    x = _Quantize(x, 1.0)
+    w = constant_op.constant(np.ones((8, 1)), dtype=dtypes.float32)
     x = math_ops.matmul(x, w)
-    x = quantize(x, 10.0)
+    x = _Quantize(x, 10.0)
     x = array_ops.identity(x, name=output_name)
-  return g
+
+  return trt_test.TfTrtIntegrationTestParams(
+      gdef=g.as_graph_def(),
+      input_names=[input_name],
+      input_dims=[input_dims],
+      output_names=[output_name],
+      expected_output_dims=[(8, 1)])
+
 
 class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
 
   def GetParams(self):
     """Create a graph containing single segment with no quantization ranges."""
-    input_name = "input"
-    input_dims = [128, 10]
-    output_name = "output"
-    g = build_graph(input_name, input_dims, output_name,
-                    add_quantization_nodes=False)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[input_dims],
-        output_names=[output_name],
-        expected_output_dims=[(128, 1)])
+    return _GetParams(add_quantization_nodes=False)
 
   def ShouldRunTest(self, run_params):
-    return (run_params.precision_mode == "INT8" and
-            not run_params.use_optimizer and
-            not run_params.dynamic_engine)
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return False
+    # Only test static engine mode, with or without calibration.
+    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.use_optimizer and not run_params.dynamic_engine)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     if run_params.use_calibration:
+      # In static engine mode with calibration, it should build a calibration
+      # engine.
       return ["my_trt_op_0"]
+    # In static engine mode without calibration, the engine building will fail
+    # since no quantization ranges are set, which results in no TRT nodes.
     return []
 
+
 class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
 
   def GetParams(self):
     """Create a graph containing single segment with no quantization ranges."""
-    input_name = "input"
-    input_dims = [128, 10]
-    output_name = "output"
-    g = build_graph(input_name, input_dims, output_name,
-                    add_quantization_nodes=True)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[input_dims],
-        output_names=[output_name],
-        expected_output_dims=[(128, 1)])
+    return _GetParams(add_quantization_nodes=True)
 
   def ShouldRunTest(self, run_params):
-    return (run_params.precision_mode == "INT8" and
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return False
+    # Test static/dynamic engine with/without calibration.
+    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
             not run_params.use_optimizer)
 
   def ExpectedEnginesToBuild(self, run_params):
@@ -116,30 +114,23 @@ class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
     """The relative tolerance to compare floating point results."""
     return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
 
+
 class NonQuantizedPrecisionsWithRangesTest(trt_test.TfTrtIntegrationTestBase):
 
   def GetParams(self):
     """Create a graph containing single segment with no quantization ranges."""
-    input_name = "input"
-    input_dims = [128, 10]
-    output_name = "output"
-    g = build_graph(input_name, input_dims, output_name,
-                    add_quantization_nodes=True)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[input_dims],
-        output_names=[output_name],
-        expected_output_dims=[(128, 1)])
+    return _GetParams(add_quantization_nodes=True)
 
   def ShouldRunTest(self, run_params):
-    return (run_params.precision_mode == "FP32" or
-            run_params.precision_mode == "FP16")
+    # Only test FP32/FP16 mode.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
+    # The fake quant ops are not supported in FP32/FP16 mode, and will split the
+    # graph into three TRT segments.
     return ["my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3"]
-  
+
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
     return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
@@ -148,5 +139,6 @@ class NonQuantizedPrecisionsWithRangesTest(trt_test.TfTrtIntegrationTestBase):
     """The relative tolerance to compare floating point results."""
     return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/testdata/checkpoint b/tensorflow/contrib/tensorrt/test/testdata/checkpoint
new file mode 100644
index 0000000000000000000000000000000000000000..a603e1aec91adab04fd9801ba05a2ee9adfbb6e8
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/testdata/checkpoint
@@ -0,0 +1,3 @@
+model_checkpoint_path: "model.ckpt-46900"
+all_model_checkpoint_paths: "model.ckpt-0"
+all_model_checkpoint_paths: "model.ckpt-46900"
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..88a998f184b275121e1e76eb51d2310da149f10a
Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 differ
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index
new file mode 100644
index 0000000000000000000000000000000000000000..537976571337508ab1798d33646c51d62a146ecc
Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index differ
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
index 8804f2bc8f73e085982d10b8dba2a54d30eed608..80eb8552fd01531be76c228c10830c2fa33a2dec 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.tensorrt.python import trt_convert
 from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
 # pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
@@ -66,6 +67,34 @@ class GraphState(object):
   INFERENCE = 2
 
 
+def OptimizerDisabledRewriterConfig():
+  """Returns a RewriterConfig with all default Grappler optimizers disabled."""
+  rewriter_config = rewriter_config_pb2.RewriterConfig()
+
+  # Turn off all default Grappler optimizers.
+  off = rewriter_config_pb2.RewriterConfig.OFF
+  rewriter_config.layout_optimizer = off
+  rewriter_config.constant_folding = off
+  rewriter_config.shape_optimization = off
+  rewriter_config.remapping = off
+  rewriter_config.arithmetic_optimization = off
+  rewriter_config.dependency_optimization = off
+  rewriter_config.loop_optimization = off
+  rewriter_config.function_optimization = off
+  rewriter_config.debug_stripper = off
+  rewriter_config.disable_model_pruning = True
+  rewriter_config.scoped_allocator_optimization = off
+  rewriter_config.memory_optimization = (
+      rewriter_config_pb2.RewriterConfig.NO_MEM_OPT)
+  rewriter_config.pin_to_host_optimization = off
+  rewriter_config.auto_parallel.enable = False
+
+  # Run only once for each enabled optimizer.
+  rewriter_config.meta_optimizer_iterations = (
+      rewriter_config_pb2.RewriterConfig.ONE)
+  return rewriter_config
+
+
 class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration."""
 
@@ -203,11 +232,16 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     trt_convert.clear_test_values("my_trt_op_.*:ExecuteCalibration")
     trt_convert.clear_test_values("my_trt_op_.*:ExecuteNativeSegment")
 
+  def _GetGPUOptions(self):
+    gpu_options = config_pb2.GPUOptions()
+    gpu_options.allow_growth = True
+    return gpu_options
+
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
     if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
       conversion_params = self.GetConversionParams(run_params)
-      rewriter_cfg = trt_convert.tensorrt_rewriter_config(
+      rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
           conversion_params.rewriter_config, conversion_params.max_batch_size,
           conversion_params.max_workspace_size_bytes,
           conversion_params.precision_mode,
@@ -221,13 +255,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     else:
       graph_options = config_pb2.GraphOptions()
 
-    gpu_options = config_pb2.GPUOptions()
-    gpu_options.allow_growth = True
-    if trt_convert.get_linked_tensorrt_version()[0] == 3:
-      gpu_options.per_process_gpu_memory_fraction = 0.50
-
     config = config_pb2.ConfigProto(
-        gpu_options=gpu_options, graph_options=graph_options)
+        gpu_options=self._GetGPUOptions(), graph_options=graph_options)
     return config
 
   def _ExpectTestValue(self, engine_name, method, expected_value):
@@ -297,6 +326,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     params = self._GetParamsCached()
     conversion_params = self.GetConversionParams(run_params)
     logging.info(conversion_params)
+
+    config_for_trt = config_pb2.ConfigProto(gpu_options=self._GetGPUOptions())
+    if conversion_params.rewriter_config is not None:
+      config_for_trt.graph_options.rewrite_options.CopyFrom(
+          conversion_params.rewriter_config)
     return trt_convert.create_inference_graph(
         input_graph_def=gdef,
         outputs=params.input_names + params.output_names,
@@ -307,8 +341,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_op=conversion_params.is_dynamic_op,
         maximum_cached_engines=conversion_params.maximum_cached_engines,
         cached_engine_batch_sizes=conversion_params.cached_engine_batch_sizes,
-        rewriter_config=conversion_params.rewriter_config,
-        use_calibration=conversion_params.use_calibration)
+        use_calibration=conversion_params.use_calibration,
+        session_config=config_for_trt)
 
   def _WriteGraph(self, run_params, gdef, graph_state):
     if graph_state == GraphState.ORIGINAL:
@@ -408,13 +442,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         self.assertEqual(run_params.dynamic_engine, is_dynamic_engine,
                          node.name)
         self.assertEqual(node.attr["use_calibration"].b,
-                         run_params.use_calibration,
-                         node.name)
+                         run_params.use_calibration, node.name)
 
         has_calibration_data = len(node.attr["calibration_data"].s)
         if (IsQuantizationMode(run_params.precision_mode) and
-            run_params.use_calibration and 
-            graph_state == GraphState.INFERENCE):
+            run_params.use_calibration and graph_state == GraphState.INFERENCE):
           self.assertTrue(has_calibration_data, node.name)
         else:
           self.assertFalse(has_calibration_data, node.name)
@@ -449,6 +481,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       # types.
       scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
       dims = params.input_dims[i]
+      # TODO(laigd): add debug options. E.g. we can set the input data to be
+      # continuous natural numbers:
+      # seq = np.arange(np.prod(dims))
+      # seq.resize(dims)
+      # input_data.append(scale * seq.astype(dtype))
       input_data.append((scale * np.random.random_sample(dims)).astype(dtype))
     self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
 
@@ -541,7 +578,7 @@ def _AddTests(test_class):
         # graphdef using custom python wrapper class, which is not currently
         # supported yet.
         continue
-      if not dynamic_engine and use_calibration:
+      if use_calibration and not dynamic_engine:
         # Static engine with use_calibration=False will be static, so we want to
         # test that. If use_calibration=True, only dynamic op is supported.
         # TODO(aaroey): construction of static calibration engine is not
@@ -553,8 +590,10 @@ def _AddTests(test_class):
         continue
 
     conversion = "OptimizerConversion" if use_optimizer else "ToolConversion"
-    engine_type = ("DynamicEngine" if dynamic_engine else "StaticEngine")
-    test_name = "%s_%s_%s" % (conversion, precision_mode, engine_type)
+    engine_type = "DynamicEngine" if dynamic_engine else "StaticEngine"
+    calibration_type = "UseCalibration" if use_calibration else "NoCalibration"
+    test_name = "%s_%s_%s_%s" % (conversion, engine_type, precision_mode,
+                                 calibration_type)
     run_params = RunParams(
         use_optimizer=use_optimizer,
         precision_mode=precision_mode,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index c230919168b937b26c68e141e15f0762ad70f3e6..ae7db35b47b326272dd2c7bc76e18047cec59865 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -106,6 +106,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     tags = [
+        "no_mac",
         "no_pip_gpu",  # b/63391119
         "nomsan",  # Takes too long to run.
         "notsan",  # b/67865658
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index af68aa03cf6583dc474eda6cda2e648fa1c3d08d..146ed9f27134e3e2a6c74627b6b78e53d65155f0 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -32,7 +32,7 @@ from tensorflow.contrib.timeseries.python.timeseries.state_space_models.filterin
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.estimator.export import export_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index ffd838be40ed6267109fe36d95a681496fb2f964..7d780559f976516823611f3fe0ded056e4be088c 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -30,7 +30,7 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 90c7d8ac1a9c69216ece74af458cd750667f51ee..8f692d94da45bfaed6c72cf75d525346865aea34 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -38,7 +38,7 @@ from tensorflow.core.example import example_pb2
 
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index 43c5267e632e464d43ffcbcf6c551ff83d3c5767..aab330643862c1ccf073d2a0e34e1c475b1ec15f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -802,7 +802,7 @@ class InputStatisticsFromMiniBatch(object):
             array_ops.shape(times)[1] - 1, self._dtype))
     # Co-locate updates with their variables to minimize race conditions when
     # updating statistics.
-    with ops.colocate_with(auxiliary_variables.max_time_seen):
+    with ops.device(auxiliary_variables.max_time_seen.device):
       # There is a race condition if this value is being updated from multiple
       # workers. However, it should eventually reach the correct value if the
       # last chunk is presented enough times.
@@ -810,16 +810,16 @@ class InputStatisticsFromMiniBatch(object):
           auxiliary_variables.max_time_seen,
           gen_math_ops.maximum(auxiliary_variables.max_time_seen,
                                math_ops.reduce_max(times)))
-    with ops.colocate_with(auxiliary_variables.chunk_count):
+    with ops.device(auxiliary_variables.chunk_count.device):
       chunk_count_assign = state_ops.assign_add(auxiliary_variables.chunk_count,
                                                 array_ops.shape(
                                                     times,
                                                     out_type=dtypes.int64)[0])
-    with ops.colocate_with(auxiliary_variables.inter_observation_duration_sum):
+    with ops.device(auxiliary_variables.inter_observation_duration_sum.device):
       inter_observation_duration_assign = state_ops.assign_add(
           auxiliary_variables.inter_observation_duration_sum,
           math_ops.reduce_sum(batch_inter_observation_duration))
-    with ops.colocate_with(auxiliary_variables.example_count):
+    with ops.device(auxiliary_variables.example_count.device):
       example_count_assign = state_ops.assign_add(
           auxiliary_variables.example_count,
           array_ops.size(times, out_type=dtypes.int64))
@@ -829,11 +829,11 @@ class InputStatisticsFromMiniBatch(object):
     # the series are then members of fewer chunks. For series which are much
     # longer than the chunk size (the usual/expected case), this effect becomes
     # irrelevant.
-    with ops.colocate_with(auxiliary_variables.overall_feature_sum):
+    with ops.device(auxiliary_variables.overall_feature_sum.device):
       overall_feature_sum_assign = state_ops.assign_add(
           auxiliary_variables.overall_feature_sum,
           math_ops.reduce_sum(values, axis=[0, 1]))
-    with ops.colocate_with(auxiliary_variables.overall_feature_sum_of_squares):
+    with ops.device(auxiliary_variables.overall_feature_sum_of_squares.device):
       overall_feature_sum_of_squares_assign = state_ops.assign_add(
           auxiliary_variables.overall_feature_sum_of_squares,
           math_ops.reduce_sum(values**2, axis=[0, 1]))
@@ -869,7 +869,7 @@ class InputStatisticsFromMiniBatch(object):
             state_ops.assign(statistics.series_start_moments.mean, mean),
             state_ops.assign(statistics.series_start_moments.variance,
                              variance))
-      with ops.colocate_with(statistics.start_time):
+      with ops.device(statistics.start_time.device):
         series_start_update = control_flow_ops.cond(
             # Update moments whenever we even match the lowest time seen so far,
             # to ensure that series start statistics are eventually updated to
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model.py b/tensorflow/contrib/timeseries/python/timeseries/model.py
index edd97b2a4c131dbce0a5111dbac7d40eddea2bae..a8cd4287e0003de300b7114cf3f88d21d3239e6e 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/model.py
@@ -27,7 +27,7 @@ from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
 
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 3c07a74ed8af9e3ab70408f9b43cb62b6bd4c7f2..125750e7639ad40c481472a93353e6fb7055be96 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -40,7 +40,10 @@ py_test(
     timeout = "long",  # Moderate but for asan
     srcs = ["state_space_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_mac",
+        "no_windows",  # TODO: needs investigation on Windows
+    ],
     deps = [
         ":state_space_model",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 67327d32000caea5db75f4d83e5743e8bde70a92..a0a9cb3f31a945a00eb3f6a5fd1402aab9a2df5f 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -246,6 +246,7 @@ py_library(
         "python/tpu/bfloat16.py",
         "python/tpu/device_assignment.py",
         "python/tpu/session_support.py",
+        "python/tpu/tensor_tracer.py",
         "python/tpu/topology.py",
         "python/tpu/tpu.py",
         "python/tpu/tpu_feed.py",
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 38d1c3049ef7185f2f9f448361029d066678cdae..541fbf33a302a4d850422885fdbbc438bd6b9b7b 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -94,13 +94,6 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library(
-    name = "tf_op_stats_proto",
-    srcs = ["tf_op_stats.proto"],
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
 tf_proto_library(
     name = "tpu_profiler_analysis_proto",
     srcs = ["tpu_profiler_analysis.proto"],
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
deleted file mode 100644
index 1e66801efd4b2a997ed85289b9b1690bb5d07737..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ /dev/null
@@ -1,261 +0,0 @@
-// This proto describes the format of tensorflow operation level stats for
-// profiling (in tensorboard) purpose.
-
-syntax = "proto2";
-
-package tensorflow.tpu;
-
-// Result proto for OpMetrics.
-message OpMetricsResult {
-  // True if this OP is executed on the device; False if it is executed on the
-  // host.
-  optional bool on_device = 1;
-  reserved 2;  // was uint32 id.
-  // Name of this OP.
-  optional string name = 3;
-  // Rank of this OP.
-  optional uint64 rank = 4;
-  // The starting time in cycles of the last instance of this OP executed.
-  optional double last_starttime_in_cycles = 5;
-  // The ending time in cycles of the last instance of this OP executed.
-  optional double last_endtime_in_cycles = 6;
-  // If this OP (say A), is an immediate child of another OP (say B), this field
-  // stores the sum of duration in microseconds of A inside B. If A appears more
-  // than once in B, the duration of all A's appearances will be added together.
-  // This sum will be reset after the self-time of B is calculated so that it
-  // can be reused for a new parent OP.
-  optional double sum_of_duration_in_us_as_children = 7;
-  // Number of instances that this OP occurred.
-  optional uint64 occurrences = 8;
-  // Total time in microseconds spent in this OP (accumulated
-  // over all of its occurrences).
-  optional double total_time_in_us = 9;
-  // Total self time in microseconds spent in this OP
-  // (accumulated over all of its occurrences).
-  optional double total_self_time_in_us = 10;
-  // The total self time as a fraction of sum of all OP's
-  // total self time on the host.
-  optional double host_total_self_time_as_fraction_of_all_op_time = 11;
-  // Cumulative total self time in fraction on the host.
-  optional double host_cumulative_total_self_time_as_fraction_of_all_op_time =
-      12;
-  // The total self time as a fraction of sum of all OP's
-  // total self time on the device.
-  optional double device_total_self_time_as_fraction_of_all_op_time = 13;
-  // Cumulative total self time in fraction on the device.
-  optional double device_cumulative_total_self_time_as_fraction_of_all_op_time =
-      14;
-  // Total number of FLOPs incurred by this OP.
-  optional double total_flops = 15;
-  // Total number of bytes accessed by this OP.
-  optional double total_bytes_accessed = 16;
-  // Total time in microseconds that special hw unit 1 is occupied by this OP.
-  optional double unit1_occupancy_in_us = 17;
-  // Total time in microseconds that special hw unit 2 is occupied by this OP.
-  optional double unit2_occupancy_in_us = 18;
-  // Total memory stall time in microseconds.
-  optional double total_memory_stall_in_us = 19;
-}
-
-// Result proto for OpMetricsDb.
-message OpMetricsDbResult {
-  // A bunch of OpMetricsResults.
-  repeated OpMetricsResult metrics_db = 1;
-  // The total host infeed-enqueue duration in picoseconds.
-  optional uint64 total_host_infeed_enq_duration_ps = 2;
-  // The total of the difference between the start times of two
-  // consecutive infeed-enqueues (per host) in picoseconds.
-  optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
-  // The total device time in microseconds.
-  optional double total_device_time_in_us = 4;
-  // The total host time in microseconds.
-  optional double total_host_time_in_us = 5;
-}
-
-// Result proto for StepInfo.
-message StepInfoResult {
-  // The (micro) step number.
-  optional uint32 step_num = 1;
-  // The step duration in picoseconds.
-  optional uint64 duration_ps = 2;
-  // The infeed duration in picoseconds.
-  optional uint64 infeed_duration_ps = 3;
-  // The outfeed duration in picoseconds.
-  optional uint64 host_outfeed_ps = 8;
-  // The start time of this step in picoseconds.
-  optional uint64 begin_ps = 4;
-  // The waiting time within this step in picoseconds.
-  optional uint64 wait_duration_ps = 5;
-  // The unit b outfeed duration in picoseconds.
-  optional uint64 unit_b_outfeed_ps = 9;
-  // The time spent on cross-replica-sum in picoseconds.
-  optional uint64 crs_duration_ps = 6;
-  // Percentage of unit b time spent on infeed.
-  optional double unit_b_infeed_percent = 7;
-}
-
-// Result proto for a sequence of steps.
-message StepSequenceResult {
-  // A sequence of StepInfoResults.
-  repeated StepInfoResult step_sequence = 1;
-}
-
-// Result proto for a StepDatabase.
-message StepDatabaseResult {
-  // A map from core_id to StepSequenceResult.
-  map<uint32, StepSequenceResult> step_sequence_per_core = 1;
-}
-
-// Result proto for looping-related metrics.
-message LoopingResult {
-  // The total iteration time in nanoseconds.
-  optional double iteration_time_ns = 1;
-  // The total number of iterations.
-  optional int32 num_iterations = 2;
-  // The total computation time in nanoseconds.
-  optional double computation_time_ns = 3;
-  // The total number of computations.
-  optional int32 num_computations = 4;
-}
-
-// Result proto for HloExtraInfo.
-message HloExtraInfoResult {
-  // Category of the HLO op given by the compiler.
-  optional string category = 1;
-  // The long name of the HLO that includes the dimensions.
-  optional string long_name = 2;
-  // The per-TPU-core batch size inferred from this HLO.
-  optional int64 per_core_batch_size = 3;
-}
-
-// Result proto for HloExtraInfoMap.
-message HloExtraInfoMapResult {
-  // A map from HLO name to HloExtraInfo.
-  map<string, HloExtraInfoResult> hlo_extrainfo_map = 1;
-}
-
-// Result proto for host-independent job information.
-message HostIndependentJobInfoResult {
-  // The change-list number of this build.
-  optional int64 change_list = 1;
-  // The time of this build.
-  optional int64 build_time = 2;
-  // The target of this build.
-  optional string build_target = 3;
-}
-
-// Result proto for host-dependent job information.
-message HostDependentJobInfoResult {
-  // This ID of the host where the job was run on.
-  optional string host_id = 1;
-  // The command line used to run the job.
-  optional string command_line = 2;
-  // The start time of the job on this host.
-  optional int64 start_time = 3;
-}
-
-// Result proto for RunEnvironment (the run environment of a profiling session).
-message RunEnvironmentResult {
-  // Number of hosts used.
-  optional int32 host_count = 1;
-  // The type of TPU used.
-  optional string tpu_type = 2;
-  // The number of TPU cores used.
-  optional int32 tpu_core_count = 3;
-  // The per-TPU-core batch size.
-  optional int32 per_core_batch_size = 4;
-  // Host-independent job information.
-  optional HostIndependentJobInfoResult host_independent_job_info = 5;
-  // Host-dependent job information.
-  repeated HostDependentJobInfoResult host_dependent_job_info = 6;
-  // The number of replicas, corresponds to input parallelism.
-  // If there is no model parallelism, replica_count = tpu_core_count
-  optional int32 replica_count = 7;
-  // The number of cores used for a single replica, e.g. model parallelism.
-  // If there is no model parallelism, then num_cores_per_replica = 1
-  optional int32 num_cores_per_replica = 8;
-}
-
-// The types of host operations that are tracked.
-enum HostOp {
-  // Invalid host op.
-  kINVALIDHostOp = 0;
-  // Each of host op type has two parts:
-  // (1) the stage where the op happens and (2) the op name.
-  // stage = Input Data Producer, op = Get Next Batch.
-  kInputDataProducerGetNextBatch = 1;
-  // stage = Input Data Producer, op = Session Run.
-  kInputDataProducerSessionRun = 2;
-  // stage = Input Data Producer, op = Forward Batch.
-  kInputDataProducerForwardBatch = 3;
-  // stage = Infeed Thread, op = Get Next Batch.
-  kInfeedThreadGetNextBatch = 4;
-  // stage = Infeed Thread, op = Session Run.
-  kInfeedThreadSessionRun = 5;
-  // stage = Infeed Thread, op = Forward Batch.
-  kInfeedThreadForwardBatch = 6;
-  // stage = Outfeed Thread, op = Get Next Batch.
-  kOutfeedThreadGetNextBatch = 7;
-  // stage = Outfeed Thread, op = Session Run.
-  kOutfeedThreadSessionRun = 8;
-  // stage = Outfeed Thread, op = Forward Batch.
-  kOutfeedThreadForwardBatch = 9;
-}
-
-// Result proto for the host ops per TPU step.
-message HostOpsPerTpuStep {
-  // Whether the data in this message is valid.
-  optional bool valid = 1 [default = false];
-  // The current TPU step number.
-  optional uint32 tpu_step_num = 2;
-  // The beginning time of the current TPU step on the device in picoseconds.
-  optional uint64 tpu_step_begin_ps = 3;
-  // The ending time of the current TPU step on the device in picoseconds.
-  optional uint64 tpu_step_end_ps = 4;
-  // For each possible host operation, maps to the difference between the TPU
-  // step number that the host op targets and the current TPU step number.
-  // The key is HostOp, value is the step difference.
-  map<int32, int32> step_diffs = 5;
-}
-
-message HostOpsDetailsPerCore {
-  // Map from core id to HostOpsPerTpuStep.
-  map<int32, HostOpsPerTpuStep> core_map = 1;
-}
-
-message HostOpsDetailsPerHost {
-  // Map from hostname to a map from core id to HostOpsPerTpuStep.
-  map<string, HostOpsDetailsPerCore> host_map = 1;
-}
-
-// Result proto for the host ops for all TPU steps.
-message HostOpsResult {
-  reserved 1;  // (was repeated HostOpsPerTpuStep host_op_sequence)
-  // A sequence of records with one for each TPU step. Each record
-  // is a map from hostname to a map from core id to HostOpsPerTpuStep.
-  repeated HostOpsDetailsPerHost hostops_details = 2;
-}
-
-// Result proto for TfStatsHelper.
-message TfOpStats {
-  // The result for the TF-metric database.
-  optional OpMetricsDbResult tf_metrics_db = 1;
-  // The result for the HLO-metric database.
-  optional OpMetricsDbResult hlo_metrics_db = 2;
-  // The result for the step database.
-  optional StepDatabaseResult step_db = 3;
-  // The result for the looping-related metrics.
-  optional LoopingResult looping = 4;
-  // The result for the HloExtraInfoMap.
-  optional HloExtraInfoMapResult hlo_extrainfo_map = 5;
-  // Overall matrix unit utilization in percentage.
-  optional double matrix_unit_utilization_percent = 6;
-  // The run environment of this profiling session.
-  optional RunEnvironmentResult run_environment = 7;
-  // The result for the host operations.
-  optional HostOpsResult host_ops = 8;
-  // A map from core ID to name.
-  map<uint32, string> core_id_to_name_map = 9;
-  // The result for hw unit b stats.
-  optional bytes unit_b_stats = 10;
-}
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index c2e3be03db0e4cca1a664f9e79aa9107384de312..aae1ab1d37a166303883e3a07a7a01efe2feab51 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -154,6 +154,14 @@ message OptimizationParameters {
   // updates; not present means no limits are applied.
   ClippingLimits gradient_clipping_limits = 7;
 
+  // Amount of weight decay to apply; see weight_decay_optimizers.py for
+  // details. Almost all optimizers are supported with this option (MDL Adagrad
+  // Light does not work, and SGD does not behave as expected if it is enabled).
+  // Although there is no check, users who want weight decay will probably also
+  // want to enable gradient accumulation as well so that the decay will happen
+  // once per minibatch.
+  float weight_decay_factor = 16;
+
   // Whether to use gradient accumulation (do two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
   // apply them using the actual optimization algorithm). This feature is
diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
index c32bd5997c1493594d253650d42ae2215b2862a2..1b09ce173a64ba3f93ec019c8fd65dc4710f0fcf 100644
--- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
+++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
@@ -80,6 +80,8 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
     self._summary_writer = None
     self._global_step_tensor = None
 
+    self._last_checkpoint_step = None
+
   def _set_steps_per_run(self, steps_per_run):
     self._steps_per_run = steps_per_run
 
@@ -137,8 +139,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
 
     last_step = session.run(self._global_step_tensor)
 
-    # Save the last checkpoint synchronously if needed.
-    if last_step != self._timer.last_triggered_step():
+    if self._last_checkpoint_step != last_step:
       self._save(session, last_step, asynchronous=False)
 
     for l in self._listeners:
@@ -164,15 +165,17 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
           SessionLog(
               status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
           step)
+
+      for l in self._listeners:
+        l.after_save(session, step)
+
       end_time = time.time()
       logging.info("Checkpoint actual writing time: (%.3f sec)",
                    end_time - start_time)
       logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
 
-    for l in self._listeners:
-      l.before_save(session, step)
-
     if not asynchronous:
+      self._last_checkpoint_step = step
       _save_fn()
       return
 
@@ -182,6 +185,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
         logging.info("Saver thread still in progress, skipping checkpoint.")
         return
 
+    self._last_checkpoint_step = step
     self._save_thread = threading.Thread(target=_save_fn)
     self._save_thread.start()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index abf9dc810fda97e5617f3be7fb85b6e782e3ca86..73753cd9181403d97b18f117a17e3e75e1f3b974 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -978,7 +978,7 @@ class TPUFunction(object):
 
             # When running on more than one core, concatenate outputs at the end
             # of processing. In backprop stage, the gradients will be
-            # calculdated according to the local inputs as gradient of
+            # calculated according to the local inputs as gradient of
             # cross-replica-concat being zero for any outputs other than those
             # from mlocal core so the loss calculation is identical.
             num_towers = self.model._tpu_assignment.num_towers
@@ -1005,14 +1005,17 @@ class TPUFunction(object):
                   for tensor in tpu_targets
               ]
 
-            if is_training or is_test:
+          if is_training or is_test:
+            with variable_scope.variable_scope(
+                'metrics', reuse=variable_scope.AUTO_REUSE):
               self._cloned_model.compile(
                   optimizer=_replicated_optimizer(self._cloned_optimizer),
                   loss=self.model.loss,
                   loss_weights=self.model.loss_weights,
-                  metrics=metrics_module.clone_metrics(self.model.metrics),
+                  metrics=metrics_module.clone_metrics(
+                      self.model._compile_metrics),
                   weighted_metrics=metrics_module.clone_metrics(
-                      self.model.weighted_metrics),
+                      self.model._compile_weighted_metrics),
                   target_tensors=tpu_targets,
               )
 
@@ -1024,29 +1027,29 @@ class TPUFunction(object):
           # the Momentum optimizer) when _make_train_function is invoked.
           with keras_tpu_variables.replicated_variable_for_optimizer(
               self._tpu_assignment.num_towers):
-            self._cloned_model._make_train_function()
+            self._cloned_model._make_fit_function()
         else:
-          self._cloned_model._make_train_function()
+          self._cloned_model._make_fit_function()
 
         self._outfeed_spec = [
             tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
-            for tensor in self._cloned_model.train_function.outputs
+            for tensor in self._cloned_model._fit_function.outputs
         ]
         return [
-            self._cloned_model.train_function.updates_op,
+            self._cloned_model._fit_function.updates_op,
             tpu_ops.outfeed_enqueue_tuple(
-                self._cloned_model.train_function.outputs,
+                self._cloned_model._fit_function.outputs,
                 name='outfeed-enqueue-train')
         ]
       elif is_test:
-        self._cloned_model._make_test_function()
+        self._cloned_model._make_eval_function()
         self._outfeed_spec = [
             tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
-            for tensor in self._cloned_model.test_function.outputs
+            for tensor in self._cloned_model._eval_function.outputs
         ]
         return [
             tpu_ops.outfeed_enqueue_tuple(
-                self._cloned_model.test_function.outputs,
+                self._cloned_model._eval_function.outputs,
                 name='outfeed-enqueue-test')
         ]
       elif is_predict:
@@ -1182,13 +1185,9 @@ class TPUFunction(object):
       # pipelined loop.
       return None, None
 
-    if (self.model.uses_learning_phase and
-        not isinstance(K.learning_phase(), int)):
+    if isinstance(inputs[-1], int):
       # Remove the learning_phase flag at the end. We currently hard code the
       # learning_phase in TPUFunction.
-      assert isinstance(inputs[-1], int), (
-          'Expect the final element be learning_phase flag. Got {}'.format(
-              inputs[-1]))
       inputs = inputs[:-1]
 
     if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or
@@ -1376,6 +1375,9 @@ class KerasTPUModel(models.Model):
     self.predict_function = None
     self.test_function = None
     self.train_function = None
+    self._fit_function = None
+    self._eval_function = None
+    self._stateful_metric_functions = []
 
     cluster_resolver = strategy._tpu_cluster_resolver
     self._tpu_name_or_address = cluster_resolver.get_master()
@@ -1390,10 +1392,10 @@ class KerasTPUModel(models.Model):
       self.compile(
           self._cpu_model.optimizer,
           self._cpu_model.loss,
-          self._cpu_model.metrics,
+          self._cpu_model._compile_metrics,
           self._cpu_model.loss_weights,
           self._cpu_model.sample_weight_mode,
-          self._cpu_model.weighted_metrics,
+          self._cpu_model._compile_weighted_metrics,
           self._cpu_model.target_tensors,
       )
 
@@ -1647,7 +1649,7 @@ class KerasTPUModel(models.Model):
     self._make_train_function()
     sample_weights = sample_weights or []
     val_sample_weights = val_sample_weights or []
-    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    if not isinstance(K.learning_phase(), int):
       ins = inputs + targets + sample_weights + [1]
     else:
       ins = inputs + targets + sample_weights
@@ -1697,7 +1699,7 @@ class KerasTPUModel(models.Model):
     callbacks.on_train_begin()
     for epoch in range(initial_epoch, epochs):
       # Reset stateful metrics
-      for m in self.stateful_metric_functions:
+      for m in self.metrics:
         m.reset_states()
       # Update callbacks
       callbacks.on_epoch_begin(epoch)
@@ -1994,10 +1996,21 @@ class KerasTPUModel(models.Model):
   def optimizer(self, optimizer):
     self._optimizer = optimizer
 
+  @property
+  def metrics(self):
+    if self._tpu_model:
+      return self._tpu_model.metrics
+    return self._stateful_metric_functions
+
+  @metrics.setter
+  def metrics(self, metrics):
+    self._stateful_metric_functions = metrics
+
   def _make_train_function(self):
     if not self.train_function:
       self.train_function = TPUFunction(
-          self, model_fn_lib.ModeKeys.TRAIN,
+          self,
+          model_fn_lib.ModeKeys.TRAIN,
           tpu_assignment=self._tpu_assignment)
 
     return self.train_function
@@ -2008,6 +2021,21 @@ class KerasTPUModel(models.Model):
           self, model_fn_lib.ModeKeys.EVAL, tpu_assignment=self._tpu_assignment)
     return self.test_function
 
+  def _make_fit_function(self):
+    if not self._fit_function:
+      self._fit_function = TPUFunction(
+          self,
+          model_fn_lib.ModeKeys.TRAIN,
+          tpu_assignment=self._tpu_assignment)
+
+    return self._fit_function
+
+  def _make_eval_function(self):
+    if not self._eval_function:
+      self._eval_function = TPUFunction(
+          self, model_fn_lib.ModeKeys.EVAL, tpu_assignment=self._tpu_assignment)
+    return self._eval_function
+
   def _make_predict_function(self):
     if not self.predict_function:
       self.predict_function = TPUFunction(
@@ -2201,10 +2229,10 @@ def tpu_model(model, strategy=None):
     cpu_model.compile(
         _clone_optimizer(model.optimizer, optimizer_config),
         model.loss,
-        metrics_module.clone_metrics(model.metrics),
+        metrics_module.clone_metrics(model._compile_metrics),
         model.loss_weights,
         model.sample_weight_mode,
-        metrics_module.clone_metrics(model.weighted_metrics),
+        metrics_module.clone_metrics(model._compile_weighted_metrics),
     )
 
   if model_weights:
diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
new file mode 100644
index 0000000000000000000000000000000000000000..70baea203cc6174bebc7d90646045efae5f2391d
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
@@ -0,0 +1,553 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========================================================================
+"""A utility to trace tensor values on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path
+import re
+
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+
+_TRACER_LOG_PREFIX = ' [>>>TT>>>]'
+_DEVICE_TYPE_TPU = 'tpu'
+_DEVICE_TYPE_CPU = 'cpu'
+_GLOBAL_STEP_OP_NAME = 'GLOBAL-STEP'
+_TRACE_MODE_NAN_INF = 'nan-inf'
+_TRACE_MODE_PART_TENSOR = 'part-tensor'
+_TRACE_MODE_PART_TENSOR_SIZE = 3
+_TRACE_MODE_FULL_TENSOR = 'full-tensor'
+_RECORD_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
+_RECORD_SHOULD_NOT_TRACE = 'not-traced-should-not-trace'
+_RECORD_FILTERED_OUT = 'not-traced-filtered-out'
+_RECORD_SCALAR = 'not-traced-scalar'
+_RECORD_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
+_RECORD_GET_TRACED = 'get-traced'
+_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
+_MARKER_SECTION_END = '!!!!!!! section-end:'
+_SECTION_NAME_CONFIG = 'configuration'
+_SECTION_NAME_REASON = 'reason'
+_SECTION_NAME_OP_LIST = 'op-list'
+_SECTION_NAME_GRAPH = 'graph'
+_FIELD_NAME_VERSION = 'version:'
+_FIELD_NAME_DEVICE = 'device:'
+_FIELD_NAME_TRACE_MODE = 'trace-mode:'
+_FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
+_FIELD_NAME_NUM_OPS = 'number-of-ops:'
+_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
+_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
+_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
+_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
+_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
+_FLAG_NAME_ENABLE = 'enable'
+_FLAG_NAME_TRACE_MODE = 'trace_mode'
+_FLAG_NAME_INTERESTING_OPS = 'interesting_ops'
+_FLAG_NAME_TRACE_FILE = 'trace_file_path'
+_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
+_FLAG_NAME_OP_RANGE = 'op_range'
+_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
+_OUTPUT_STREAM_ESCAPE = 'file://'
+_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
+
+
+class TensorTracer(object):
+  """A software construct for tracing tensor values in a TF graph on TPU.
+
+  This utility is disabled by default. It can be enabled by setting
+  the TENSOR_TRACER_FLAGS env variable as:
+    export TENSOR_TRACER_FLAGS="--enable=1"
+  If it is enabled, it will trace the output tensor values of
+  selected Ops in the graph. It has two outputs: (1) the traces and (2)
+  a report. The traces are dumped to a specified local file on the TPU
+  host. The report is printed to the log.info of the TPU job.
+  By passing options via the env variable, users can change:
+     (1) the trace mode (e.g., detecting NaN/Inf, printing partial or
+         full tensor values)
+     (2) which Ops to be traced (via op.name or op.type)
+     (3) output trace file path.
+  """
+
+  @staticmethod
+  def _match_next_flag(flags, pos):
+    """Returns the match for the next TensorTracer flag."""
+
+    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match
+    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match
+    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
+    return match
+
+  @staticmethod
+  def print_flag_values():
+    """Prints all TensorTracer flags passed via environment variables."""
+
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
+    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
+                                                   tensor_tracer_flags)
+    result += 'Individual flag value:\n'
+    pos = 0
+    while True:
+      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      flag_value = match.group(2)
+      result += '  %s: %s\n'%(flag_name, flag_value)
+      pos = match.end()
+    result += '\n'
+    return result
+
+  @staticmethod
+  def get_flag_value(wanted_flag_name):
+    """Returns the value of a TensorTracer flags."""
+
+    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return ''
+    pos = 0
+    while True:
+      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        return ''
+      flag_name = match.group(1)
+      flag_value = match.group(2)
+      if flag_name == wanted_flag_name:
+        return flag_value
+      pos = match.end()
+    return ''
+
+  @staticmethod
+  def is_enabled():
+    """Returns True if TensorTracer is enabled."""
+
+    flag_value = TensorTracer.get_flag_value(_FLAG_NAME_ENABLE)
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  @staticmethod
+  def use_test_undeclared_outputs_dir():
+    """Decides the output directory of the trace file.
+
+    Args:
+       None.
+
+    Returns:
+       True if the output trace file should be written to the
+       test-undeclared-outputs-directory defined via an
+       env variable.
+    """
+
+    flag_value = TensorTracer.get_flag_value(
+        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  @staticmethod
+  def check_device_type(device_type):
+    """Checks if the given device type is valid."""
+
+    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
+      raise ValueError('Invalid device_type "%s"'%device_type)
+
+  @staticmethod
+  def check_trace_mode(trace_mode):
+    """Checks if the given trace mode is valid."""
+
+    valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
+                         _TRACE_MODE_FULL_TENSOR]
+    if trace_mode not in valid_trace_modes:
+      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
+                       'Valid trace modes are: %s'%(trace_mode,
+                                                    valid_trace_modes))
+
+  @staticmethod
+  def should_trace(device_type, op):
+    """Returns True if the given Op should be traced."""
+
+    if device_type != _DEVICE_TYPE_TPU:
+      raise ValueError('Non TPU device type is not supported')
+    if control_flow_util.IsInCond(op):
+      return False
+    if op.type in ['Reshape', 'ArgMin', 'ArgMax']:
+      return False
+    # pylint: disable=protected-access
+    return tpu._TPU_REPLICATE_ATTR in op.node_def.attr
+    # pylint: enable=protected-access
+
+  @staticmethod
+  def reason(op_idx, details):
+    """Returns why the Op at op_idx is traced or not."""
+    return '%d %s'%(op_idx, details)
+
+  @staticmethod
+  def topological_sort(g):
+    """Performs topological sort on the given graph.
+
+    Args:
+       g: the graph.
+
+    Returns:
+       A pair where the first element indicates if the topological
+       sort succeeded (True if there is no cycle found; False if a
+       cycle is found) and the second element is either the sorted
+       list of nodes or the cycle of nodes found.
+    """
+
+    def visit(op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops):
+      """Recursively visits all Ops in a graph.
+
+      Args:
+         op: the current Op being visited.
+         cycle: a cycle of Ops found.
+         permanently_marked_ops: the set of Ops that were already visited.
+         temporarily_marked_ops: the set of Ops that we have visited during
+                                 the current descent.
+         sorted_ops: the list of Ops sorted in topological order.
+      """
+
+      if cycle:
+        return
+      if op in permanently_marked_ops:
+        return
+      if op in temporarily_marked_ops:
+        cycle = temporarily_marked_ops
+        return
+      temporarily_marked_ops.add(op)
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        for consumer_op in out_tensor.consumers():
+          visit(consumer_op, cycle, permanently_marked_ops,
+                temporarily_marked_ops, sorted_ops)
+      # pylint: disable=protected-access
+      for ctrl_output_op in op._control_outputs:
+      # pylint: enable=protected-access
+        visit(ctrl_output_op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops)
+      temporarily_marked_ops.remove(op)
+      permanently_marked_ops.add(op)
+      sorted_ops.insert(0, op)
+
+    graph_cycle = set([])
+    sorted_ops = []
+    permanently_marked_ops = set([])
+    temporarily_marked_ops = set([])
+    unsorted_ops = g.get_operations()
+    for op in unsorted_ops:
+      visit(op, graph_cycle, permanently_marked_ops,
+            temporarily_marked_ops, sorted_ops)
+    if graph_cycle:
+      return (False, graph_cycle)
+    else:
+      assert len(unsorted_ops) == len(sorted_ops)
+      return (True, sorted_ops)
+
+  def __init__(self):
+    """Initializes a TensorTracer.
+
+    Sets the various member fields from the flags (if given) or the defaults.
+    """
+    self._version = 'use-outside-compilation'
+    self._device_type = None
+    self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
+    if not self._trace_mode:
+      self._trace_mode = _TRACE_MODE_NAN_INF
+    TensorTracer.check_trace_mode(self._trace_mode)
+    self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
+    self._instrument_records = {}
+    interesting_ops = TensorTracer.get_flag_value(_FLAG_NAME_INTERESTING_OPS)
+    self._selected_ops = interesting_ops.split()
+    self._set_trace_file_path()
+    self._set_op_range()
+    self._num_replicas = None
+    self._replica_id = None
+
+  def _add_replica_id_to_graph(self, num_replicas, result_tensor):
+    """Adds nodes for computing the replica ID to the graph."""
+
+    if not num_replicas:
+      self._replica_id = 'unknown'
+      return result_tensor
+
+    self._num_replicas = num_replicas
+
+    with ops.control_dependencies(None):
+      # Uses None as dependency to run outside of TPU graph rewrites.
+      self._replica_id = tpu_ops.tpu_replicated_input(
+          list(range(self._num_replicas)),
+          name='tt_replica_id')
+    use_replica_id = array_ops.identity(self._replica_id).op
+    with ops.control_dependencies([use_replica_id]):
+      # Adds a control dependency from the result_tensor to
+      # the replica_id to ensure that replica_id will be added to the graph.
+      return array_ops.identity(result_tensor)
+
+  def _set_trace_file_path(self):
+    """Sets the path of the output trace file."""
+
+    self._trace_file_path = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_FILE)
+    if not self._trace_file_path:
+      raise ValueError('--%s is not set in the environment variable %s'
+                       %(_FLAG_NAME_TRACE_FILE, _FLAGS_ENV_VAR))
+    elif TensorTracer.use_test_undeclared_outputs_dir():
+      if os.path.isabs(self._trace_file_path):
+        raise ValueError('If use_test_undeclared_outputs_dir is set,'
+                         'trace_file_path cannot be an absolute path (%s)'
+                         %self._trace_file_path)
+      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+      self._trace_file_path = os.path.join(outputs_dir,
+                                           self._trace_file_path)
+
+  def _set_op_range(self):
+    """Sets the index range of the Ops that we will consider tracing."""
+
+    op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
+    if not op_range:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    match = _OP_RANGE_PAT.match(op_range)
+    if not match:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    self._op_range = (int(match.group(1)), int(match.group(2)))
+
+  def _inside_op_range(self, idx):
+    """Return True if the given index is inside the selected range."""
+
+    if idx < self._op_range[0]:
+      return False
+    return self._op_range[1] < 0 or idx <= self._op_range[1]
+
+  def _write_report(self, content):
+    """Writes the given content to the report."""
+
+    logging.info('%s %s'%(_TRACER_LOG_PREFIX, content))
+
+  def _is_selected_op(self, op_name):
+    """Returns True if the Op with op_name is selected to be traced."""
+
+    if not self._selected_ops:
+      return True
+    if op_name in self._selected_ops:
+      return True
+    return False
+
+  def _write_config_section(self):
+    """Writes the config section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
+    self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
+    self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
+
+  def _write_reason_section(self):
+    """Writes the reason section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON))
+    for key in sorted(self._instrument_records):
+      self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
+
+  def _write_op_list_section(self, op_list):
+    """Writes the Op-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
+    for i in range(0, len(op_list)):
+      self._write_report('%d "%s" %s\n'%(i, op_list[i].name, op_list[i].type))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
+
+  def _write_graph_section(self, succeed, sorted_or_cycle):
+    """Writes the graph section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED,
+                                  succeed))
+    l = list(sorted_or_cycle)
+    for i in range(0, len(l)):
+      self._write_report('%d "%s"\n'%(i, l[i].name))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
+
+  def _make_tensor_trace_fun(self, op_name, output_idx):
+    """Makes the tensor tracing function called by outside compilation.
+
+    Args:
+      op_name: the name of the Op that outputs the tensor to be traced.
+      output_idx: which output of the Op it is (0 means the first output).
+
+    Returns:
+      A function to be passed as the first argument to outside compilation.
+
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _print_tensor(op_name, output_idx, num_elements, tensor, output_tensor):
+      """Prints a tensor value to a file.
+
+      Args:
+        op_name: the name of the Op that outputs the tensor to be printed.
+        output_idx: which output of the Op it is (0 means the first output).
+        num_elements: number of elements to print.
+        tensor: the tensor needs to be returned.
+        output_tensor: the tensor needs to be printed.
+
+      Returns:
+        The same tensor passed via the "tensor" argument.
+      """
+      msg = '"%s:%d" '%(op_name, output_idx)
+      output_stream = _OUTPUT_STREAM_ESCAPE + self._trace_file_path
+      print_op = logging_ops.print_v2(msg, array_ops.shape(output_tensor),
+                                      ' @', self._replica_id,
+                                      '\n', output_tensor,
+                                      summarize=num_elements,
+                                      output_stream=output_stream)
+      with ops.control_dependencies([print_op]):
+        return array_ops.identity(tensor).op
+
+    def _detect_nan_inf(tensor):
+      """Trace function for detecting any NaN/Inf in the tensor."""
+
+      if tensor.dtype.is_floating:
+        # Since host can't handle bf16, always convert tensor to f32.
+        tensor = math_ops.cast(tensor, dtypes.float32)
+        output_tensor = math_ops.reduce_any(
+            gen_math_ops.logical_or(gen_math_ops.is_nan(tensor),
+                                    gen_math_ops.is_inf(tensor)))
+      else:
+        output_tensor = constant_op.constant(0)
+      return _print_tensor(op_name, output_idx, 1, tensor, output_tensor)
+
+    def _show_global_step(tensor):
+      """Trace function for printing the global step count."""
+
+      return _print_tensor(op_name, output_idx, 1, tensor, tensor)
+
+    def _show_part_tensor(tensor):
+      """Trace function for printing part of the tensor."""
+
+      return _print_tensor(op_name, output_idx, self._part_tensor_size,
+                           tensor, tensor)
+
+    def _show_full_tensor(tensor):
+      """Trace function for printing the entire tensor."""
+
+      return _print_tensor(op_name, output_idx, -1, tensor, tensor)
+
+    if op_name == _GLOBAL_STEP_OP_NAME:
+      return _show_global_step
+    if self._trace_mode == _TRACE_MODE_NAN_INF:
+      return _detect_nan_inf
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return _show_part_tensor
+    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+      return _show_full_tensor
+
+    raise RuntimeError('Tensor trace fun for %s is not yet implemented'
+                       %self._trace_mode)
+
+  def trace_tpu(self, graph, result_tensor, num_replicas=None):
+    """Traces the tensors generated by TPU Ops in a TF graph.
+
+    Args:
+      graph: the graph of Ops.
+      result_tensor: a result tensor of evaluating the graph.
+      num_replicas: number of replicas used on the TPU.
+
+    Returns:
+      A tuple (result_tensor_copy, tracing_ops), where:
+        result_tensor_copy: an exact copy of result_tensor
+        tracing_ops: a list of tracing ops. If this list
+                     is non empty, the caller of this function
+                     should pose control dependencies upon these
+                     Ops so that they will be executed when the
+                     graph is evaluated.
+    """
+
+    self._device_type = _DEVICE_TYPE_TPU
+    TensorTracer.check_device_type(self._device_type)
+    result_tensor_copy = self._add_replica_id_to_graph(num_replicas,
+                                                       result_tensor)
+    self._write_config_section()
+    tracing_ops = []
+    operations = graph.get_operations()
+    self._write_op_list_section(operations)
+    # Does the topological sort before adding any nodes to the graph.
+    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
+    for op_id, op in enumerate(operations):
+      if not self._inside_op_range(op_id):
+        self._instrument_records[op.name] = TensorTracer.reason(
+            op_id, _RECORD_OUTSIDE_OP_RANGE)
+        continue
+      if not TensorTracer.should_trace(self._device_type, op):
+        self._instrument_records[op.name] = TensorTracer.reason(
+            op_id, _RECORD_SHOULD_NOT_TRACE)
+        continue
+      if not self._is_selected_op(op.name):
+        self._instrument_records[op.name] = TensorTracer.reason(
+            op_id, _RECORD_FILTERED_OUT)
+        continue
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        if not out_tensor.get_shape().is_fully_defined():
+          self._instrument_records[out_tensor.name] = TensorTracer.reason(
+              op_id, _RECORD_DYNAMIC_SHAPE)
+          continue  # cannot trace tensors with dynamic shape.
+        rank = len(out_tensor.shape)
+        if rank < 1:
+          self._instrument_records[out_tensor.name] = TensorTracer.reason(
+              op_id, _RECORD_SCALAR)
+          continue  # cannot trace scalar.
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _RECORD_GET_TRACED)
+        consumers = out_tensor.consumers()
+        trace_op = tpu.outside_compilation(
+            self._make_tensor_trace_fun(op.name, i), out_tensor)
+        if consumers:
+          for consumer_op in consumers:
+            # pylint: disable=protected-access
+            consumer_op._add_control_input(trace_op)
+            # pylint: enable=protected-access
+        else:
+          # if there is no consumer, we will add the control dependence later
+          # when we add the control dependency to the output operations.
+          tracing_ops.append(trace_op)
+
+    self._write_reason_section()
+    self._write_graph_section(succeed, sorted_or_cycle)
+
+    return (result_tensor_copy, tracing_ops)
diff --git a/tensorflow/contrib/tpu/python/tpu/topology.py b/tensorflow/contrib/tpu/python/tpu/topology.py
index b6bb5c6e56c74003ed8ceafe9246fb6a05d928dd..6ae718cc2c9716587849aeee8abcd0a1de82a9ae 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology.py
+++ b/tensorflow/contrib/tpu/python/tpu/topology.py
@@ -189,12 +189,13 @@ class Topology(object):
   def cpu_device_name_at_coordinates(self, device_coordinates, job=None):
     """Returns the CPU device attached to a logical core."""
     return _tpu_host_device_name(
-        job, self._topology_tasks[device_coordinates])
+        job, self._topology_tasks[tuple(device_coordinates)])
 
   def tpu_device_name_at_coordinates(self, device_coordinates, job=None):
     """Returns the name of the TPU device assigned to a logical core."""
-    return _tpu_device_name(job, self._topology_tasks[device_coordinates],
-                            self._topology_devices[device_coordinates])
+    return _tpu_device_name(job,
+                            self._topology_tasks[tuple(device_coordinates)],
+                            self._topology_devices[tuple(device_coordinates)])
 
   @property
   def num_tasks(self):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index e3e791faacb9b3c1fedbd83d3740e35351e38abb..a02361241cec5d16c4b05406c8b53bfd58156f56 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -1001,8 +1001,8 @@ def rewrite(computation,
       `rewrite` is a list of tensors corresponding to the tensors from the
       output of `computation`.
 
-      All `Operation`s returned from `computation` will be executed when
-      evaluating any of the returned output tensors.
+      All `Operation`s constructed during `computation` will be executed when
+      evaluating any of the returned output tensors, not just the ones returned.
     inputs: A list of input tensors or `None` (equivalent to an empty list).
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index da6bdf67d686fba09d66386de982b57aa28d4dd4..672462447944b777375331d49727c4d5366cf295 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -41,7 +41,7 @@ _NUM_CORES_TO_COMPUTATION_SHAPE = {
 
 
 class TPUContext(object):
-  """The context of current input_fn invocation."""
+  """A context that holds the current configuration of the TPU computation."""
 
   def __init__(self,
                internal_ctx,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 555ad0f1fdbe36f078c7d2fdcc67571f28c8b723..932367f4dd546c7867ea75eba1ae36813c9080da 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -31,6 +31,7 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.tpu.python.tpu import tensor_tracer
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import error_handling
 from tensorflow.contrib.tpu.python.tpu import session_support
@@ -108,6 +109,15 @@ ops.register_proto_function(
     from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
 
 
+def _is_iterable(obj):
+  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
+  try:
+    iter(obj)
+    return True
+  except TypeError:
+    return False
+
+
 def _create_global_step(graph):
   graph = graph or ops.get_default_graph()
   if training.get_global_step(graph) is not None:
@@ -288,9 +298,9 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
       host_calls['host_call'] = host_call
     _OutfeedHostCall.validate(host_calls)
 
-    training_hooks = list(training_hooks or [])
-    evaluation_hooks = list(evaluation_hooks or [])
-    prediction_hooks = list(prediction_hooks or [])
+    training_hooks = tuple(training_hooks or [])
+    evaluation_hooks = tuple(evaluation_hooks or [])
+    prediction_hooks = tuple(prediction_hooks or [])
 
     for hook in training_hooks + evaluation_hooks + prediction_hooks:
       if not isinstance(hook, session_run_hook.SessionRunHook):
@@ -325,7 +335,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
     hooks = None
     if self.host_call is not None:
       hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
-    hooks = list(hooks or [])
+    hooks = tuple(hooks or [])
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
         mode=self.mode,
@@ -1317,9 +1327,15 @@ class _ModelFnWrapper(object):
 
       captured_training_hooks.capture(estimator_spec.training_hooks)
 
+      tracing_ops = []
+      if tensor_tracer.TensorTracer.is_enabled():
+        tt = tensor_tracer.TensorTracer()
+        loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(), loss,
+                                         self._ctx.num_replicas)
+
       # We must run train_op to update the variables prior to running the
       # outfeed.
-      with ops.control_dependencies([train_op]):
+      with ops.control_dependencies([train_op]+tracing_ops):
         host_call_outfeed_ops = []
         if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
             and estimator_spec.host_call is not None):
@@ -2250,8 +2266,7 @@ class TPUEstimator(estimator_lib.Estimator):
         # Only fetching `tpu_tensors_on_cpu` does not trigger
         # TPU computation and blocks, so we add the control dependency here.
         control_inputs = (
-            tpu_tensors_on_cpu if isinstance(tpu_tensors_on_cpu,
-                                             (list, tuple)) else
+            tpu_tensors_on_cpu if _is_iterable(tpu_tensors_on_cpu) else
             (tpu_tensors_on_cpu,))
         with ops.control_dependencies(control_inputs):
           new_tensors.append(array_ops.identity(t))
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index e75a09492ec12b95bad32b221a8e78a1b79f3a6b..cf36103277de2e3b055ae89c66b198fb55bb4522 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -26,7 +26,6 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
-from tensorflow.compiler.xla.python_api import xla_shape
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_sharding
@@ -92,8 +91,7 @@ class InfeedQueue(object):
       else:
         raise ValueError(
             "number of tuple elements cannot be inferred from InfeedQueue "
-            "constructor"
-        )
+            "constructor")
     if number_of_tuple_elements <= 0:
       raise ValueError("number_of_tuple_elements %d must be > 0" %
                        number_of_tuple_elements)
@@ -293,9 +291,8 @@ class InfeedQueue(object):
         self.number_of_tuple_elements
     """
     if len(input_tensors) != self.number_of_tuple_elements:
-      raise ValueError(
-          "input_tensors is %s, but should be a list of %d Tensors", (
-              str(input_tensors), self.number_of_tuple_elements))
+      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
+                       % (str(input_tensors), self.number_of_tuple_elements))
     self.set_tuple_shapes([t.shape for t in input_tensors])
     self.set_tuple_types([t.dtype for t in input_tensors])
 
@@ -451,8 +448,8 @@ class InfeedQueue(object):
       for i in xrange(1, self.number_of_tuple_elements):
         if devices[0] != devices[i]:
           raise ValueError(
-              "input devices for shard %d are %s, but should all be the same",
-              index, str(devices))
+              "input devices for shard %d are %s, but should all be the same" %
+              (index, str(devices)))
       with ops.colocate_with(inputs[0]):
         return tpu_ops.infeed_enqueue_tuple(
             inputs=inputs,
@@ -792,18 +789,14 @@ class _PartitionedInfeedQueue(InfeedQueue):
 
     Args:
       tensor: Input tensor for partitioning.
-      dims: A list of integer describes how to partition the input tensor.
+      dims: 1-D np.array of the list of integer describes how to partition the
+        input tensor.
 
     Raises:
       ValueError: If the tensor can't be partitioned by dims or the
         num_cores_per_replica doesn't match the number of
         partitions(dims.prod()).
     """
-    if dims is None:
-      return
-
-    dims = np.array(dims)
-
     if (dims < 1).any():
       raise ValueError("All input partition dims must be >= 1.")
 
@@ -823,11 +816,6 @@ class _PartitionedInfeedQueue(InfeedQueue):
           "partition dims = {}).".format(tensor.shape.as_list(), dims))
 
     tensor.shape.assert_is_fully_defined()
-    if (np.array(tensor.shape.as_list()) % dims != 0).any():
-      raise ValueError(
-          "All input partition dims must divide exactly into the `Tensor` "
-          "shape (tensor shape = {}, input partition dims = {}).".format(
-              tensor.shape.as_list(), dims))
 
   def _partition_or_replicate_on_host(self, tensor, dims):
     """Partitions or replicates the input tensor.
@@ -840,16 +828,33 @@ class _PartitionedInfeedQueue(InfeedQueue):
     Returns:
       An iterator of `Tensor`s or a list of partioned tensors.
     """
-    self._check_input_partition_dims(tensor, dims)
     if dims is None:
       return itertools.repeat(tensor)
-    else:
-      output = [tensor]
-      for axis, dim in enumerate(dims):
-        if dim > 1:
-          output = [array_ops.split(x, dim, axis=axis) for x in output]
-          output = nest.flatten(output)
-      return output
+    dims = np.array(dims)
+    self._check_input_partition_dims(tensor, dims)
+    output = [tensor]
+    divds, remainders = np.divmod(np.array(tensor.shape.as_list()), dims)
+    for axis, (divd, remainder, dim) in enumerate(
+        np.dstack((divds, remainders, dims))[0]):
+      if dim <= 1:
+        continue
+      if remainder > 0:
+        # For each dimension, when it cannot be evenly partitioned, XLA assumes
+        # the size of last parts are smaller by 1. E.g. 2D tensor with shape
+        # (5, 14) and dims are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14]
+        # => [[(3, 3), (3, 3), (2, 3), (2, 3)],
+        # [(2, 3), (2, 3), (2, 2), (2, 2)]]
+        output = [
+            array_ops.split(
+                x,
+                num_or_size_splits=[divd + 1] * remainder +
+                [divd] * (dim - remainder),
+                axis=axis) for x in output
+        ]
+      else:
+        output = [array_ops.split(x, dim, axis=axis) for x in output]
+      output = nest.flatten(output)
+    return output
 
   def _tag_sharding_attribute_for_dequeued_tensor(self, tensor, dims):
     """Tags appropriate XLA sharding attribute to the dequeued tensor.
@@ -866,13 +871,9 @@ class _PartitionedInfeedQueue(InfeedQueue):
     elif np.prod(dims) == 1:
       return xla_sharding.assign_device(tensor, 0)
     else:
-      tile_shape = np.array(tensor.shape.as_list()) // dims
       tile_assignment = np.arange(np.prod(dims)).reshape(dims)
       return xla_sharding.tile(
           tensor=tensor,
-          tile_shape=xla_shape.CreateShapeFromDtypeAndTuple(
-              dtype=np.dtype(tensor.dtype.as_numpy_dtype),
-              shape_tuple=tile_shape),
           tile_assignment=tile_assignment)
 
   def _tag_sharding_attribute_for_dequeued_tensors(self, dequeues, dims):
diff --git a/tensorflow/contrib/util/__init__.py b/tensorflow/contrib/util/__init__.py
index 338acef63f244613cbd14a2da04c7ec4d811a0af..acc5a049aa87649e4f8bf3a00be605616ea7b630 100644
--- a/tensorflow/contrib/util/__init__.py
+++ b/tensorflow/contrib/util/__init__.py
@@ -15,8 +15,6 @@
 
 """Utilities for dealing with Tensors.
 
-See [Contrib Util](https://tensorflow.org/api_guides/python/contrib.util) guide.
-
 @@constant_value
 @@make_tensor_proto
 @@make_ndarray
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index f7c979e86320d59ad033e2b8d7fcdff89ce0d133..9db80f6b5736d849d88e1e41ea467a5ff11844f5 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -1028,7 +1027,10 @@ Status RdmaTensorResponse::PrepareRecvTensor(
     return errors::Aborted(
         "RecvTensor expects a different device incarnation: ",
         parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
-        ". Your worker job was probably restarted. Check your "
+        ". Your worker job (\"",
+        channel_->adapter_->worker_env_->session_mgr->LegacySession()
+            ->worker_name,
+        "\") was probably restarted. Check your "
         "worker job for the reason why it was restarted.");
   }
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index afe4c46c8efc59da3da07777ee1fd38be015753d..2a8c2718edd7faa844d2efb7e7ea007db48d846b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -300,6 +300,7 @@ filegroup(
         "platform/env_time.h",
         "platform/logging.h",
         "platform/macros.h",
+        "platform/platform_strings.h",
         "platform/types.h",
     ],
     visibility = ["//visibility:private"],
@@ -383,6 +384,7 @@ cc_library(
         ":lib_platform",
         ":platform_base",
         "//tensorflow/core/platform/default/build_config:port",
+        "@com_google_absl//absl/base",
         "@snappy",
     ],
 )
@@ -518,6 +520,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "platform_strings",
+    srcs = tf_platform_srcs([
+        "platform/platform_strings.cc",
+        "platform/platform_strings_computed.h",
+    ]),
+    hdrs = [
+        "platform/platform_strings.h",
+    ],
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [":lib"],
+)
+
 filegroup(
     name = "platform_other_hdrs",
     srcs = [
@@ -1037,6 +1052,7 @@ tf_gen_op_libs(
         "batch_ops",
         "bitwise_ops",
         "boosted_trees_ops",
+        "tensor_forest_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
         "collective_ops",
@@ -1057,6 +1073,7 @@ tf_gen_op_libs(
         "logging_ops",
         "manip_ops",
         "math_ops",
+        "mkl_nn_ops",
         "nccl_ops",
         "nn_ops",
         "no_op",
@@ -1185,6 +1202,7 @@ cc_library(
         ":batch_ops_op_lib",
         ":bitwise_ops_op_lib",
         ":boosted_trees_ops_op_lib",
+        ":tensor_forest_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
         ":collective_ops_op_lib",
@@ -1229,7 +1247,7 @@ cc_library(
         ":training_ops_op_lib",
         ":user_ops_op_lib",
         ":word2vec_ops",
-    ] + tf_additional_cloud_op_deps(),
+    ] + if_mkl([":mkl_nn_ops_op_lib"]) + tf_additional_cloud_op_deps(),
     alwayslink = 1,
 )
 
@@ -1285,7 +1303,9 @@ cc_library(
         ":framework",
         ":lib",
         ":nn_ops_op_lib",
-    ],
+    ] + if_mkl([
+        ":mkl_nn_ops_op_lib",
+    ]),
     alwayslink = 1,
 )
 
@@ -1336,6 +1356,7 @@ cc_library(
         "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/core/kernels:boosted_trees_ops",
+        "//tensorflow/core/kernels:tensor_forest_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
         "//tensorflow/core/kernels:collective_ops",
@@ -1667,6 +1688,7 @@ cc_library(
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1771,6 +1793,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
@@ -1795,6 +1818,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
@@ -2168,6 +2192,7 @@ cc_library(
             "lib/**/*.cc",
             "platform/*.cc",
             "platform/profile_utils/**/*.cc",
+        ] + [
             "framework/resource_handle.cc",
             "util/env_var.cc",
         ],
@@ -2635,6 +2660,7 @@ tf_cuda_library(
         ":stats_calculator_portable",
         ":version_lib",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
@@ -2811,7 +2837,6 @@ tf_cuda_library(
         ":functional_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:required",
-        ":core_cpu_impl",
     ]),
     alwayslink = 1,
 )
@@ -3048,7 +3073,9 @@ tf_cuda_library(
     ],
     copts = tf_copts(),
     cuda_deps = if_cuda_is_configured(tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps()),
-    visibility = ["//visibility:private"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
     deps = [
         ":core_cpu_internal",
         ":lib",
@@ -3402,6 +3429,16 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "platform_strings_test",
+    size = "small",
+    srcs = ["platform/platform_strings_test.cc"],
+    deps = [
+        ":lib",
+        ":platform_strings",
+    ],
+)
+
 tf_cc_test(
     name = "platform_env_test",
     size = "small",
@@ -4080,6 +4117,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:immutable_constant_op",
         "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:topk_op",
         "//third_party/eigen3",
     ],
 )
@@ -4852,6 +4890,7 @@ transitive_hdrs(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:platform_strings",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor",
     ],
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index 6f9885691595368ab50cfe660b1b5c75673063cf..d38a8424eb13009fbf84d7511fb1325085d8b809 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -182,11 +182,14 @@ void TestDeprecationVersionSetCorrectly(
   for (const auto& name_and_api_def : api_defs_map) {
     const auto& name = name_and_api_def.first;
     const auto& api_def = name_and_api_def.second;
-    ASSERT_TRUE(api_def.deprecation_version() == 0 ||
-                api_def.deprecation_message().empty())
-        << "ApiDef that includes deprecation_version > 0 must also specify "
-        << "a deprecation_message. Op " << name
-        << " has deprecation_version > 0 but deprecation_message is not set.";
+    if (api_def.deprecation_version() != 0) {
+      ASSERT_TRUE(api_def.deprecation_version() > 0)
+          << "Found ApiDef with negative deprecation_version";
+      ASSERT_FALSE(api_def.deprecation_message().empty())
+          << "ApiDef that includes deprecation_version > 0 must also specify "
+          << "a deprecation_message. Op " << name
+          << " has deprecation_version > 0 but deprecation_message is not set.";
+    }
   }
 }
 }  // namespace
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
index cdaeb5091c7b407addec2811bbf0cb79e61db2d2..bfaf3d2ea5912bf5fde34a91ec51ad42f66b6adb 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "float_values"
     description: <<END
-float; List of Rank 2 Tensor each containing float values for a single feature.
+float; List of Rank 1 Tensor each containing float values for a single feature.
 END
   }
   in_arg {
@@ -17,7 +17,7 @@ END
   out_arg {
     name: "buckets"
     description: <<END
-int; List of Rank 2 Tensors each containing the bucketized values for a single feature.
+int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
index ca111af312daa6e7696203762cdd979345dc9bcf..e7a3ca3d9fd051a0fc08ef2a02a72bf3f9dcfaca 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "float_values"
     description: <<END
-float; List of Rank 2 Tensors each containing values for a single feature.
+float; List of Rank 1 Tensors each containing values for a single feature.
 END
   }
   in_arg {
@@ -22,8 +22,8 @@ END
   out_arg {
     name: "summaries"
     description: <<END
-float; List of Rank 2 Tensors each containing the quantile summary (value, weight,
-min_rank, max_rank) of a single feature.
+float; List of Rank 2 Tensors each containing the quantile summary
+(value, weight, min_rank, max_rank) of a single feature.
 END
   }
   attr {
@@ -35,6 +35,7 @@ END
   }
   summary: "Makes the summary of quantiles for the batch."
   description: <<END
-An op that takes a list of tensors and outputs the quantile summaries for each tensor.
+An op that takes a list of tensors (one tensor per feature) and outputs the
+quantile summaries for each tensor.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e61e5fa93aae47924dc7d4306f478e2adcfe9d6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceDeserialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "quantile_stream_resource_handle"
+    description: <<END
+resource handle referring to a QuantileStreamResource.
+END
+  }
+  in_arg {
+    name: "bucket_boundaries"
+    description: <<END
+float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+END
+  }
+  attr {
+    name: "num_streams"
+    description: <<END
+inferred int; number of features to get bucket boundaries for.
+END
+  }
+  summary: "Deserialize bucket boundaries and ready flag into current QuantileAccumulator."
+  description: <<END
+An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9619edcac1cce1bf8ab73ab271b647f902539bb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalMapDataset"
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalNonSerializableDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalNonSerializableDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08632aa262a35b0f33bd4bdb82783dc7643c5c6d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalNonSerializableDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalNonSerializableDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
index 4e48d6c169b6641ece5f11d5add478ce25611ee8..0ba2327371a4ba0f5f553815fc9e8c991f62b424 100644
--- a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most
+A complex tensor of the same shape as `input`. The inner-most
   dimension of `input` is replaced with its 1D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
index 555f8e60673d71e43dbb5d4dc17ae345606a2089..c7b780a56f04298bc7906955cb17bc335ec4e8d5 100644
--- a/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 2
+A complex tensor of the same shape as `input`. The inner-most 2
   dimensions of `input` are replaced with their 2D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad82eddb587b40e8ab61dd55aa3dc277aefd03d5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "FixedLengthRecordDatasetV2"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
index b793c99cf74408305b48dbbf1c9df7b03d09b2f3..c17a84000560e9e14e10326e42e84dd49d924bf2 100644
--- a/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most
+A complex tensor of the same shape as `input`. The inner-most
   dimension of `input` is replaced with its inverse 1D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
index 7f38f14308de70fb0ebc229064d010762055c458..7458d233ec8bd385e7976095d0cf89dfa0b36ace 100644
--- a/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 2
+A complex tensor of the same shape as `input`. The inner-most 2
   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_SinkDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SinkDataset.pbtxt
deleted file mode 100644
index b5758ddbfb0542cbbdf85ff278ae8e3ce833403a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_SinkDataset.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-op {
-  graph_op_name: "SinkDataset"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the input dataset.
-END
-  }
-  summary: "A placeholder for input pipeline graph optimizations."
-  description: <<END
-A placeholder for input pipeline graph optimizations.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe2ccd9da62db86c2204cad8be7ed0d7588eb47a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestCreateTreeVariable"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be created.
+END
+  }
+  in_arg {
+    name: "tree_config"
+    description: <<END
+Serialized proto string of the boosted_trees.Tree.
+END
+  }
+  summary: "Creates a tree resource and returns a handle to it."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43dbcb7b42d3bc72077292a765fe71d6393286ae
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeDeserialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be restored.
+END
+  }
+  in_arg {
+    name: "tree_config"
+    description: <<END
+Serialied proto string of the boosted_trees.Tree proto.
+END
+  }
+  summary: "Deserializes a proto into the tree handle"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9c7a67888e21cbc025750bce66a8b85da5f2519
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeIsInitializedOp"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree.
+END
+  }
+  out_arg {
+    name: "is_initialized"
+    description: <<END
+Whether the tree is initialized.
+END
+  }
+  summary: "Checks whether a tree has been initialized."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8d92702748299dbf38b187f412ad72920374dfb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "TensorForestTreePredict"
+  visibility: HIDDEN
+  attr {
+    name: "logits_dimension"
+    description: <<END
+Scalar, dimension of the logits.
+END
+  }
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource.
+END
+  }
+  in_arg {
+    name: "dense_features"
+    description: <<END
+Rank 2 dense features tensor.
+END
+  }
+  out_arg {
+    name: "logits"
+    description: <<END
+The logits predictions from the tree for each instance in the batch.
+END
+  }
+  summary: "Output the logits for the given input data"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbf5c51d647ca76e6af49af66c4e732a70d76472
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorForestTreeResourceHandleOp"
+  visibility: HIDDEN
+  summary: "Creates a handle to a TensorForestTreeResource"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aac2afa0f85958012abb336d0c853cc2ad6d2c90
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeSerialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be serialized.
+END
+  }
+  out_arg {
+    name: "tree_config"
+    description: <<END
+Serialied proto string of the tree resource.
+END
+  }
+  summary: "Serializes the tree handle to a proto"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b85b0ed6cf59bf69d9e48583ad39666aa21d6c5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeSize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource.
+END
+  }
+  out_arg {
+    name: "tree_size"
+    description: <<END
+The size of the tree.
+END
+  }
+  summary: "Get the number of nodes in a tree"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15fc8747af14b4ee139fd5a6781ff6126ab95a64
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt
@@ -0,0 +1,87 @@
+op {
+  graph_op_name: "UnicodeDecodeWithOffsets"
+  in_arg {
+    name: "input"
+    description: <<END
+The text to be decoded. Can have any shape. Note that the output is flattened
+to a vector of char values.
+END
+  }
+  out_arg {
+    name: "row_splits"
+    description: <<END
+A 1D int32 tensor containing the row splits.
+END
+  }
+  out_arg {
+    name: "char_values"
+    description: <<END
+A 1D int32 Tensor containing the decoded codepoints.
+END
+  }
+  out_arg {
+    name: "char_to_byte_starts"
+    description: <<END
+A 1D int32 Tensor containing the byte index in the input string where each
+character in `char_values` starts.
+END
+  }
+  attr {
+    name: "input_encoding"
+    description: <<END
+Text encoding of the input strings. This is any of the encodings supported
+by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD or U+65533.)
+END
+  }
+  attr {
+    name: "replace_control_characters"
+    description: <<END
+Whether to replace the C0 control characters (00-1F) with the
+`replacement_char`. Default is false.
+END
+  }
+  summary: <<END
+Decodes each string in `input` into a sequence of Unicode code points.
+END
+  description: <<END
+The character codepoints for all strings are returned using a single vector
+`char_values`, with strings expanded to characters in row-major order.
+Similarly, the character start byte offsets are returned using a single vector
+`char_to_byte_starts`, with strings expanded in row-major order.
+
+The `row_splits` tensor indicates where the codepoints and start offsets for
+each input string begin and end within the `char_values` and
+`char_to_byte_starts` tensors.  In particular, the values for the `i`th
+string (in row-major order) are stored in the slice
+`[row_splits[i]:row_splits[i+1]]`. Thus:
+
+* `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+  character in the `i`th string (in row-major order).
+* `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+  character in the `i`th string (in row-major order).
+* `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+  string (in row-major order).
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt b/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
index 33110d8c9ec3ff6e8aa2ba094011b6a5b1339058..cf7a56ec782360076a18aa9ab7959e0de4a20987 100644
--- a/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
@@ -1,5 +1,7 @@
 op {
   graph_op_name: "CheckNumerics"
+  deprecation_version: 2
+  deprecation_message: "Use debugging.assert_all_finite instead"
   endpoint {
     name: "debugging.check_numerics"
   }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
index 2ae75d6da222d84245bb2a912942522eb52047bc..1f4bc6d22e3e9aa6e5923bd4fccf6caec322921d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2D"
-  endpoint {
-    name: "nn.conv2d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
index 6f21d8c8802f9a18c9357dbe68d3c65407bff923..1a9d96f3ab184d22ee999f727cb0f8f33e86841d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2DBackpropFilter"
-  endpoint {
-    name: "nn.conv2d_backprop_filter"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
index ea976799cbc73bc9164a15e781a051f03e14275b..1505a307658786b2c9d68263d7b50e87348d5027 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2DBackpropInput"
-  endpoint {
-    name: "nn.conv2d_backprop_input"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
index ba8d178263c94574c0aaac8f1f24fb1424a50275..cb463dd0d8d725ca4851d93e37d1f6b63e4117c8 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv3D"
-  endpoint {
-    name: "nn.conv3d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
index 1da8ee3a25f36a0b44f6458a351854190fe7830f..590b37c95fb2a43e49d5c5ae4dcfe8cc499a4c6d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -1,6 +1,10 @@
 op {
   graph_op_name: "Conv3DBackpropFilterV2"
+  endpoint {
+    name: "nn.conv3d_backprop_filter"
+  }
   endpoint {
     name: "nn.conv3d_backprop_filter_v2"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
index ce65f8172ddfea2ae08750cf37bba8e3e012f5f5..2559a6c80b812475ef5b6ca5d0a0cc35bffc4d4b 100644
--- a/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "CropAndResize"
-  endpoint {
-    name: "image.crop_and_resize"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
index 1bb17e548d1cd0ca77d6415b7fa165b1a6b7cae3..e26d029212e3bc421987f6d203b2e6ce5a95c7ac 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -1,6 +1,8 @@
 op {
   graph_op_name: "DepthwiseConv2dNative"
+  deprecation_message: "Use nn.depthwise_conv2d instead"
   endpoint {
     name: "nn.depthwise_conv2d_native"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
index 6f9df4b1a11459c252f2961fb1caacaad64021ae..01c4a50ca6fa31f65feb9d5a65fbf105525772e8 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -2,5 +2,10 @@ op {
   graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
   endpoint {
     name: "nn.depthwise_conv2d_native_backprop_filter"
+    deprecated: true
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "nn.depthwise_conv2d_backprop_filter"
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
index 0bd72539e932f597e86f63ef52519652f0e8efd7..f32aa8a69f24db4abc3f8e1aef514ee84d73c23f 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -2,5 +2,10 @@ op {
   graph_op_name: "DepthwiseConv2dNativeBackpropInput"
   endpoint {
     name: "nn.depthwise_conv2d_native_backprop_input"
+    deprecated: true
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "nn.depthwise_conv2d_backprop_input"
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
index 0bd8b1c11aa15b49f45960abfa43ca1c7e947c49..17921dea4d5e19ef960100a72709a2311da66f3d 100644
--- a/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "ExtractImagePatches"
-  endpoint {
-    name: "image.extract_image_patches"
-  }
-  endpoint {
-    name: "extract_image_patches"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
index 7f4a2add4e713133a7042540467db3d9e08795a8..33f87caa38c38a7522e43104276b033a6ea5609a 100644
--- a/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "FFT"
+  endpoint {
+    name: "signal.fft"
+  }
   endpoint {
     name: "spectral.fft"
+    deprecation_version: 2
   }
   endpoint {
     name: "fft"
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
index 9ed1341dfe2d0c4f57e0fa3c2d14378bce452be3..2273a757898bcd4c3b10fbee3bac272396bfb092 100644
--- a/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "FFT2D"
+  endpoint {
+    name: "signal.fft2d"
+  }
   endpoint {
     name: "spectral.fft2d"
+    deprecation_version: 2
   }
   endpoint {
     name: "fft2d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
index 5a4e1d6adf9b9c2bf68c6375de6aebfdfcf5bfb3..6a43b86e3d388c3aca752d7d61413bce1d2f4989 100644
--- a/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "FFT3D"
+  endpoint {
+    name: "signal.fft3d"
+  }
   endpoint {
     name: "spectral.fft3d"
+    deprecation_version: 2
   }
   endpoint {
     name: "fft3d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..def9f85e02d9d34412ed42d7774d77e8b6a328e0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordDatasetV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
index 16ed9b56f2b662b6cca44f5c955e579c2f9d7971..cbe87777a7fec7557b5153df8cd7689f22aa961e 100644
--- a/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "FractionalAvgPool"
-  endpoint {
-    name: "nn.fractional_avg_pool"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
index 695559520805abd02e0575f7f85937d00f0dc5fd..02470b43454cdcb44ee624ecab4486fa36caa7da 100644
--- a/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "FractionalMaxPool"
-  endpoint {
-    name: "nn.fractional_max_pool"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
index 0124721e1cb18524c5dabcf9f3b7a1fc34c68a10..68ef4833949f37384ff24662fe204903608fefd6 100644
--- a/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IFFT"
+  endpoint {
+    name: "signal.ifft"
+  }
   endpoint {
     name: "spectral.ifft"
+    deprecation_version: 2
   }
   endpoint {
     name: "ifft"
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
index d6b36a314b8d8a197651ee3c68b1376a9bbed669..47fb6fa191f68f75e09846b6b26479cf46505eac 100644
--- a/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "IFFT2D"
+  endpoint {
+    name: "signal.ifft2d"
+  }
   endpoint {
     name: "spectral.ifft2d"
+    deprecation_version: 2
   }
   endpoint {
     name: "ifft2d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
index 6def5b36da17766c5342703fcefe2b377028f330..aff598314b21bba23883d4fffdeecdc2096099eb 100644
--- a/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "IFFT3D"
+  endpoint {
+    name: "signal.ifft3d"
+  }
   endpoint {
     name: "spectral.ifft3d"
+    deprecation_version: 2
   }
   endpoint {
     name: "ifft3d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
index 91160bd8bfa7760c4529c028df178755d35c49db..ccd736a483ef3e927e270a33639f6f38856312b8 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IsFinite"
+  endpoint {
+    name: "math.is_finite"
+  }
   endpoint {
     name: "debugging.is_finite"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_finite"
diff --git a/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
index 7f029ee8cf0c7cd85a2bf75f9302469dd8174deb..3cbfb7317c1383db74317080d1dfe93628aab3b4 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IsInf"
+  endpoint {
+    name: "math.is_inf"
+  }
   endpoint {
     name: "debugging.is_inf"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_inf"
diff --git a/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
index f2b8862c28d4968289f5d0c6a2a85d9cf632487d..b01536664e5111217c7d1e5fb415c8e791cbaa34 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IsNan"
+  endpoint {
+    name: "math.is_nan"
+  }
   endpoint {
     name: "debugging.is_nan"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_nan"
diff --git a/tensorflow/core/api_def/python_api/api_def_PlaceholderWithDefault.pbtxt b/tensorflow/core/api_def/python_api/api_def_PlaceholderWithDefault.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1a722c1036db085968c911ebcb697b0aeed8d55
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PlaceholderWithDefault.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PlaceholderWithDefault"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
index dfa793a16e18ab30891bcb9a997d7bed02410e54..6aceba3b1188919d4b0318f560ed32921e823343 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedAvgPool"
   endpoint {
     name: "nn.quantized_avg_pool"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
index 2409d12abeff922cca92f9ae609764a27f651356..4b5a04f45ef014ad328fea26e613f227d1821e71 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedConv2D"
   endpoint {
     name: "nn.quantized_conv2d"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
index 3a58590f5773a3d886ace95108ee63a659362de2..cd1c7fdbf22ec746a080566b20daa7b100e5cb65 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedMaxPool"
   endpoint {
     name: "nn.quantized_max_pool"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
index 926ec98eeb468e7fa4846ae013a112cc865bb82c..d83d71c65cabf7a00d65c9dc87c6465f7c1ae9f5 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedReluX"
   endpoint {
     name: "nn.quantized_relu_x"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
index 2f1b4aee00d90221d659daa34a7eb3462f42fa0c..e1a1f883d8ba6850f429ca5ebc8ab89789a2df90 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeArea"
-  endpoint {
-    name: "image.resize_area"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
index 3ec8e0ad6359307eab1b166801474817d8c5282b..e0bec8c116db961f873e1aa961d32d9422311696 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeBicubic"
-  endpoint {
-    name: "image.resize_bicubic"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
index eb3b8d6f458fff6163932457ef6c73a8fbbd721e..6121c1128c9060914723beb9d056d51a212b54bc 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeBilinear"
-  endpoint {
-    name: "image.resize_bilinear"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
index 25c5d5701feefd6f8270236f29e1c187fa3cf06a..0e86e4ce3ea33515947eae08705d5ea6c6860faa 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeNearestNeighbor"
-  endpoint {
-    name: "image.resize_nearest_neighbor"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SerializeTensor.pbtxt b/tensorflow/core/api_def/python_api/api_def_SerializeTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..219b125da3ddf97bd5b1eca0adb5660362f08a03
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SerializeTensor.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "SerializeTensor"
+  endpoint {
+    name: "io.serialize_tensor"
+  }
+  endpoint {
+    name: "serialize_tensor"
+    deprecation_version: 2
+  }
+}
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index f90fb174344d4bb588578f749de9cd71b7c7359d..624d3f228982e0828fce102dec697747158f69c0 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -511,7 +511,7 @@ void CollectiveParamResolverLocal::FindInstanceRec(
         if (irec->is_init) {
           exit_outside_locks = true;
         } else {
-          irec->init_waiters.push_back([this, gr, cp, done](InstanceRec* irec) {
+          irec->init_waiters.push_back([this, done](InstanceRec* irec) {
             CallbackWithStatus(done, irec);
           });
           return;
@@ -708,7 +708,16 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
       return;
     }
     CHECK_EQ(ir->known_count, ir->shared.group.group_size);
-    CHECK_GE(ir->source_rank, 0);
+    if (ir->source_rank < 0) {
+      // NOTE(ayushd): changing the error message below would also require
+      // updating CompleteParamsBroadcastForgotSend test in
+      // CollectiveParamResolverLocalTest.
+      ir->status =
+          errors::Internal("Instance ", cp->instance.instance_key,
+                           " found no source for broadcast.  This "
+                           "could mean that there were group_size=",
+                           ir->known_count, " BcastRecvs but no BcastSend.");
+    }
     if (!ir->known_waiters.empty()) {
       ready_waiters = std::move(ir->known_waiters);
     }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 2b43adbac693592fabac283426cdfd3cc4e55d9f..9a501b329818938f8fde828d73daecb8a0a46b5e 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -200,28 +200,35 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
   }
 }
 
+void InitializeCollectiveParamsForBroadcast(int instance_key, int device_idx,
+                                            bool is_source,
+                                            CollectiveParams* cp) {
+  cp->group.group_key = 1;
+  cp->group.group_size = 3;
+  cp->group.device_type = DeviceType("CPU");
+  cp->group.num_tasks = 1;
+  cp->instance.instance_key = instance_key;
+  cp->instance.type = BROADCAST_COLLECTIVE;
+  cp->instance.data_type = DataType(DT_FLOAT);
+  cp->instance.shape = TensorShape({5});
+  cp->instance.device_names.push_back(strings::StrCat(
+      "/job:localhost/replica:0/task:0/device:CPU:", device_idx));
+  cp->instance.impl_details.subdiv_offsets.push_back(0);
+  cp->is_source = is_source;
+}
+
 TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
+  constexpr int kInstanceKey = 5;
   CollectiveParams cps[NUM_DEVS];
   Status statuses[NUM_DEVS];
   Notification note[NUM_DEVS];
   for (int i = 0; i < NUM_DEVS; ++i) {
     CollectiveParams* cp = &cps[i];
-    cp->group.group_key = 1;
-    cp->group.group_size = 3;
-    cp->group.device_type = DeviceType("CPU");
-    cp->group.num_tasks = 1;
-    cp->instance.instance_key = 3;
-    cp->instance.type = BROADCAST_COLLECTIVE;
-    cp->instance.data_type = DataType(DT_FLOAT);
-    cp->instance.shape = TensorShape({5});
-    cp->instance.device_names.push_back(
-        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
-    cp->instance.impl_details.subdiv_offsets.push_back(0);
-    cp->is_source = (i == 1);
+    InitializeCollectiveParamsForBroadcast(kInstanceKey, i, i == 1, cp);
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
       prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
                                 nullptr /*CancellationManager*/,
-                                [this, &statuses, &note, i](const Status& s) {
+                                [&statuses, &note, i](const Status& s) {
                                   statuses[i] = s;
                                   note[i].Notify();
                                 });
@@ -245,4 +252,38 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
   }
 }
 
+// If we don't mark any participant in a broadcast as the source, we essentially
+// create a collective group with only broadcast recvs.  In that case, we should
+// get an internal error from param resolution.
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcastForgotSender) {
+  constexpr int kInstanceKey = 8;
+  CollectiveParams cps[NUM_DEVS];
+  Status statuses[NUM_DEVS];
+  Notification note[NUM_DEVS];
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    CollectiveParams* cp = &cps[i];
+    InitializeCollectiveParamsForBroadcast(kInstanceKey, i, false, cp);
+    Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
+      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+                                nullptr /*CancellationManager*/,
+                                [&statuses, &note, i](const Status& s) {
+                                  statuses[i] = s;
+                                  note[i].Notify();
+                                });
+    });
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    note[i].WaitForNotification();
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    EXPECT_EQ(statuses[i].code(), error::INTERNAL);
+    EXPECT_EQ(statuses[i].error_message(),
+              strings::StrCat(
+                  "Instance ", kInstanceKey,
+                  " found no source for broadcast.  This could mean that there"
+                  " were group_size=",
+                  NUM_DEVS, " BcastRecvs but no BcastSend."));
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index 288ae9d794a2547d7837e1311e71c4681236704a..d99565b49abde95ca2fa28293771970b19620dd5 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -38,7 +38,7 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
     return;
   }
   buf_rendezvous_.ConsumeBuf(
-      key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr,
+      key, [to_tensor, to_device_ctx, to_device, to_alloc_attr,
             dev_to_dev_stream_index,
             done](const Status& s, BufRendezvous::Hook* hook) {
         if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 6d5c7f951e36d9d1fa728a5ad5463dad59a550ac..5c226ec56e13fbb398d852ff6287910d2347785e 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -471,10 +471,10 @@ bool ReplaceTensorWithConstant(
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) Do not replace another constant.
-  // 2) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
-  // constraint, do not replace it.
-  // 3) If the destination tensor is an int32 tensor, and has DEVICE_MEMORY
-  // constraint, do not replace it.
+  // 2) If the destination tensor or any other tensor from the same node is not
+  // an int32 tensor, and has HOST_MEMORY constraint, do not replace it.
+  // 3) If the destination tensor or any other tensor from the same node is an
+  // int32 tensor, and has DEVICE_MEMORY constraint, do not replace it.
   // 4) If the size of the constant in bytes is too large (>
   // max_constant_in_bytes), do not replace it. This prevents the size of the
   // Graph from growing too large.
@@ -490,16 +490,20 @@ bool ReplaceTensorWithConstant(
                                ? DeviceType{partition_device->device_type()}
                                : DEVICE_CPU;
   if (partition_device && device_type != DEVICE_CPU) {
-    MemoryType memory_type;
-    if (!MemoryTypeForOutput(device_type, graph, tensor.first, tensor.second,
-                             &memory_type)
+    MemoryTypeVector input_mvec;
+    MemoryTypeVector output_mvec;
+    if (!MemoryTypesForNode(graph->op_registry(), device_type,
+                            tensor.first->def(), &input_mvec, &output_mvec)
              .ok()) {
       return false;
     }
-    bool is_int32 = tensor.first->output_type(tensor.second) == DT_INT32;
-    if ((memory_type == HOST_MEMORY && !is_int32) ||
-        (memory_type == DEVICE_MEMORY && is_int32)) {
-      return false;
+    for (int i = 0; i < output_mvec.size(); i++) {
+      MemoryType memory_type = output_mvec[i];
+      bool is_int32 = tensor.first->output_type(i) == DT_INT32;
+      if ((memory_type == HOST_MEMORY && !is_int32) ||
+          (memory_type == DEVICE_MEMORY && is_int32)) {
+        return false;
+      }
     }
   }
   if (constant.TotalBytes() > max_constant_size_in_bytes) {
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 98aefcde27fc9589b09cfb8af6a1e8734e13af24..1d4586f3da84f0beabe440dca51105826feb197c 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -18,13 +18,16 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
 
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -90,6 +93,24 @@ class ConstantFoldingTest : public ::testing::Test {
   }
 };
 
+class FakeDevice : public Device {
+ private:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+ public:
+  Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
+
+  static std::unique_ptr<Device> Make(const string& name, const string& type) {
+    DeviceAttributes device_attributes;
+    device_attributes.set_name(name);
+    device_attributes.set_device_type(DeviceType(type).type());
+    return std::unique_ptr<Device>(new FakeDevice(device_attributes));
+  }
+};
+
 TEST_F(ConstantFoldingTest, Basic) {
   Scope s = Scope::NewRootScope();
   BuildSimpleGraph(&s);
@@ -610,6 +631,31 @@ TEST_F(ConstantFoldingTest, ConstShapeKnown) {
   }
 }
 
+TEST_F(ConstantFoldingTest, NoReplacePartialOutput) {
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope().ExitOnError().WithAssignedDevice("/gpu:0");
+
+    auto c0 = ops::Const<float>(s.WithOpName("c0"), {5.0, 2.0, 8.0, 1.0}, {4});
+    auto k = ops::Const<int>(s.WithOpName("k"), 3);
+    auto topK =
+        ops::TopK(s.WithOpName("topK"), c0, k, ops::TopK::Sorted(false));
+    auto send_values = ops::_Send(s.WithOpName("send_values"), topK.values,
+                                  "send_values", "sender", 0, "receiver");
+    auto send_indices = ops::_Send(s.WithOpName("send_indices"), topK.indices,
+                                   "send_indices", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+  bool was_mutated;
+  TF_EXPECT_OK(ConstantFold(
+      ConstantFoldingOptions{}, nullptr, Env::Default(),
+      FakeDevice::Make("/job:tpu_worker/replica:0/task:0/device:GPU:0",
+                       DEVICE_GPU)
+          .get(),
+      &g, &was_mutated));
+  EXPECT_FALSE(was_mutated);
+}
+
 namespace {
 
 const char kTestMemRegionName[] = "test://test";
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/common_runtime/device.cc
index 8fc64fff69a6252ed9860f8dcb75814cfd0785ff..9925814a48acf19162a39f07666a909db56e39e4 100644
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/common_runtime/device.cc
@@ -36,6 +36,8 @@ Device::~Device() {
   }
 }
 
+void Device::Sync(const DoneCallback& done) { done(Sync()); }
+
 // static
 DeviceAttributes Device::BuildDeviceAttributes(
     const string& name, DeviceType device, Bytes memory_limit,
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 2ef1547cd9a56de0750eac1583568a06720acb99..8dfbb21eda641ff9f70c58f1f4bf150ba4cceef3 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -55,6 +55,9 @@ class DeviceMgr;
 
 class Device : public DeviceBase {
  public:
+  // Callback type that takes a Status and returns void.
+  typedef std::function<void(const Status&)> DoneCallback;
+
   Device(Env* env, const DeviceAttributes& device_attributes);
   ~Device() override;
 
@@ -112,6 +115,13 @@ class Device : public DeviceBase {
   // at completion.
   virtual Status Sync() = 0;
 
+  // Calls the given callback when all operations queued on the device at the
+  // time of the call have completed. The callback is passed any error pending
+  // on the device at completion.
+  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
+  // version.
+  virtual void Sync(const DoneCallback& done);
+
   // Override this to return true for devices that require a Sync() call before
   // session completion.
   virtual bool RequiresSyncOnCompletion() const { return false; }
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc
index b43c718817558f0e44eff5f5e5d5ec3a81d25ddd..b94900114c580e4a7ee68c1175d0b91a1abd5df6 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/common_runtime/device_factory.cc
@@ -127,7 +127,12 @@ Device* DeviceFactory::NewDevice(const string& type,
   (*opt.config.mutable_device_count())[type] = 1;
   std::vector<Device*> devices;
   TF_CHECK_OK(device_factory->CreateDevices(opt, name_prefix, &devices));
-  CHECK_EQ(devices.size(), size_t{1});
+  int expected_num_devices = 1;
+  auto iter = options.config.device_count().find(type);
+  if (iter != options.config.device_count().end()) {
+    expected_num_devices = iter->second;
+  }
+  DCHECK_EQ(devices.size(), static_cast<size_t>(expected_num_devices));
   return devices[0];
 }
 
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 52c1cd269189c6fdc629397376b260f95a457aa1..40b7071f40a20ea7f99f983173e49664124cef7f 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -253,11 +253,19 @@ static RunHandlerPool* GetOrCreateRunHandlerPool(
   return pool;
 }
 
-bool DirectSession::ShouldUseRunHandlerPool() const {
-  if (options_.config.session_inter_op_thread_pool_size() > 0 ||
-      options_.config.use_per_session_threads()) {
+bool DirectSession::ShouldUseRunHandlerPool(
+    const RunOptions& run_options) const {
+  if (options_.config.use_per_session_threads()) return false;
+  if (options_.config.session_inter_op_thread_pool_size() > 0 &&
+      run_options.inter_op_thread_pool() > 0)
     return false;
-  }
+  // Only use RunHandlerPool when:
+  // a. Single global thread pool is used for inter-op parallelism.
+  // b. When multiple inter_op_thread_pool(s) are created, use it only while
+  // running sessions on the default inter_op_thread_pool=0. Typically,
+  // servo-team uses inter_op_thread_pool > 0 for model loading.
+  // TODO(crk): Revisit whether we'd want to create one (static) RunHandlerPool
+  // per entry in session_inter_op_thread_pool() in the future.
   return true;
 }
 
@@ -603,9 +611,8 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   }
 
   std::unique_ptr<RunHandler> handler;
-  if (ShouldUseRunHandlerPool() &&
+  if (ShouldUseRunHandlerPool(run_options) &&
       run_options.experimental().use_run_handler_pool()) {
-    // Non-null only when a global inter-op pool is used.
     VLOG(1) << "Using RunHandler to scheduler inter-op closures.";
     handler = GetOrCreateRunHandlerPool(options_)->Get();
   }
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 3a168bbe3fcb08167465ab75a155e2d2b4038046..6754e9cfb71700090049107cf4dd122175527ffe 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -247,8 +247,10 @@ class DirectSession : public Session {
                                    ExecutorsAndKeys* executors_and_keys,
                                    RunMetadata* run_metadata);
 
-  // Returns whether inter-op execution uses a global pool.
-  bool ShouldUseRunHandlerPool() const;
+  // Returns whether inter-op execution uses a global pool or the input
+  // `run_options` requests being run on inter_op_thread_pool = 0 in case
+  // multiple pools are configured.
+  bool ShouldUseRunHandlerPool(const RunOptions& run_options) const;
 
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 2c63b8704ee1d08d643c9e90940c3897fbb1358b..6a265c468c1fe617d38e539fac20fb0cba294afe 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -107,26 +107,20 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim_size());
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
+        // if MKL is used, it goes through additional
+        // graph rewrite pass on top of Tensorflow.
+        // In TF, every time a graph pass
+        // happens, "constant" nodes are allocated
+        // and deallocated. Each allocation calls the
+        // (FindChunkPtr of BFCAllocator),
+        // which increments the value of AllocationId.
+        // Thus AllocationId of MKL can differ with TF if
+        // someone changes the relevant codes in BFCAllocator.
+        // Currently they are the same.
         if (node->name() == y->name()) {
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
-          // if MKL is used, it goes through various additional
-          // graph rewrite pass. In TF, everytime a graph pass
-          // happens, "constant" nodes are allocated
-          // and deallocated. Each allocation calls the
-          // (FindChunkPtr of BFCAllocator),
-          // which increments the value of AllocationId.
-          // Thus AllocationId becomes more than TF if MKL
-          // is used. Now IDs for MKL are 8 more than TF.
-          EXPECT_EQ(21, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(13, cm->AllocationId(node, 0));
-#endif  // INTEL_MKL && ENABLE_MKL
         } else {
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
-          EXPECT_EQ(22, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(14, cm->AllocationId(node, 0));
-#endif  // INTEL_MKL && ENABLE_MKL
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 29edc4e3b8f5e456c5b6e420487252235c67ae4e..aae3392d0e64319cdd539904b2271df1598921b3 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -39,6 +39,18 @@ std::unordered_map<string, const AttrTypeMap*>* OpNameToAttrTypeMap() {
 
 const uint32 kIsList = 1U << 31;
 
+AttrTypeMap* DefaultFunctionAttrTypeMap() {
+  AttrTypeMap* map = new AttrTypeMap();
+  (*map)["executor_type"] = TF_ATTR_STRING;
+  (*map)["config"] = TF_ATTR_STRING;
+  return map;
+}
+
+const AttrTypeMap* GetDefaultFunctionAttrTypeMap() {
+  static const AttrTypeMap* map = DefaultFunctionAttrTypeMap();
+  return map;
+}
+
 }  // namespace
 
 Status OpDefForOp(const char* op_name, const OpDef** op_def) {
@@ -50,13 +62,27 @@ Status OpDefForOp(const char* op_name, const OpDef** op_def) {
   return s;
 }
 
-Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
+Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
+                        bool* is_function) {
   mutex_lock l(g_op_name_to_attr_type_map_lock);
+  *is_function = false;
   *out = gtl::FindPtrOrNull(*OpNameToAttrTypeMap(), op_name);
   if (*out != nullptr) return Status::OK();
   const OpDef* op_def = nullptr;
   Status s = OpDefForOp(op_name, &op_def);
-  if (!s.ok()) return s;
+  if (errors::IsNotFound(s)) {
+    // If we did not find the op def, we assume `op_name` is a function.
+    // If it is actually a misspelled op, user will get another error when
+    // trying to run it.
+    // TODO(iga): If we ever have a use case for different attribute specs
+    // in different functions, we will need to look at the OpDef in the
+    // function def to retrieve their types.
+    *out = GetDefaultFunctionAttrTypeMap();
+    *is_function = true;
+    return Status::OK();
+  } else if (!s.ok()) {
+    return s;
+  }
   std::unique_ptr<AttrTypeMap> m(new AttrTypeMap);
   // TODO(agarwal): Avoid having to create this "registry" at runtime,
   // perhaps can be done at op registration time?
@@ -184,7 +210,7 @@ namespace {
 inline tensorflow::Fprint128 FingerprintCat128(const tensorflow::Fprint128& a,
                                                const tensorflow::Fprint128& b) {
   return {tensorflow::FingerprintCat64(a.low64, b.low64),
-          tensorflow::FingerprintCat64(a.low64, b.low64)};
+          tensorflow::FingerprintCat64(a.high64, b.high64)};
 }
 
 void CombineUnordered(const tensorflow::Fprint128& a,
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index af5b7d80c324d986102ec66b750644e203c92d83..41dd275a668d2694397ec415cf05ddca03b258dc 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -43,7 +43,11 @@ typedef std::unordered_map<string, uint32> AttrTypeMap;
 Status OpDefForOp(const char* op_name, const OpDef** op_def);
 
 // Returns the AttrTypeMap for the TensorFlow operation named op_name.
-Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out);
+// If op_name is not registered in global op registry, AttrTypeMapForOp assumes
+// the op to be a function and returns the default attributes for a function.
+// `is_function` is set to true in this case.
+Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
+                        bool* is_function);
 
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 79b094f2e008786661b0236bc7bcdb3f37a23946..220cc6f5ce0bff32cfdc8d4e837c6900c773728e 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -35,9 +35,18 @@ namespace {
 
 TEST(AttrTypeMap, Lookup) {
   const AttrTypeMap* m = nullptr;
-  Status s = AttrTypeMapForOp("ThisOpCannotPossiblyExist", &m);
-  EXPECT_FALSE(s.ok());
-  s = AttrTypeMapForOp("MatMul", &m);
+  // Unknown ops are assumed to be functions.
+  // Their maps are filled with default attributes.
+  bool is_function = false;
+  Status s = AttrTypeMapForOp("SomeFunctionName", &m, &is_function);
+  EXPECT_TRUE(s.ok());
+  EXPECT_TRUE(is_function);
+  EXPECT_EQ(TF_ATTR_STRING, m->find("executor_type")->second);
+  EXPECT_EQ(TF_ATTR_STRING, m->find("config")->second);
+
+  is_function = true;
+  s = AttrTypeMapForOp("MatMul", &m, &is_function);
+  EXPECT_FALSE(is_function);
   ASSERT_TRUE(s.ok()) << s;
 
   TF_AttrType t;
@@ -50,7 +59,7 @@ TEST(AttrTypeMap, Lookup) {
   EXPECT_EQ(TF_ATTR_BOOL, t);
   EXPECT_EQ(is_list, 0);
 
-  s = AttrTypeMapForOp("Squeeze", &m);
+  s = AttrTypeMapForOp("Squeeze", &m, &is_function);
   ASSERT_TRUE(s.ok()) << s;
   s = AttrTypeByName(*m, "squeeze_dims", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index f23cefb33d755dc4dfba9b67d1e8963f8198bd21..583ae64edd16af7b86c4a2c9f708f0d3d0b8c843 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -32,18 +32,6 @@ bool ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val) {
   return default_val;
 }
 
-std::unique_ptr<thread::ThreadPool> EagerThreadPool(
-    const SessionOptions& opts) {
-  SessionOptions opts_copy(opts);
-  if (opts_copy.config.inter_op_parallelism_threads() == 0) {
-    // Eager defaults to a single thread when no threads are specified.
-    opts_copy.config.set_inter_op_parallelism_threads(1);
-  }
-
-  return std::unique_ptr<thread::ThreadPool>(
-      NewThreadPoolFromSessionOptions(opts_copy));
-}
-
 }  // namespace
 
 EagerContext::EagerContext(const SessionOptions& opts,
@@ -61,7 +49,7 @@ EagerContext::EagerContext(const SessionOptions& opts,
     : policy_(default_policy),
       devices_(device_mgr->ListDevices()),
       rendezvous_(rendezvous),
-      thread_pool_(EagerThreadPool(opts)),
+      thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
           device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_, {},
           thread_pool_.get())),
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 4de807bde31ec14b1571948a91fe2f930d50f427..51109f8f1ae67cf1a64e6c520dd063744cf8abce 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -206,6 +206,8 @@ class EagerContext {
   bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
   bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; }
 
+  tensorflow::Env* TFEnv() const { return env_; }
+
  private:
   void InitDeviceMapAndAsync();
   Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index 8a887540b066055fc1f59e64e0cead9f2512178e..5bc3bb689e076467672af85d28bb340b56e7ee79 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -30,7 +30,7 @@ class CopyToDeviceNode : public EagerNode {
         src_(src),
         dstd_(dstd),
         ctx_(ctx),
-        dst_(new TensorHandle(id, src_->dtype, ctx)) {
+        dst_(new TensorHandle(id, dstd_, dstd_, src->dtype, ctx)) {
     src_->Ref();
     dst_->Ref();
   }
@@ -44,13 +44,11 @@ class CopyToDeviceNode : public EagerNode {
     TensorHandle* temp = nullptr;
     TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &temp));
     const Tensor* tensor = nullptr;
-    Device* device = nullptr;
-    Device* op_device = nullptr;
-    Status status = temp->TensorAndDevice(&tensor, &device, &op_device);
+    Status status = temp->Tensor(&tensor);
     // `temp` is a ready handle. So the following call should return OK.
     TF_DCHECK_OK(status) << status.error_message();
     DCHECK(tensor);
-    dst_->SetTensorAndDevice(*tensor, device, op_device);
+    dst_->SetTensor(*tensor);
     temp->Unref();
     return Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index fcf62c7715320466a49c707e31cf7a5045f16b8e..935ca7f9aa766a69582b4c94fec6c508e3f5a369 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -22,11 +22,14 @@ limitations under the License.
 namespace tensorflow {
 class EagerOperation {
  public:
-  // t is NULL iff the EagerOperation corresponds to a TensorFlow function
-  // instead of a primitive operation.
   EagerOperation(tensorflow::EagerContext* ctx, const char* op,
-                 const tensorflow::AttrTypeMap* t)
-      : ctx_(ctx), name_(op), attrs_(op), attr_types_(t), device_(nullptr) {}
+                 bool is_function, const tensorflow::AttrTypeMap* t)
+      : ctx_(ctx),
+        name_(op),
+        attrs_(op),
+        attr_types_(t),
+        device_(nullptr),
+        is_function_(is_function) {}
 
   ~EagerOperation() {
     for (tensorflow::TensorHandle* h : inputs_) {
@@ -34,7 +37,7 @@ class EagerOperation {
     }
   }
 
-  bool is_function() const { return attr_types_ == nullptr; }
+  bool is_function() const { return is_function_; }
 
   tensorflow::EagerContext* EagerContext() { return ctx_; }
 
@@ -68,6 +71,7 @@ class EagerOperation {
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs_;
   tensorflow::Device* device_;
   bool use_xla_ = false;
+  const bool is_function_;
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 53f0ba1c818bf95792ca385a33a6828641aa14e2..5bf7888fad5043ac9a02f0d9e2fc4362d6567661 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/lib/core/errors.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
@@ -85,8 +86,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
                                       RunMetadata* run_metadata,
                                       TensorHandle** handle) {
   EagerContext* ctx = op->EagerContext();
-  Device* handle_device = nullptr;
-  TF_RETURN_IF_ERROR((*handle)->Device(&handle_device));
+  Device* handle_device = (*handle)->device();
   const Device* actual_device =
       handle_device == nullptr ? ctx->HostCPU() : handle_device;
   const Device* op_device =
@@ -193,7 +193,7 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
 }
 
 Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
-  DeviceTypeVector final_devices;
+  PrioritizedDeviceTypeVector final_devices;
   TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
       ctx->prioritized_device_type_list(), ndef, &final_devices));
   if (final_devices.empty()) {
@@ -203,7 +203,7 @@ Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
                             " :\n", KernelsRegisteredForOp(ndef.op()));
   }
   for (Device* d : *ctx->devices()) {
-    if (d->device_type() == final_devices[0].type_string()) {
+    if (d->device_type() == final_devices[0].first.type_string()) {
       *device = d;
       return Status::OK();
     }
@@ -334,7 +334,9 @@ Status EagerLocalExecute(EagerOperation* op,
     // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
     tensorflow::uint64 id = ctx->NextId();
     for (int i = 0; i < *num_retvals; ++i) {
-      (*retvals)[i] = new TensorHandle(id, output_dtypes[i], ctx);
+      (*retvals)[i] = new TensorHandle(id, /* d= */ kernel->OutputDevice(i),
+                                       /* op_device= */ kernel->device(),
+                                       output_dtypes[i], ctx);
     }
     EagerNode* node = new ExecuteNode(
         id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
@@ -417,8 +419,7 @@ Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
   request.set_op_id(ctx->NextId());
   request.set_device_name(recv_device->name());
 
-  Device* tensor_handle_device;
-  TF_RETURN_IF_ERROR(h->Device(&tensor_handle_device));
+  Device* tensor_handle_device = h->device();
 
   // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence copy
   // it to the CPU before copying it out.
@@ -485,8 +486,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   auto* remote_op = request->add_queue()->mutable_operation();
 
   for (int i = 0; i < op->Inputs().size(); i++) {
-    tensorflow::Device* input_device;
-    TF_RETURN_IF_ERROR(op->Inputs()[i]->Device(&input_device));
+    tensorflow::Device* input_device = op->Inputs()[i]->device();
     if (op->Device() != input_device &&
         // If the expected and actual devices are on the same task, don't
         // explicitly copy, and instead depend on the copy to happen locally
@@ -622,8 +622,7 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
       ctx->PinSmallOpsToCPU() && IsPinnableOp(op->Name());
 
   for (int i = 0; i < op->Inputs().size(); ++i) {
-    Device* input_op_device = nullptr;
-    TF_RETURN_IF_ERROR(op->Inputs()[i]->OpDevice(&input_op_device));
+    Device* input_op_device = op->Inputs()[i]->op_device();
     VLOG(2) << "for op " << op->Name() << " input " << i << " "
             << DataTypeString(op->Inputs()[i]->dtype) << " "
             << (input_op_device == nullptr ? "cpu" : input_op_device->name())
@@ -767,17 +766,19 @@ Status EagerExecute(EagerContext* ctx, Device* device,
     }
   }
   DCHECK_EQ(num_retvals, outputs.size());
-  Device* op_device = device;
   for (int i = 0; i < num_retvals; ++i) {
-    Device* d = op_device;
-    if (d != nullptr && output_memory_types != nullptr &&
-        (*output_memory_types)[i] == HOST_MEMORY) {
-      d = nullptr;
-    }
     if (retvals[i] == nullptr) {
-      retvals[i] = new TensorHandle(outputs[i], d, op_device, ctx);
+      retvals[i] =
+          new TensorHandle(outputs[i], /* d= */ kernel->OutputDevice(i),
+                           /* op_device= */ device, ctx);
     } else {
-      retvals[i]->SetTensorAndDevice(outputs[i], d, op_device);
+      // In the async case, the retval is not a nullptr, and its device is
+      // already set since all TensorHandles always have their device set during
+      // construction.
+      DCHECK_EQ(device, retvals[i]->op_device());
+      DCHECK_EQ(kernel->OutputDevice(i), retvals[i]->device());
+
+      retvals[i]->SetTensor(outputs[i]);
     }
   }
   return Status::OK();
@@ -827,8 +828,11 @@ Status ExecuteSend(EagerContext* ctx, tensorflow::Device* device,
                    TensorHandle* h, StringPiece wire_id,
                    const string& recv_device) {
   const tensorflow::AttrTypeMap* types;
-  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Send", &types));
-  tensorflow::EagerOperation op(ctx, "_Send", types);
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(
+      tensorflow::AttrTypeMapForOp("_Send", &types, &is_function));
+  DCHECK(!is_function);
+  tensorflow::EagerOperation op(ctx, "_Send", /*is_function=*/false, types);
 
   op.AddInput(h);
 
@@ -855,8 +859,11 @@ Status ExecuteRecv(EagerContext* ctx, tensorflow::Device* device,
                    const string& send_device, int64 send_device_incarnation,
                    TensorHandle** result) {
   const tensorflow::AttrTypeMap* types;
-  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Recv", &types));
-  tensorflow::EagerOperation op(ctx, "_Recv", types);
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(
+      tensorflow::AttrTypeMapForOp("_Recv", &types, &is_function));
+  DCHECK(!is_function);
+  tensorflow::EagerOperation op(ctx, "_Recv", /*is_function=*/false, types);
 
   op.SetDevice(device);
 
@@ -892,8 +899,7 @@ string GetUniqueWireID() {
 
 Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
                          const char* device_name, TensorHandle** result) {
-  tensorflow::Device* send_device;
-  TF_RETURN_IF_ERROR(h->Device(&send_device));
+  tensorflow::Device* send_device = h->device();
 
   if (send_device == nullptr) {
     send_device = ctx->HostCPU();
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index e1ff45d6dd0aef596bd3a50ea3c91abdc3768acc..192d22dfd5a105a31ab19a33c29ddc83ecd04142 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -131,7 +131,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     outputs->push_back(Tensor(*context.mutable_output(i)));
   }
   if (stats != nullptr) {
-    for (const auto& allocator_pair : context.wrapped_allocators()) {
+    for (const auto& allocator_pair : context.ConsumeWrappedAllocators()) {
       AllocatorMemoryUsed* memory = stats->add_memory();
       memory->set_allocator_name(allocator_pair.first->Name());
       auto sizes = allocator_pair.second->GetSizes();
@@ -156,4 +156,12 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   return Status::OK();
 }
 
+tensorflow::Device* KernelAndDevice::OutputDevice(int idx) const {
+  if (device_ != nullptr &&
+      kernel_->output_memory_types()[idx] == HOST_MEMORY) {
+    return nullptr;
+  }
+  return device_;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 4b0f5182a0e4d2a6a7419ca89c3c84df94e1c26b..52dac94ccca0cc987751400778c3c1c6e95272d6 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -69,6 +69,8 @@ class KernelAndDevice {
              std::vector<Tensor>* outputs, NodeExecStats* stats,
              StepStats* step_stats, GraphCollector* graph_collector);
 
+  Device* OutputDevice(int idx) const;
+
   const OpKernel* kernel() const { return kernel_.get(); }
 
   Device* device() const { return device_; }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d58724cbfacf6fec6b097dead53d9bd373cd2e7f..d8d6b7a63b6f7189d4db66846a2f48982a20e610 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -79,20 +79,6 @@ Status TensorHandle::Tensor(const tensorflow::Tensor** t) {
   return Status::OK();
 }
 
-Status TensorHandle::Device(tensorflow::Device** d) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *d = device_;
-  return Status::OK();
-}
-
-Status TensorHandle::OpDevice(tensorflow::Device** d) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *d = op_device_;
-  return Status::OK();
-}
-
 Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
                                      tensorflow::Device** device,
                                      tensorflow::Device** op_device) {
@@ -178,17 +164,12 @@ Status TensorHandle::RemoteAddress(int64* op_id, int32* output_num) {
   return Status::OK();
 }
 
-void TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor,
-                                      tensorflow::Device* device,
-                                      tensorflow::Device* op_device) {
+void TensorHandle::SetTensor(const tensorflow::Tensor& tensor) {
   mutex_lock l(ctx_mutex_);
-  DCHECK(node_id_ > 0 && !is_ready_)
-      << "SetTensorAndDevice should be only called  "
-      << "on non-ready handles.";
+  DCHECK(node_id_ > 0 && !is_ready_) << "SetTensor should be only called  "
+                                     << "on non-ready handles.";
   is_ready_ = true;
   tensor_ = tensor;
-  device_ = device;
-  op_device_ = op_device;
 }
 
 Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index e55f1a03385f2dffa4d55961b6df502e17a1b474..0fdd31ab5fcfe99c92074fc69d831d17f46d607e 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -61,12 +61,13 @@ class TensorHandle : public core::RefCounted {
         ctx_(ctx),
         is_ready_(true) {}
 
-  TensorHandle(uint64 node_id, DataType dtype, EagerContext* ctx)
+  TensorHandle(uint64 node_id, Device* d, Device* op_device, DataType dtype,
+               EagerContext* ctx)
       : dtype(dtype),
         node_id_(node_id),
         tensor_(dtype),
-        device_(nullptr),
-        op_device_(nullptr),
+        device_(d),
+        op_device_(op_device),
         remote_op_id_(-1),
         remote_output_num_(-1),
         remote_shape_node_id_(-1),
@@ -101,9 +102,9 @@ class TensorHandle : public core::RefCounted {
 
   Status Tensor(const tensorflow::Tensor** t);
 
-  Status Device(tensorflow::Device** d);
+  tensorflow::Device* device() const { return device_; }
 
-  Status OpDevice(tensorflow::Device** d);
+  tensorflow::Device* op_device() const { return op_device_; }
 
   Status TensorAndDevice(const tensorflow::Tensor** tensor,
                          tensorflow::Device** device,
@@ -120,9 +121,7 @@ class TensorHandle : public core::RefCounted {
 
   // Note that this can be called at most once, and only on non-ready handles,
   // and makes them ready.
-  void SetTensorAndDevice(const tensorflow::Tensor& tensor,
-                          tensorflow::Device* device,
-                          tensorflow::Device* op_device);
+  void SetTensor(const tensorflow::Tensor& tensor);
 
   Status CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
                       TensorHandle** output);
@@ -172,11 +171,11 @@ class TensorHandle : public core::RefCounted {
   //
   // TODO(ashankar): Reference count TFE_Context to ensure that 'device_' of a
   // TFE_TensorHandle does not outlive the TFE_Context from which it came?
-  tensorflow::Device* device_;
+  tensorflow::Device* const device_;
 
   // Device in which the op producing this tensor was executed. Equals to
   // device_ for constant tensors.
-  tensorflow::Device* op_device_;
+  tensorflow::Device* const op_device_;
 
   // IDs required when this class is representing a remote tensor handle.
   const int64 remote_op_id_;
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 1e68954827f3c5fad781fa8bb3ca821abae53ee4..77b249c2b49bb5bc2465a8ddf84e1835b3fe66a3 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1713,7 +1713,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         auto done = [this, state]() {
           Device* device = impl_->params_.device;
           NodeExecStatsInterface* stats = state->stats;  // Shorthand
-          Entry* first_input = state->first_input;     // Shorthand
+          Entry* first_input = state->first_input;       // Shorthand
 
           nodestats::SetOpEnd(stats);
           EntryVector outputs;
@@ -2046,6 +2046,23 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
 void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
                                      const NodeItem* item, EntryVector* outputs,
                                      TaggedNodeSeq* ready) {
+  auto activity_handle =
+      [&]() -> std::unique_ptr<tracing::TraceCollector::Handle> {
+    if (TF_PREDICT_FALSE(trace_collector_ != nullptr &&
+                         trace_collector_->IsEnabledForActivities(
+                             false /* is_expensive */))) {
+      const string& op_name = item->kernel->name();
+      // Intentionally using ExecutorPropagateOutputs as the first key so that
+      // users are aware that it's not the op invocation.
+      return trace_collector_->CreateActivityHandle(
+          "ExecutorPropagateOutputs",
+          strings::StrCat(op_name, "#id=", step_id_, "#"),
+          false /* is_expensive */);
+    } else {
+      return nullptr;
+    }
+  }();
+
   const Node* node = tagged_node.node;
   FrameState* input_frame = tagged_node.input_frame;
   const int64 input_iter = tagged_node.input_iter;
@@ -2377,18 +2394,23 @@ void ExecutorState::Finish() {
   auto done_cb = std::move(done_cb_);
   auto runner = std::move(runner_);
   mu_.unlock();
+  CHECK(done_cb != nullptr);
   Device* device = impl_->params_.device;
+
   if ((sync_on_finish_ && status.ok()) || device->RequiresSyncOnCompletion()) {
     // Block until the device has finished all queued operations. For
     // devices like GPUs that continue to execute Ops after their Compute
     // methods have completed, this ensures that control is not returned to
     // the user until the step (and its side-effects) has actually completed.
-    status.Update(device->Sync());
+    device->Sync([=](Status new_status) mutable {
+      status.Update(new_status);
+      delete this;
+      runner([=]() { done_cb(status); });
+    });
+  } else {
+    delete this;
+    runner([=]() { done_cb(status); });
   }
-
-  delete this;
-  CHECK(done_cb != nullptr);
-  runner([=]() { done_cb(status); });
 }
 
 void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 286386e04c2632f8fad58f24fc476a8b982fc9f4..7eb622dc117f40a68079e6cea1a829227acfed7a 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -46,7 +46,11 @@ namespace tensorflow {
 
 // A few string constant used throughout this module.
 static constexpr const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static constexpr const char* const kDeviceArgOp =
+    FunctionLibraryDefinition::kDeviceArgOp;
 static constexpr const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
+static constexpr const char* const kDeviceRetOp =
+    FunctionLibraryDefinition::kDeviceRetOp;
 static constexpr const char* const kGradientOp =
     FunctionLibraryDefinition::kGradientOp;
 static constexpr const char* const kNodeLabel = "Func";
@@ -382,8 +386,8 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs,
                            const FunctionLibraryDefinition* lib_def,
                            FunctionBody** fbody);
-  Status CreateItem(Handle handle, Item** item);
-  Status GetOrCreateItem(Handle handle, Item** item);
+  Status CreateItem(Item** item);
+  Status GetOrCreateItem(LocalHandle local_handle, Item** item);
   Status InstantiateSymbolicGradient(const NameAttrList& func,
                                      const FunctionLibraryDefinition* lib_def,
                                      FunctionBody** g_body);
@@ -391,7 +395,11 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   AttrValueMap FixAttrs(const AttrSlice& attrs);
   void RunRemote(const Options& opts, Handle handle,
                  gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-                 Executor::Args* exec_args, Item* item, DoneCallback done);
+                 Item* item, DoneCallback done);
+
+  void ExecutorArgsFromOptions(const FunctionLibraryRuntime::Options& run_opts,
+                               CallFrameInterface* frame,
+                               Executor::Args* exec_args);
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryRuntimeImpl);
 };
@@ -687,13 +695,14 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
     TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, attrs, lib_def, &fbody));
   }
 
+  LocalHandle local_handle;
   {
     mutex_lock l(mu_);
     *handle = parent_->GetHandle(key);
     if (*handle != kInvalidHandle) {
       delete fbody;
-      ++items_[parent_->GetHandleOnDevice(device_name_, *handle)]
-            ->instantiation_counter;
+      local_handle = parent_->GetHandleOnDevice(device_name_, *handle);
+      ++items_[local_handle]->instantiation_counter;
     } else {
       *handle = parent_->AddHandle(key, device_name_, next_handle_);
       Item* item = new Item;
@@ -705,26 +714,24 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
         item->overlay_flr =
             new FunctionLibraryRuntimeOverlay(this, options.overlay_lib);
       }
-      items_.emplace(next_handle_, std::unique_ptr<Item>(item));
-      next_handle_++;
+      local_handle = next_handle_++;
+      items_.emplace(local_handle, std::unique_ptr<Item>(item));
     }
   }
 
   if (options.create_kernels_eagerly) {
     Item* item;
-    TF_RETURN_IF_ERROR(GetOrCreateItem(*handle, &item));
+    TF_RETURN_IF_ERROR(GetOrCreateItem(local_handle, &item));
   }
 
   return Status::OK();
 }
 
 Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+  LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
+  if (h == kInvalidLocalHandle) {
     return parent_->ReleaseHandle(handle);
   }
-
-  LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
-  CHECK_NE(h, kInvalidLocalHandle);
   mutex_lock l(mu_);
   CHECK_EQ(1, items_.count(h));
   std::unique_ptr<Item>& item = items_[h];
@@ -785,7 +792,7 @@ void PruneFunctionBody(Graph* g) {
 }
 }  // namespace
 
-Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
+Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   const FunctionBody* fbody;
   const FunctionLibraryDefinition* lib_def;
   string executor_type;
@@ -839,13 +846,13 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
   return Status::OK();
 }
 
-Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
-  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+Status FunctionLibraryRuntimeImpl::GetOrCreateItem(LocalHandle local_handle,
+                                                   Item** item) {
   {
     tf_shared_lock l(mu_);
     auto iter = items_.find(local_handle);
     if (iter == items_.end()) {
-      return errors::NotFound("Function handle ", handle,
+      return errors::Internal("Local function handle ", local_handle,
                               " is not valid. Likely an internal error.");
     }
     *item = iter->second.get();
@@ -855,22 +862,37 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
   }
   // NOTE: We need to call CreateItem out of mu_ because creating an
   // executor needs to call CreateKernel.
-  return CreateItem(handle, item);
+  return CreateItem(item);
+}
+
+void FunctionLibraryRuntimeImpl::ExecutorArgsFromOptions(
+    const FunctionLibraryRuntime::Options& run_opts, CallFrameInterface* frame,
+    Executor::Args* exec_args) {
+  // Inherit the step_id from the caller.
+  exec_args->step_id = run_opts.step_id;
+  exec_args->rendezvous = run_opts.rendezvous;
+  exec_args->stats_collector = run_opts.stats_collector;
+  exec_args->cancellation_manager = run_opts.cancellation_manager;
+  exec_args->step_container = run_opts.step_container;
+  if (run_opts.runner) {
+    exec_args->runner = *run_opts.runner;
+  } else {
+    exec_args->runner = default_runner_;
+  }
+  exec_args->collective_executor = run_opts.collective_executor;
+  exec_args->call_frame = frame;
 }
 
 void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                                            gtl::ArraySlice<Tensor> args,
                                            std::vector<Tensor>* rets,
-                                           Executor::Args* exec_args,
                                            Item* item, DoneCallback done) {
-  DCHECK(exec_args->call_frame == nullptr);
   string target_device = parent_->GetDeviceName(handle);
   string source_device = opts.source_device;
   Rendezvous* rendezvous = opts.rendezvous;
   DeviceContext* device_context;
   Status s = parent_->GetDeviceContext(target_device, &device_context);
   if (!s.ok()) {
-    delete exec_args;
     done(s);
     return;
   }
@@ -878,7 +900,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   s = parent_->GetDeviceIncarnation(source_device, &src_incarnation);
   s.Update(parent_->GetDeviceIncarnation(target_device, &target_incarnation));
   if (!s.ok()) {
-    delete exec_args;
     done(s);
     return;
   }
@@ -886,13 +907,8 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   const FunctionBody* fbody = GetFunctionBody(handle);
   FunctionCallFrame* frame =
       new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
-  exec_args->call_frame = frame;
-  if (!s.ok()) {
-    delete frame;
-    delete exec_args;
-    done(s);
-    return;
-  }
+  Executor::Args* exec_args = new Executor::Args;
+  ExecutorArgsFromOptions(opts, frame, exec_args);
 
   std::vector<AllocatorAttributes> args_alloc_attrs, rets_alloc_attrs;
   args_alloc_attrs.reserve(fbody->arg_types.size());
@@ -938,10 +954,10 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           return;
         }
         item->exec->RunAsync(
-            *exec_args, [frame, rets, done, source_device, target_device,
-                         target_incarnation, rendezvous, device_context,
-                         remote_args, exec_args, rets_alloc_attrs,
-                         allow_dead_tensors](const Status& status) {
+            *exec_args,
+            [frame, rets, done, source_device, target_device,
+             target_incarnation, rendezvous, device_context, remote_args,
+             rets_alloc_attrs, allow_dead_tensors](const Status& status) {
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets, allow_dead_tensors);
@@ -949,7 +965,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
               delete frame;
               if (!s.ok()) {
                 delete remote_args;
-                delete exec_args;
                 done(s);
                 return;
               }
@@ -957,9 +972,9 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                   target_device, source_device, "ret_", target_incarnation,
                   *rets, device_context, rets_alloc_attrs, rendezvous);
               delete remote_args;
-              delete exec_args;
               done(s);
             });
+        delete exec_args;
       });
 }
 
@@ -982,7 +997,8 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     };
   }
 
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  if (local_handle == kInvalidLocalHandle) {
     parent_->Run(run_opts, handle, args, rets, done);
     return;
   }
@@ -992,54 +1008,43 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   }
   DCHECK(run_opts.runner != nullptr);
 
-  Executor::Args* exec_args = new Executor::Args;
-  // Inherit the step_id from the caller.
-  exec_args->step_id = run_opts.step_id;
-  exec_args->rendezvous = run_opts.rendezvous;
-  exec_args->stats_collector = run_opts.stats_collector;
-  exec_args->cancellation_manager = run_opts.cancellation_manager;
-  exec_args->step_container = run_opts.step_container;
-  exec_args->runner = *run_opts.runner;
-  exec_args->collective_executor = run_opts.collective_executor;
-
   Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
+  Status s = GetOrCreateItem(local_handle, &item);
   if (!s.ok()) {
-    delete exec_args;
     done(s);
     return;
   }
 
   if (run_opts.remote_execution) {
     // NOTE(mrry): `RunRemote()` will set `exec_args->call_frame` for us.
-    RunRemote(run_opts, handle, args, rets, exec_args, item, done);
+    RunRemote(run_opts, handle, args, rets, item, done);
     return;
   }
 
   const FunctionBody* fbody = GetFunctionBody(handle);
   FunctionCallFrame* frame =
       new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
-  exec_args->call_frame = frame;
   s = frame->SetArgs(args);
   if (!s.ok()) {
     delete frame;
-    delete exec_args;
     done(s);
     return;
   }
 
-  bool allow_dead_tensors = opts.allow_dead_tensors;
+  Executor::Args exec_args;
+  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
+
+  bool allow_dead_tensors = run_opts.allow_dead_tensors;
   item->exec->RunAsync(
       // Executor args
-      *exec_args,
+      exec_args,
       // Done callback.
-      [frame, rets, done, exec_args, allow_dead_tensors](const Status& status) {
+      [frame, rets, done, allow_dead_tensors](const Status& status) {
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets, allow_dead_tensors);
         }
         delete frame;
-        delete exec_args;
         done(s);
       });
 }
@@ -1051,8 +1056,8 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     done(errors::Cancelled(""));
     return;
   }
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle) ||
-      opts.remote_execution) {
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  if (local_handle == kInvalidLocalHandle || opts.remote_execution) {
     done(errors::Unimplemented("Remote calling with CallFrameInterface"));
     return;
   }
@@ -1073,7 +1078,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   }
 
   Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
+  Status s = GetOrCreateItem(local_handle, &item);
   if (!s.ok()) {
     done(s);
     return;
@@ -1084,16 +1089,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   DCHECK(run_opts.runner != nullptr);
 
   Executor::Args exec_args;
-  // Inherit the step_id from the caller.
-  exec_args.step_id = run_opts.step_id;
-  exec_args.rendezvous = run_opts.rendezvous;
-  exec_args.stats_collector = run_opts.stats_collector;
-  exec_args.cancellation_manager = run_opts.cancellation_manager;
-  exec_args.collective_executor = run_opts.collective_executor;
-  exec_args.step_container = run_opts.step_container;
-  exec_args.runner = *run_opts.runner;
-  exec_args.call_frame = frame;
-
+  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
   item->exec->RunAsync(exec_args, std::move(done));
 }
 
@@ -1105,7 +1101,8 @@ bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) {
 
 string FunctionLibraryRuntimeImpl::DebugString(Handle handle) {
   Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  Status s = GetOrCreateItem(local_handle, &item);
   if (s.ok()) {
     return tensorflow::DebugString(item->graph);
   } else {
@@ -1640,9 +1637,9 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
   this->ret_nodes.resize(ret_types.size());
   for (Node* n : this->graph->op_nodes()) {
     gtl::InlinedVector<Node*, 4>* node_vec;
-    if (n->type_string() == kRetOp) {
+    if (n->type_string() == kRetOp || n->type_string() == kDeviceRetOp) {
       node_vec = &this->ret_nodes;
-    } else if (n->type_string() == kArgOp) {
+    } else if (n->type_string() == kArgOp || n->type_string() == kDeviceArgOp) {
       node_vec = &this->arg_nodes;
     } else {
       continue;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index d8ebdeff5d21c9efcd5cc30d1e4324f11a81d4b7..81fea311e13c766b1fdb79d5fdc63e21940dd2bd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -84,13 +84,13 @@ namespace tensorflow {
 // corresponding stream have completed.  The following two classes
 // serve this purpose in two different compilation environments.
 
-class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
+class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
  public:
-  EigenCudaStreamDevice()
+  EigenGpuStreamDevice()
       : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
     Eigen::initializeDeviceProp();
   }
-  ~EigenCudaStreamDevice() override {}
+  ~EigenGpuStreamDevice() override {}
   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
                     TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
                     char* scratch) {
@@ -101,7 +101,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
     context_ = context;
     scratch_ = scratch;
     semaphore_ =
-        reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
+        reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
     stream_ = cuda_stream;
     allocator_ = alloc;
     PlatformGpuId platform_gpu_id;
@@ -185,7 +185,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   mutable unsigned int* semaphore_;
   OpKernelContext* context_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
+  TF_DISALLOW_COPY_AND_ASSIGN(EigenGpuStreamDevice);
 };
 
 // This factory helps to ensure that different GPU device objects that refer to
@@ -292,7 +292,7 @@ Status BaseGPUDevice::InitScratchBuffers() {
       DCHECK(streams_[i]);
       if (scratch_.size() > i && scratch_[i]) continue;
       size_t scratch_buffer_size =
-          Eigen::kCudaScratchSize + sizeof(unsigned int);
+          Eigen::kGpuScratchSize + sizeof(unsigned int);
       void* scratch_buffer = gpu_allocator_->AllocateRaw(
           Allocator::kAllocatorAlignment, scratch_buffer_size);
       if (scratch_buffer == nullptr) {
@@ -304,7 +304,7 @@ Status BaseGPUDevice::InitScratchBuffers() {
           se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
 
       bool ok = executor_->SynchronousMemZero(
-          &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
+          &mem, Eigen::kGpuScratchSize + sizeof(unsigned int));
       if (!ok) {
         return errors::FailedPrecondition(
             "Failed to memcopy into scratch buffer for device ",
@@ -692,7 +692,7 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
   const Eigen::GpuDevice& device() const override { return device_; }
 
  private:
-  EigenCudaStreamDevice stream_device_;
+  EigenGpuStreamDevice stream_device_;
   Eigen::GpuDevice device_;
 };
 
@@ -1169,6 +1169,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
     LocalityMap* localities) {
   std::vector<TfGpuId> all_tf_gpu_ids;
+  all_tf_gpu_ids.reserve(num_tf_gpus);
   for (int i = 0; i < num_tf_gpus; ++i) {
     all_tf_gpu_ids.push_back(TfGpuId(i));
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 3e95374fda89cd14660fa6974789c17be522bb03..a9a19f0fe04d1535e442ea37e51aba26eab69dc8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -55,30 +55,17 @@ bool useCudaMemoryGuardAllocator() {
 
 }  // namespace
 
-GPUProcessState* GPUProcessState::instance_ = nullptr;
-
-/*static*/ GPUProcessState* GPUProcessState::singleton() {
-  if (instance_ == nullptr) {
-    instance_ = new GPUProcessState;
-  }
-  CHECK(instance_->process_state_);
-
-  return instance_;
+/*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
+  static GPUProcessState* instance = ps ? ps : new GPUProcessState;
+  DCHECK((!ps) || (ps == instance))
+      << "Multiple calls to GPUProcessState with non-null ps";
+  return instance;
 }
 
 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
-  CHECK(instance_ == nullptr);
-  instance_ = this;
   process_state_ = ProcessState::singleton();
 }
 
-// Normally the GPUProcessState singleton is never explicitly deleted.
-// This function is defined for debugging problems with the allocators.
-GPUProcessState::~GPUProcessState() {
-  CHECK_EQ(this, instance_);
-  instance_ = nullptr;
-}
-
 int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
   // Return the NUMA node associated with the GPU's StreamExecutor.
   se::StreamExecutor* se =
@@ -166,7 +153,9 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
       !process_state_->ProcessState::FLAGS_brain_mem_reg_cuda_dma) {
     return process_state_->GetCPUAllocator(numa_node);
   }
-  CHECK_GE(numa_node, 0);
+  if (numa_node == port::kNUMANoAffinity) {
+    numa_node = 0;
+  }
   {
     // Here we optimize the most common use case where cuda_host_allocators_
     // and cuda_al_ have already been populated and since we're only reading
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index 43e9a316604006bb20f5ff171730f4b2ddc7e3d6..df51c10c8065fa94d736c8f4dfa76faebdc8bc62 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -37,7 +37,19 @@ class PoolAllocator;
 // Singleton that manages per-process state when GPUs are present.
 class GPUProcessState {
  public:
-  static GPUProcessState* singleton();
+  // If ps == nullptr, returns pointer to the single instance of this class to
+  // be used within this process.
+  //
+  // If ps != nullptrs, accepts a value to be returned by all subsequent calls.
+  // A non-null ps may ONLY be provided during program static storage
+  // initialization.  Must not be called more than once with a non-null ps.
+  //
+  // If a derived class of GPUProcessState is ever used in a process, it must
+  // always be used in place of this class.  In order to ensure that existing
+  // calls to GPUProcessState::singleton() all resolve to the derived instance
+  // instead, this function must be called once during startup, supplying the
+  // derived instance value, prior to any accessor call to this function.
+  static GPUProcessState* singleton(GPUProcessState* ps = nullptr);
 
   // Query whether any GPU device has been created so far.
   // Disable thread safety analysis since a race is benign here.
@@ -97,7 +109,11 @@ class GPUProcessState {
   virtual int BusIdForGPU(TfGpuId tf_gpu_id);
 
  protected:
+  // GPUProcessState is a singleton that should not normally be deleted except
+  // at process shutdown.
   GPUProcessState();
+  virtual ~GPUProcessState() {}
+  friend class GPUDeviceTest;
 
   // Helper method for unit tests to reset the ProcessState singleton by
   // cleaning up everything. Never use in production.
@@ -127,10 +143,6 @@ class GPUProcessState {
       GUARDED_BY(mu_);
   std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_
       GUARDED_BY(mu_);
-
-  virtual ~GPUProcessState();
-
-  friend class GPUDeviceTest;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index ab619ef619acab090b5a9d3597e874c23f3b7830..0d36930324a3a4a5f14a64dabfd954dc101dca72 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -552,10 +552,7 @@ Status GraphExecutionState::OptimizeGraph(
     return errors::InvalidArgument("Can't optimize a pruned graph");
   }
 
-  const RewriterConfig& rewrite_options =
-      session_options_->config.graph_options().rewrite_options();
-
-  if (grappler::MetaOptimizerEnabled(rewrite_options)) {
+  if (grappler::MetaOptimizerEnabled(session_options_->config)) {
     // Adding this functionality in steps. The first step is to make sure
     // we don't break dependencies. The second step will be to turn the
     // functionality on by default.
@@ -638,7 +635,7 @@ Status GraphExecutionState::OptimizeGraph(
     grappler::VirtualCluster cluster(device_set_);
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
-        item, rewrite_options, cpu_device, &cluster, &new_graph));
+        item, session_options_->config, cpu_device, &cluster, &new_graph));
 
     // Merge optimized graph function library with an original library.
     // Optimized graph might have new functions specialized for it's
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 873182371e097cf0929cd6886b3ec70dfb9b3ab2..f1fcca194e9ef56bf7b96e6c73717db7620b9812 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -18,11 +18,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/common_runtime/process_state.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -30,23 +32,52 @@ namespace tensorflow {
 
 /* static */
 bool LocalDevice::use_global_threadpool_ = true;
+mutex LocalDevice::global_tp_mu_;
+gtl::InlinedVector<LocalDevice::EigenThreadPoolInfo*, 4>
+    LocalDevice::global_tp_info_;
 
 struct LocalDevice::EigenThreadPoolInfo {
-  explicit EigenThreadPoolInfo(const SessionOptions& options) {
+  // Wrapper so we can provide the CPUAllocator to Eigen for use
+  // when ops need extra tmp memory.
+  class EigenAllocator : public Eigen::Allocator {
+   public:
+    explicit EigenAllocator(tensorflow::Allocator* a) : allocator_(a) {}
+    void* allocate(size_t num_bytes) const override {
+      return allocator_->AllocateRaw(64, num_bytes);
+    }
+    void deallocate(void* buffer) const override {
+      allocator_->DeallocateRaw(buffer);
+    }
+    tensorflow::Allocator* allocator_;
+  };
+
+  explicit EigenThreadPoolInfo(const SessionOptions& options, int numa_node,
+                               Allocator* allocator) {
     int32 intra_op_parallelism_threads =
         options.config.intra_op_parallelism_threads();
     if (intra_op_parallelism_threads == 0) {
       intra_op_parallelism_threads = port::NumSchedulableCPUs();
+      if (numa_node != port::kNUMANoAffinity) {
+        // Assume that CPUs are equally distributed over available NUMA nodes.
+        // This may not be true, but there isn't currently a better way of
+        // determining the number of CPUs specific to the requested node.
+        intra_op_parallelism_threads /= port::NUMANumNodes();
+      }
     }
-    VLOG(1) << "Local device intra op parallelism threads: "
-            << intra_op_parallelism_threads;
+    ThreadOptions thread_opts;
+    thread_opts.numa_node = numa_node;
     eigen_worker_threads_.num_threads = intra_op_parallelism_threads;
     eigen_worker_threads_.workers = new thread::ThreadPool(
-        options.env, "Eigen", intra_op_parallelism_threads);
+        options.env, thread_opts, strings::StrCat("numa_", numa_node, "_Eigen"),
+        intra_op_parallelism_threads);
     eigen_threadpool_wrapper_.reset(
         new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    if (allocator) {
+      eigen_allocator_.reset(new EigenAllocator(allocator));
+    }
     eigen_device_.reset(new Eigen::ThreadPoolDevice(
-        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads,
+        eigen_allocator_.get()));
   }
 
   ~EigenThreadPoolInfo() {
@@ -58,6 +89,7 @@ struct LocalDevice::EigenThreadPoolInfo {
   DeviceBase::CpuWorkerThreads eigen_worker_threads_;
   std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
   std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+  std::unique_ptr<EigenAllocator> eigen_allocator_;
 };
 
 LocalDevice::LocalDevice(const SessionOptions& options,
@@ -68,15 +100,34 @@ LocalDevice::LocalDevice(const SessionOptions& options,
   port::InfoAboutUnusedCPUFeatures();
   LocalDevice::EigenThreadPoolInfo* tp_info;
   if (use_global_threadpool_) {
-    // All ThreadPoolDevices in the process will use this single fixed
-    // sized threadpool for numerical computations.
-    static LocalDevice::EigenThreadPoolInfo* global_tp_info =
-        new LocalDevice::EigenThreadPoolInfo(options);
-    tp_info = global_tp_info;
+    mutex_lock l(global_tp_mu_);
+    if (options.config.experimental().use_numa_affinity()) {
+      int numa_node = attributes.locality().numa_node();
+      int num_numa_nodes = port::NUMANumNodes();
+      DCHECK_LT(numa_node, num_numa_nodes);
+      Allocator* numa_allocator =
+          ProcessState::singleton()->GetCPUAllocator(numa_node);
+      while (numa_node >= global_tp_info_.size()) {
+        global_tp_info_.push_back(nullptr);
+      }
+      if (!global_tp_info_[numa_node]) {
+        global_tp_info_[numa_node] = new LocalDevice::EigenThreadPoolInfo(
+            options, numa_node, numa_allocator);
+      }
+      tp_info = global_tp_info_[numa_node];
+    } else {
+      if (global_tp_info_.empty()) {
+        global_tp_info_.push_back(new LocalDevice::EigenThreadPoolInfo(
+            options, port::kNUMANoAffinity, nullptr));
+      }
+      tp_info = global_tp_info_[0];
+    }
   } else {
     // Each LocalDevice owns a separate ThreadPoolDevice for numerical
     // computations.
-    owned_tp_info_.reset(new LocalDevice::EigenThreadPoolInfo(options));
+    // TODO(tucker): NUMA for these too?
+    owned_tp_info_.reset(new LocalDevice::EigenThreadPoolInfo(
+        options, port::kNUMANoAffinity, nullptr));
     tp_info = owned_tp_info_.get();
   }
   set_tensorflow_cpu_worker_threads(&tp_info->eigen_worker_threads_);
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index 226f121bf32e0259d13dca633627174d5cdab917..f305c212c5a331be7992188d2b2e4c323ab6d403 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -47,6 +47,13 @@ class LocalDevice : public Device {
   struct EigenThreadPoolInfo;
   std::unique_ptr<EigenThreadPoolInfo> owned_tp_info_;
 
+  // All ThreadPoolDevices in the process associated with the same
+  // NUMA node will share a single fixed sized threadpool for numerical
+  // computations.
+  static mutex global_tp_mu_;
+  static gtl::InlinedVector<EigenThreadPoolInfo*, 4> global_tp_info_
+      GUARDED_BY(global_tp_mu_);
+
   friend class test::Benchmark;
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalDevice);
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 44a2478e3f9809e174ec3ef49c193b14daae9a62..9738006f5ca9eb821439a9ad507aec3db434946c 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -76,7 +76,7 @@ class CondBuilder {
   // The identity node with the same outputs as the original If op.
   Node* lowered_if_output_;
   // The predicate of the conditional.
-  Node* pred_;
+  OutputTensor pred_;
   // Node corresponding to pivot_f branch of predicate switch which is
   // the pivot node that dominates all nodes in the false/else branch.
   Node* pivot_f_;
@@ -102,7 +102,7 @@ CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
       name_(if_op->name()),
       then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
       else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
-  TF_CHECK_OK(if_op_->input_node(0, &pred_));
+  TF_CHECK_OK(if_op_->input_tensor(0, &pred_));
   then_call_builder_.Device(if_op_->requested_device());
   else_call_builder_.Device(if_op_->requested_device());
 }
@@ -113,8 +113,8 @@ Status CondBuilder::CreatePivotNodes() {
   Node* switch_pred;
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName("switch_pred"), "Switch", graph_->op_registry())
-          .Input(NodeOut(pred_, 0))
-          .Input(NodeOut(pred_, 0))
+          .Input(NodeOut(pred_))
+          .Input(NodeOut(pred_))
           .Device(if_op_->requested_device())
           .Finalize(graph_, &switch_pred));
   control_predecessor_ = switch_pred;
@@ -140,7 +140,7 @@ Status CondBuilder::AddInput(Node* src, int src_output) {
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry())
           .Input(src, src_output)
-          .Input(pred_, 0)
+          .Input(pred_)
           .Device(if_op_->requested_device())
           .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 305d6a3b1bddcaca694d9d85402077a73ae68338..01e4072f60323904099af5a75a302c45a84e2fd9 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/port.h"
 
 namespace tensorflow {
 
@@ -46,42 +47,51 @@ const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
 // returned list is sorted by preferred type (higher numeric type is preferred).
 std::vector<Device*> FilterSupportedDevices(
     const std::vector<Device*>& devices,
-    const DeviceTypeVector& supported_device_types,
+    const PrioritizedDeviceTypeVector& supported_device_types,
     const Device* default_device) {
   Device* filtered_default_device = nullptr;
-  std::vector<Device*> filtered_devices;
-  for (const DeviceType& d : supported_device_types) {
+  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
+  for (const auto& supported_device_type : supported_device_types) {
     for (Device* device : devices) {
-      if (DeviceType(device->attributes().device_type()) == d) {
+      if (DeviceType(device->attributes().device_type()) ==
+          supported_device_type.first) {
         if (device == default_device) {
           filtered_default_device = device;
         } else {
-          filtered_devices.emplace_back(device);
+          prioritized_filtered_devices.emplace_back(
+              device, supported_device_type.second);
         }
       }
     }
   }
 
-  auto device_sort = [](const Device* a, const Device* b) {
-    auto a_priority = DeviceSet::DeviceTypeOrder(DeviceType(a->device_type()));
-    auto b_priority = DeviceSet::DeviceTypeOrder(DeviceType(b->device_type()));
+  auto device_sort = [](const std::pair<Device*, int32>& a,
+                        const std::pair<Device*, int32>& b) {
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+
+    auto a_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
+    auto b_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
     // First sort by prioritized device type (higher is preferred) and
     // then by device name (lexicographically).
     if (a_priority != b_priority) {
       return a_priority > b_priority;
     }
-    return StringPiece(a->name()) < StringPiece(b->name());
+    return StringPiece(a.first->name()) < StringPiece(b.first->name());
   };
-  std::vector<Device*>::iterator sort_start;
+  std::sort(prioritized_filtered_devices.begin(),
+            prioritized_filtered_devices.end(), device_sort);
+
+  std::vector<Device*> filtered_devices;
   if (filtered_default_device != nullptr) {
-    // Put the default device first outside of the normal ordering.
     filtered_devices.emplace_back(filtered_default_device);
-    std::iter_swap(filtered_devices.begin(), std::prev(filtered_devices.end()));
-    sort_start = std::next(filtered_devices.begin());
-  } else {
-    sort_start = filtered_devices.begin();
   }
-  std::sort(sort_start, filtered_devices.end(), device_sort);
+  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
+    filtered_devices.push_back(prioritized_filtered_device.first);
+  }
   return filtered_devices;
 }
 
@@ -378,11 +388,20 @@ class ColocationGraph {
             }
             std::sort(device_names.begin(), device_names.end());
 
+            string gpu_msg = "";
+            if (!IsGoogleCudaEnabled() &&
+                str_util::Lowercase(specified_device_name.type) == "gpu") {
+              gpu_msg =
+                  " The requested device appears to be a GPU, but CUDA is not "
+                  "enabled.";
+            }
+
             return errors::InvalidArgument(
-                "Operation was explicitly assigned to ",
-                node->requested_device(), " but available devices are [ ",
+                errors::FormatNodeNameForError(node->name()),
+                "was explicitly assigned to ", node->requested_device(),
+                " but available devices are [ ",
                 str_util::Join(device_names, ", "), " ]. Make sure ",
-                "the device specification refers to a valid device.");
+                "the device specification refers to a valid device.", gpu_msg);
           } else if (specified_device_name.has_type) {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
@@ -462,7 +481,7 @@ class ColocationGraph {
     // The intersection of all device types supported by this node,
     // and those of all of its children, in priority order
     // of the preferred device.
-    DeviceTypeVector supported_device_types;
+    PrioritizedDeviceTypeVector supported_device_types;
 
     // The merged form of the device requested for this node, with
     // those of all of its children.
@@ -501,8 +520,8 @@ class ColocationGraph {
       const string& op_type = node->type_string();
       string devices_registered;
       for (const auto& device_type : members_[id].supported_device_types) {
-        strings::StrAppend(&devices_registered, DeviceTypeString(device_type),
-                           " ");
+        strings::StrAppend(&devices_registered,
+                           DeviceTypeString(device_type.first), " ");
       }
 
       type_to_devices[op_type] = std::move(devices_registered);
@@ -555,8 +574,9 @@ class ColocationGraph {
                                 "' does not match any device");
       }
 
-      for (const DeviceType& d : member->supported_device_types) {
-        if (DeviceType(assigned_device->attributes().device_type()) == d) {
+      for (const auto& d : member->supported_device_types) {
+        if (DeviceType(assigned_device->attributes().device_type()) ==
+            d.first) {
           return Status::OK();
         }
       }
@@ -613,24 +633,102 @@ class ColocationGraph {
     return Status::OK();
   }
 
+  static bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
+    for (const auto& prioritized_device_type : device_types) {
+      if (prioritized_device_type.second != 0) return true;
+    }
+    return false;
+  }
+
+  static bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
+                                const PrioritizedDeviceTypeVector& b_types) {
+    if (a_types.size() != b_types.size()) {
+      return false;
+    }
+    for (int i = 0; i < a_types.size(); ++i) {
+      if (a_types[i].first != b_types[i].first) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Updates target to contain the intersection of the device types in
   // "target" and "other".
-  static void MergeSupportedDevices(DeviceTypeVector* target,
-                                    const DeviceTypeVector& other) {
-    DeviceTypeVector temp = *target;
+  static void MergeSupportedDevices(PrioritizedDeviceTypeVector* target,
+                                    const PrioritizedDeviceTypeVector& other) {
+    PrioritizedDeviceTypeVector temp = *target;
     target->clear();
 
-    // Iterate in priority order.
-    for (const DeviceType& device_type : temp) {
+    // Generate intersection with priorities.
+    PrioritizedDeviceTypeVector target_intersection;
+    PrioritizedDeviceTypeVector other_intersection;
+    for (const auto& prioritized_device_type : temp) {
       bool found = false;
-      for (const DeviceType& other_device_type : other) {
-        if (device_type == other_device_type) {
+      for (const auto& other_prioritized_device_type : other) {
+        if (prioritized_device_type.first ==
+            other_prioritized_device_type.first) {
           found = true;
+          other_intersection.push_back(other_prioritized_device_type);
           break;
         }
       }
       if (found) {
-        target->push_back(device_type);
+        target_intersection.push_back(prioritized_device_type);
+      }
+    }
+
+    // Sort the devices by priority order.
+    auto device_sort = [](const std::pair<DeviceType, int32>& a,
+                          const std::pair<DeviceType, int32>& b) {
+      // First look at set priorities.
+      if (a.second != b.second) {
+        return a.second > b.second;
+      }
+      // Then fallback to default priorities.
+      auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
+      auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
+      if (a_priority != b_priority) {
+        return a_priority > b_priority;
+      }
+      // Finally just look at the Device type strings.
+      return a.first.type_string() < b.first.type_string();
+    };
+
+    std::sort(target_intersection.begin(), target_intersection.end(),
+              device_sort);
+    std::sort(other_intersection.begin(), other_intersection.end(),
+              device_sort);
+
+    bool is_target_prioritized = HasPriorities(target_intersection);
+    bool is_other_prioritized = HasPriorities(other_intersection);
+    // If neither are prioritized then we just return the original i.e. target
+    // prioritization.
+    if (!is_target_prioritized && !is_other_prioritized) {
+      *target = target_intersection;
+    }
+    // If only one is prioritized, then we respect priorities of that in the
+    // intersection.
+    if (is_target_prioritized && !is_other_prioritized) {
+      *target = target_intersection;
+    }
+    if (!is_target_prioritized && is_other_prioritized) {
+      *target = other_intersection;
+    }
+    // If both have priorities and agree then we go with that. If the
+    // prioritization order is different, then we just fallback to the default
+    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
+    // merged priorities to 0, so that downstream merges work correctly as well.
+    if (is_target_prioritized && is_other_prioritized) {
+      bool priorities_agree =
+          ArePrioritiesSame(target_intersection, other_intersection);
+      if (priorities_agree) {
+        *target = target_intersection;
+      } else {
+        for (const auto& prioritized_device : target_intersection) {
+          target->push_back(std::make_pair(prioritized_device.first, 0));
+        }
+        std::sort(target->begin(), target->end(), device_sort);
       }
     }
   }
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index d5e98b8d9e81b628f168b9a53b717269f78214f5..009f905f108c365d0be91005d8cb00b00c07ad0e 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -164,6 +164,13 @@ REGISTER_KERNEL_BUILDER(Name("TestDeviceEnforce").Device("FakeGPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("Shape").Device("FakeCPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("Shape").Device("FakeGPU"), DummyOp);
 
+// Op that has kernels with device priorities specified.
+REGISTER_OP("TestDatasetOp").Input("a: float").Output("b: float");
+REGISTER_KERNEL_BUILDER(Name("TestDatasetOp").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("TestDatasetOp").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // A PlacerTest method has three phases:
@@ -285,6 +292,251 @@ TEST_F(PlacerTest, TestNoConstraints) {
   EXPECT_DEVICE_TYPE(g, "n2", "FakeGPU");
 }
 
+// Test that a graph with no constraints but using kernels that have a specified
+// device priority will successfully assign nodes to the device with higher
+// priority
+TEST_F(PlacerTest, TestNoConstraintsWithPrioritizedKernels) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestInput", b.opts().WithName("in"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 1),
+                 b.opts().WithName("n2"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "n2", "FakeCPU");
+}
+
+TEST_F(PlacerTest, TestGPUInputIntoPrioritizedKernel) {
+  Graph g(OpRegistry::Global());
+  {
+    // Scope for temp variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestGPUOutput", b.opts().WithName("in"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeCPU");
+}
+
+// Tests that a GPU kernel colocated with prioritized kernel respects it.
+TEST_F(PlacerTest, TestGPUInputColocatedWithPrioritizedKernel) {
+  Graph g(OpRegistry::Global());
+  {
+    // Scope for temp variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestGPUOutput", b.opts().WithName("in"));
+    // We colocate n1 with in.
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1").WithAttr("_class", {"loc:@in"}));
+    // We don't colocate n2 with in.
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n2"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n2", "FakeCPU");
+}
+
+REGISTER_OP("CreateDatasetCPU").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetCPU").Device("FakeCPU"), DummyOp);
+
+REGISTER_OP("CreateDatasetSP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetSP").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetSP").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
+REGISTER_OP("CreateDatasetRP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetRP").Device("FakeCPU").Priority(1),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetRP").Device("FakeGPU").Priority(2),
+                        DummyOp);
+
+REGISTER_OP("CreateDatasetNP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetNP").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetNP").Device("FakeGPU"), DummyOp);
+
+REGISTER_OP("IteratorNP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorNP").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorNP").Device("FakeGPU"), DummyOp);
+
+REGISTER_OP("IteratorSP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorSP").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorSP").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
+REGISTER_OP("IteratorRP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorRP").Device("FakeCPU").Priority(1),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorRP").Device("FakeGPU").Priority(2),
+                        DummyOp);
+
+REGISTER_OP("IteratorGPU").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorGPU").Device("FakeGPU"), DummyOp);
+
+// Test reference edges with one node having prioritized kernels and the other
+// has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestDSWithPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorNP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test reference edges with one node having kernels with regular priority and
+// the other has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestDSWithGPUPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorNP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and the other
+// has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestITWithPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetNP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test reference edges with one node having kernels with regular priority and
+// the other has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestITWithGPUPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetNP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and other node
+// can only be placed on GPU. We should respect the constraint then.
+TEST_F(PlacerTest, TestITGPU) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorGPU", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and other node
+// can only be placed on CPU. We should respect the constraint then.
+TEST_F(PlacerTest, TestSimpleIteratorOnlyGPU) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetCPU", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test constraints with agreeing priorities.
+TEST_F(PlacerTest, TestAgreeingPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test constraints with agreeing regular priorities.
+TEST_F(PlacerTest, TestAgreeingRegularPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test constraints with different priorities. In this case, we should bail
+// and just revert to default.
+TEST_F(PlacerTest, TestConflictingPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test constraints with different priorities. In this case, we should bail
+// and just revert to default.
+TEST_F(PlacerTest, TestConflictingPrioritiesReversed) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
 // Test that a graph with device type and reference constraints on
 // some of the ops will successfully assign nodes to the constrained
 // device, and colocate nodes with reference connections.
@@ -1194,14 +1446,37 @@ TEST_F(PlacerTest, TestNonExistentDevice) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
+  SessionOptions options;
+  Status s = Place(&g, &options);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+  LOG(WARNING) << s.error_message();
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "was explicitly assigned to /job:foo/replica:17"));
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(), "but available devices"));
+}
+
+#if !GOOGLE_CUDA
+// Test that we inform the user if they appear to be explicitly placing nodes
+// on a GPU when CUDA is not available
+TEST_F(PlacerTest, TestUseGpuWithNoCuda) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    ops::SourceOp("VariableGPU",
+                  b.opts().WithName("var").WithDevice("/device:gpu:0"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
   EXPECT_TRUE(str_util::StrContains(
       s.error_message(),
-      "was explicitly assigned to /job:foo/replica:17 but available devices"));
+      "The requested device appears to be a GPU, but CUDA is not enabled."));
 }
+#endif
 
 TEST_F(PlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
   Graph g(OpRegistry::Global());
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index 66dc8f332217c30a3b3a1745a7c90a1880e3e068..6b40fcc4c70f50ba5bc643855a8035d73b92bfb0 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -258,7 +259,12 @@ void PoolAllocator::EvictOne() {
 void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
   void* ptr = nullptr;
   if (num_bytes > 0) {
-    ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+    if (numa_node_ == port::kNUMANoAffinity) {
+      ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+    } else {
+      ptr =
+          port::NUMAMalloc(numa_node_, num_bytes, static_cast<int>(alignment));
+    }
     VisitAlloc(ptr, numa_node_, num_bytes);
   }
   return ptr;
@@ -267,7 +273,11 @@ void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
 void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) {
   if (num_bytes > 0) {
     VisitFree(ptr, numa_node_, num_bytes);
-    port::AlignedFree(ptr);
+    if (numa_node_ == port::kNUMANoAffinity) {
+      port::AlignedFree(ptr);
+    } else {
+      port::NUMAFree(ptr, num_bytes);
+    }
   }
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 5b4623ba10fe684f6399a244e30ecafd55003c95..8be9c7b678e2bbe7659c9e22e31cb595ce704307 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -150,7 +150,6 @@ class Pow2Rounder : public RoundUpInterface {
 
 class BasicCPUAllocator : public SubAllocator {
  public:
-  // Argument numa_node is currently ignored.
   BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors,
                     const std::vector<Visitor>& free_visitors)
       : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {}
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index bcaa37fc8a156a63fcc76f9b8bb39ac8fd75f15a..3d8ac9b1344d8f2ca210451194adf4607dd52b7d 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -32,28 +32,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-ProcessState* ProcessState::instance_ = nullptr;
-
 /*static*/ ProcessState* ProcessState::singleton() {
-  if (instance_ == nullptr) {
-    instance_ = new ProcessState;
-  }
-
-  return instance_;
+  static ProcessState* instance = new ProcessState;
+  return instance;
 }
 
 ProcessState::ProcessState() : numa_enabled_(false) {
-  CHECK(instance_ == nullptr);
-}
-
-// Normally the ProcessState singleton is never explicitly deleted.
-// This function is defined for debugging problems with the allocators.
-ProcessState::~ProcessState() {
-  CHECK_EQ(this, instance_);
-  instance_ = nullptr;
-  for (Allocator* a : cpu_allocators_) {
-    delete a;
-  }
 }
 
 string ProcessState::MemDesc::DebugString() {
@@ -72,8 +56,7 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
 }
 
 Allocator* ProcessState::GetCPUAllocator(int numa_node) {
-  CHECK_GE(numa_node, 0);
-  if (!numa_enabled_) numa_node = 0;
+  if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0;
   mutex_lock lock(mu_);
   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
     // If visitors have been defined we need an Allocator built from
@@ -90,8 +73,9 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
     Allocator* allocator = nullptr;
     SubAllocator* sub_allocator =
         (alloc_visitors_defined || use_bfc_allocator)
-            ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1,
-                                    cpu_alloc_visitors_, cpu_free_visitors_)
+            ? new BasicCPUAllocator(
+                  numa_enabled_ ? numa_node : port::kNUMANoAffinity,
+                  cpu_alloc_visitors_, cpu_free_visitors_)
             : nullptr;
     if (use_bfc_allocator) {
       // TODO(reedwm): evaluate whether 64GB by default is the best choice.
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index cac312d8496d3d4e454291405bcd16c432af8852..6849d305b3c5577485e83ed7d2e9521dce20a452 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -63,7 +63,7 @@ class ProcessState {
   MemDesc PtrType(const void* ptr);
 
   // Returns the one CPUAllocator used for the given numa_node.
-  // TEMPORARY: ignores numa_node.
+  // Treats numa_node == kNUMANoAffinity as numa_node == 0.
   Allocator* GetCPUAllocator(int numa_node);
 
   // Registers alloc visitor for the CPU allocator(s).
@@ -87,19 +87,19 @@ class ProcessState {
 
   // Helper method for unit tests to reset the ProcessState singleton by
   // cleaning up everything. Never use in production.
-  virtual void TestOnlyReset();
+  void TestOnlyReset();
 
   static ProcessState* instance_;
   bool numa_enabled_;
 
   mutex mu_;
 
+  // Indexed by numa_node.  If we want numa-specific allocators AND a
+  // non-specific allocator, maybe should index by numa_node+1.
   std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_);
   std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_);
   std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_);
 
-  virtual ~ProcessState();
-
   // Optional RecordingAllocators that wrap the corresponding
   // Allocators for runtime attribute use analysis.
   MDMap mem_desc_map_;
diff --git a/tensorflow/core/common_runtime/session_options.cc b/tensorflow/core/common_runtime/session_options.cc
index aacd57000cfb143a99bc79fa9767a228ed31ef0b..57c3b605575b925e6f4a131f076cfe6f25c92fc1 100644
--- a/tensorflow/core/common_runtime/session_options.cc
+++ b/tensorflow/core/common_runtime/session_options.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index a70ab93d4ad7f7aac1221560c7fc124f2e5a29ed..49265445659ff1daa30b632f60c03845d4a6a7f7 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -139,7 +139,7 @@ void NodeExecStatsWrapper::SetScheduled(int64 nanos) {
 }
 
 void NodeExecStatsWrapper::SetMemory(OpKernelContext* ctx) {
-  for (const auto& allocator_pair : ctx->wrapped_allocators()) {
+  for (const auto& allocator_pair : ctx->ConsumeWrappedAllocators()) {
     AddAllocation(allocator_pair.first, allocator_pair.second);
   }
   auto* ms = stats_->mutable_memory_stats();
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 4a1bb44f4b899d5c77e9f364ad10325e1487344d..7d34383ce8209c9f4b889410a96bce02f6702a64 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -74,8 +74,7 @@ class NodeExecStatsInterface {
   // Records information about the memory allocated during the execution of this
   // node.
   //
-  // Takes ownership of the `TrackingAllocator` objects in
-  // `ctx->wrapped_allocators()`.
+  // Takes ownership of any `TrackingAllocator` objects stored in `ctx`.
   virtual void SetMemory(OpKernelContext* ctx) = 0;
 
   // Records information about the tensor produced by this node at the given
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 6404d8bc6a209997afbe33c547679ebb2cb5cbf5..ca7ca5443c954a6cdcb5d25324ea84163bb4291e 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -93,7 +93,7 @@ Status ThreadPoolDevice::MakeTensorFromProto(
     Tensor* tensor) {
   if (tensor_proto.dtype() > 0 && tensor_proto.dtype() <= DataType_MAX) {
     Tensor parsed(tensor_proto.dtype());
-    if (parsed.FromProto(cpu_allocator(), tensor_proto)) {
+    if (parsed.FromProto(allocator_, tensor_proto)) {
       *tensor = std::move(parsed);
       return Status::OK();
     }
diff --git a/tensorflow/core/common_runtime/threadpool_device_factory.cc b/tensorflow/core/common_runtime/threadpool_device_factory.cc
index 6a900c02c00e976fdef2e4b5f6673f27affb3069..c06a4035a75d0564760481507fddf73e1f8c206a 100644
--- a/tensorflow/core/common_runtime/threadpool_device_factory.cc
+++ b/tensorflow/core/common_runtime/threadpool_device_factory.cc
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/process_state.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -28,8 +30,7 @@ class ThreadPoolDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
                        std::vector<Device*>* devices) override {
-    // TODO(zhifengc/tucker): Figure out the number of available CPUs
-    // and/or NUMA configuration.
+    int num_numa_nodes = port::NUMANumNodes();
     int n = 1;
     auto iter = options.config.device_count().find("CPU");
     if (iter != options.config.device_count().end()) {
@@ -37,8 +38,26 @@ class ThreadPoolDeviceFactory : public DeviceFactory {
     }
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:CPU:", i);
-      devices->push_back(new ThreadPoolDevice(
-          options, name, Bytes(256 << 20), DeviceLocality(), cpu_allocator()));
+      ThreadPoolDevice* tpd = nullptr;
+      if (options.config.experimental().use_numa_affinity()) {
+        int numa_node = i % num_numa_nodes;
+        if (numa_node != i) {
+          LOG(INFO) << "Only " << num_numa_nodes
+                    << " NUMA nodes visible in system, "
+                    << " assigning device " << name << " to NUMA node "
+                    << numa_node;
+        }
+        DeviceLocality dev_locality;
+        dev_locality.set_numa_node(numa_node);
+        tpd = new ThreadPoolDevice(
+            options, name, Bytes(256 << 20), dev_locality,
+            ProcessState::singleton()->GetCPUAllocator(numa_node));
+      } else {
+        tpd = new ThreadPoolDevice(
+            options, name, Bytes(256 << 20), DeviceLocality(),
+            ProcessState::singleton()->GetCPUAllocator(port::kNUMANoAffinity));
+      }
+      devices->push_back(tpd);
     }
 
     return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 37029f3f1a797f8879a5475acc53d17840768a4e..818324746f9773d79e20348181ac4d25cfc2ad71 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -15,7 +15,7 @@ filegroup(
     ]),
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
@@ -189,7 +189,7 @@ cc_library(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "worker",
     srcs = ["worker.cc"],
     hdrs = [
@@ -204,6 +204,7 @@ cc_library(
         ":worker_interface",
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:device_tracer",
         "//tensorflow/core:lib_internal",
     ],
 )
@@ -466,6 +467,17 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "server_lib_test",
+    srcs = ["server_lib_test.cc"],
+    deps = [
+        ":server_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "rpc_collective_executor_mgr",
     srcs = ["rpc_collective_executor_mgr.cc"],
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index b8af63724aa1dbe1a20dbc18bd6115c9aab78a0c..5b0a420fada133a18549f43c222aba1fc6411875 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/host_info.h"
 
 namespace tensorflow {
 namespace eager {
@@ -152,20 +153,19 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
   std::unique_ptr<tensorflow::EagerOperation> op;
   const char* name = operation.name().c_str();  // Shorthand
   const tensorflow::AttrTypeMap* types;
-  auto status = tensorflow::AttrTypeMapForOp(name, &types);
-  if (status.ok()) {
-    op.reset(
-        new tensorflow::EagerOperation(server_context->Context(), name, types));
-  } else if (errors::IsNotFound(status)) {
-    if (server_context->Context()->FindFunctionByName(name)) {
-      op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
-                                              nullptr));
-    } else {
-      return status;
-    }
-  } else {
-    return status;
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp(name, &types, &is_function));
+  if (is_function && !server_context->Context()->FindFunctionByName(name)) {
+    return errors::NotFound(
+        "'", name,
+        "' is neither a type of a primitive operation nor a name "
+        "of a function registered in binary running on ",
+        port::Hostname(),
+        ". Make sure the operation or function is "
+        "registered in the binary running in this process.");
   }
+  op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
+                                          is_function, types));
 
   TF_RETURN_IF_ERROR(op->SetDevice(operation.device().c_str()));
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 5c9b33b345b8b3f8efec8ac14720a11867e1d5cd..5ba522c2a2e9ce650b7823bbb2d4959531874d98 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -345,8 +345,7 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
       response.context_id(), RemoteTensorHandleInternal(2, 0), &tensor_handle));
   TF_ASSERT_OK(tensor_handle->Tensor(&t));
 
-  Device* device = nullptr;
-  TF_ASSERT_OK(tensor_handle->Device(&device));
+  Device* device = tensor_handle->device();
   EXPECT_NE(device, nullptr);
   EXPECT_EQ(device->name(), "/job:localhost/replica:0/task:0/device:CPU:0");
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 8e9eec1ed926fb72887ec50e58ae8e505abad807..bc8ba6e47d5c66eab72eacd1f4d9a65a4b9cae6c 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/master_session.h"
 
+#include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -64,27 +65,33 @@ namespace tensorflow {
 class MasterSession::ReffedClientGraph : public core::RefCounted {
  public:
   ReffedClientGraph(const string& handle, const BuildGraphOptions& bopts,
-                    std::unique_ptr<ClientGraph> cg,
+                    std::unique_ptr<ClientGraph> client_graph,
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
                     bool is_partial, WorkerCacheInterface* worker_cache,
                     bool should_deregister)
       : session_handle_(handle),
         bg_opts_(bopts),
-        client_graph_(std::move(cg)),
+        client_graph_before_register_(std::move(client_graph)),
         session_opts_(session_opts),
         is_partial_(is_partial),
         callable_opts_(bopts.callable_options),
         worker_cache_(worker_cache),
-        should_deregister_(should_deregister) {
+        should_deregister_(should_deregister),
+        collective_graph_key_(
+            client_graph_before_register_->collective_graph_key) {
     VLOG(1) << "Created ReffedClientGraph for node with "
-            << client_graph()->graph.num_node_ids();
+            << client_graph_before_register_->graph.num_node_ids();
 
     stats_publisher_ = stats_publisher_factory(handle, bopts, session_opts);
 
     // Initialize a name to node map for processing device stats.
-    for (Node* n : client_graph_->graph.nodes()) {
-      name_to_node_.insert({n->name(), n});
+    for (Node* n : client_graph_before_register_->graph.nodes()) {
+      name_to_node_details_.emplace(
+          n->name(),
+          NodeDetails(n->type_string(),
+                      strings::StrCat(
+                          "(", str_util::Join(n->requested_inputs(), ", "))));
     }
   }
 
@@ -98,12 +105,12 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     }
   }
 
-  const ClientGraph* client_graph() { return client_graph_.get(); }
-
   const CallableOptions& callable_options() { return callable_opts_; }
 
   const BuildGraphOptions& build_graph_options() { return bg_opts_; }
 
+  int64 collective_graph_key() { return collective_graph_key_; }
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
                                                     int64 execution_count,
                                                     const RunOptions& ropts) {
@@ -187,7 +194,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   // Partitions the graph into subgraphs and registers them on
   // workers.
-  Status RegisterPartitions(const PartitionOptions& popts);
+  Status RegisterPartitions(PartitionOptions popts);
 
   // Runs one step of all partitions.
   Status RunPartitions(const MasterEnv* env, int64 step_id,
@@ -214,29 +221,28 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                       const RunState* run_state,
                       GraphExecutionState* execution_state);
 
-  string DetailText(const Node& node, const NodeExecStats& ns) {
-    int64 tot = 0;
-    for (auto& no : ns.output()) {
-      tot += no.tensor_description().allocation_description().requested_bytes();
-    }
-    string bytes;
-    if (tot >= 0.1 * 1048576.0) {
-      bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
-    }
-    return strings::StrCat(bytes, node.name(), " = ", node.type_string(), "(",
-                           str_util::Join(node.requested_inputs(), ", "), ")");
-  }
-
  private:
   const string session_handle_;
   const BuildGraphOptions bg_opts_;
-  const std::unique_ptr<ClientGraph> client_graph_;
+
+  // NOTE(mrry): This pointer will be null after `RegisterPartitions()` returns.
+  std::unique_ptr<ClientGraph> client_graph_before_register_ GUARDED_BY(mu_);
   const SessionOptions session_opts_;
   const bool is_partial_;
   const CallableOptions callable_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
-  std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node_;
+
+  struct NodeDetails {
+    explicit NodeDetails(string type_string, string detail_text)
+        : type_string(std::move(type_string)),
+          detail_text(std::move(detail_text)) {}
+    const string type_string;
+    const string detail_text;
+  };
+  std::unordered_map<string, NodeDetails> name_to_node_details_;
+
   const bool should_deregister_;
+  const int64 collective_graph_key_;
   std::atomic<int64> execution_count_ = {0};
 
   // Graph partitioned into per-location subgraphs.
@@ -268,9 +274,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   mutable mutex mu_;
 
   // Partition initialization and registration only needs to happen
-  // once. init_started_ && !init_done_ indicates the initialization
-  // is on going.
-  bool init_started_ GUARDED_BY(mu_) = false;
+  // once. `!client_graph_before_register_ && !init_done_.HasBeenNotified()`
+  // indicates the initialization is ongoing.
   Notification init_done_;
 
   // init_result_ remembers the initialization error if any.
@@ -278,6 +283,19 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   std::unique_ptr<StatsPublisherInterface> stats_publisher_;
 
+  string DetailText(const NodeDetails& details, const NodeExecStats& stats) {
+    int64 tot = 0;
+    for (auto& no : stats.output()) {
+      tot += no.tensor_description().allocation_description().requested_bytes();
+    }
+    string bytes;
+    if (tot >= 0.1 * 1048576.0) {
+      bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
+    }
+    return strings::StrCat(bytes, stats.node_name(), " = ",
+                           details.type_string, details.detail_text);
+  }
+
   // Send/Recv nodes that are the result of client-added
   // feeds and fetches must be tracked so that the tensors
   // can be added to the local rendezvous.
@@ -286,7 +304,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   // The actual graph partitioning and registration implementation.
   Status DoBuildPartitions(
-      PartitionOptions pots,
+      PartitionOptions popts, ClientGraph* client_graph,
       std::unordered_map<string, GraphDef>* out_partitions);
   Status DoRegisterPartitions(
       const PartitionOptions& popts,
@@ -311,14 +329,20 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 };
 
 Status MasterSession::ReffedClientGraph::RegisterPartitions(
-    const PartitionOptions& popts) {
+    PartitionOptions popts) {
   {  // Ensure register once.
     mu_.lock();
-    if (!init_started_) {
-      init_started_ = true;
+    if (client_graph_before_register_) {
+      // The `ClientGraph` is no longer needed after partitions are registered.
+      // Since it can account for a large amount of memory, we consume it here,
+      // and it will be freed after concluding with registration.
+
+      std::unique_ptr<ClientGraph> client_graph;
+      std::swap(client_graph_before_register_, client_graph);
       mu_.unlock();
       std::unordered_map<string, GraphDef> graph_defs;
-      Status s = DoBuildPartitions(popts, &graph_defs);
+      popts.flib_def = client_graph->flib_def.get();
+      Status s = DoBuildPartitions(popts, client_graph.get(), &graph_defs);
       if (s.ok()) {
         // NOTE(mrry): The pointers in `graph_defs_for_publishing` do not remain
         // valid after the call to DoRegisterPartitions begins, so
@@ -394,19 +418,19 @@ void MasterSession::ReffedClientGraph::TrackFeedsAndFetches(
 }
 
 Status MasterSession::ReffedClientGraph::DoBuildPartitions(
-    PartitionOptions popts,
+    PartitionOptions popts, ClientGraph* client_graph,
     std::unordered_map<string, GraphDef>* out_partitions) {
   if (popts.need_to_record_start_times) {
     CostModel cost_model(true);
-    cost_model.InitFromGraph(client_graph()->graph);
+    cost_model.InitFromGraph(client_graph->graph);
     // TODO(yuanbyu): Use the real cost model.
     // execution_state_->MergeFromGlobal(&cost_model);
-    SlackAnalysis sa(&client_graph()->graph, &cost_model);
+    SlackAnalysis sa(&client_graph->graph, &cost_model);
     sa.ComputeAsap(&popts.start_times);
   }
 
   // Partition the graph.
-  return Partition(popts, &client_graph_->graph, out_partitions);
+  return Partition(popts, &client_graph->graph, out_partitions);
 }
 
 Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
@@ -415,7 +439,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
   partitions_.reserve(graph_partitions.size());
   Status s;
   for (auto& name_def : graph_partitions) {
-    partitions_.resize(partitions_.size() + 1);
+    partitions_.emplace_back();
     Part* part = &partitions_.back();
     part->name = name_def.first;
     TrackFeedsAndFetches(part, name_def.second, popts);
@@ -449,7 +473,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
     *c->req.mutable_debug_options() =
         callable_opts_.run_options().debug_options();
-    c->req.set_collective_graph_key(client_graph()->collective_graph_key);
+    c->req.set_collective_graph_key(collective_graph_key_);
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -915,8 +939,8 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
       ph->RecordOneOp(dev_name, ns, true /*is_copy*/, "", ns.node_name(),
                       ns.timeline_label());
     } else {
-      const Node* node = name_to_node_[ns.node_name()];
-      const bool found_node_in_graph = node != nullptr;
+      auto iter = name_to_node_details_.find(ns.node_name());
+      const bool found_node_in_graph = iter != name_to_node_details_.end();
       if (!found_node_in_graph && ns.timeline_label().empty()) {
         // The counter incrementing is not thread-safe. But we don't really
         // care.
@@ -930,13 +954,13 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
         }
         continue;
       }
-      string optype =
-          found_node_in_graph ? node->type_string() : ns.node_name();
+      const string& optype =
+          found_node_in_graph ? iter->second.type_string : ns.node_name();
       string details;
       if (!ns.timeline_label().empty()) {
         details = ns.timeline_label();
       } else if (found_node_in_graph) {
-        details = DetailText(*node, ns);
+        details = DetailText(iter->second, ns);
       } else {
         // Leave details string empty
       }
@@ -1545,14 +1569,13 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   // Registers subgraphs if haven't done so.
   PartitionOptions popts;
   popts.node_to_loc = SplitByWorker;
-  // The closures potps.{new_name,get_incarnation} are called synchronously in
+  // The closures popts.{new_name,get_incarnation} are called synchronously in
   // RegisterPartitions() below, so do not need a Ref()/Unref() pair to keep
   // "this" alive during the closure.
   popts.new_name = [this](const string& prefix) {
     mutex_lock l(mu_);
     return strings::StrCat(prefix, "_S", next_node_id_++);
   };
-  popts.flib_def = rcg->client_graph()->flib_def.get();
   popts.get_incarnation = [this](const string& name) -> int64 {
     Device* d = devices_->FindDeviceByName(name);
     if (d == nullptr) {
@@ -1580,7 +1603,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
     popts.need_to_record_start_times = true;
   }
 
-  TF_RETURN_IF_ERROR(rcg->RegisterPartitions(popts));
+  TF_RETURN_IF_ERROR(rcg->RegisterPartitions(std::move(popts)));
 
   return Status::OK();
 }
@@ -1784,10 +1807,10 @@ Status MasterSession::PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
   Status s = run_status;
   if (s.ok()) {
     pss->end_micros = Env::Default()->NowMicros();
-    if (rcg->client_graph()->collective_graph_key !=
+    if (rcg->collective_graph_key() !=
         BuildGraphOptions::kNoCollectiveGraphKey) {
-      env_->collective_executor_mgr->RetireStepId(
-          rcg->client_graph()->collective_graph_key, step_id);
+      env_->collective_executor_mgr->RetireStepId(rcg->collective_graph_key(),
+                                                  step_id);
     }
     // Schedule post-processing and cleanup to be done asynchronously.
     rcg->ProcessStats(step_id, pss, ph.get(), run_options, out_run_metadata);
@@ -1846,7 +1869,7 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  uint64 step_id = NewStepId(rcg->client_graph()->collective_graph_key);
+  uint64 step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   std::unique_ptr<ProfileHandler> ph;
@@ -1854,6 +1877,7 @@ Status MasterSession::DoRunWithLocalExecution(
 
   Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
                                 &cancellation_manager_, false);
+
   cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
   return PostRunCleanup(rcg, step_id, req.options(), &pss, ph, s,
                         resp->mutable_metadata());
@@ -1910,7 +1934,7 @@ Status MasterSession::DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
   // Prepare.
   int64 count = rcg->get_and_increment_execution_count();
 
-  const uint64 step_id = NewStepId(rcg->client_graph()->collective_graph_key);
+  const uint64 step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   const RunOptions& run_options = rcg->callable_options().run_options();
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 4a10d99a6070d18acc127a519e0b1b852bc82497..d122016d3ee9ba8d152b430f0f9a62bb95e417d0 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -87,6 +87,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache_logger",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 181422118cd9f01658c1601a1779355f127c6fac..3626a48171e0b628b2630c35a17826b8713dc9d1 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -40,7 +40,7 @@ class GrpcEagerClient : public EagerClient {
       override {                                                          \
     new RPCState<protobuf::Message>(                                      \
         &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
-        response, std::move(done), nullptr);                              \
+        response, std::move(done), nullptr, nullptr);                     \
   }
 
   CLIENT_METHOD(CreateContext);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 6008462d0448130ed05393dd438d01002d243167..2daefcb399c79324f80278340967b679be5c6574 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -30,9 +30,11 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
@@ -41,10 +43,12 @@ class GrpcRemoteWorker : public WorkerInterface {
  public:
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
+                            thread::ThreadPool* callback_threadpool,
                             WorkerCacheLogger* logger)
       : channel_(std::move(channel)),
         stub_(channel_),
         cq_(completion_queue),
+        callback_threadpool_(callback_threadpool),
         getstatus_(Method(GrpcWorkerMethod::kGetStatus)),
         createworkersession_(Method(GrpcWorkerMethod::kCreateWorkerSession)),
         deleteworkersession_(Method(GrpcWorkerMethod::kDeleteWorkerSession)),
@@ -121,7 +125,44 @@ class GrpcRemoteWorker : public WorkerInterface {
 
   void RecvBufAsync(CallOptions* call_opts, const RecvBufRequest* request,
                     RecvBufResponse* response, StatusCallback done) override {
-    IssueRequest(request, response, recvbuf_, std::move(done), call_opts);
+    int64 start_usec = Env::Default()->NowMicros();
+    // Type-specialized logging for this method.
+    bool logging_active = logger_->LoggingActive() || VLOG_IS_ON(2);
+    StatusCallback wrapper_done;
+    const StatusCallback* cb_to_use;
+    if (!logging_active) {
+      cb_to_use = &done;  // No additional work to do, so just use done directly
+    } else {
+      wrapper_done = [this, request, response, done, start_usec](Status s) {
+        if (logger_->LoggingActive()) {
+          int64 end_usec = Env::Default()->NowMicros();
+          int64 step_id = request->step_id();
+          RecvBufRespExtra extra;
+          response->transport_options().UnpackTo(&extra);
+          int64 num_bytes = 0;
+          for (const auto& chunk : extra.tensor_content()) {
+            num_bytes += chunk.size();
+          }
+          int64 send_start_usec = start_usec;
+          // Prefer start time reported by the sender, if available.
+          if (response->send_start_micros()) {
+            send_start_usec = std::max(
+                start_usec, static_cast<int64>(response->send_start_micros()));
+            send_start_usec = std::min(send_start_usec, end_usec - 1);
+          }
+          const string& key = request->buf_rendezvous_key();
+          logger_->RecordDataTransfer(
+              step_id, send_start_usec, end_usec, key, request->src_device(),
+              request->dst_device(), num_bytes, "", "RecvBuf");
+        }
+        VLOG(2) << "done callback, req: " << request->DebugString()
+                << " response " << response->DebugString();
+        done(s);
+      };
+      cb_to_use = &wrapper_done;
+    }
+
+    IssueRequest(request, response, recvbuf_, *cb_to_use, call_opts);
   }
 
   void CompleteGroupAsync(CallOptions* call_opts,
@@ -220,13 +261,15 @@ class GrpcRemoteWorker : public WorkerInterface {
                     protobuf::Message* response, const ::grpc::string& method,
                     StatusCallback done, CallOptions* call_opts = nullptr) {
     new RPCState<protobuf::Message>(&stub_, cq_, method, *request, response,
-                                    std::move(done), call_opts);
+                                    std::move(done), call_opts,
+                                    callback_threadpool_);
   }
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
-                                 std::move(done), call_opts);
+                                 std::move(done), call_opts,
+                                 callback_threadpool_);
   }
 
   // Helper function for initializing the RpcMethod objects below.
@@ -235,6 +278,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   SharedGrpcChannelPtr channel_;
   ::grpc::GenericStub stub_;
   ::grpc::CompletionQueue* cq_;
+  thread::ThreadPool* callback_threadpool_;
 
   const ::grpc::string getstatus_;
   const ::grpc::string createworkersession_;
@@ -260,8 +304,10 @@ class GrpcRemoteWorker : public WorkerInterface {
 
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
+                                     thread::ThreadPool* callback_threadpool,
                                      WorkerCacheLogger* logger) {
-  return new GrpcRemoteWorker(std::move(channel), completion_queue, logger);
+  return new GrpcRemoteWorker(std::move(channel), completion_queue,
+                              callback_threadpool, logger);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index b85c1dc5b4e592e621ee96853dd724440ad9b4bd..d1f0e94ba52d81451a1085804cf01375f4d2fb57 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -19,18 +19,19 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 
 namespace grpc {
 class CompletionQueue;
 }
 
 namespace tensorflow {
-
 class WorkerCacheLogger;
 class WorkerInterface;
 
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
+                                     thread::ThreadPool* callback_threadpool,
                                      WorkerCacheLogger* logger);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index cde6b785dc6e351ba0d51bef9b23d6bd05742320..4f5975bbc11a6217355c1fcf368996a0fca45969 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -206,11 +206,11 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
 
   int index = call->index();
   // This object will delete itself when done.
-  new RPCState<string>(get_stub(index), &completion_queue_,
-                       *get_method_ptr(index), call->request(),
-                       call->response(),
-                       /*done=*/[call](const Status& s) { call->Done(s); },
-                       call->call_opts(), fail_fast_, timeout_in_ms_);
+  new RPCState<string>(
+      get_stub(index), &completion_queue_, *get_method_ptr(index),
+      call->request(), call->response(),
+      /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
+      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 63d438c615567ed44157af0a548b923718ad2654..ae722fdfe9559f1be6727f2e08c4d0aa5728a654 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -194,7 +194,7 @@ Status GrpcServer::Init(
   MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);
   master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
-  worker_impl_ = worker_func ? worker_func(&worker_env_)
+  worker_impl_ = worker_func ? worker_func(&worker_env_, config)
                              : NewGrpcWorker(&worker_env_, config);
   worker_service_ =
       NewGrpcWorkerService(worker_impl_.get(), &builder).release();
@@ -451,7 +451,11 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr, nullptr));
+  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
   *out_server = std::move(ret);
   return Status::OK();
 }
@@ -462,7 +466,11 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr, nullptr));
+  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
   *out_server = std::move(ret);
   return Status::OK();
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 7979e96d3edbf955eb93eb27b30e435b875bcfc7..c1395abddebd1af780ade4884b3f5af239c5fb0e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -53,7 +53,8 @@ typedef std::function<void(const WorkerEnv*, ::grpc::ServerBuilder*)>
     ServiceInitFunction;
 
 // function that creates a grpc based worker implementation.
-typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*)>
+typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*,
+                                                  const ConfigProto& config)>
     WorkerCreationFunction;
 
 class GrpcServer : public ServerInterface {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 61c5bc285f2f2e38a39737408a446a84b8442690..b67f3c4563107882a556e83c07ee20ca69b3f3b4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/notification.h"
 
 namespace tensorflow {
@@ -36,16 +37,18 @@ class RPCState : public GrpcClientCQTag {
   // Default behavior is to set fail_fast = False and handle timeouts manually.
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const protobuf::Message& request,
-           Response* response, StatusCallback done, CallOptions* call_opts)
+           Response* response, StatusCallback done, CallOptions* call_opts,
+           thread::ThreadPool* threadpool)
       : RPCState(stub, cq, method, request, response, std::move(done),
-                 call_opts, /*fail_fast=*/false, /*timeout_in_ms=*/0) {}
+                 call_opts, threadpool, /*fail_fast=*/false,
+                 /*timeout_in_ms=*/0) {}
 
   template <typename Request>
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           bool fail_fast, int64 timeout_in_ms)
-      : call_opts_(call_opts), done_(std::move(done)) {
+           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms)
+      : call_opts_(call_opts), threadpool_(threadpool), done_(std::move(done)) {
     context_.set_fail_fast(fail_fast);
     if (timeout_in_ms > 0) {
       context_.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
@@ -77,11 +80,27 @@ class RPCState : public GrpcClientCQTag {
       // to Finish for client-side unary calls, ok should never be false
       s.Update(errors::Internal("unexpected ok value at rpc completion"));
     }
-    if (s.ok() && !GrpcMaybeParseProto(&response_buf_, response_)) {
-      s.Update(errors::Internal("could not parse rpc response"));
-    }
-    if (!s.ok()) {
+
+    if (s.ok()) {
+      if (threadpool_) {
+        // Run parse and callback in another thread, returning this
+        // one to service more RPCs.
+        threadpool_->Schedule([this]() { ParseAndCallDone(); });
+      } else {
+        ParseAndCallDone();
+        return;
+      }
+    } else {
       VLOG(2) << "Call returned with non-ok status: " << s;
+      done_(s);
+      delete this;
+    }
+  }
+
+  void ParseAndCallDone() {
+    Status s;
+    if (!GrpcMaybeParseProto(&response_buf_, response_)) {
+      s.Update(errors::Internal("could not parse rpc response"));
     }
     done_(s);
     delete this;
@@ -90,6 +109,7 @@ class RPCState : public GrpcClientCQTag {
  private:
   CallOptions* call_opts_;
   ::grpc::ClientContext context_;
+  thread::ThreadPool* threadpool_;
   std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
   Response* response_;
   ::grpc::ByteBuffer request_buf_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index e1541db69bfc2471ff1241a0154f442c1fd5511c..60d5881d4ca75a7ea201d592d8668bce7438592e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -43,7 +43,17 @@ class GrpcWorkerCache : public WorkerCachePartial {
         local_worker_(local_worker),
         channel_cache_(channel_cache),
         threads_(kGrpcWorkerCacheThreadCount),
-        next_round_robin_assignment_(0) {}
+        next_round_robin_assignment_(0) {
+    // NOTE: We don't yet have any reason to assign NUMA affinity to this
+    // ThreadPool.  If there's only a single NIC it shouldn't make any
+    // difference since presumably it is handling memory from all nodes.
+    ThreadOptions options;
+    options.numa_node = port::kNUMANoAffinity;
+    const int kNumCallbackThreads = 10;
+    callback_threadpool_.reset(new thread::ThreadPool(
+        Env::Default(), options, "grpc_wcache_callback", kNumCallbackThreads,
+        false /*low_latency_hint*/, nullptr /*allocator*/));
+  }
 
   // Explicit destructor to control destruction order.
   ~GrpcWorkerCache() override {
@@ -67,7 +77,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
       if (!channel) return nullptr;
       return NewGrpcRemoteWorker(
           channel, threads_[AssignWorkerToThread(target)].completion_queue(),
-          &logger_);
+          callback_threadpool_.get(), &logger_);
     }
   }
 
@@ -138,6 +148,8 @@ class GrpcWorkerCache : public WorkerCachePartial {
   WorkerCacheLogger logger_;
   std::vector<GrpcWorkerCacheThread> threads_;
 
+  std::unique_ptr<thread::ThreadPool> callback_threadpool_;
+
   mutex assignment_mu_;
   std::unordered_map<std::string, size_t> target_assignments_
       GUARDED_BY(assignment_mu_);
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index b8cb5385038ed2c01d15cb5a571cd2d5ec6505c8..9fb920404f987d6b5b324cce4155da40c7e753b4 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -244,6 +244,15 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
   // Record "call" in active_ so that it can be aborted cleanly.
   RegisterCall(call);
 
+  // RendezvousMgr already aborted, shouldn't send RPC call any more
+  if (!call->status().ok()) {
+    call->done()(call->status(), Args(), Args(), Tensor(), false);
+    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
+    call->wi_ = nullptr;
+    get_call_freelist()->Release(call, session()->worker_cache.get());
+    return;
+  }
+
   // Start "call".
   Ref();
   call->Start([this, call]() {
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 7d308bb723a71e23482b6f52fa6d8fa53f89dda8..fe9369e884b8e24b31622b82487712ae6f96a6dd 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -49,16 +49,22 @@ void ServerFactory::Register(const string& server_type,
 Status ServerFactory::GetFactory(const ServerDef& server_def,
                                  ServerFactory** out_factory) {
   mutex_lock l(*get_server_factory_lock());
-  // TODO(mrry): Improve the error reporting here.
   for (const auto& server_factory : *server_factories()) {
     if (server_factory.second->AcceptsOptions(server_def)) {
       *out_factory = server_factory.second;
       return Status::OK();
     }
   }
+
+  std::vector<string> server_names;
+  for (const auto& server_factory : *server_factories()) {
+    server_names.push_back(server_factory.first);
+  }
+
   return errors::NotFound(
       "No server factory registered for the given ServerDef: ",
-      server_def.DebugString());
+      server_def.DebugString(), "\nThe available server factories are: [ ",
+      str_util::Join(server_names, ", "), " ]");
 }
 
 // Creates a server based on the given `server_def`, and stores it in
diff --git a/tensorflow/core/distributed_runtime/server_lib_test.cc b/tensorflow/core/distributed_runtime/server_lib_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..460372523c98c4e5a1e83be7a025e5911e9b4a8c
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/server_lib_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class TestServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "test_protocol";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return Status::OK();
+  }
+};
+
+TEST(ServerLibTest, NewServerFactoryAccepts) {
+  ServerFactory::Register("TEST_SERVER", new TestServerFactory());
+  ServerDef server_def;
+  server_def.set_protocol("test_protocol");
+  std::unique_ptr<ServerInterface> server;
+  TF_EXPECT_OK(NewServer(server_def, &server));
+}
+
+TEST(ServerLibTest, NewServerNoFactoriesAccept) {
+  ServerDef server_def;
+  server_def.set_protocol("fake_protocol");
+  std::unique_ptr<ServerInterface> server;
+  Status s = NewServer(server_def, &server);
+  ASSERT_NE(s, Status::OK());
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No server factory registered for the given ServerDef"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "The available server factories are: ["));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 95b31c6991f6344c1b15b1fd28225aef37359818..38833bd20247b62d6f9e56f351c6eb8c8259f96e 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -122,7 +122,9 @@ Status SessionMgr::WorkerSessionForSessionLocked(
     auto it = sessions_.find(session_handle);
     if (it == sessions_.end()) {
       return errors::Aborted("Session handle is not found: ", session_handle,
-                             ". Possibly this worker just restarted.");
+                             ". Possibly this worker (\"",
+                             legacy_session_->worker_name,
+                             "\") just restarted.");
     } else {
       *out_session = it->second;
     }
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 1ea19c48f09170e6044eb9c72b5090dfc2feb703..f42143e5824827e35a97ac25cb80b0e2c82e716e 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
+#include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
@@ -179,7 +180,28 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
       request->exec_opts().record_timeline() ||
       request->exec_opts().record_costs()) {
     collector = new StepStatsCollector(response->mutable_step_stats());
-    // TODO(mrry,pbar): GPU tracing for distributed steps.
+  }
+  DeviceTracer* tracer = nullptr;
+  if (collector && request->exec_opts().record_timeline()) {
+    // If timeline was requested, assume we want hardware level tracing.
+    std::unique_ptr<DeviceTracer> trptr = CreateDeviceTracer();
+    if (trptr) {
+      tracer = trptr.release();
+      Status s = tracer->Start();
+      if (!s.ok()) {
+        delete tracer;
+        if (errors::IsUnavailable(s)) {
+          LOG(WARNING)
+              << "Hardware tracing unavailable, continuing without it. " << s;
+          tracer = nullptr;
+        } else {
+          delete collector;
+          delete out;
+          done(s);
+          return;
+        }
+      }
+    }
   }
   CancellationManager* cm = new CancellationManager;
   opts->SetCancelCallback([this, cm, step_id]() {
@@ -194,6 +216,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
     opts->ClearCancelCallback();
     delete cm;
     delete collector;
+    delete tracer;
     delete out;
     done(errors::Aborted("Call was aborted"));
     return;
@@ -201,8 +224,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
   session->graph_mgr->ExecuteAsync(
       request->graph_handle(), step_id, session.get(), request->exec_opts(),
       collector, response, cm, in,
-      [this, step_id, response, session, cm, out, token, collector, opts,
-       done](Status s) {
+      [this, step_id, response, session, cm, out, token, collector, tracer,
+       opts, done](Status s) {
         if (s.ok()) {
           s = session->graph_mgr->RecvOutputs(step_id, out);
         }
@@ -210,6 +233,15 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
         cancellation_manager_.DeregisterCallback(token);
         delete cm;
 
+        if (tracer) {
+          Status tracer_status = tracer->Stop();
+          if (tracer_status.ok()) {
+            tracer_status = tracer->Collect(collector);
+          }
+          if (!tracer_status.ok()) {
+            LOG(ERROR) << "Bad status from tracer: " << tracer_status;
+          }
+        }
         if (s.ok()) {
           for (const auto& p : *out) {
             const string& key = p.first;
@@ -219,6 +251,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
         }
         if (collector) collector->Finalize();
         delete collector;
+        delete tracer;
         delete out;
         done(s);
       });
@@ -405,7 +438,9 @@ Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
     return errors::Aborted(
         "RecvTensor expects a different device incarnation: ",
         parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
-        ". Your worker job was probably restarted. Check your "
+        ". Your worker job (\"",
+        env_->session_mgr->LegacySession()->worker_name,
+        "\") was probably restarted. Check your "
         "worker job for the reason why it was restarted.");
   }
 
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 95ca3c3b4d11fac0d103eb52f19d5b0b2f4ad3ea..e0a1734087061c4c736ff93918fd82945b3742c1 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -101,13 +101,18 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
                                            const string& transfer_method_name) {
   NodeExecStats* ns = new NodeExecStats;
   ns->set_node_name(transfer_method_name);
+  int64 elapsed_usecs = end_usecs - start_usecs;
   if (details.empty()) {
     auto byte_string = strings::StrCat("[", bytes, "B] ");
     if (bytes >= 0.1 * 1048576.0) {
       byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0);
     }
-    auto label = strings::StrCat(byte_string, tensor_name, " from ", src_device,
-                                 " to ", dst_device);
+    float mbs_rate = (8.0 * static_cast<float>(bytes)) / elapsed_usecs;
+    auto rate_string = (mbs_rate >= 1000.0)
+                           ? strings::Printf("[%.1fGb/s] ", mbs_rate / 1000.0)
+                           : strings::Printf("[%fMb/s] ", mbs_rate);
+    auto label = strings::StrCat(byte_string, rate_string, tensor_name,
+                                 " from ", src_device, " to ", dst_device);
     ns->set_timeline_label(label);
   } else {
     ns->set_timeline_label(details);
@@ -115,13 +120,10 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
 
   ns->set_all_start_micros(start_usecs);
   ns->set_op_start_rel_micros(0);
-  int64 elapsed = end_usecs - start_usecs;
-  ns->set_op_end_rel_micros(elapsed);
-  ns->set_all_end_rel_micros(elapsed);
+  ns->set_op_end_rel_micros(elapsed_usecs);
+  ns->set_all_end_rel_micros(elapsed_usecs);
   NodeOutput* no = ns->add_output();
   no->set_slot(0);
-  // TODO(tucker): Maybe set the dimensions too, but then they'll
-  // need to be passed in.
   no->mutable_tensor_description()
       ->mutable_allocation_description()
       ->set_requested_bytes(bytes);
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 84cee5569c4ac2c0083e4d4970b48460d9bd95ca..89c49a2ad050bfe067e9557aabd2916fba812fb0 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -96,9 +96,11 @@ static int64_t TotalAllocationWarningBytes() {
 void EnableCPUAllocatorStats(bool enable) {
   cpu_allocator_collect_stats = enable;
 }
+bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
 void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
+bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
 namespace {
 // A default Allocator for CPU devices.  ProcessState::GetCPUAllocator() will
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 8c23604625ba77a4ca4fa42f96059735ed525f5d..531ea73e89277c83cfede50fce0de08b65c5e5a5 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -383,10 +383,12 @@ Allocator* cpu_allocator();
 // If 'enable' is true, the default CPU allocator implementation will collect
 // AllocatorStats. By default, it's disabled.
 void EnableCPUAllocatorStats(bool enable);
+bool CPUAllocatorStatsEnabled();
 
 // If 'enable' is true, the default CPU allocator implementation will collect
 // full statistics. By default, it's disabled.
 void EnableCPUAllocatorFullStats(bool enable);
+bool CPUAllocatorFullStatsEnabled();
 
 // An object that does the underlying suballoc/free of memory for a higher-level
 // allocator.  The expectation is that the higher-level allocator is doing some
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index e934cbfb54742154467b6edc1d58c56215519c40..7f35390f90c4ffb22e5e8247096812896371b3ad 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1059,7 +1059,7 @@ Status UnknownShape(shape_inference::InferenceContext* c) {
 template <typename T>
 Status ReductionShapeHelper(const Tensor* reduction_indices_t,
                             const int32 input_rank,
-                            std::set<int64>& true_indices) {
+                            std::set<int64>* true_indices) {
   auto reduction_indices = reduction_indices_t->flat<T>();
   for (int i = 0; i < reduction_indices_t->NumElements(); ++i) {
     const T reduction_index = reduction_indices(i);
@@ -1074,7 +1074,7 @@ Status ReductionShapeHelper(const Tensor* reduction_indices_t,
       wrapped_index += input_rank;
     }
 
-    true_indices.insert(wrapped_index);
+    true_indices->insert(wrapped_index);
   }
   return Status::OK();
 }
@@ -1112,10 +1112,10 @@ Status ReductionShape(InferenceContext* c) {
   std::set<int64> true_indices;
   if (reduction_indices_t->dtype() == DataType::DT_INT32) {
     TF_RETURN_IF_ERROR(ReductionShapeHelper<int32>(reduction_indices_t,
-                                                   input_rank, true_indices));
+                                                   input_rank, &true_indices));
   } else if (reduction_indices_t->dtype() == DataType::DT_INT64) {
     TF_RETURN_IF_ERROR(ReductionShapeHelper<int64>(reduction_indices_t,
-                                                   input_rank, true_indices));
+                                                   input_rank, &true_indices));
   } else {
     return errors::InvalidArgument(
         "reduction_indices can only be int32 or int64");
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 284dafb886e6dcdb55da7496b048718cfb190862..fc6b5dde0cbe0b6ef1ae3c65171c16001f383b64 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -140,7 +140,7 @@ Status GraphDefBuilderWrapper::AddFunction(SerializationContext* ctx,
             << " the graph. It will not be added again.";
     return Status::OK();
   }
-  if (!ctx->allow_stateful_functions()) {
+  if (!ctx->optimization_only()) {
     TF_RETURN_IF_ERROR(
         EnsureFunctionIsStateless(ctx->flib_def(), function_name));
   }
@@ -203,28 +203,9 @@ bool GraphDefBuilderWrapper::HasAttr(const string& name,
   return HasAttr(op_def, attr_name);
 }
 
-Status DatasetBase::Save(SerializationContext* ctx,
-                         IteratorStateWriter* writer) const {
-  string serialized_graph_def;
-  string output_node;
-  GraphDefBuilder b;
-  DatasetGraphDefBuilder db(&b);
-  Node* node = nullptr;
-  TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
-  output_node = node->name();
-  GraphDef graph_def;
-  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-  graph_def.SerializeToString(&serialized_graph_def);
-  TF_RETURN_IF_ERROR(
-      writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
-  TF_RETURN_IF_ERROR(
-      writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
-  return Status::OK();
-}
-
 Status GetDatasetFromVariantTensor(const Tensor& tensor,
                                    DatasetBase** out_dataset) {
-  if (!(tensor.dtype() == DT_VARIANT ||
+  if (!(tensor.dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(tensor.shape()))) {
     return errors::InvalidArgument(
         "Dataset tensor must be a scalar of dtype DT_VARIANT.");
@@ -251,6 +232,47 @@ Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor) {
   return Status::OK();
 }
 
+Status DatasetBase::Save(SerializationContext* ctx,
+                         IteratorStateWriter* writer) const {
+  string serialized_graph_def;
+  string output_node;
+  GraphDefBuilder b;
+  DatasetGraphDefBuilder db(&b);
+  Node* node = nullptr;
+  TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
+  output_node = node->name();
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
+  graph_def.SerializeToString(&serialized_graph_def);
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
+  return Status::OK();
+}
+
+Status DatasetBase::DatasetGraphDefBuilder::AddInputDataset(
+    SerializationContext* ctx, const DatasetBase* dataset, Node** output) {
+  Status status = dataset->AsGraphDefInternal(ctx, this, output);
+  if (ctx->optimization_only() && errors::IsUnimplemented(status)) {
+    Tensor t(DT_VARIANT, TensorShape({}));
+    // `StoreDatasetInVariantTensor` will transfer ownership of `dataset`. We
+    // increment the refcount of `dataset` here to retain ownership.
+    dataset->Ref();
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(const_cast<DatasetBase*>(dataset), &t));
+    TF_RETURN_IF_ERROR(AddPlaceholder(t, output));
+    DCHECK_NE(ctx->input_list(), nullptr);
+    ctx->input_list()->emplace_back((*output)->name(), std::move(t));
+    LOG(WARNING)
+        << "Input of " << dataset->DebugString()
+        << " will not be optimized because the dataset does not implement the "
+           "AsGraphDefInternal() method needed to apply optimizations.";
+    return Status::OK();
+  }
+  return status;
+}
+
 void DatasetOpKernel::Compute(OpKernelContext* ctx) {
   DatasetBase* dataset = nullptr;
   MakeDataset(ctx, &dataset);
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 55f0a3456da7af4ae0355fd0bb78038f2fad27fa..9b11449b300ca9ca1949fde4cc7154bc08f664d3 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -30,8 +30,10 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
 
 // Polymorphic datasets should support all primitive TensorFlow
@@ -272,47 +274,75 @@ class StatsAggregator;
 class IteratorContext {
  public:
   struct Params {
-    // Interface to operating system functionality.
-    Env* env;
+    explicit Params(IteratorContext* ctx)
+        : allocator_getter(ctx->allocator_getter()),
+          env(ctx->env()),
+          function_library(ctx->function_library()),
+          lib(ctx->lib()),
+          model(ctx->model()),
+          runner(*(ctx->runner())),
+          runner_threadpool_size(ctx->runner_threadpool_size()),
+          stats_aggregator(ctx->stats_aggregator()) {}
+
+    explicit Params(OpKernelContext* ctx)
+        : env(ctx->env()),
+          lib(ctx->function_library()),
+          runner(*(ctx->runner())) {
+      // NOTE: need reinterpret_cast because function.h forward-declares Device.
+      DeviceBase* device =
+          reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
+      allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
+      thread::ThreadPool* thread_pool =
+          ctx->device()->tensorflow_device_thread_pool();
+      if (thread_pool) {
+        runner_threadpool_size = thread_pool->NumThreads();
+      } else {
+        runner_threadpool_size = port::NumSchedulableCPUs();
+      }
+    }
 
-    // Function call support.
-    std::function<void(std::function<void()>)> runner = nullptr;
+    // The Allocator to be used to allocate the output of an iterator.
+    std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
 
-    // The `StatsAggregator` object to record statistics about the iterator.
-    std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
+    // Interface to operating system functionality.
+    Env* env = nullptr;
 
-    // The FunctionLibraryRuntime object to be used to make function calls.
-    FunctionLibraryRuntime* lib = nullptr;
+    // The FunctionLibraryDefinition used to look up user-defined functions.
     std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
 
-    // The Allocator to be used to allocate the output of an iterator.
-    std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
+    // The FunctionLibraryRuntime object to be used to make function calls.
+    FunctionLibraryRuntime* lib = nullptr;
 
     // If non-null, identifies the object used for performance modeling.
     std::shared_ptr<model::Model> model = nullptr;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+
+    // Number of threads used for executing user-defined functions.
+    int32 runner_threadpool_size = 0;
+
+    // The `StatsAggregator` object to record statistics about the iterator.
+    std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
   };
 
+  explicit IteratorContext(IteratorContext* ctx) : params_(Params{ctx}) {}
+
+  explicit IteratorContext(OpKernelContext* ctx) : params_(Params{ctx}) {}
+
   explicit IteratorContext(Params params) : params_(std::move(params)) {}
 
-  explicit IteratorContext(OpKernelContext* ctx) {
-    params_.env = ctx->env();
-    params_.runner = *(ctx->runner());
-    params_.lib = ctx->function_library();
-    // NOTE: must use reinterpret_cast because function.h forward-declares
-    // Device.
-    DeviceBase* device =
-        reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
-    params_.allocator_getter = [device](AllocatorAttributes attrs) {
-      return device->GetAllocator(attrs);
-    };
+  Allocator* allocator(AllocatorAttributes attrs) {
+    return params_.allocator_getter(attrs);
   }
 
-  Env* env() const { return params_.env; }
-
-  std::function<void(std::function<void()>)>* runner() {
-    return &params_.runner;
+  std::function<Allocator*(AllocatorAttributes)> allocator_getter() {
+    return params_.allocator_getter;
   }
 
+  Env* env() const { return params_.env; }
 
   std::shared_ptr<const FunctionLibraryDefinition> function_library() {
     return params_.function_library;
@@ -320,22 +350,18 @@ class IteratorContext {
 
   FunctionLibraryRuntime* lib() { return params_.lib; }
 
-  void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
+  const std::shared_ptr<model::Model>& model() { return params_.model; }
 
-  Allocator* allocator(AllocatorAttributes attrs) {
-    return params_.allocator_getter(attrs);
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
   }
 
-  std::function<Allocator*(AllocatorAttributes)> allocator_getter() {
-    return params_.allocator_getter;
-  }
+  int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
 
   std::shared_ptr<StatsAggregator> stats_aggregator() {
     return params_.stats_aggregator;
   }
 
-  std::shared_ptr<model::Model> model() { return params_.model; }
-
   Params params() { return params_; }
 
  private:
@@ -346,21 +372,21 @@ class IteratorContext {
 class SerializationContext {
  public:
   struct Params {
-    bool allow_stateful_functions = false;
     const FunctionLibraryDefinition* flib_def = nullptr;           // Not owned.
     std::vector<std::pair<string, Tensor>>* input_list = nullptr;  // Not owned.
+    bool optimization_only = false;
   };
 
   explicit SerializationContext(Params params) : params_(std::move(params)) {}
 
-  bool allow_stateful_functions() { return params_.allow_stateful_functions; }
-
   const FunctionLibraryDefinition& flib_def() { return *params_.flib_def; }
 
   std::vector<std::pair<string, Tensor>>* input_list() {
     return params_.input_list;
   }
 
+  bool optimization_only() { return params_.optimization_only; }
+
  private:
   Params params_;
 
@@ -460,6 +486,7 @@ class IteratorBase {
 
  private:
   friend class DatasetBase;  // for access to `AddCleanupFunction`
+  friend class DatasetBaseIterator;  // for access to `node_`
 
   // Registers a cleanup function to be called upon object destruction.
   //
@@ -468,7 +495,11 @@ class IteratorBase {
     cleanup_fns_.push_back(std::move(cleanup_fn));
   }
 
+  // Associates the given performance modeling `Node` with this iterator.
+  void SetNode(std::shared_ptr<model::Node> node) { node_ = node.get(); }
+
   std::vector<std::function<void()>> cleanup_fns_;
+  model::Node* node_ = nullptr;  // Not owned.
 };
 
 // Represents runtime information needed to construct a dataset.
@@ -518,11 +549,10 @@ class DatasetBase : public core::RefCounted {
   Status MakeIterator(IteratorContext* ctx, const string& output_prefix,
                       std::unique_ptr<IteratorBase>* iterator) const {
     *iterator = MakeIteratorInternal(output_prefix);
-    std::shared_ptr<model::Model> model = ctx->model();
-    if (model) {
+    if (const auto& model = ctx->model()) {
       const string& prefix = (*iterator)->prefix();
-      model->AddNode(MakeNodeFactory(ctx, iterator->get()), prefix,
-                     output_prefix);
+      (*iterator)->SetNode(model->AddNode(MakeNodeFactory(ctx, iterator->get()),
+                                          prefix, output_prefix));
       (*iterator)->AddCleanupFunction(
           [model, prefix]() { model->RemoveNode(prefix); });
     }
@@ -558,9 +588,7 @@ class DatasetBase : public core::RefCounted {
    public:
     DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
     Status AddInputDataset(SerializationContext* ctx,
-                           const DatasetBase* dataset, Node** output) {
-      return dataset->AsGraphDefInternal(ctx, this, output);
-    }
+                           const DatasetBase* dataset, Node** output);
   };
 
   // TODO(jsimsa): Consolidate overloading into a single method.
@@ -654,24 +682,32 @@ class DatasetBaseIterator : public IteratorBase {
   // When performance modeling is enabled, this method records the fact that
   // this iterator has produced an element.
   void RecordElement(IteratorContext* ctx) {
-    if (ctx->model()) {
-      ctx->model()->RecordElement(prefix());
+    if (node_) {
+      node_->record_element();
     }
   }
 
   // When performance modeling is enabled, this method records the fact that
   // a thread of this iterator has started work.
   void RecordStart(IteratorContext* ctx, bool stop_output = false) {
-    if (ctx->model()) {
-      ctx->model()->RecordStart(prefix(), stop_output);
+    if (node_) {
+      int64 now_nanos = Env::Default()->NowNanos();
+      if (stop_output && node_->output()) {
+        node_->output()->record_stop(now_nanos);
+      }
+      node_->record_start(now_nanos);
     }
   }
 
   // When performance modeling is enabled, this method records the fact that
   // a thread of this iterator has stopped work.
   void RecordStop(IteratorContext* ctx, bool start_output = false) {
-    if (ctx->model()) {
-      ctx->model()->RecordStop(prefix(), start_output);
+    if (node_) {
+      int64 now_nanos = Env::Default()->NowNanos();
+      node_->record_stop(now_nanos);
+      if (start_output && node_->output()) {
+        node_->output()->record_start(now_nanos);
+      }
     }
   }
 
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index 9108c32942ad65616b246227f2ad84a56ea9eb93..78ace480c4bad66b06f27ca90a1bc5c482c3f00c 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -34,14 +34,14 @@ const string& DeviceBase::name() const {
 }
 
 void DeviceBase::set_eigen_cpu_device(Eigen::ThreadPoolDevice* d) {
-  // Eigen::ThreadPoolDevice is a very cheap struct (one pointer and
+  // Eigen::ThreadPoolDevice is a very cheap struct (two pointers and
   // an int).  Therefore, we can afford a pre-allocated array of
   // Eigen::ThreadPoolDevice.  Here, we ensure that
   // Eigen::ThreadPoolDevices in eigen_cpu_devices_ has increasingly
   // larger numThreads.
   for (int i = 1; i <= d->numThreads(); ++i) {
-    eigen_cpu_devices_.push_back(
-        new Eigen::ThreadPoolDevice(d->getPool(), i /* numThreads() */));
+    eigen_cpu_devices_.push_back(new Eigen::ThreadPoolDevice(
+        d->getPool(), i /* numThreads() */, d->allocator()));
   }
 }
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index abd0930ca9e8d0d881498093f98762e8ab1d3e5c..6809c27197a3a1e0a6eb075b4a0ee0124468fdb9 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb_text.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -149,8 +150,8 @@ class FunctionInstantiationHelper {
   }
 
   // Builds index for nodes that can be used as node's input arguments.
-  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
-                            AttrSlice attr_values) {
+  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def, AttrSlice attr_values,
+                            bool ints_on_device) {
     bool is_type_list;
     DataTypeVector dtypes;
     TF_RETURN_IF_ERROR(
@@ -169,7 +170,11 @@ class FunctionInstantiationHelper {
         strings::StrAppend(&name, "_", i);
       }
       NodeDef* gnode = AddNode(name);
-      gnode->set_op(FunctionLibraryDefinition::kArgOp);
+      if (ints_on_device && dtypes[i] == DataType::DT_INT32) {
+        gnode->set_op(FunctionLibraryDefinition::kDeviceArgOp);
+      } else {
+        gnode->set_op(FunctionLibraryDefinition::kArgOp);
+      }
       AddAttr("T", dtypes[i], gnode);
       AddAttr("index", arg_index, gnode);
       result_.arg_types.push_back(dtypes[i]);
@@ -564,9 +569,11 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   std::vector<const NodeDef*> ret;
   std::vector<const NodeDef*> body;
   for (const NodeDef* n : nodes) {
-    if (n->op() == FunctionLibraryDefinition::kArgOp) {
+    if (n->op() == FunctionLibraryDefinition::kArgOp ||
+        n->op() == FunctionLibraryDefinition::kDeviceArgOp) {
       arg.push_back(n);
-    } else if (n->op() == FunctionLibraryDefinition::kRetOp) {
+    } else if (n->op() == FunctionLibraryDefinition::kRetOp ||
+               n->op() == FunctionLibraryDefinition::kDeviceRetOp) {
       ret.push_back(n);
     } else {
       body.push_back(n);
@@ -638,10 +645,13 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   const OpDef& sig = fdef.signature();
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
 
+  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
+                        fdef.attr().at("experimental_ints_on_device").b();
+
   FunctionInstantiationHelper helper(get_function, result);
   Status s;
   for (const OpDef::ArgDef& arg_def : sig.input_arg()) {
-    s = helper.BuildInputArgIndex(arg_def, attr_values);
+    s = helper.BuildInputArgIndex(arg_def, attr_values, ints_on_device);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", Print(arg_def));
       return s;
@@ -693,9 +703,6 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
     }
   }
 
-  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
-                        fdef.attr().at("experimental_ints_on_device").b();
-
   // Emits nodes for the function's return values.
   int ret_index = 0;
   for (const OpDef::ArgDef& ret_def : sig.output_arg()) {
@@ -1234,6 +1241,16 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   }
 }
 
+std::vector<string> FunctionLibraryDefinition::ListFunctionNames() const {
+  std::vector<string> function_names;
+  tf_shared_lock l(mu_);
+  function_names.reserve(function_defs_.size());
+  for (const auto& it : function_defs_) {
+    function_names.emplace_back(it.first);
+  }
+  return function_names;
+}
+
 FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
   FunctionDefLibrary lib;
   tf_shared_lock l(mu_);
@@ -1273,6 +1290,138 @@ GET_ATTR(string)
 GET_ATTR(bool)
 #undef GET_ATTR
 
+namespace {
+
+constexpr char kExperimentalApiImplements[] = "experimental_api_implements";
+
+absl::flat_hash_set<string> ReachableFunctions(
+    const FunctionLibraryDefinition& flib,
+    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+  // Functions that are reachable from the graph.
+  absl::flat_hash_set<string> reachable_funcs;
+
+  // For any functions, if it has attribute "experimental_api_implements" =
+  // "some_interface" and it is reachable, then it means any other
+  // function with same attribute name and value could also be potentially
+  // reachable, eg via experimental_implementation_selector swapping the
+  // nodedef.
+  absl::flat_hash_set<string> reachable_api_interface;
+
+  // Functions might be reachable from the nested function calls, so we keep a
+  // queue of functions that we have to check.
+  gtl::InlinedVector<const FunctionDef*, 4> func_queue;
+
+  // Add reachable and not already processed functions to the functions queue.
+  const auto add_to_func_queue = [&](const string& func_name) {
+    const FunctionDef* func = flib.Find(func_name);
+    if (func && reachable_funcs.find(func_name) == reachable_funcs.end()) {
+      func_queue.push_back(func);
+    }
+  };
+
+  // Add all the functions that are reachable from the given node to the queue.
+  const auto process_node = [&](const NodeDef& node) {
+    // Node itself can be a call to the function.
+    add_to_func_queue(node.op());
+
+    // Or node can have an attribute referencing a function.
+    for (const auto& attr : node.attr()) {
+      const auto& attr_value = attr.second;
+
+      // 1. AttrValue.func
+      if (attr_value.has_func()) {
+        add_to_func_queue(attr_value.func().name());
+      }
+
+      // 2. AttrValue.ListValue.func
+      if (attr_value.has_list()) {
+        for (const auto& func : attr_value.list().func()) {
+          add_to_func_queue(func.name());
+        }
+      }
+    }
+  };
+
+  // Add all functions that are directly called from the optimized graph.
+  std::for_each(nodes.begin(), nodes.end(), process_node);
+
+  // Process all reachable functions.
+  while (!func_queue.empty()) {
+    const FunctionDef* func = func_queue.back();
+    func_queue.pop_back();
+
+    const string& func_name = func->signature().name();
+    reachable_funcs.insert(func_name);
+
+    const auto attr_it = func->attr().find(kExperimentalApiImplements);
+    if (attr_it != func->attr().end()) {
+      reachable_api_interface.insert(attr_it->second.s());
+    }
+
+    // Find all the functions called from the function body.
+    const auto& func_body = func->node_def();
+    std::for_each(func_body.begin(), func_body.end(), process_node);
+
+    // Check if the function has a registered gradient.
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
+  }
+
+  for (const auto& func_name : flib.ListFunctionNames()) {
+    const auto& func_def = flib.Find(func_name);
+    const auto attr_it = func_def->attr().find(kExperimentalApiImplements);
+    if (attr_it != func_def->attr().end()) {
+      if (reachable_api_interface.contains(attr_it->second.s())) {
+        reachable_funcs.insert(func_name);
+      }
+    }
+  }
+
+  return reachable_funcs;
+}
+
+FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
+    const FunctionLibraryDefinition& flib,
+    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, nodes);
+
+  FunctionLibraryDefinition reachable_flib(flib.default_registry(),
+                                           FunctionDefLibrary());
+
+  for (const string& func_name : reachable_funcs) {
+    const FunctionDef* func = flib.Find(func_name);
+    DCHECK_NE(func, nullptr);
+    // That should never fail, because we copy functions from valid flib and use
+    // the same default registry.
+    const Status added = reachable_flib.AddFunctionDef(*func);
+    DCHECK(added.ok());
+
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) {
+      GradientDef grad;
+      grad.set_function_name(func_name);
+      grad.set_gradient_func(grad_func_name);
+      // It can only fail if function already has a gradient function.
+      const Status added_grad = reachable_flib.AddGradientDef(grad);
+      DCHECK(added_grad.ok());
+    }
+  }
+
+  return reachable_flib;
+}
+
+}  // namespace
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const GraphDef& graph) const {
+  return ReachableFunctionLibraryDefinition(*this, graph.node());
+}
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const FunctionDef& func) const {
+  return ReachableFunctionLibraryDefinition(*this, func.node_def());
+}
+
 void FunctionDefHelper::AttrValueWrapper::InitFromString(StringPiece val) {
   if (val.size() >= 2 && val[0] == '$') {
     proto.set_placeholder(val.data() + 1, val.size() - 1);
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 40ace6ef81589885b2eb025edd337220a8cce545..9cf4b0f4cdf1d4c3604eebcf33bb51274578d73c 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -379,6 +379,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Ops created for function arguments bear the name given by `kArgOp`; those
   // created for return values bear the name given by `kRetOp`.
   static constexpr const char* const kArgOp = "_Arg";
+  static constexpr const char* const kDeviceArgOp = "_DeviceArg";
   static constexpr const char* const kRetOp = "_Retval";
   static constexpr const char* const kDeviceRetOp = "_DeviceRetval";
 
@@ -406,10 +407,18 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
     return function_defs_.size();
   }
 
+  // Returns all the function names in the FunctionLibraryDefinition.
+  std::vector<string> ListFunctionNames() const LOCKS_EXCLUDED(mu_);
+
   const OpRegistryInterface* default_registry() const {
     return default_registry_;
   }
 
+  // Returns a copy of `*this` with only the subset of functions that are
+  // reachable from the nodes of `graph` or `func`.
+  FunctionLibraryDefinition ReachableDefinitions(const GraphDef& graph) const;
+  FunctionLibraryDefinition ReachableDefinitions(const FunctionDef& func) const;
+
  private:
   // Shape inference for functions is handled separately by ShapeRefiner.
 
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 10392a9f32850a32c1a4a8ca273693987f102244..75d45fa2c84ebc340dfb79b76f7b406d7a099c1f 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -1213,6 +1213,17 @@ TEST(FunctionLibraryDefinitionTest, ToProto) {
   EXPECT_EQ(f3->DebugString(), f4->DebugString());
 }
 
+TEST(FunctionLibraryDefinitionTest, FunctionNames) {
+  FunctionDefLibrary proto;
+  *proto.add_function() = test::function::XTimesTwo();
+  *proto.add_function() = test::function::WXPlusB();
+  const FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
+
+  const std::vector<string> function_names = lib_def.ListFunctionNames();
+  const std::vector<string> expected = {"XTimesTwo", "WXPlusB"};
+  EXPECT_EQ(function_names, expected);
+}
+
 TEST(FunctionLibraryDefinitionTest, GetAttr_FuncNoAttr) {
   FunctionDefLibrary proto;
   *proto.add_function() = test::function::XTimesTwo();
@@ -1293,6 +1304,79 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_Gradient) {
   EXPECT_EQ(annotation, false);  // WXPlusB has no custom gradient.
 }
 
+TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
+  using ::tensorflow::test::function::GDef;
+  using ::tensorflow::test::function::NDef;
+  using FDH = ::tensorflow::FunctionDefHelper;
+
+  const auto make_simple_fdef = [](const string& name,
+                                   const string& interface_name) {
+    auto func_def = FDH::Create(
+        name, {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+        {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+        /* Mapping between function returns and function node outputs. */
+        {{"z", "output:z:0"}});
+
+    if (!interface_name.empty()) {
+      auto* attr = func_def.mutable_attr();
+      (*attr)["experimental_api_implements"].set_s(interface_name);
+    }
+    return func_def;
+  };
+
+  FunctionDef func_1 = make_simple_fdef("Func1", "");
+  FunctionDef func_2 = make_simple_fdef("Func2", "");
+  FunctionDef func_3 = make_simple_fdef("Func3", "");
+  FunctionDef func_4 = make_simple_fdef("Func4", "api_1");
+  FunctionDef func_5 = make_simple_fdef("Func5", "api_1");
+  FunctionDef func_6 = make_simple_fdef("Func6", "api_2");
+
+  FunctionDef func_2_grad = make_simple_fdef("Func2_grad", "");
+
+  constexpr char kDevice[] = "/device:CPU:0";
+
+  GraphDef graph = GDef(
+      {
+          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+          NDef("x", "Func1", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+                {"Tout", DataTypeSlice{DT_FLOAT}},
+                {"f", FDH::FunctionRef("Func2", {{"T", DT_FLOAT}})}},
+               kDevice),
+          NDef("z", "Func4", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+      },
+      // FunctionLib
+      {func_1, func_2, func_3, func_2_grad, func_4, func_5, func_6});
+
+  // Register custom function gradient after the graph was constructed.
+  GradientDef* func3_grad_def = graph.mutable_library()->add_gradient();
+  func3_grad_def->set_function_name("Func2");
+  func3_grad_def->set_gradient_func("Func2_grad");
+
+  FunctionLibraryDefinition flib(OpRegistry::Global(), graph.library());
+
+  // - 'Func1' is called directly from the graph.
+  // - 'Func2' is called indirectly via a PartitionedCall attribute, and it also
+  //   has a custom gradient ('Func2_grad') that must remain in the library.
+  // - 'Func3' is unreachable and has to be removed from the library
+  // - 'Func4' is called directly from the graph
+  // - 'Func5' is not called directly, but it implements same interface as Func4
+  //   which is directly called.
+  // - 'Func6' is not called directly, and the interface it implements has not
+  //   not been called by another nodes in the graph.
+  FunctionLibraryDefinition reachable_flib = flib.ReachableDefinitions(graph);
+  EXPECT_EQ(reachable_flib.num_functions(), 5);
+  EXPECT_TRUE(reachable_flib.Contains("Func1"));
+  EXPECT_TRUE(reachable_flib.Contains("Func2"));
+  EXPECT_TRUE(reachable_flib.Contains("Func2_grad"));
+  EXPECT_FALSE(reachable_flib.Contains("Func3"));
+  EXPECT_TRUE(reachable_flib.Contains("Func4"));
+  EXPECT_TRUE(reachable_flib.Contains("Func5"));
+  EXPECT_FALSE(reachable_flib.Contains("Func6"));
+}
+
 // TODO(skyewm): this could be more thorough
 TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   // Equal functions
diff --git a/tensorflow/core/framework/kernel_def.proto b/tensorflow/core/framework/kernel_def.proto
index e16c2ae73bd5fb559daa0f1b8ec141479ce3d67a..358621dc0f5cc19d4687d75e97a76b9fafe3325f 100644
--- a/tensorflow/core/framework/kernel_def.proto
+++ b/tensorflow/core/framework/kernel_def.proto
@@ -33,6 +33,11 @@ message KernelDef {
   // won't be used unless the user specifies a "_kernel" attr with
   // value matching this.
   string label = 5;
+
+  // Prioritization of kernel amongst different devices. By default we assume
+  // priority is 0. The higher the priority the better. By default (i.e. if
+  // this is not set), we prefer GPU kernels over CPU.
+  int32 priority = 6;
 }
 
 // A collection of KernelDefs
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index eb86f18ff06c38860e0c24e60b42326317ddecfb..fcacc3bebbab66449f81e5fa4f3aba2565f3f18e 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -66,6 +66,11 @@ KernelDefBuilder& KernelDefBuilder::Label(const char* label) {
   return *this;
 }
 
+KernelDefBuilder& KernelDefBuilder::Priority(int32 priority) {
+  kernel_def_->set_priority(priority);
+  return *this;
+}
+
 const KernelDef* KernelDefBuilder::Build() {
   KernelDef* r = kernel_def_;
   kernel_def_ = nullptr;
diff --git a/tensorflow/core/framework/kernel_def_builder.h b/tensorflow/core/framework/kernel_def_builder.h
index 32dd21f94e0edf8b48cd2f710d1cd99038cba122..d74453cf60678d0f07e53190adba4903c120c69a 100644
--- a/tensorflow/core/framework/kernel_def_builder.h
+++ b/tensorflow/core/framework/kernel_def_builder.h
@@ -64,6 +64,9 @@ class KernelDefBuilder {
   // "_kernel" attr.  May only be specified once.  Returns *this.
   KernelDefBuilder& Label(const char* label);
 
+  // Specify a priority number for this kernel.
+  KernelDefBuilder& Priority(int32 priority);
+
   // Returns a pointer to a KernelDef with fields set based on the
   // above calls to this instance.
   // Caller takes ownership of the result.
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 5650b4861b930606e12560e16fdd514c21aae5f5..3bd5b725b860ff522dba5be86ef7ab64b387b03e 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -330,8 +330,8 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args) {
   return std::make_shared<Unknown>(std::move(args));
 }
 
-void Model::AddNode(Node::Factory factory, const string& name,
-                    const string& output_name) {
+std::shared_ptr<Node> Model::AddNode(Node::Factory factory, const string& name,
+                                     const string& output_name) {
   // The name captures the sequence of iterators joined by `::`. We use the full
   // sequence as the key in the lookup table, but only the last element of the
   // sequence as the name node.
@@ -357,6 +357,7 @@ void Model::AddNode(Node::Factory factory, const string& name,
     output->add_input(node);
   }
   lookup_table_.insert(std::make_pair(name, node));
+  return node;
 }
 
 void Model::AddProcessingTime(const string& name, int64 delta) {
@@ -441,10 +442,11 @@ void Model::RecordStart(const string& name, bool stop_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
   if (node) {
+    int64 now_nanos = Env::Default()->NowNanos();
     if (stop_output && (*node)->output()) {
-      (*node)->output()->record_stop();
+      (*node)->output()->record_stop(now_nanos);
     }
-    (*node)->record_start();
+    (*node)->record_start(now_nanos);
   }
 }
 
@@ -452,9 +454,10 @@ void Model::RecordStop(const string& name, bool start_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
   if (node) {
-    (*node)->record_stop();
+    int64 now_nanos = Env::Default()->NowNanos();
+    (*node)->record_stop(now_nanos);
     if (start_output && (*node)->output()) {
-      (*node)->output()->record_start();
+      (*node)->output()->record_start(now_nanos);
     }
   }
 }
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 10ecdef586808ad74f70a64cd43167b768ba51f6..24aa5630cc38550789d6184500cff6b0394ecbee 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -18,7 +18,8 @@ limitations under the License.
 #include <list>
 #include <memory>
 #include <string>
-#include <thread>  // (b/114492873): move this include into core/platform
+// TODO(b/114492873): Move this include into core/platform.
+#include <thread>  // NOLINT
 #include <utility>
 #include <vector>
 
@@ -108,8 +109,8 @@ class Node {
 
   using Factory = std::function<std::shared_ptr<Node>(Args)>;
 
-  Node(Args args)
-      : id_(args.id), name_(args.name), output_(std::move(args.output)) {}
+  explicit Node(Args args)
+      : id_(args.id), name_(args.name), output_(args.output.get()) {}
 
   // Adds an input.
   void add_input(std::shared_ptr<Node> node) LOCKS_EXCLUDED(mu_) {
@@ -142,10 +143,7 @@ class Node {
   }
 
   // Returns the node output.
-  std::shared_ptr<Node> output() const LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock l(mu_);
-    return output_;
-  }
+  Node* output() const { return output_; }
 
   // Returns the aggregate processing time.
   int64 processing_time() const LOCKS_EXCLUDED(mu_) {
@@ -160,19 +158,19 @@ class Node {
   }
 
   // Records that a node thread has started executing.
-  void record_start() LOCKS_EXCLUDED(mu_) {
+  void record_start(int64 time_nanos) LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
-    work_start_[std::this_thread::get_id()] = Env::Default()->NowNanos();
+    work_start_[std::this_thread::get_id()] = time_nanos;
   }
 
   // Records that a node thread has stopped executing.
-  void record_stop() LOCKS_EXCLUDED(mu_) {
+  void record_stop(int64 time_nanos) LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     std::thread::id tid = std::this_thread::get_id();
-    auto start_time = gtl::FindOrNull(work_start_, tid);
-    if (start_time) {
-      processing_time_ += Env::Default()->NowNanos() - *start_time;
-      work_start_.erase(tid);
+    auto iter = work_start_.find(tid);
+    if (iter != work_start_.end()) {
+      processing_time_ += time_nanos - iter->second;
+      work_start_.erase(iter);
     } else {
       LOG(WARNING)
           << "Encountered a stop event that was not preceded by a start event.";
@@ -185,12 +183,6 @@ class Node {
     inputs_.remove(input);
   }
 
-  // Set the node output.
-  void set_output(std::shared_ptr<Node> output) LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    output_ = output;
-  }
-
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
       std::vector<std::shared_ptr<Parameter>>* parameters) LOCKS_EXCLUDED(mu_) {
@@ -287,7 +279,10 @@ class Node {
   std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
   std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
   std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
-  std::shared_ptr<Node> output_ GUARDED_BY(mu_);
+
+  // The reference to the output node is not owned so that that deletion of a
+  // node results in recursive deletion of the subtree rooted in the node.
+  Node* const output_;
 };
 
 // InterleaveMany is used to model datasets whose inputs are used to create
@@ -337,8 +332,8 @@ class Model {
   Model() = default;
 
   // Adds a node with the given name and given output.
-  void AddNode(Node::Factory factory, const string& name,
-               const string& output_name) LOCKS_EXCLUDED(mu_);
+  std::shared_ptr<Node> AddNode(Node::Factory factory, const string& name,
+                                const string& output_name) LOCKS_EXCLUDED(mu_);
 
   // Increments the processing time for the given node..
   void AddProcessingTime(const string& name, int64 delta) LOCKS_EXCLUDED(mu_);
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 02e27107fb0c24cc5c472f4f7f678b4e004ec433..53e35f25b28cb3770b52e8f7de54eb0ff4e65d83 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -101,15 +101,9 @@ TEST_P(AsyncKnownRatioTest, Model) {
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", async_known_many});
   async_known_many->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup([async_known_many, source1]() {
-    async_known_many->remove_input(source1);
-  });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
   async_known_many->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup([async_known_many, source2]() {
-    async_known_many->remove_input(source2);
-  });
   std::vector<int64> input_times(1, input_time);
   source1->add_processing_time(100);
   EXPECT_EQ(0, async_known_many->ProcessingTime());
@@ -166,19 +160,12 @@ TEST(InterleaveManyTest, Model) {
   std::shared_ptr<Node> meta_source =
       model::MakeSourceNode({1, "meta_source", interleave_many});
   interleave_many->add_input(meta_source);
-  auto cleanup_meta = gtl::MakeCleanup([interleave_many, meta_source]() {
-    interleave_many->remove_input(meta_source);
-  });
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", interleave_many});
   interleave_many->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup(
-      [interleave_many, source1]() { interleave_many->remove_input(source1); });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", interleave_many});
   interleave_many->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup(
-      [interleave_many, source2]() { interleave_many->remove_input(source2); });
   std::vector<int64> input_times(1, 0);
   interleave_many->add_processing_time(100);
   EXPECT_EQ(100, interleave_many->processing_time());
@@ -210,13 +197,9 @@ TEST_P(KnownRatioTest, Model) {
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", known_many});
   known_many->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup(
-      [known_many, source1]() { known_many->remove_input(source1); });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", known_many});
   known_many->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup(
-      [known_many, source2]() { known_many->remove_input(source2); });
   std::vector<int64> input_times(1, 0);
   source1->add_processing_time(100);
   EXPECT_EQ(0, known_many->ProcessingTime());
@@ -280,13 +263,9 @@ TEST(UnknownRatioTest, Model) {
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", unknown_many});
   unknown_many->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup(
-      [unknown_many, source1]() { unknown_many->remove_input(source1); });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown_many});
   unknown_many->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup(
-      [unknown_many, source2]() { unknown_many->remove_input(source2); });
   std::vector<int64> input_times(1, 0);
   unknown_many->add_processing_time(100);
   EXPECT_EQ(100, unknown_many->processing_time());
@@ -315,13 +294,9 @@ TEST(UnknownTest, Model) {
   std::shared_ptr<Node> source1 =
       model::MakeSourceNode({1, "source1", unknown});
   unknown->add_input(source1);
-  auto cleanup1 = gtl::MakeCleanup(
-      [unknown, source1]() { unknown->remove_input(source1); });
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown});
   unknown->add_input(source2);
-  auto cleanup2 = gtl::MakeCleanup(
-      [unknown, source2]() { unknown->remove_input(source2); });
   std::vector<int64> input_times(1, 0);
   source1->add_processing_time(100);
   EXPECT_EQ(0, unknown->ProcessingTime());
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 578ec7f2e4dbebc8f0f5b3d80b551346523f8d10..95a787b2df02d48f316653ee5059b4f7e80f73e1 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -102,6 +102,10 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   return ret;
 }
 
+string SummarizeAttrs(const NodeDef& node_def) {
+  return SummarizeAttrsHelper(node_def, node_def.device());
+}
+
 string FormatNodeForError(const Node& node) {
   return FormatNodeDefForError(node.def());
 }
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 0ff67554eb3d2b4713c6c329dec2dc814ce28395..f682bb15355550622e8bbe384df790f1022bd630 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -48,6 +48,7 @@ extern const char* const kColocationGroupPrefix;
 // than a text-format proto.
 string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
+string SummarizeAttrs(const NodeDef& node_def);
 
 // Produces a formatted string pattern from the node which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 1eb12d3f9539caab98bd967645b56b136a0a945c..e3cb4a40ec5503307813d292f4f538fb8577a25b 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -39,9 +39,11 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/platform_strings.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/ptr_util.h"
 
@@ -255,6 +257,9 @@ Status OpKernelConstruction::allocate_persistent(
 
 // OpKernelContext -----------------------------------------------------------
 
+const int OpKernelContext::Params::kNeverForward;
+const int OpKernelContext::Params::kNoReservation;
+
 OpKernelContext::OpKernelContext(Params* params)
     : OpKernelContext(
           params, static_cast<int>(params->op_kernel->output_types().size())) {}
@@ -286,6 +291,13 @@ OpKernelContext::~OpKernelContext() {
     }
   }
   if (params_->record_tensor_accesses) referenced_tensors_.Destroy();
+  if (params_->track_allocations && !wrapped_allocators_.empty()) {
+    LOG(WARNING) << "OpKernelContext is tracking allocations but they are not "
+                 << "being consumed by the StepStatsCollector.";
+    for (auto& wrapped_alloator : wrapped_allocators_) {
+      wrapped_alloator.second->GetRecordsAndUnRef();
+    }
+  }
 }
 
 Allocator* OpKernelContext::get_allocator(AllocatorAttributes attr) {
@@ -914,11 +926,12 @@ void OpKernelContext::clear_recorded_memory() {
 
 struct KernelRegistration {
   KernelRegistration(const KernelDef& d, StringPiece c,
-                     kernel_factory::OpKernelRegistrar::Factory f)
-      : def(d), kernel_class_name(c), factory(f) {}
+                     std::unique_ptr<kernel_factory::OpKernelFactory> f)
+      : def(d), kernel_class_name(c), factory(std::move(f)) {}
+
   const KernelDef def;
   const string kernel_class_name;
-  const kernel_factory::OpKernelRegistrar::Factory factory;
+  std::unique_ptr<kernel_factory::OpKernelFactory> factory;
 };
 
 // This maps from 'op_type' + DeviceType to the set of KernelDefs and
@@ -934,6 +947,44 @@ static const char kKernelLibPattern[] = "libtfkernel*.dylib";
 static const char kKernelLibPattern[] = "libtfkernel*.so";
 #endif
 
+#define FEATURE(x) \
+  { x, #x }
+
+// Returns Status::OK if the dynamic library at the given path is safe to
+// load with some level of confidence.
+static Status IsProbablySafeToLoad(const string& path) {
+  // A map of platform string to required CPU feature.
+  using port::CPUFeature;
+  static const auto* feature_map =
+      new std::map<string, std::pair<CPUFeature, string>>{
+          {"__AVX512VL__=1", FEATURE(CPUFeature::AVX512VL)},
+      };
+
+  std::vector<std::string> platform_strings;
+  int result = GetPlatformStrings(path, &platform_strings);
+  if (result) {
+    return Status(error::Code::UNKNOWN, strerror(result));
+  }
+  if (platform_strings.empty()) {
+    return Status(error::Code::FAILED_PRECONDITION,
+                  "Didn't find any platform strings");
+  }
+  std::vector<std::string> missing_features;
+  for (const auto& platform_string : platform_strings) {
+    const auto& entry = feature_map->find(platform_string);
+    if (entry != feature_map->end() &&
+        !port::TestCPUFeature(entry->second.first)) {
+      missing_features.emplace_back(entry->second.second);
+    }
+  }
+  if (!missing_features.empty()) {
+    string errmsg = "Missing CPU features: ";
+    errmsg.append(str_util::Join(missing_features, ", "));
+    return Status(errors::Code::FAILED_PRECONDITION, errmsg);
+  }
+  return Status::OK();
+}
+
 void LoadDynamicKernelsInternal() {
   Env* env = Env::Default();
   string bazel_kernel_dir = io::JoinPath(env->GetRunfilesDir(),
@@ -944,12 +995,18 @@ void LoadDynamicKernelsInternal() {
   Status s_kernel_dir = env->GetChildren(bazel_kernel_dir, &files);
   if (s_kernel_dir.ok()) {
     string dll_spec = io::JoinPath(bazel_kernel_dir, kKernelLibPattern);
-    for (const auto&  file : files) {
-      string fullpath =  io::JoinPath(bazel_kernel_dir, file);
+    for (const auto& file : files) {
+      string fullpath = io::JoinPath(bazel_kernel_dir, file);
       if (env->MatchPath(fullpath, dll_spec)) {
-        // TODO(gunan): Store the handles to the opened files.
-        void* unused_filehandle;
-        TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
+        Status s = IsProbablySafeToLoad(fullpath);
+        if (s.ok()) {
+          // TODO(gunan): Store the handles to the opened files.
+          void* unused_filehandle;
+          TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
+        } else {
+          LOG(WARNING) << "Not loading plugin library " << fullpath << ": "
+                       << s.error_message();
+        }
       }
     }
   }
@@ -985,7 +1042,7 @@ namespace kernel_factory {
 
 void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
                                      StringPiece kernel_class_name,
-                                     Factory factory) {
+                                     std::unique_ptr<OpKernelFactory> factory) {
   // See comments in register_kernel::Name in header for info on _no_register.
   if (kernel_def->op() != "_no_register") {
     const string key =
@@ -1000,8 +1057,8 @@ void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
     // program flakily. Until we get rid of static initializers in kernel
     // registration mechanism, we have this workaround here.
     reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry())
-        ->insert(std::make_pair(
-            key, KernelRegistration(*kernel_def, kernel_class_name, factory)));
+        ->emplace(key, KernelRegistration(*kernel_def, kernel_class_name,
+                                          std::move(factory)));
   }
   delete kernel_def;
 }
@@ -1070,7 +1127,8 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
         FormatNodeDefForError(node_def));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
-          &s, " (OpKernel was found, but attributes didn't match)");
+          &s, " (OpKernel was found, but attributes didn't match) ",
+          "Requested Attributes: ", SummarizeAttrs(node_def));
     }
     errors::AppendToMessage(
         &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
@@ -1083,7 +1141,7 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
 
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    DeviceTypeVector* device_types) {
+    PrioritizedDeviceTypeVector* prioritized_device_types) {
   // TODO(zhifengc): Changes the callers (SimplePlacer and
   // DynamicPlacer) to consider the possibility that 'def' is call to
   // a user-defined function and only calls this
@@ -1096,12 +1154,21 @@ Status SupportedDeviceTypesForNode(
       bool was_attr_mismatch;
       TF_RETURN_IF_ERROR(
           FindKernelRegistration(device_type, def, &reg, &was_attr_mismatch));
-      if (reg != nullptr) device_types->push_back(device_type);
+      if (reg != nullptr) {
+        int32 priority = reg->def.priority();
+        prioritized_device_types->emplace_back(device_type, priority);
+      }
     }
+    std::sort(prioritized_device_types->begin(),
+              prioritized_device_types->end(),
+              [](const std::pair<DeviceType, int32>& a,
+                 const std::pair<DeviceType, int32>& b) {
+                return a.second > b.second;
+              });
   } else {
     // Assumes that all device types support this node.
     for (const DeviceType& device_type : prioritized_types) {
-      device_types->push_back(device_type);
+      prioritized_device_types->push_back(std::make_pair(device_type, 0));
     }
   }
   return Status::OK();
@@ -1196,7 +1263,8 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                               FormatNodeDefForError(node_def)));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
-          &s, " (OpKernel was found, but attributes didn't match)");
+          &s, " (OpKernel was found, but attributes didn't match) ",
+          "Requested Attributes: ", SummarizeAttrs(node_def));
     }
     errors::AppendToMessage(
         &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
@@ -1225,7 +1293,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
   OpKernelConstruction context(
       device_type, device, allocator, &node_def, op_def, flib, inputs,
       input_memory_types, outputs, output_memory_types, graph_def_version, &s);
-  *kernel = (*registration->factory)(&context);
+  *kernel = registration->factory->Create(&context);
   if (!s.ok()) {
     delete *kernel;
     *kernel = nullptr;
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 6c71e118c0244e9d9a4159d55d0d499dfa604426..9f4c57e880ad32afac8bfadaf2edd7ba9597f02b 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -982,9 +982,10 @@ class OpKernelContext {
     return params_->output_attr_array[index];
   }
 
-  gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators() const {
+  gtl::InlinedVector<WrappedAllocator, 4> ConsumeWrappedAllocators() {
     mutex_lock lock(mu_);
-    gtl::InlinedVector<WrappedAllocator, 4> retrieved = wrapped_allocators_;
+    gtl::InlinedVector<WrappedAllocator, 4> retrieved;
+    retrieved.swap(wrapped_allocators_);
     return retrieved;
   }
 
@@ -1236,7 +1237,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
 //           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    DeviceTypeVector* device_types);
+    PrioritizedDeviceTypeVector* device_types);
 
 // Returns a message with a description of the kernels registered for op
 // `op_name`.
@@ -1345,22 +1346,55 @@ KernelList GetRegisteredKernelsForOp(StringPiece op_name);
 
 namespace kernel_factory {
 
+// OpKernelFactory is responsible for creating OpKernels when TensorFlow needs
+// them. You register factories with the TensorFlow core by constructing an
+// OpKernelRegistrar and passing the factory as a constructor parameter.
+class OpKernelFactory {
+ public:
+  virtual OpKernel* Create(OpKernelConstruction* context) = 0;
+  virtual ~OpKernelFactory() = default;
+};
+
 class OpKernelRegistrar {
  public:
-  typedef OpKernel* (*Factory)(OpKernelConstruction*);
+  // Registers the given kernel factory with TensorFlow. TF will call the
+  // factory Create() method when it determines that a kernel matching the given
+  // KernelDef is required.
+  OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
+                    std::unique_ptr<OpKernelFactory> factory) {
+    // Perform the check in the header to allow compile-time optimization
+    // to a no-op, allowing the linker to remove the kernel symbols.
+    if (kernel_def != nullptr) {
+      InitInternal(kernel_def, kernel_class_name, std::move(factory));
+    }
+  }
 
+  // Registers the given factory function with TensorFlow. This is equivalent
+  // to registering a factory whose Create function invokes `create_fn`.
   OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
-                    Factory factory) {
+                    OpKernel* (*create_fn)(OpKernelConstruction*)) {
     // Perform the check in the header to allow compile-time optimization
     // to a no-op, allowing the linker to remove the kernel symbols.
     if (kernel_def != nullptr) {
-      InitInternal(kernel_def, kernel_class_name, factory);
+      struct PtrOpKernelFactory : public OpKernelFactory {
+        explicit PtrOpKernelFactory(
+            OpKernel* (*create_func)(OpKernelConstruction*))
+            : create_func_(create_func) {}
+
+        OpKernel* Create(OpKernelConstruction* context) override {
+          return (*create_func_)(context);
+        }
+
+        OpKernel* (*create_func_)(OpKernelConstruction*);
+      };
+      InitInternal(kernel_def, kernel_class_name,
+                   absl::make_unique<PtrOpKernelFactory>(create_fn));
     }
   }
 
  private:
   void InitInternal(const KernelDef* kernel_def, StringPiece kernel_class_name,
-                    Factory factory);
+                    std::unique_ptr<OpKernelFactory> factory);
 };
 
 }  // namespace kernel_factory
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 83dda6579b784be538f45d9c95be57d412f49668..d8001cd07103f01c57480b62f3d40ff40514af88 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -102,6 +102,27 @@ REGISTER_OP("Test4").Input("i: float").Output("o: float");
 REGISTER_KERNEL_BUILDER(Name("Test4").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("Test4").Device(DEVICE_GPU), DummyKernel);
 
+// Kernels with different priorities.
+REGISTER_OP("Test5").Input("a: T").Input("b: T").Attr("T: type");
+
+class TestOp5Cpu : public tensorflow::OpKernel {
+ public:
+  explicit TestOp5Cpu(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("Test5").Device(DEVICE_CPU).Priority(2),
+                        TestOp5Cpu);
+
+class TestOp5Gpu : public tensorflow::OpKernel {
+ public:
+  explicit TestOp5Gpu(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("Test5").Device(DEVICE_GPU).Priority(1),
+                        TestOp5Gpu);
+
 static std::vector<DeviceType> DeviceTypes() {
   return {DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)};
 }
@@ -185,10 +206,10 @@ TEST_F(OpKernelTest, SuccessBothCpuAndGpu) {
 
 TEST_F(OpKernelTest, CpuTypeRegistered) {
   NodeDef ndef = CreateNodeDef("Test1", {DT_FLOAT, DT_INT32});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
   EXPECT_EQ(1, devs.size());
-  EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0]);
+  EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
 }
 
 TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
@@ -196,24 +217,24 @@ TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
     // Try a node def of an op that is registered for a specific type
     // only on CPU.
     NodeDef ndef = CreateNodeDef("Test3", {DT_INT8, DT_INT8});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(1, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0]);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
   }
   {
     // Try a node def of an op that is registered for a specific type
     // only on GPU.
     NodeDef ndef = CreateNodeDef("Test3", {DT_FLOAT, DT_FLOAT});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(1, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0]);
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0].first);
   }
   {
     // Try a node def of an op that is only registered for other types.
     NodeDef ndef = CreateNodeDef("Test3", {DT_STRING, DT_STRING});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(0, devs.size());
   }
@@ -221,11 +242,23 @@ TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
   {
     // Try a node def of an op that is registered for both.
     NodeDef ndef = CreateNodeDef("Test4", {DT_FLOAT});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
+    TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
+    EXPECT_EQ(2, devs.size());
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0].first);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[1].first);
+  }
+
+  {
+    // Try a node def of an op where kernels have priorities.
+    NodeDef ndef = CreateNodeDef("Test5", {DT_STRING, DT_STRING});
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(2, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0]);
-    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[1]);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
+    EXPECT_EQ(2, devs[0].second);
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[1].first);
+    EXPECT_EQ(1, devs[1].second);
   }
 }
 
@@ -412,11 +445,11 @@ class OpKernelBuilderTest : public ::testing::Test {
     }
 
     // Test SupportedDeviceTypesForNode()
-    DeviceTypeVector devices;
+    PrioritizedDeviceTypeVector devices;
     TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
     bool found = false;
-    for (const DeviceType& dt : devices) {
-      if (dt == device_type) {
+    for (const auto& dt : devices) {
+      if (dt.first == device_type) {
         found = true;
       }
     }
@@ -445,11 +478,11 @@ class OpKernelBuilderTest : public ::testing::Test {
       EXPECT_EQ(code, status.code());
 
       // Test SupportedDeviceTypesForNode().
-      DeviceTypeVector devices;
+      PrioritizedDeviceTypeVector devices;
       if (errors::IsNotFound(status)) {
         TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
-        for (const DeviceType& dt : devices) {
-          EXPECT_NE(dt, device_type);
+        for (const auto& dt : devices) {
+          EXPECT_NE(dt.first, device_type);
         }
       } else {
         Status status2 =
@@ -562,7 +595,7 @@ REGISTER_KERNEL_BUILDER(Name("DuplicateKernel").Device(DEVICE_CPU),
 
 TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   const NodeDef ndef = CreateNodeDef("DuplicateKernel", {});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(
@@ -582,7 +615,7 @@ REGISTER_KERNEL_BUILDER(
 TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   const NodeDef ndef =
       CreateNodeDef("DuplicateKernelForT", {"T|type|DT_FLOAT"});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(
@@ -603,7 +636,7 @@ REGISTER_KERNEL_BUILDER(Name("BadConstraint")
 
 TEST_F(OpKernelBuilderTest, BadConstraint) {
   const NodeDef ndef = CreateNodeDef("BadConstraint", {});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 508a8d3149b9f614afc900b528ae5777d0d2f5fc..9f3204ab96050a1cc06ab3052741f0044369b83e 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -204,12 +204,19 @@ Status ResourceMgr::Delete(const ResourceHandle& handle) {
 }
 
 Status ResourceMgr::Cleanup(const string& container) {
+  {
+    tf_shared_lock l(mu_);
+    if (!gtl::FindOrNull(containers_, container)) {
+      // Nothing to cleanup.
+      return Status::OK();
+    }
+  }
   Container* b = nullptr;
   {
     mutex_lock l(mu_);
     auto iter = containers_.find(container);
     if (iter == containers_.end()) {
-      // Nothing to cleanup, it's OK.
+      // Nothing to cleanup, it's OK (concurrent cleanup).
       return Status::OK();
     }
     b = iter->second;
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index 0c4007eafcee22d280747cf9b21630a12a63e961..55790b6e526ea38c45c79bd7e8345bafed90c5d0 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -92,6 +92,27 @@ class RunHandlerPool::Impl {
       handlers_.emplace_back(new RunHandler::Impl(this));
       free_handlers_.push_back(handlers_.back().get());
     }
+    // Set steal partitions to a fixed size steal domain of size 6 = 2 *
+    // kMinThreadsPerRequest.
+    std::vector<std::pair<unsigned, unsigned>> steal_partitions(
+        num_inter_op_threads);
+    int kStealDomainSize = std::min(6, num_inter_op_threads);
+    unsigned steal_start = 0, steal_end = kStealDomainSize;
+    for (int i = 0; i < num_inter_op_threads; ++i) {
+      if (i > steal_start) {
+        if (steal_end + kStealDomainSize < num_inter_op_threads) {
+          steal_start = steal_end;
+          steal_end += kStealDomainSize;
+        } else {
+          steal_end = num_inter_op_threads;
+          steal_start = steal_end - kStealDomainSize;
+        }
+      }
+      steal_partitions[i] = std::make_pair(steal_start, steal_end);
+      VLOG(1) << "Steal partition i: " << i << " steal_start: " << steal_start
+              << " steal_end: " << steal_end;
+    }
+    inter_op_thread_pool_->SetStealPartitions(steal_partitions);
   }
 
   ~Impl() {
@@ -223,7 +244,9 @@ void RunHandlerPool::Impl::RecomputePoolStatsLocked() {
 void RunHandler::Impl::ScheduleInterOpClosure(std::function<void()> fn) {
   std::uint_fast32_t start = 0, limit = 0;
   DecodePartition(inter_op_scheduling_range(), &start, &limit);
-  pool_impl_->inter_op_thread_pool()->Schedule(std::move(fn));
+  DCHECK_LT(start, limit);
+  pool_impl_->inter_op_thread_pool()->ScheduleWithHint(std::move(fn), start,
+                                                       limit);
 }
 
 void RunHandler::Impl::Reset() {
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 1dea6da9113bab15848eb6be9004bd9f180e518d..c7ddc6c21eda7af94379b07ab3dff8a25021665e 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -752,6 +752,13 @@ Tensor::Tensor(Allocator* a, DataType type, const TensorShape& shape,
 Tensor::Tensor(DataType type, const TensorShape& shape)
     : Tensor(cpu_allocator(), type, shape) {}
 
+void Tensor::HostScalarTensorBufferBase::FillAllocationDescription(
+    AllocationDescription* proto) const {
+  proto->set_requested_bytes(size());
+  proto->set_allocator_name("HostScalarTensorBuffer");
+  proto->set_ptr(reinterpret_cast<uintptr_t>(data()));
+}
+
 template <typename T>
 class SubBuffer : public TensorBuffer {
  public:
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index d0f9eb56e236aac2b04174330450e4b4d87b24b3..3177bbe7e93268444bc10f7a2de0bcc447109e39 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
 #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
 
+#include <cstdint>
+#include <type_traits>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -28,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -110,6 +113,76 @@ class Tensor {
   /// for details.
   explicit Tensor(DataType type);
 
+ private:
+  // A tag type for selecting the `Tensor` constructor overload that creates a
+  // scalar tensor in host memory.
+  struct host_scalar_tag {};
+
+  class HostScalarTensorBufferBase;
+  template <typename T>
+  struct ValueAndTensorBuffer;
+
+  // Creates a tensor with the given scalar `value` in CPU memory.
+  template <typename T>
+  Tensor(T value, host_scalar_tag tag);
+
+ public:
+  // A series of specialized constructors for scalar tensors in host memory.
+  //
+  // NOTE: The `Variant` host-scalar constructor is not defined, because Variant
+  // is implicitly constructible from many different types, and this causes
+  // ambiguities with some compilers.
+  explicit Tensor(float scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(double scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(string scalar_value)
+      : Tensor(std::move(scalar_value), host_scalar_tag{}) {}
+  explicit Tensor(complex64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(complex128 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(bool scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(quint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(quint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(bfloat16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(Eigen::half scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(ResourceHandle scalar_value)
+      : Tensor(std::move(scalar_value), host_scalar_tag{}) {}
+
+  // NOTE: The `const char*` host-scalar constructor is provided as a
+  // convenience because otherwise passing a string literal would surprisingly
+  // construct a DT_BOOL tensor.
+  explicit Tensor(const char* scalar_value)
+      : Tensor(string(scalar_value), host_scalar_tag{}) {}
+
   /// Copy constructor.
   Tensor(const Tensor& other);
 
@@ -799,6 +872,81 @@ inline Tensor::Tensor(Tensor&& other)
   other.buf_ = nullptr;
 }
 
+class Tensor::HostScalarTensorBufferBase : public TensorBuffer {
+ public:
+  void FillAllocationDescription(AllocationDescription* proto) const final;
+};
+
+// A packed representation for a single scalar value of type `T`, and a
+// `TensorBuffer` implementation that describes (and manages the lifetime of)
+// that value.
+template <typename T>
+struct Tensor::ValueAndTensorBuffer {
+  class HostScalarTensorBuffer : public Tensor::HostScalarTensorBufferBase {
+   public:
+    HostScalarTensorBuffer(void* data) : data_(data) {}
+    void* data() const final { return const_cast<void*>(data_); }
+    size_t size() const final { return sizeof(T); }
+    TensorBuffer* root_buffer() final { return this; }
+
+    // Override `operator delete` so that calling `delete this` in
+    // `core::Refcounted::Unref()` for an object of this type will free
+    // the enclosing `ValueAndTensorBuffer` for the tensor buffer.
+    //
+    // NOTE(mrry): The definition of this method must be outside the class
+    // definition in order to satisfy some compilers.
+    static void operator delete(void* ptr);
+
+    static void operator delete(void*, void*) {
+      // Some compilers require an overridden class-specific deallocation
+      // function, which will be called if placement `new` throws an
+      // exception.
+    }
+
+   private:
+    ~HostScalarTensorBuffer() override { static_cast<T*>(data_)->~T(); }
+    void* const data_;
+  };
+
+  T value;
+  HostScalarTensorBuffer tensor_buffer;
+};
+
+/* static */
+template <typename T>
+void Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer::operator delete(
+    void* ptr) {
+  // Use a dummy object to compute to offset of
+  // `ValueAndTensorBuffer::tensor_buffer`, because `offsetof()` is not
+  // necessarily defined on this non-POD type (until C++17).
+  //
+  // NOTE(mrry): Using `sizeof(Tensor::ValueAndTensorBuffer<T>)` here requires
+  // us to define this method outside the class definition, so that it is not
+  // considered an incomplete type.
+  typename std::aligned_storage<sizeof(Tensor::ValueAndTensorBuffer<T>),
+                                alignof(Tensor::ValueAndTensorBuffer<T>)>::type
+      dummy_storage_;
+  Tensor::ValueAndTensorBuffer<T>* dummy_object =
+      reinterpret_cast<Tensor::ValueAndTensorBuffer<T>*>(&dummy_storage_);
+  intptr_t offset = reinterpret_cast<intptr_t>(&dummy_object->tensor_buffer) -
+                    reinterpret_cast<intptr_t>(dummy_object);
+
+  port::AlignedFree(static_cast<char*>(ptr) - offset);
+}
+
+template <typename T>
+Tensor::Tensor(T value, host_scalar_tag tag) {
+  auto* value_and_buf = static_cast<Tensor::ValueAndTensorBuffer<T>*>(
+      port::AlignedMalloc(sizeof(typename Tensor::ValueAndTensorBuffer<T>),
+                          EIGEN_MAX_ALIGN_BYTES));
+  new (&value_and_buf->value) T(std::move(value));
+  new (&value_and_buf->tensor_buffer)
+      typename Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer(
+          value_and_buf);
+  buf_ = &value_and_buf->tensor_buffer;
+  set_dtype(DataTypeToEnum<T>::value);
+}
+
 inline Tensor& Tensor::operator=(Tensor&& other) {
   // Avoid self-assignment, since we might destroy our underlying buffer.
   if (&other != this) {
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index c5966041435b240a821fc510fa3479a06ca457e9..4fa9d1df6757768c8c6b00b6932ee8e3550ee2f8 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -830,6 +830,45 @@ TEST(Tensor_Scalar, Basics) {
   }
 }
 
+TEST(Tensor_HostScalar, Basics) {
+  {
+    Tensor t(true);
+    EXPECT_EQ(DT_BOOL, t.dtype());
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<bool>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    EXPECT_TRUE(Tt());
+    Tt() = false;
+    EXPECT_FALSE(Tt());
+  }
+  {
+    Tensor t(123.45f);
+    EXPECT_EQ(DT_FLOAT, t.dtype());
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<float>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    EXPECT_FLOAT_EQ(123.45f, Tt());
+    Tt() = 42.0f;
+    EXPECT_FLOAT_EQ(42.0f, Tt());
+  }
+  {
+    // NOTE(mrry): Use long enough strings so that the contents are dynamically
+    // allocated, and the absence of a call to the string destructor would
+    // cause a memory leak.
+    Tensor t("fooooooooooooooooooooooooooooooooooooo");
+    EXPECT_EQ(DT_STRING, t.dtype());
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<string>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    EXPECT_EQ("fooooooooooooooooooooooooooooooooooooo", Tt());
+    Tt() = "baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaar";
+    EXPECT_EQ("baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaar", Tt());
+  }
+}
+
 TEST(Tensor_Float, Reshape_And_Slice_Assignment) {
   // A test to experiment with a way to assign to a subset of a tensor
   Tensor t(DT_FLOAT, TensorShape({10, 4, 3, 2}));
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index a05dea19ec425de6d15df6bcb08ae62d4ab2017b..c0df19334210bb0830371d3d5c2fc4edd0d297bc 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -104,6 +104,8 @@ typedef gtl::InlinedVector<DataType, 4> DataTypeVector;
 typedef gtl::ArraySlice<DataType> DataTypeSlice;
 
 typedef gtl::InlinedVector<DeviceType, 4> DeviceTypeVector;
+typedef gtl::InlinedVector<std::pair<DeviceType, int32>, 4>
+    PrioritizedDeviceTypeVector;
 
 // Convert the enums to strings for errors:
 string DataTypeString(DataType dtype);
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index b9fceb6a31b31237145f78582d459095a6560b19..466310d874279c1a2b3d293021f0cb0cf578c6c5 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -285,6 +285,14 @@ Status Node::input_node(int idx, const Node** const_n) const {
   return Status::OK();
 }
 
+Status Node::input_tensor(int idx, OutputTensor* t) const {
+  const Edge* e;
+  TF_RETURN_IF_ERROR(input_edge(idx, &e));
+  DCHECK(e != nullptr);
+  *t = OutputTensor(e->src(), e->src_output());
+  return Status::OK();
+}
+
 // InputTensor
 
 bool InputTensor::operator==(const InputTensor& other) const {
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 585afa2d0086d884000ffe880bb101eb525fd1f7..6c6d98b5aa284d54cd4f992439f89d3edcd74eb6 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -59,6 +59,7 @@ class EdgeSetTest;
 class Graph;
 class GraphDef;
 class Node;
+struct OutputTensor;
 class VersionDef;
 class WhileContext;
 
@@ -189,6 +190,10 @@ class Node {
   Status input_node(int idx, const Node** n) const;
   Status input_node(int idx, Node** n) const;
 
+  // Returns into '*t' the idx-th input tensor of this node, represented as the
+  // output tensor of input_node(idx).
+  Status input_tensor(int idx, OutputTensor* t) const;
+
   WhileContext* while_ctx() const { return while_ctx_; }
   void set_while_ctx(WhileContext* while_ctx) {
     DCHECK(IsExit());
@@ -287,10 +292,10 @@ class Node {
 
 // Represents an input of a node, i.e., the `index`-th input to `node`.
 struct InputTensor {
-  const Node* node;
+  Node* node;
   int index;
 
-  InputTensor(const Node* n, int i) : node(n), index(i) {}
+  InputTensor(Node* n, int i) : node(n), index(i) {}
   InputTensor() : node(nullptr), index(0) {}
 
   // Returns true if this InputTensor is identical to 'other'. Nodes are
@@ -308,10 +313,10 @@ struct InputTensor {
 // that a single `OutputTensor` can correspond to multiple `Edge`s if the output
 // is consumed by multiple destination nodes.
 struct OutputTensor {
-  const Node* node;
+  Node* node;
   int index;
 
-  OutputTensor(const Node* n, int i) : node(n), index(i) {}
+  OutputTensor(Node* n, int i) : node(n), index(i) {}
   OutputTensor() : node(nullptr), index(0) {}
 
   // Returns true if this OutputTensor is identical to 'other'. Nodes are
@@ -425,9 +430,9 @@ class Graph {
   // Constructs a graph with a single SOURCE (always id kSourceId) and a
   // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
   //
-  // The graph can hold ops found in registry. `registry`s lifetime must be at
+  // The graph can hold ops found in the registry. `ops`s lifetime must be at
   // least that of the constructed graph's.
-  explicit Graph(const OpRegistryInterface* registry);
+  explicit Graph(const OpRegistryInterface* ops);
 
   // Constructs a graph with a single SOURCE (always id kSourceId) and a
   // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 1dbcebab598c7230008ab61e1094229bde76b757..9c640c42a5891b632e18517c848cc9a0c76a0f45 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -1186,7 +1187,8 @@ Status Partition(const PartitionOptions& opts, Graph* g,
   for (auto& it : *partitions) {
     GraphDef* gdef = &it.second;
     *gdef->mutable_versions() = g->versions();
-    *gdef->mutable_library() = flib_def->ToProto();
+    // Prune unreachable functions from `flib_def` before adding them to `gdef`.
+    *gdef->mutable_library() = flib_def->ReachableDefinitions(*gdef).ToProto();
 
     // Traverse the graph to fill every send/recv op's incarnation
     // information.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index f44ed47a6e94acdce66c36902cbcf2fdfb041447..29d8034d2a14b6fa2c49b5fa65cb409209b29944 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -470,13 +470,19 @@ TEST_F(GraphPartitionTest, Functions) {
   ConstructOp(in_.WithOpName("A2"), "XTimesTwo", {a1});
   ConstructOp(in_.WithOpName("B2"), "XTimesFour", {b1});
 
+  // The `Partition()` helper function uses the first letter of the op name ('A'
+  // or 'B') to choose a device for each node.
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  // Test that partition graphs inherit function library from original graph
+  // Test that partition graphs inherit function library from original graph.
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
-  ExpectFunctions(partitions_[a].library(), {"XTimesTwo", "XTimesFour"});
+
+  // Node "A2" is placed in part `a`, and uses only "XTimesTwo".
+  ExpectFunctions(partitions_[a].library(), {"XTimesTwo"});
+  // Node "B2" is placed in part `b`, and uses both "XTimesFour" directly,
+  // and "XTimesTwo" in the body of "XTimesFour".
   ExpectFunctions(partitions_[b].library(), {"XTimesTwo", "XTimesFour"});
 }
 
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index bab1df87a4d3c62b8377363e1ea7a0af33434dc3..990b2fe9b04770dc875b949ec3e17c321fe018be 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -75,6 +75,8 @@ int inline GetTensorMetaDataIndex(int n, int total_tensors) {
 namespace mkl_op_registry {
 static const char* kMklOpLabel = "MklOp";
 static const char* kMklOpLabelPattern = "label='MklOp'";
+static const char* kMklQuantizedOpLabel = "QuantizedMklOp";
+static const char* kMklQuantizedOpLabelPattern = "label='QuantizedMklOp'";
 // Prefix that we add to Tensorflow op name to construct Mkl op name.
 static const char* const kMklOpPrefix = "_Mkl";
 
@@ -91,9 +93,30 @@ inline string GetMklOpName(const string& name) {
 // @return: true if opname is registered as Mkl op; false otherwise
 static inline bool IsMklOp(const string& op_name, DataType T) {
   string kernel = KernelsRegisteredForOp(op_name);
-  bool result =
-      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-  return result;
+
+  // Restrict quantized ops to QUINT8 and QINT8 for now
+  if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
+    return (T == DT_QUINT8 || T == DT_QINT8);
+  }
+  // Restrict regular ops to FLOAT
+  if (kernel.find(kMklOpLabelPattern) != string::npos) {
+    return (T == DT_FLOAT);
+  }
+  return false;
+}
+
+// TODO(mdfaijul): QuantizedConv2D is registered with input: QUINT8
+// filter:QINT8 for mkldnn integration. First a dummy kernel is created
+// and then it is replaced by an actual kernel.
+static inline bool IsMklOp(const string& op_name, DataType Tinput,
+                           DataType Tfilter) {
+  string kernel = KernelsRegisteredForOp(op_name);
+
+  // Restrict quantized ops to QUINT8 and QINT8 for now
+  if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
+    return (Tinput == DT_QUINT8 && Tfilter == DT_QINT8);
+  }
+  return false;
 }
 
 // Check whether opname with type T is registered as MKL-compliant and
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index adaee479359c22f08bbec5af0245719fa161912e..a91e6dd05738ae8242c812970e8bbc4a10c7675a 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -29,6 +29,8 @@ NodeBuilder::NodeOut::NodeOut(Node* n, int32 i)  // NOLINT(runtime/explicit)
       index(i),
       dt(SafeGetOutput(node, i, &error)) {}
 
+NodeBuilder::NodeOut::NodeOut(OutputTensor t) : NodeOut(t.node, t.index) {}
+
 NodeBuilder::NodeOut::NodeOut(StringPiece n, int32 i, DataType t)
     : node(nullptr), error(false), name(n), index(i), dt(t) {}
 
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 31fb5909393058585a6fa994b144dae9218c3bba..b1dc2ae92f14ba4519d98a4c556c1d06e14b6b5d 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -50,6 +50,7 @@ class NodeBuilder {
   struct NodeOut {
     // For referencing an existing Node.
     NodeOut(Node* n, int32 i = 0);
+    NodeOut(OutputTensor t);
 
     // For referencing Nodes not in the graph being built. It is
     // useful when preparing a graph for ExtendSession or creating a
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 0a38aa1c9192a6f2628c1ca916bd75a8cb51d2e8..0e74a30c7a92ebd46a933f1056ccb093fa095128 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -123,6 +123,17 @@ Node* Assign(Graph* g, Node* var, Node* val) {
   return ret;
 }
 
+Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive, bool reverse) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Cumsum")
+                  .Input(data)
+                  .Input(axes)
+                  .Attr("exclusive", exclusive)
+                  .Attr("reverse", reverse)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
              bool keep_dims) {
   Node* ret;
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index b00196f58735f938f562b5cabcd2985274b34f56..0c7233161f4128c1da0d8761b0b49fc2f4cf2524 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -68,6 +68,10 @@ Node* Recv(Graph* g, const string& tensor, const string& type,
            const string& sender, const uint64 sender_incarnation,
            const string& receiver);
 
+// Adds a cumsum "node" in "g" doing cumsum(data, axes).
+Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive = false,
+             bool reverse = false);
+
 // Adds a reduction "node" in "g" doing sum(data, axes).  "reduce" is
 // a reduction, e.g., Sum, Max, Min, Mean, etc.
 Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index c2a9a28a1ca2abdb09d72763d268729d8a0a95f1..7b03ec38bf5bb13f4fc20cccef241839eaacc426 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -23,6 +23,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -67,11 +68,14 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":utils",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -170,8 +174,10 @@ cc_library(
         ":graph_view",
         ":grappler_item",
         ":utils",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -184,6 +190,7 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index db814ecab2b38ec0621623a145b3de9e351f4ed0..270b75269c794249975d0316c628e40f0ec95a5b 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -525,27 +525,26 @@ class SymbolicShapeRefiner {
             "supported.");
       }
       NodeDef* fun_node = gv.GetNode(fun_input.input_name);
-      const string& input = function_node->input(i);
-      const string& node_name = NodeName(input);
+      const TensorId input_tensor = ParseTensorName(function_node->input(i));
 
-      if (IsControlInput(input)) {
+      if (IsControlInput(input_tensor)) {
         return errors::FailedPrecondition(
             "Function inputs should not contain control nodes.");
       }
 
-      const NodeDef* input_node = graph_.GetNode(node_name);
+      const NodeDef* input_node = graph_.GetNode(input_tensor.node());
       if (input_node == nullptr) {
-        return errors::FailedPrecondition(node_name,
+        return errors::FailedPrecondition(input_tensor.node(),
                                           " was not found in the graph.");
       }
 
       InferenceContext* input_inference_context = GetContext(input_node);
       if (input_inference_context == nullptr) {
         return errors::FailedPrecondition(
-            "Inference context has not been created for ", node_name);
+            "Inference context has not been created for ", input_tensor.node());
       }
 
-      int output_port_num = NodePosition(input);
+      int output_port_num = input_tensor.index();
       AttrValue attr_output_shape;
       TensorShapeProto proto;
       const auto& handle = input_inference_context->output(output_port_num);
@@ -1921,12 +1920,12 @@ Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
   return Status::OK();
 }
 
-bool GraphProperties::HasInputProperties(const string& name) const {
-  return input_properties_.find(name) != input_properties_.end();
+bool GraphProperties::HasInputProperties(const string& node_name) const {
+  return input_properties_.find(node_name) != input_properties_.end();
 }
 
-bool GraphProperties::HasOutputProperties(const string& name) const {
-  return output_properties_.find(name) != output_properties_.end();
+bool GraphProperties::HasOutputProperties(const string& node_name) const {
+  return output_properties_.find(node_name) != output_properties_.end();
 }
 
 const std::vector<OpInfo::TensorProperties>&
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index e5036d3cdf1ae95f9478c9b88be5f7523f044f91..fbae1ca5b437c1d73c38da3ef580a9e49e8c84c5 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -63,8 +63,8 @@ class GraphProperties {
   // values strictly less than -1 to encode symbolic dimensions: although we
   // don't know the actual value of the symbolic dimension, we know that all the
   // dimensions denoted by the same negative value are the equal.
-  bool HasInputProperties(const string& name) const;
-  bool HasOutputProperties(const string& name) const;
+  bool HasInputProperties(const string& node_name) const;
+  bool HasOutputProperties(const string& node_name) const;
   const std::vector<OpInfo::TensorProperties>& GetInputProperties(
       const string& node_name) const;
   const std::vector<OpInfo::TensorProperties>& GetOutputProperties(
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 76e5c989fca81c8b26be3f70ffa330f686f86085..0e55209238555deb88d69ba97fc4df8cb11d3677 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -311,8 +311,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {"Square", EIGEN_COST(scalar_square_op<float>)},
       {"Tanh", EIGEN_COST(scalar_tanh_op<float>)},
       {"Relu", EIGEN_COST(scalar_max_op<float>)},
-      {"Sigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
-      {"QuantizedSigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
+      {"Sigmoid", EIGEN_COST(scalar_logistic_op<float>)},
+      {"QuantizedSigmoid", EIGEN_COST(scalar_logistic_op<float>)},
       {"Sign", EIGEN_COST(scalar_sign_op<float>)},
       {"Sin", EIGEN_COST(scalar_sin_op<float>)},
       {"Tan", EIGEN_COST(scalar_tan_op<float>)},
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index ba50e5553852d2b722b1f0fffe0507c2a77d9d9b..ae5200b359232153f96c9ffa21a505d2a056d55d 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -469,8 +469,8 @@ Status VirtualScheduler::Init() {
         } else {
           // Different device, no cached copy; transfer input_node to the
           // curr_node's device.
-          auto send_and_recv =
-              CreateSendRecv(input_node, curr_node, input_node_name);
+          auto send_and_recv = CreateSendRecv(input_node, curr_node, input_node,
+                                              input_node_name);
           // Note that CreateSendRecv() already connected input/output between
           // _Send and _Recv ops.
           const auto* send = send_and_recv.first;
@@ -608,7 +608,8 @@ string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
 }
 
 std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
-    const NodeDef* from, const NodeDef* to, const string& input_name) {
+    const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
+    const string& input_name) {
   CHECK(!initialized_) << "CreateSendRecv is called after Init().";
 
   // Connect "from" node to "to" node with _Send and _Recv such that
@@ -641,6 +642,12 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   send_attr[kAttrInputSrc].set_s(input_name);
   send_attr[kAttrSrcDevice].set_s(DeviceName(from));
   send_attr[kAttrDstDevice].set_s(DeviceName(to));
+  // GraphDef generated by AutoGrappler has tensor_name field when removing
+  // _Send/_Recv nodes.
+  if (input_node->attr().count(kAttrTensorName)) {
+    send_attr[kAttrTensorName].set_s(
+        input_node->attr().at(kAttrTensorName).s());
+  }
 
   // _Recv op.
   auto* recv = new NodeDef();
@@ -650,6 +657,10 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   recv->set_device(DeviceName(to));
   auto& recv_attr = *(recv->mutable_attr());
   recv_attr[kAttrInputSrc].set_s(input_name);
+  if (input_node->attr().count(kAttrTensorName)) {
+    recv_attr[kAttrTensorName].set_s(
+        input_node->attr().at(kAttrTensorName).s());
+  }
 
   // NodeState for _Send op.
   auto& send_node_state = GetNodeStateOrCreateIt(send);
@@ -1022,7 +1033,8 @@ Costs VirtualScheduler::Summary() const {
       bool is_cost_accurate;
       std::tie(cost, is_cost_accurate) = op_costs_.at(item.first);
       VLOG(2) << "Node: " << item.first << ", Count: " << item.second
-              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost;
+              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost
+              << " us";
     }
   }
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 89dff9686d3983330e0261c1074b8b791a98b459..6a835f32d16d0850c06891f656b2bec910e26b78 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -308,15 +308,17 @@ class VirtualScheduler {
  private:
   // Constants.
   const string kAttrInputSrc = "input_source_";
-  const string kAttrSrcDevice = "src_device_";
-  const string kAttrDstDevice = "dst_device_";
+  const string kAttrSrcDevice = "send_device";
+  const string kAttrDstDevice = "recv_device";
+  const string kAttrTensorName = "tensor_name";
   const string kChannelDevice = "Channel";
 
   // Methods called from Init(). Fails if initialize_ is set.
   void MaybeUpdateInputOutput(const NodeDef* node);
   NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
   std::pair<const NodeDef*, const NodeDef*> CreateSendRecv(
-      const NodeDef* from, const NodeDef* to, const string& input_name);
+      const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
+      const string& input_name);
   string DeviceName(const NodeDef* node) const;
   string SanitizedDeviceName(const NodeDef* node) const;
   string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index 9b3958b6c175d8abdedef4c0eed7973b5292262e..ba9d2eb32181940bc430771db281c6cea8cb48c4 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -63,5 +63,32 @@ int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
   return OpPortIdToArgId(node, op.input_arg(), port_id);
 }
 
+bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
+                         int port) {
+  const auto output = GraphView::OutputPort(node, port);
+  const auto fanout = graph_view.GetFanout(output);
+  return fanout.size() <= 1;
+}
+
+bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port) {
+  const auto output = GraphView::OutputPort(node, port);
+  const auto fanout = graph_view.GetFanout(output);
+  return !fanout.empty();
+}
+
+bool NoControlFanin(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::InputPort(node, -1);
+  return graph_view.GetFanin(control_port).empty();
+}
+
+bool NoControlFanout(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::OutputPort(node, -1);
+  return graph_view.GetFanout(control_port).empty();
+}
+
+bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node) {
+  return NoControlFanin(graph_view, node) && NoControlFanout(graph_view, node);
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 495e01d2ebeda9d7921504279939cb503b6523a1..0a47b2256583f35e6ef413b50fdc8eea2bdc978d 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -134,7 +134,7 @@ class GraphViewInternal {
   // of an output (resp. input) port.
   const absl::flat_hash_set<InputPort>& GetFanout(
       const OutputPort& port) const {
-    return gtl::FindWithDefault(fanouts_, port, empty_set_);
+    return gtl::FindWithDefault(fanouts_, port, fanout_not_found_value_);
   }
 
   absl::flat_hash_set<OutputPort> GetFanin(const InputPort& port) const {
@@ -173,7 +173,7 @@ class GraphViewInternal {
     port.node = const_cast<NodeDefT*>(&node);
     const int first_port_id = include_controlled_nodes ? -1 : 0;
     const int last_port_id =
-        gtl::FindWithDefault(num_regular_outputs_, port.node, -1);
+        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
 
     for (int i = first_port_id; i <= last_port_id; ++i) {
       port.port_id = i;
@@ -220,7 +220,7 @@ class GraphViewInternal {
     port.node = const_cast<NodeDefT*>(&node);
     const int first_port_id = include_controlling_nodes ? -1 : 0;
     const int last_port_id =
-        gtl::FindWithDefault(num_regular_outputs_, port.node, -1);
+        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
 
     for (int i = first_port_id; i <= last_port_id; ++i) {
       port.port_id = i;
@@ -241,7 +241,7 @@ class GraphViewInternal {
     port.node = const_cast<NodeDefT*>(&node);
     const int first_port_id = include_controlled_edges ? -1 : 0;
     const int last_port_id =
-        gtl::FindWithDefault(num_regular_outputs_, &node, -1);
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
 
     for (int i = first_port_id; i <= last_port_id; ++i) {
       port.port_id = i;
@@ -290,29 +290,42 @@ class GraphViewInternal {
       if (output.port_id < 0) {
         fanouts_[output].emplace(node, -1);
       } else {
-        num_regular_outputs_[output.node] =
-            std::max(num_regular_outputs_[output.node], output.port_id);
+        max_regular_output_port_[output.node] =
+            std::max(max_regular_output_port_[output.node], output.port_id);
         fanouts_[output].emplace(node, i);
       }
     }
   }
 
   // Access to the mutable internal state for MutableGraphView.
-  absl::flat_hash_map<absl::string_view, NodeDefT*>* mutable_nodes() {
-    return &nodes_;
+  absl::flat_hash_map<absl::string_view, NodeDefT*>& nodes() { return nodes_; }
+
+  absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>>& fanouts() {
+    return fanouts_;
   }
 
-  absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>>*
-  mutable_fanouts() {
-    return &fanouts_;
+  absl::flat_hash_map<const NodeDef*, int>& max_regular_output_port() {
+    return max_regular_output_port_;
   }
 
  private:
   GraphDefT* graph_;  // must outlive the graph view
+
+  // A mapping from the node name to the node itself.
   absl::flat_hash_map<absl::string_view, NodeDefT*> nodes_;
-  absl::flat_hash_set<InputPort> empty_set_;
+
+  // A mapping from the output port to all inputs that read from it.
   absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>> fanouts_;
-  std::unordered_map<NodeDefT*, int> num_regular_outputs_;
+
+  // Keep a maximum index of tensor fetched from the node. It doesn't guarantee
+  // that all tensors in the [0, max_regular_output_port] range are actually
+  // fetched by other nodes.
+  absl::flat_hash_map<const NodeDef*, int> max_regular_output_port_;
+
+  // If the node has no fanouts at given output port (output tensor consumers)
+  // we return a reference to this set from `GetFanout` (we can't construct new
+  // empty set every time, because we need a non-dangling reference).
+  absl::flat_hash_set<InputPort> fanout_not_found_value_;
 };
 
 }  // namespace internal
@@ -329,6 +342,17 @@ class GraphView
   }
 };
 
+// Returns true if node has one (or zero) fanout nodes at given output port.
+bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
+                         int port = 0);
+
+// Returns true if node has at least one fanout node at given output port.
+bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port = 0);
+
+bool NoControlFanin(const GraphView& graph_view, const NodeDef* node);
+bool NoControlFanout(const GraphView& graph_view, const NodeDef* node);
+bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index 67e804cbfbd25cae947753e61ff02c9c358f6065..1a4754153bca9bb7ee019b9b9ea67e6ce3cb5f89 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
@@ -47,53 +50,137 @@ NodeDef* MutableGraphView::AddNode(NodeDef&& node) {
   return node_in_graph;
 }
 
-NodeDef* MutableGraphView::InsertNode(const NodeDef& input_node, NodeDef&& node,
-                                      const int output_port_id) {
-  auto* node_in_graph = graph()->add_node();
-  *node_in_graph = std::move(node);
+void MutableGraphView::UpdateFanouts(absl::string_view from_node,
+                                     absl::string_view to_node) {
+  NodeDef* from_node_ptr = GetNode(from_node);
+  NodeDef* to_node_ptr = GetNode(to_node);
+  if (from_node_ptr && to_node_ptr) {
+    UpdateFanouts(from_node_ptr, to_node_ptr);
+  } else if (!from_node_ptr) {
+    LOG(WARNING) << absl::Substitute(
+        "Can't update fanouts from '$0' to '$1', from node was not found.",
+        from_node, to_node);
+  } else {
+    LOG(WARNING) << absl::Substitute(
+        "Can't update fanouts from '$0' to '$1', to node was not found.",
+        from_node, to_node);
+  }
+}
 
-  AddUniqueNodeOrDie(node_in_graph);
+void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
+  VLOG(0) << absl::Substitute("Update fanouts from '$0' to '$1'.",
+                              from_node->name(), to_node->name());
+
+  // Update internal state with the new output_port->input_port edge.
+  const auto add_edge = [this](const OutputPort& output_port,
+                               const InputPort& input_port) {
+    fanouts()[output_port].insert(input_port);
+  };
+
+  // Remove invalidated edge from the internal state.
+  const auto remove_edge = [this](const OutputPort& output_port,
+                                  const InputPort& input_port) {
+    fanouts()[output_port].erase(input_port);
+  };
+
+  // First we update regular fanouts. For the regular fanouts
+  // `input_port:port_id` is the input index in NodeDef.
+
+  auto regular_edges =
+      GetFanoutEdges(*from_node, /*include_controlled_edges=*/false);
+
+  // Maximum index of the `from_node` output tensor that is still used as an
+  // input to some other node.
+  int keep_max_regular_output_port = -1;
+
+  for (const Edge& edge : regular_edges) {
+    const OutputPort output_port = edge.src;
+    const InputPort input_port = edge.dst;
+
+    // If the `to_node` reads from the `from_node`, skip this edge (see
+    // AddAndUpdateFanoutsWithoutSelfLoops test for an example).
+    if (input_port.node == to_node) {
+      keep_max_regular_output_port =
+          std::max(keep_max_regular_output_port, input_port.port_id);
+      continue;
+    }
+
+    // Update input at destination node.
+    input_port.node->set_input(
+        input_port.port_id,
+        output_port.port_id == 0
+            ? to_node->name()
+            : absl::StrCat(to_node->name(), ":", output_port.port_id));
+
+    // Remove old edge between the `from_node` and the fanout node.
+    remove_edge(output_port, input_port);
+    // Add an edge between the `to_node` and new fanout node.
+    add_edge(OutputPort(to_node, output_port.port_id), input_port);
+  }
 
-  // replace input for the output nodes of `input_node` with `node`
-  ReplaceInput(input_node, *node_in_graph, output_port_id);
+  // For the control fanouts we do not know the input index in a NodeDef,
+  // so we have to traverse all control inputs.
+
+  auto control_fanouts =
+      GetFanout(GraphView::OutputPort(from_node, Graph::kControlSlot));
+  if (control_fanouts.empty()) return;
+
+  const string from_control_input = absl::StrCat("^", from_node->name());
+  const string to_control_input = absl::StrCat("^", to_node->name());
+
+  for (const InputPort& control_port : control_fanouts) {
+    // Node can't be control dependency of itself.
+    if (control_port.node == to_node) continue;
+
+    // Find and update input corresponding to control dependency.
+    NodeDef* node = control_port.node;
+    for (int i = node->input_size() - 1; i >= 0; --i) {
+      const string& input = node->input(i);
+      if (!IsControlInput(input)) break;  // we reached regular inputs
+      if (input == from_control_input) {
+        node->set_input(i, to_control_input);
+      }
+    }
+
+    // Remove old edge between the `from_node` and the fanout node.
+    remove_edge(OutputPort(from_node, Graph::kControlSlot), control_port);
+    // Add an edge between the `to_node` and new fanout node.
+    add_edge(OutputPort(to_node, Graph::kControlSlot), control_port);
+  }
 
-  AddFanouts(node_in_graph);
-  return node_in_graph;
-}
+  // Because we update all regular fanouts of `from_node`, we can just copy
+  // the value `num_regular_outputs`.
+  max_regular_output_port()[to_node] = max_regular_output_port()[from_node];
 
-void MutableGraphView::ReplaceInput(const NodeDef& old_input,
-                                    const NodeDef& new_input,
-                                    const int output_port_id) {
-  OutputPort output_port = GetOutputPort(old_input.name(), output_port_id);
-  auto fanout = GetFanout(output_port);
-  for (auto& input_port : fanout) {
-    input_port.node->set_input(input_port.port_id, new_input.name());
-    AddFanouts(input_port.node);
+  // Check if all fanouts were updated to read from the `to_node`.
+  if (keep_max_regular_output_port >= 0) {
+    max_regular_output_port()[from_node] = keep_max_regular_output_port;
+  } else {
+    max_regular_output_port().erase(from_node);
   }
 }
 
 void MutableGraphView::DeleteNodes(const std::set<string>& nodes_to_delete) {
   for (const string& node_name_to_delete : nodes_to_delete)
-    RemoveFanouts(mutable_nodes()->at(node_name_to_delete));
+    RemoveFanouts(nodes().at(node_name_to_delete));
   for (const string& node_name_to_delete : nodes_to_delete)
-    mutable_nodes()->erase(node_name_to_delete);
+    nodes().erase(node_name_to_delete);
   EraseNodesFromGraph(nodes_to_delete, graph());
 }
 
-void MutableGraphView::RemoveFanouts(NodeDef* node) {
-  for (int i = 0; i < node->input_size(); ++i) {
-    OutputPort fanin;
-    string fanin_name = ParseNodeName(node->input(i), &fanin.port_id);
-    fanin.node = (*mutable_nodes())[fanin_name];
+void MutableGraphView::RemoveFanouts(NodeDef* deleted_node) {
+  for (int i = 0; i < deleted_node->input_size(); ++i) {
+    TensorId tensor_id = ParseTensorName(deleted_node->input(i));
+    OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
 
     InputPort input;
-    input.node = node;
-    if (fanin.port_id < 0)
-      input.port_id = -1;
+    input.node = deleted_node;
+    if (tensor_id.index() < 0)
+      input.port_id = Graph::kControlSlot;
     else
       input.port_id = i;
 
-    (*mutable_fanouts())[fanin].erase(input);
+    fanouts()[fanin].erase(input);
   }
 }
 
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
index 702751a57fda5bf940f343545dc973bed1d79239..355dd6c491763e96b509ce42977e2cf0f5db2eb5 100644
--- a/tensorflow/core/grappler/mutable_graph_view.h
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -44,31 +44,44 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   using GraphViewInternal::GetRegularFanin;
   const OutputPort GetRegularFanin(const GraphView::InputPort& port) const;
 
-  // Adds a new node to graph and updates the view.
+  // Adds a new node to graph and updates the view. Returns a pointer to the
+  // node in graph.
   NodeDef* AddNode(NodeDef&& node);
 
-  // Inserts a new node to the graph after `input` node and updates the view.
-  // This adds `node` to the graph and replaces the input for the output
-  // nodes of `input` with a port `output_port_id` with the new node.
-  NodeDef* InsertNode(const NodeDef& input, NodeDef&& node,
-                      int output_port_id = 0);
-
-  // Replaces the input for the output nodes of 'old_input' with a port
-  // `output_port_id` with 'new_input'.
+  // Updates all fanouts (input ports fetching output tensors) from `from_node`
+  // to the `to_node`, including control dependencies.
+  //
+  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0, ^bar)
+  //   2. foo2(bar:1, other:1)
   //
-  // E.g: We have 2 nodes that use 'bar' node outputs as inputs:
-  // foo(bar:0, bar:1),  foo2(other:0, bar:0)
-  // Calling ReplaceInput(bar, new, 0) changes every occurrence of bar:0 for
-  // new:0.  Result:
-  // foo(new:0, bar:1),  foo2(other:0, new:0)
-  void ReplaceInput(const NodeDef& old_input, const NodeDef& new_input,
-                    int output_port_id = 0);
+  // After calling ForwardOutputs(bar, new_bar):
+  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
+  //   2. foo2(new_bar:1, other:1)
+  void UpdateFanouts(absl::string_view from_node, absl::string_view to_node);
 
   // Deletes nodes from the graph.
   void DeleteNodes(const std::set<string>& nodes_to_delete);
 
  private:
-  void RemoveFanouts(NodeDef* node);
+  // Updates all fanouts (input ports fetching output tensors) from `from_node`
+  // to the `to_node`, including control dependencies.
+  //
+  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0, ^bar)
+  //   2. foo2(bar:1, other:1)
+  //
+  // After calling ForwardOutputs(bar, new_bar):
+  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
+  //   2. foo2(new_bar:1, other:1)
+  //
+  // IMPORTANT: If `from_node` or `to_node` is not in the underlying graph, the
+  // behavior is undefined.
+  void UpdateFanouts(NodeDef* from_node, NodeDef* to_node);
+
+  // Remove fanouts of the deleted node from internal state (including control
+  // dependencies).
+  void RemoveFanouts(NodeDef* deleted_node);
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
index 7d9025e031ef4e8a81115610229136c9945717c5..c1b3f8c01cf3dbb570d64845fb7097d1b309fc30 100644
--- a/tensorflow/core/grappler/mutable_graph_view_test.cc
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/platform/test.h"
@@ -23,104 +24,122 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-bool FindChildWithName(const MutableGraphView& graph,
-                       const string& output_port_name,
-                       const string& input_name) {
-  MutableGraphView::OutputPort output_port =
-      graph.GetOutputPort(output_port_name, 0);
-  auto fanout = graph.GetFanout(output_port);
-  for (auto& input_port : fanout) {
-    if (input_port.node->name() == input_name) return true;
-  }
-  return false;
+using ::tensorflow::test::function::NDef;
+
+TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar", "NotImportant", {}, {}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar", "other", "bar:1", "^bar"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"})},
+      /* empty function library */ {});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NotImportant", {}, {}));
+  NodeDef* bar = graph.GetNode("bar");
+
+  graph.UpdateFanouts(bar->name(), new_bar->name());
+
+  // Fanout nodes must have their inputs updated.
+  NodeDef* foo_1 = graph.GetNode("foo_1");
+  ASSERT_NE(foo_1, nullptr);
+  ASSERT_EQ(foo_1->input_size(), 4);
+  EXPECT_EQ(foo_1->input(0), "new_bar");
+  EXPECT_EQ(foo_1->input(1), "other");
+  EXPECT_EQ(foo_1->input(2), "new_bar:1");
+  EXPECT_EQ(foo_1->input(3), "^new_bar");
+
+  NodeDef* foo_2 = graph.GetNode("foo_2");
+  ASSERT_NE(foo_2, nullptr);
+  ASSERT_EQ(foo_2->input_size(), 3);
+  EXPECT_EQ(foo_2->input(0), "other:1");
+  EXPECT_EQ(foo_2->input(1), "new_bar:2");
+  EXPECT_EQ(foo_2->input(2), "^new_bar");
+
+  // And fanouts mapping must be also updated for both nodes.
+  bool include_control_fanouts = true;
+  auto old_node_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
+  auto new_node_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
+
+  EXPECT_TRUE(old_node_fanouts.empty());
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 0)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 2)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, -1)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
 }
 
-TrivialTestGraphInputYielder SimpleGraph() {
-  // This outputs simple graph like:
-  //        x
-  //       / \
-  // Square   Square_1
-  //   |   \  /    |
-  //   |    \/     |
-  //   |    /\     |
-  //   |   /  \    |
-  //  AddN     AddN_1
-  //      \   /
-  //        y
-  TrivialTestGraphInputYielder simple_graph(2, 2, 2, false,
-                                            {"/CPU:0", "/GPU:0"});
-  return simple_graph;
-}
-
-TEST(MutableGraphViewTest, AddAndReplaceInput) {
-  TrivialTestGraphInputYielder fake_input = SimpleGraph();
-  GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+TEST(MutableGraphViewTest, AddAndUpdateFanoutsWithoutSelfLoops) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def =
+      test::function::GDef({NDef("bar", "NotImportant", {}, {}),
+                            NDef("foo", "NotImportant", {"bar", "^bar"})},
+                           /* empty function library */ {});
 
-  GraphDef new_graph = item.graph;
-  MutableGraphView graph(&new_graph);
+  MutableGraphView graph(&graph_def);
 
-  MutableGraphView::InputPort input = graph.GetInputPort("AddN", 0);
-  EXPECT_EQ("AddN", input.node->name());
-  EXPECT_EQ(0, input.port_id);
-  MutableGraphView::OutputPort fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ("Square", fanin.node->name());
-  EXPECT_EQ(0, fanin.port_id);
+  // `new_bar` reads the output of an original `bar` node.
+  NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NewBar", {"bar"}, {}));
+  NodeDef* bar = graph.GetNode("bar");
 
-  EXPECT_FALSE(FindChildWithName(graph, "Square", "new_node"));
+  graph.UpdateFanouts("bar", new_bar->name());
 
-  NodeDef new_node = *input.node;
-  new_node.set_name("new_node");
+  // Foo node must read from `new_bar`.
+  NodeDef* foo = graph.GetNode("foo");
+  ASSERT_NE(foo, nullptr);
+  ASSERT_EQ(foo->input_size(), 2);
+  EXPECT_EQ(foo->input(0), "new_bar");
+  EXPECT_EQ(foo->input(1), "^new_bar");
 
-  EXPECT_EQ(graph.GetNode("new_node"), nullptr);
-  NodeDef* node_in_graph = graph.AddNode(std::move(new_node));
-  EXPECT_NE(graph.GetNode("new_node"), nullptr);
+  // And the `new_bar` should read from the original `bar`.
+  ASSERT_EQ(new_bar->input_size(), 1);
+  ASSERT_EQ(new_bar->input(0), "bar");
 
-  graph.ReplaceInput(*input.node, *node_in_graph);
-  EXPECT_TRUE(FindChildWithName(graph, "Square", "new_node"));
-  EXPECT_TRUE(FindChildWithName(graph, "new_node", "y"));
-}
+  // And fanouts mapping must be also updated for both nodes.
+  bool include_control_fanouts = true;
+  auto bar_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
+  auto new_bar_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
 
-TEST(MutableGraphViewTest, InsertNodes) {
-  TrivialTestGraphInputYielder fake_input = SimpleGraph();
+  EXPECT_EQ(bar_fanouts.size(), 1);
+  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(new_bar, 0)), 1);
 
-  GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  EXPECT_EQ(new_bar_fanouts.size(), 2);
+  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, 0)), 1);
+  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, -1)), 1);
+}
 
-  GraphDef new_graph = item.graph;
-  MutableGraphView graph(&new_graph);
+TEST(MutableGraphViewTest, DeleteNodes) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar", "NotImportant", {}, {}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar", "other", "bar:1", "^bar"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"})},
+      /* empty function library */ {});
 
-  MutableGraphView::InputPort input = graph.GetInputPort("AddN", 0);
+  MutableGraphView graph(&graph_def);
 
-  NodeDef new_node = *input.node;
-  new_node.set_name("new_node");
-  new_node.set_input(0, input.node->name());
+  EXPECT_NE(graph.GetNode("foo_1"), nullptr);
+  graph.DeleteNodes({"foo_1"});
 
-  EXPECT_EQ(graph.GetNode("new_node"), nullptr);
-  graph.InsertNode(*input.node, std::move(new_node));
-  EXPECT_NE(graph.GetNode("new_node"), nullptr);
-  EXPECT_TRUE(FindChildWithName(graph, "Square", "AddN"));
-  EXPECT_TRUE(FindChildWithName(graph, "Square", "AddN_1"));
-  EXPECT_TRUE(FindChildWithName(graph, "Square_1", "AddN"));
-  EXPECT_TRUE(FindChildWithName(graph, "Square_1", "AddN_1"));
-  EXPECT_TRUE(FindChildWithName(graph, "AddN", "new_node"));
-  EXPECT_TRUE(FindChildWithName(graph, "AddN_1", "y"));
-  EXPECT_TRUE(FindChildWithName(graph, "new_node", "y"));
-}
+  EXPECT_EQ(graph.GetNode("foo_1"), nullptr);
 
-TEST(MutableGraphViewTest, DeleteNodes) {
-  // Outputs simple graph as described in first test.
-  TrivialTestGraphInputYielder fake_input = SimpleGraph();
-  GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  NodeDef* bar = graph.GetNode("bar");
+  NodeDef* other = graph.GetNode("other");
+  NodeDef* foo_2 = graph.GetNode("foo_2");
 
-  GraphDef new_graph = item.graph;
-  MutableGraphView graph(&new_graph);
+  bool include_control_fanouts = true;
+  auto bar_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
+  auto other_fanouts = graph.GetFanouts(*other, include_control_fanouts);
 
-  EXPECT_NE(graph.GetNode("AddN"), nullptr);
-  graph.DeleteNodes({"AddN"});
+  EXPECT_EQ(bar_fanouts.size(), 2);
+  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
+  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
 
-  EXPECT_EQ(graph.GetNode("AddN"), nullptr);
+  EXPECT_EQ(other_fanouts.size(), 1);
+  EXPECT_EQ(other_fanouts.count(MutableGraphView::InputPort(foo_2, 0)), 1);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index edcf6adb93800190668b51dc8009a134bffd8114..06248393ba8cb6d7c98d05f5b4500b7ca07fa900 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -214,6 +214,11 @@ bool IsFloorDiv(const NodeDef& node) { return node.op() == "FloorDiv"; }
 
 bool IsFloorMod(const NodeDef& node) { return node.op() == "FloorMod"; }
 
+bool IsFusedBatchNorm(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "FusedBatchNorm" || op == "FusedBatchNormV2";
+}
+
 bool IsFusedBatchNormGrad(const NodeDef& node) {
   const auto& op = node.op();
   return op == "FusedBatchNormGrad" || op == "FusedBatchNormGradV2";
@@ -248,6 +253,10 @@ bool IsIgammac(const NodeDef& node) { return node.op() == "Igammac"; }
 
 bool IsImag(const NodeDef& node) { return node.op() == "Imag"; }
 
+bool IsImmutableConst(const NodeDef& node) {
+  return node.op() == "ImmutableConst";
+}
+
 bool IsInvGrad(const NodeDef& node) { return node.op() == "InvGrad"; }
 
 bool IsLess(const NodeDef& node) { return node.op() == "Less"; }
@@ -359,6 +368,8 @@ bool IsReduction(const NodeDef& node) {
          op == "Mean" || op == "Any" || op == "All";
 }
 
+bool IsRelu(const NodeDef& node) { return node.op() == "Relu"; }
+
 bool IsReluGrad(const NodeDef& node) { return node.op() == "ReluGrad"; }
 
 bool IsRelu6Grad(const NodeDef& node) { return node.op() == "Relu6Grad"; }
@@ -564,6 +575,10 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
   if (node.op().find("Queue") != string::npos) {
     return false;
   }
+  // Sending a tensor via a network is a side effect.
+  if (IsSend(node)) {
+    return false;
+  }
   return !ModifiesInputsInPlace(node);
 }
 
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 29f989b5bed371911854725b58d5a60d331b0ac9..bd286f2c7210687baf7d0c1286d7d0973b24d38b 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -66,6 +66,7 @@ bool IsFakeParam(const NodeDef& node);
 bool IsFill(const NodeDef& node);
 bool IsFloorDiv(const NodeDef& node);
 bool IsFloorMod(const NodeDef& node);
+bool IsFusedBatchNorm(const NodeDef& node);
 bool IsFusedBatchNormGrad(const NodeDef& node);
 bool IsGreater(const NodeDef& node);
 bool IsGreaterEqual(const NodeDef& node);
@@ -76,6 +77,7 @@ bool IsIdentityNSingleInput(const NodeDef& node);
 bool IsIgamma(const NodeDef& node);
 bool IsIgammac(const NodeDef& node);
 bool IsImag(const NodeDef& node);
+bool IsImmutableConst(const NodeDef& node);
 bool IsInvGrad(const NodeDef& node);
 bool IsLess(const NodeDef& node);
 bool IsLessEqual(const NodeDef& node);
@@ -113,6 +115,7 @@ bool IsRandomShuffle(const NodeDef& node);
 bool IsRank(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
+bool IsRelu(const NodeDef& node);
 bool IsRelu6Grad(const NodeDef& node);
 bool IsReluGrad(const NodeDef& node);
 bool IsReciprocalGrad(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 0637e3b2e154a1f2c98ccac71b601fb1a368a464..b6f989f2c9cf057971a21acd798fe5c239b6d624 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -212,6 +212,8 @@ cc_library(
     hdrs = ["graph_optimizer_stage.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
@@ -706,6 +708,8 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -845,6 +849,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -871,11 +876,10 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index d24691eb4f5b98be5780bdfb6116f794ba850159..566701ec2a008611bf1cb6e33e6d939804230862 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -235,18 +235,17 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
 
   // TODO(ezhulenev): move to GraphOptimizerStage?
   bool IsDrivenByControlDependency(const NodeDef& node) const {
-    return std::any_of(node.input().begin(), node.input().end(),
-                       IsControlInput);
+    return std::any_of(
+        node.input().begin(), node.input().end(),
+        [](const string& input) { return IsControlInput(input); });
   }
 
   // TODO(ezhulenev): move to GraphOptimizerStage?
   bool DrivesControlDependency(const NodeDef& node) const {
-    int position;
     for (const NodeDef* output : ctx().node_map->GetOutputs(node.name())) {
       for (int i = 0; i < output->input_size(); ++i) {
-        auto input = output->input(i);
-        StringPiece name = ParseNodeNameAsStringPiece(input, &position);
-        if (name == node.name() && /*control input*/ position < 0) {
+        const TensorId tensor = ParseTensorName(output->input(i));
+        if (tensor.node() == node.name() && tensor.index() < 0) {
           return true;
         }
       }
@@ -1551,11 +1550,9 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
       const auto& outputs = ctx().node_map->GetOutputs(node.name());
       for (NodeDef* output : outputs) {
         if (IsControlInput(output->input(0))) continue;
-        int port;
-        const StringPiece node_name =
-            ParseNodeNameAsStringPiece(output->input(0), &port);
-        if (node_name == node.name()) {
-          tails->insert(ChainLink(output, port));
+        TensorId tensor_id = ParseTensorName(output->input(0));
+        if (tensor_id.node() == node.name()) {
+          tails->insert(ChainLink(output, tensor_id.index()));
         } else {
           // This output node has a non-control input other than the split node,
           // abort.
@@ -1602,14 +1599,12 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
         new_tails->insert(ChainLink(new_tail, link.port_origin));
       } else {
         for (NodeDef* new_tail : ctx().node_map->GetOutputs(tail->name())) {
-          int port;
-          const StringPiece node_name =
-              ParseNodeNameAsStringPiece(new_tail->input(0), &port);
-          if (node_name != tail->name()) {
+          const TensorId tensor = ParseTensorName(new_tail->input(0));
+          if (tensor.node() != tail->name()) {
             return Status::OK();
           }
           // Skip control outputs.
-          if (port >= 0) {
+          if (tensor.index() >= 0) {
             // Remember original port.
             new_tails->insert(ChainLink(new_tail, link.port_origin));
           }
@@ -2314,7 +2309,9 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
   ~SimplifyAggregation() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsAggregate(*node) && NumNonControlInputs(*node) > 0;
+    return IsAggregate(*node) && NumNonControlInputs(*node) > 0 &&
+           GetDataTypeFromAttr(*node, "T") !=
+               DT_VARIANT;  // TODO(b/119787146): Enable for variants.
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -3248,10 +3245,10 @@ uint64 UniqueNodes::ComputeSignature(const NodeDef& node) const {
   h = Hash64Combine(Hash64(node.device()), h);
 
   for (const auto& input : node.input()) {
-    int pos;
-    const StringPiece node_name = ParseNodeNameAsStringPiece(input, &pos);
-    h = Hash64CombineUnordered(Hash64(node_name.data(), node_name.size()), h);
-    h = Hash64CombineUnordered(std::hash<int>()(pos), h);
+    const TensorId input_tensor = ParseTensorName(input);
+    h = Hash64CombineUnordered(
+        Hash64(input_tensor.node().data(), input_tensor.node().size()), h);
+    h = Hash64CombineUnordered(std::hash<int>()(input_tensor.index()), h);
   }
   for (const auto& attr : node.attr()) {
     h = Hash64CombineUnordered(Hash64(attr.first), h);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 8107d383f6b8393e8b75898a02b72917ea3185b0..5e3e5d6af9a7dd435a15f83e94434de0c25ed7aa 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -157,6 +157,16 @@ bool GetConcatAxis(const GraphProperties& properties, NodeDef* node,
   return true;
 }
 
+bool HasTPUAttributes(const NodeDef& node) {
+  AttrSlice attrs(node);
+  for (auto attr : attrs) {
+    if (attr.first.find("_tpu_") != attr.first.npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
@@ -764,6 +774,13 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     return false;
   }
 
+  // Don't fold nodes that contain TPU attributes.
+  // TODO(rmlarsen): We should be able to fold many of these nodes as long as we
+  // properly forward custom attributes, b/119051778.
+  if (HasTPUAttributes(node)) {
+    return false;
+  }
+
   const OpDef* op_def = nullptr;
   Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
   if (!status.ok()) {
@@ -988,9 +1005,8 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
   });
 
   for (const auto& input : node.input()) {
-    int port = 0;
-    ParseNodeNameAsStringPiece(input, &port);
-    if (port < 0) {
+    const TensorId input_tensor = ParseTensorName(input);
+    if (input_tensor.index() < 0) {
       // Control dependency
       break;
     }
@@ -1129,9 +1145,12 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
   std::vector<NodeDef> const_nodes;
   TF_RETURN_IF_ERROR(
       EvaluateOneFoldable(*node, &const_nodes, result_too_large));
+  VLOG(1) << "Folded node:\n" << node->DebugString();
+
   NodeDef* constant_output = nullptr;
   for (int i = 0; i < const_nodes.size(); i++) {
     NodeDef* const_node = &const_nodes[i];
+    VLOG(1) << "Generated constant node:\n" << const_node->DebugString();
     if (const_node->name().empty()) {
       // Dead output: we can't create a constant to encode its value, so we'll
       // just skip it. We'll preserve the edges that originate from that
@@ -1297,64 +1316,6 @@ Status ConstantFolding::FoldGraph(
   return Status::OK();
 }
 
-// Returns true iff this reduction can be reduced to an identity (i.e if the set
-// of dimensions to reduce along is empty). This happens often in the gradient
-// graphs.
-bool ConstantFolding::IsSimplifiableReduction(
-    const NodeDef& node, const GraphProperties& properties) const {
-  if (IsReduction(node)) {
-    CHECK_LE(2, node.input_size());
-    const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
-    if (IsReallyConstant(*reductions_indices)) {
-      TensorVector output;
-      auto outputs_cleanup = gtl::MakeCleanup([&output] {
-        for (const auto& out : output) {
-          delete out.tensor;
-        }
-      });
-      Status s = EvaluateNode(*reductions_indices, TensorVector(), &output);
-      if (!s.ok()) {
-        return false;
-      }
-      CHECK_EQ(1, output.size());
-      int output_size = output[0]->NumElements();
-      if (output_size == 0) {
-        return true;
-      }
-      if (node.attr().count("keep_dims") > 0 &&
-          node.attr().at("keep_dims").b()) {
-        const auto& props = properties.GetInputProperties(node.name());
-        if (!props.empty()) {
-          const TensorShapeProto& input_shape = props[0].shape();
-          if (!input_shape.unknown_rank()) {
-            bool simplifiable = true;
-            for (int i = 0; i < output[0]->NumElements(); ++i) {
-              int64 dim;
-              if (output[0]->dtype() == DT_INT32) {
-                dim = output[0]->flat<int32>()(i);
-              } else {
-                dim = output[0]->flat<int64>()(i);
-              }
-              if (dim < 0) {
-                dim += input_shape.dim_size();
-              }
-              if (dim < 0 || dim >= input_shape.dim_size() ||
-                  input_shape.dim(dim).size() != 1) {
-                simplifiable = false;
-                break;
-              }
-            }
-            if (simplifiable) {
-              return true;
-            }
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
 bool ConstantFolding::IsSimplifiableReshape(
     const NodeDef& node, const GraphProperties& properties) const {
   if (!IsReshape(node)) {
@@ -1596,15 +1557,19 @@ Status ConstantFolding::ReplaceOperationWithConstant(
 
 Status ConstantFolding::SimplifyGraph(
     bool use_shape_info, GraphDef* optimized_graph, GraphProperties* properties,
-    const absl::flat_hash_set<string>& nodes_to_not_simplify) {
+    absl::flat_hash_set<string>* nodes_to_not_simplify) {
   for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    NodeDef* node = optimized_graph->mutable_node(i);
     // TODO(lyandy): Move nodes to not simplify check into SimplifyNode and
     // generalize to only restrict certain simplifications.
-    if (nodes_to_not_simplify.find(optimized_graph->node(i).name()) ==
-        nodes_to_not_simplify.end()) {
-      TF_RETURN_IF_ERROR(SimplifyNode(use_shape_info,
-                                      optimized_graph->mutable_node(i),
-                                      optimized_graph, properties));
+    if (nodes_to_not_simplify->find(node->name()) ==
+        nodes_to_not_simplify->end()) {
+      if (HasTPUAttributes(optimized_graph->node(i))) {
+        nodes_to_not_simplify->insert(node->name());
+        continue;
+      }
+      TF_RETURN_IF_ERROR(
+          SimplifyNode(use_shape_info, node, optimized_graph, properties));
     }
   }
   return Status::OK();
@@ -1700,7 +1665,7 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
     return Status::OK();
   }
 
-  if (SimplifyReduction(*properties, node)) {
+  if (SimplifyReduction(optimized_graph, *properties, node)) {
     graph_modified_ = true;
     return Status::OK();
   }
@@ -2303,9 +2268,148 @@ bool ConstantFolding::SimplifySwitch(GraphDef* optimized_graph, NodeDef* node) {
   return false;
 }
 
-bool ConstantFolding::SimplifyReduction(const GraphProperties& properties,
+bool ConstantFolding::IsReductionCandidateForSimplification(
+    const NodeDef& node, const GraphProperties& properties,
+    TensorShapeProto* input_tensor_shape, TensorShapeProto* output_tensor_shape,
+    bool* is_single_element_op) const {
+  // Ensure its an appropriate Reduce node.
+  if (!IsReduction(node) || node.input_size() < 2) {
+    return false;
+  }
+  // Ensure that the axes to reduce by are constant.
+  NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
+  if (!IsReallyConstant(*reductions_indices)) {
+    return false;
+  }
+
+  // Get the properties of the input & output tensors and check if they both
+  // contain a single element.
+  if (!properties.HasInputProperties(node.name()) ||
+      !properties.HasOutputProperties(node.name())) {
+    return false;
+  }
+  const auto& input_props = properties.GetInputProperties(node.name())[0];
+  const auto& output_props = properties.GetOutputProperties(node.name())[0];
+  if (!input_props.has_shape() || input_props.shape().unknown_rank() ||
+      !output_props.has_shape() || output_props.shape().unknown_rank()) {
+    return false;
+  }
+  *input_tensor_shape = input_props.shape();
+  *output_tensor_shape = output_props.shape();
+  for (int i = 0; i < input_tensor_shape->dim_size(); ++i) {
+    if (input_tensor_shape->dim(i).size() < 0) {
+      return false;
+    }
+  }
+  for (int i = 0; i < output_tensor_shape->dim_size(); ++i) {
+    if (output_tensor_shape->dim(i).size() < 0) {
+      return false;
+    }
+  }
+  const int input_num_elements =
+      TensorShape(*input_tensor_shape).num_elements();
+  const int output_num_elements =
+      TensorShape(*output_tensor_shape).num_elements();
+  *is_single_element_op = input_num_elements == 1 && output_num_elements == 1;
+
+  return true;
+}
+
+bool ConstantFolding::IsReductionSimplifiableToIdentity(
+    const NodeDef& node, const TensorShapeProto& input_shape, bool keep_dims,
+    const TensorVector& reduction_indices_vector) const {
+  int output_size = reduction_indices_vector[0]->NumElements();
+  if (output_size == 0) {
+    return true;
+  }
+
+  if (!keep_dims) {
+    return false;
+  }
+  bool simplifiable = true;
+  for (int i = 0; i < output_size; ++i) {
+    int64 dim;
+    if (reduction_indices_vector[0]->dtype() == DT_INT32) {
+      dim = reduction_indices_vector[0]->flat<int32>()(i);
+    } else {
+      dim = reduction_indices_vector[0]->flat<int64>()(i);
+    }
+    if (dim < 0) {
+      dim += input_shape.dim_size();
+    }
+    if (dim < 0 || dim >= input_shape.dim_size() ||
+        input_shape.dim(dim).size() != 1) {
+      simplifiable = false;
+      break;
+    }
+  }
+  return simplifiable;
+}
+
+bool ConstantFolding::SimplifyReduction(GraphDef* optimized_graph,
+                                        const GraphProperties& properties,
                                         NodeDef* node) {
-  if (IsSimplifiableReduction(*node, properties)) {
+  bool is_single_element_op = false;
+  TensorShapeProto input_tensor_shape, output_tensor_shape;
+  if (!IsReductionCandidateForSimplification(
+          *node, properties, &input_tensor_shape, &output_tensor_shape,
+          &is_single_element_op)) {
+    return false;
+  }
+
+  // Get the reduction indices.
+  string reduction_indices_input = node->input(1);
+  NodeDef* reduction_indices = node_map_->GetNode(reduction_indices_input);
+  TensorVector reduction_indices_vector;
+  auto outputs_cleanup = gtl::MakeCleanup([&reduction_indices_vector] {
+    for (const auto& out : reduction_indices_vector) {
+      delete out.tensor;
+    }
+  });
+  if (!EvaluateNode(*reduction_indices, TensorVector(),
+                    &reduction_indices_vector)
+           .ok() ||
+      reduction_indices_vector.size() != 1) {
+    return false;
+  }
+
+  bool keep_dims =
+      node->attr().count("keep_dims") > 0 && node->attr().at("keep_dims").b();
+  bool simplifiable_to_reshape =
+      is_single_element_op && !keep_dims && (node->attr().count("T") > 0);
+  bool simplifiable_to_identity = IsReductionSimplifiableToIdentity(
+      *node, input_tensor_shape, keep_dims, reduction_indices_vector);
+
+  if (simplifiable_to_reshape) {
+    // Const node to output shape.
+    const int new_num_dimensions = output_tensor_shape.dim_size();
+    Tensor tensor(DT_INT32, TensorShape({new_num_dimensions}));
+    for (int i = 0; i < new_num_dimensions; i++) {
+      tensor.flat<int>()(i) = 1;
+    }
+    TensorValue shape_value(&tensor);
+    NodeDef* shape_node = optimized_graph->add_node();
+    if (!CreateNodeDef(OptimizedNodeName(*node, "_shape_const"), shape_value,
+                       shape_node)
+             .ok()) {
+      return false;
+    }
+    shape_node->set_device(node->device());
+    node_map_->AddNode(shape_node->name(), shape_node);
+    // Control dependency to ensure shape_node is in the correct frame.
+    shape_node->add_input(AsControlDependency(reduction_indices_input));
+    node_map_->AddOutput(NodeName(reduction_indices_input), shape_node->name());
+    // Optimize node to Reshape.
+    node->set_op("Reshape");
+    node_map_->UpdateInput(node->name(), node->input(1), shape_node->name());
+    node->set_input(1, shape_node->name());
+    node->mutable_attr()->erase("keep_dims");
+    node->mutable_attr()->erase("Tidx");
+    AttrValue attr_type_indices;
+    attr_type_indices.set_type(DT_INT32);
+    (*node->mutable_attr())["Tshape"] = attr_type_indices;
+    return true;
+  } else if (simplifiable_to_identity) {
     // Replace the reduction node with an identity node, that can be further
     // optimized by the model pruner.
     DataType output_type;
@@ -3043,7 +3147,7 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   TF_RETURN_IF_ERROR(FoldGraph(optimized_graph, &nodes_to_not_simplify));
   node_map_.reset(new NodeMap(optimized_graph));
   TF_RETURN_IF_ERROR(SimplifyGraph(can_use_shape_info, optimized_graph,
-                                   &properties, nodes_to_not_simplify));
+                                   &properties, &nodes_to_not_simplify));
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index c81d3067d50d1c83232308b09de436f77920dfa9..0b778882d7d4d89d83de5d6bd5a6f9c827cf5bf8 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -96,14 +97,11 @@ class ConstantFolding : public GraphOptimizer {
   Status FoldGraph(GraphDef* output,
                    absl::flat_hash_set<string>* nodes_to_not_simplify);
 
-  bool IsSimplifiableReduction(const NodeDef& node,
-                               const GraphProperties& properties) const;
   bool IsSimplifiableReshape(const NodeDef& node,
                              const GraphProperties& properties) const;
-  Status SimplifyGraph(
-      bool use_shape_info, GraphDef* optimized_graph,
-      GraphProperties* properties,
-      const absl::flat_hash_set<string>& nodes_to_not_simplify);
+  Status SimplifyGraph(bool use_shape_info, GraphDef* optimized_graph,
+                       GraphProperties* properties,
+                       absl::flat_hash_set<string>* nodes_to_not_simplify);
   Status SimplifyNode(bool use_shape_info, NodeDef* node,
                       GraphDef* optimized_graph, GraphProperties* properties);
 
@@ -148,8 +146,22 @@ class ConstantFolding : public GraphOptimizer {
   bool SimplifyReshape(const GraphProperties& properties, bool use_shape_info,
                        NodeDef* node);
 
-  // Simplifies a Reduction operation to an Identity operation if applicable.
-  bool SimplifyReduction(const GraphProperties& properties, NodeDef* node);
+  // Returns true if theres a possibility that a Reduce node could be simplified
+  // to an Identity/Reshape.
+  bool IsReductionCandidateForSimplification(
+      const NodeDef& node, const GraphProperties& properties,
+      TensorShapeProto* input_tensor_shape,
+      TensorShapeProto* output_tensor_shape, bool* is_single_element_op) const;
+  // Returns true iff this reduction can be reduced to an identity (i.e if the
+  // set of dimensions to reduce along is empty). This happens often in the
+  // gradient graphs.
+  bool IsReductionSimplifiableToIdentity(
+      const NodeDef& node, const TensorShapeProto& input_shape, bool keep_dims,
+      const gtl::InlinedVector<TensorValue, 4>& reduction_indices_vector) const;
+  // Simplifies a Reduction operation to an Identity/Reshape operation if
+  // applicable.
+  bool SimplifyReduction(GraphDef* optimized_graph,
+                         const GraphProperties& properties, NodeDef* node);
 
   // Switch(x, x) will always feed false to its false branch and true to
   // its true branch. By rewriting the graph a bit, we can propagate these
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 02c45e80c5ff4a198714b981a170f6b19e5498c4..f6fdb32e989219039a44e636a20573cb707dd1ba 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -2303,6 +2303,95 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
+TEST_F(ConstantFoldingTest, SingleElementEmptyAxisReduction) {
+  // Build a simple graph with reductions that involve single-element input and
+  // no axes to reduce along.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output input_var_three_dim = ops::Variable(
+      scope.WithOpName("input_var_three_dim"), {1, 1, 1}, DT_FLOAT);
+  Output input_var_one_dim =
+      ops::Variable(scope.WithOpName("input_var_one_dim"), {1}, DT_FLOAT);
+  Output one_axis = ops::Const(scope.WithOpName("one_axis"), {0}, {1});
+  Output multiple_axes =
+      ops::Const(scope.WithOpName("multiple_axes"), {1, 0}, {2});
+  Output variable_axis =
+      ops::Variable(scope.WithOpName("input_var_axis"), {1}, DT_INT32);
+  ops::Mean::Attrs attr;
+  attr = attr.KeepDims(false);
+  // Should be optimized to Reshape.
+  Output mean_1 = ops::Mean(scope.WithOpName("mean_1"), input_var_three_dim,
+                            one_axis, attr.KeepDims(false));
+  Output mean_2 = ops::Mean(scope.WithOpName("mean_2"), input_var_three_dim,
+                            multiple_axes, attr.KeepDims(false));
+  // Should remain as-is, since OutputProperties will not be known this node.
+  Output mean_3 = ops::Mean(scope.WithOpName("mean_3"), input_var_one_dim,
+                            one_axis, attr.KeepDims(false));
+  // Should remain as-is.
+  Output mean_4 = ops::Mean(scope.WithOpName("mean_4"), input_var_three_dim,
+                            variable_axis, attr.KeepDims(false));
+  // Should be optimized to Identity, since KeepDims=true.
+  Output mean_5 = ops::Mean(scope.WithOpName("mean_5"), input_var_three_dim,
+                            multiple_axes, attr.KeepDims(true));
+
+  GrapplerItem item;
+  item.fetch = {"mean_1", "mean_2", "mean_3", "mean_4", "mean_5"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // Ensure Mean node is optimized to Reshape.
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "mean_1" || node.name() == "mean_2") {
+      found++;
+      EXPECT_EQ("Reshape", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("input_var_three_dim", node.input(0));
+    } else if (node.name() == "mean_3") {
+      found++;
+      EXPECT_EQ("Mean", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("input_var_one_dim", node.input(0));
+    } else if (node.name() == "mean_4") {
+      found++;
+      EXPECT_EQ("Mean", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("input_var_three_dim", node.input(0));
+    } else if (node.name() == "mean_5") {
+      found++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^multiple_axes", node.input(1));
+    }
+  }
+  EXPECT_EQ(5, found);
+
+  // Ensure resultant values from Mean and Reshape are the same.
+  auto input_var_three_dim_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 1, 1}));
+  auto input_var_one_dim_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
+  Tensor input_var_axis_t(DT_INT32, TensorShape({1}));
+  input_var_axis_t.flat<int32>()(0) = 0;
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch,
+                    {{"input_var_three_dim", input_var_three_dim_t},
+                     {"input_var_one_dim", input_var_one_dim_t},
+                     {"input_var_axis", input_var_axis_t}});
+  EXPECT_EQ(5, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"input_var_three_dim", input_var_three_dim_t},
+                                {"input_var_one_dim", input_var_one_dim_t},
+                                {"input_var_axis", input_var_axis_t}});
+  EXPECT_EQ(5, tensors.size());
+  for (int i = 0; i < 5; ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+  }
+}
+
 TEST_F(ConstantFoldingTest, NoOpReshape) {
   // Build a simple graph with a reshape that can be reduced to the identity.
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 89e95067b83d70204de29d785666cf4e46fc939c..7593023ff4d649c623db9be98ac52ef6b799219f 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -628,6 +628,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:logging_ops",
         "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels:nn",
         "//tensorflow/core/kernels:parsing",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ] + tf_protos_all(),
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
index 3ffbfba95ee8f93e8643ab6dda2b3c9580695b1f..89b568ecf161cda08f1b71b369c3edb1d43f2a7f 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
@@ -109,7 +109,7 @@ Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* fused_filter_node = graph.AddNode(MakeFusedFilterNode(
         *first_filter_node, *second_filter_node, *fused_predicate, &graph));
 
-    graph.ReplaceInput(*second_filter_node, *fused_filter_node);
+    graph.UpdateFanouts(second_filter_node->name(), fused_filter_node->name());
 
     // TODO(prazek): we should run some optimizations on the fused filter
     // functions, or make sure that optimization passes run after filter
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
index 91b3f71e9e3de7ff05c56e635410744876ccc3a7..5af9fbadf76bfde5b031df0978ff9447ea3afb57 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
@@ -266,7 +266,7 @@ Status HoistRandomUniform::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* stateless_map = graph.AddNode(
         MakeStatelessMap(*map_node, *zip_node, *stateless_func, &graph));
 
-    graph.ReplaceInput(*map_node, *stateless_map);
+    graph.UpdateFanouts(map_node->name(), stateless_map->name());
 
     // TODO(b/116285210): we could also remove map functions from library if
     // they are not used anymore.
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index 6a5a70e084e14ba13911a4790cdb82b40c695545..16b2efb3ed3c25c4fa5b8b42205037c212140289 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -96,7 +96,8 @@ Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item,
       }
     }
 
-    graph.InsertNode(node, MakeLatencyNode(node, &graph));
+    NodeDef* latency_node = graph.AddNode(MakeLatencyNode(node, &graph));
+    graph.UpdateFanouts(node.name(), latency_node->name());
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
index bab2c361494da6bda98cb3e70f80b677f80dd3df..e5de981822376d2e4d1d78ac628f527d242f133a 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
@@ -47,7 +47,7 @@ Status MakeNumaAware::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (node.op() != "MapAndBatchDatasetV2") continue;
 
     auto* numa_node = graph.AddNode(MakeNumaAwareNode(node, &graph));
-    graph.ReplaceInput(node, *numa_node);
+    graph.UpdateFanouts(node.name(), numa_node->name());
     nodes_to_delete.insert(node.name());
   }
   graph.DeleteNodes(nodes_to_delete);
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 2807e0886bb84dcb1acc03f573dcf70d309ffc60..800050b840326d826328763a52c5447c8df70a99 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -113,7 +113,7 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     auto* new_node =
         graph.AddNode(MakeMapAndBatchNode(*map_node, batch_node, &graph));
-    graph.ReplaceInput(batch_node, *new_node);
+    graph.UpdateFanouts(batch_node.name(), new_node->name());
 
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index 7cb52c36b2d5e90acc8b71cf511bc989db86e9d6..2b0a347ce625140be16d258964af06ef418e9f58 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -38,21 +38,28 @@ NodeDef MakeFusedNode(const NodeDef& map_node,
                       MutableGraphView* graph) {
   NodeDef fused_node;
   graph_utils::SetUniqueGraphNodeName("fused_map", graph->graph(), &fused_node);
-  fused_node.set_op("MapDataset");
-  fused_node.add_input(map_node.input(0));
+  fused_node.set_op(map_node.op());
+
+  // Copy over inputs.
+  for (int i = 0; i < map_node.input_size(); ++i) {
+    fused_node.add_input(map_node.input(i));
+  }
 
   auto attr = map_node.attr().at("f");
   attr.mutable_func()->set_name(fused_function.signature().name());
   (*fused_node.mutable_attr())["f"] = std::move(attr);
 
-  graph_utils::CopyAttribute("Targuments", map_node, &fused_node);
-
-  for (auto key : {"output_shapes", "output_types"})
+  // Required attrs.
+  for (auto key : {"Targuments", "output_shapes", "output_types"}) {
     graph_utils::CopyAttribute(key, map_node, &fused_node);
+  }
 
-  if (const auto* attr =
-          gtl::FindOrNull(map_node.attr(), "use_inter_op_parallelism"))
-    (*fused_node.mutable_attr())["use_inter_op_parallelism"] = *attr;
+  // Optional attrs.
+  for (auto key : {"use_inter_op_parallelism", "sloppy"}) {
+    if (const auto* attr = gtl::FindOrNull(map_node.attr(), key)) {
+      graph_utils::CopyAttribute(key, map_node, &fused_node);
+    }
+  }
 
   // Add the predicate output attributes.
   (*fused_node.mutable_attr())["output_types"]
@@ -97,7 +104,9 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
-    if (node.op() == "MapDataset") return &node;
+    if (node.op() == "MapDataset" || node.op() == "ParallelMapDataset") {
+      return &node;
+    }
     return nullptr;
   };
 
@@ -145,7 +154,7 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* filter_by_component = graph.AddNode(
         MakeFilterByLastComponentNode(*fused_maps, *filter_node, &graph));
 
-    graph.ReplaceInput(*filter_node, *filter_by_component);
+    graph.UpdateFanouts(filter_node->name(), filter_by_component->name());
     TF_RETURN_IF_ERROR(function_library.AddFunctionDef(*fused_function));
 
     // TODO(prazek): we could also remove functions from library if they are not
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
index 6e6da37d7c20dee92bfe3676fa838ce82dd9222f..c5a5e22aba6cd2af4b2de9fa516e49b00e6e0c12 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
@@ -30,6 +30,7 @@ namespace grappler {
 namespace {
 using graph_tests_utils::MakeFilterNode;
 using graph_tests_utils::MakeMapNode;
+using graph_tests_utils::MakeParallelMapNode;
 
 TEST(MapAndFilterFusionTest, FuseMapAndFilter) {
   using test::function::NDef;
@@ -58,6 +59,41 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilter) {
       graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
 }
 
+TEST(MapAndFilterFusionTest, FuseParallelMapAndFilter) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 3}, {"dtype", "DT_INT32"}}),
+       MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo",
+                           /*sloppy=*/false),
+       MakeFilterNode("filter", "map")},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+          test::function::IsZero(),
+      });
+
+  MapAndFilterFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output))
+      << output.DebugString();
+  auto& map_node = output.node(
+      graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output));
+  EXPECT_FALSE(map_node.attr().at("sloppy").b()) << map_node.DebugString();
+  EXPECT_TRUE(
+      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output))
+      << output.DebugString();
+}
+
 TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) {
   using test::function::NDef;
   GrapplerItem item;
@@ -103,6 +139,56 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) {
   EXPECT_EQ(cache_node.input(0), filter_by_component.name());
 }
 
+TEST(MapAndFilterFusionTest, FuseParallelMapAndFilterWithExtraChild) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_STRING}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 3}, {"dtype", "DT_INT32"}}),
+       MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo",
+                           /*sloppy=*/true),
+       MakeFilterNode("filter", "map"),
+       NDef("cache", "CacheDataset", {"filter", "filename"}, {})},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+          test::function::IsZero(),
+      });
+
+  MapAndFilterFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output));
+  ASSERT_TRUE(
+      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
+
+  int map_id = graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output);
+  auto& map_node = output.node(map_id);
+  ASSERT_EQ(map_node.input_size(), 2);
+  EXPECT_EQ(map_node.input(0), "range");
+  EXPECT_EQ(map_node.input(1), "num_parallel_calls");
+
+  int filter_by_component_id =
+      graph_utils::FindGraphNodeWithOp("FilterByLastComponentDataset", output);
+  auto& filter_by_component = output.node(filter_by_component_id);
+  ASSERT_EQ(filter_by_component.input_size(), 1);
+  EXPECT_EQ(filter_by_component.input(0), map_node.name());
+
+  int cache_id = graph_utils::FindGraphNodeWithOp("CacheDataset", output);
+  auto& cache_node = output.node(cache_id);
+  ASSERT_EQ(cache_node.input_size(), 2);
+  EXPECT_EQ(cache_node.input(0), filter_by_component.name());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index 23bb49db62bff2dbdbecc5c4847b86637ef2755d..6ca0da27551bc78a9167d308eb229c662821c582 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -123,7 +123,7 @@ Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* fused_maps_node = graph.AddNode(
         MakeFusedNode(*parent_map_node, *map_node, *fused_function, &graph));
 
-    graph.ReplaceInput(*map_node, *fused_maps_node);
+    graph.UpdateFanouts(map_node->name(), fused_maps_node->name());
 
     // TODO(prazek): we should run some optimizations on the fused map
     // functions, or make sure that optimization passes run after map
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
index f4c861745717cf9cd0ae45ee74f6f59894282899..8e49f908a77288c8e99b62706578d86a272ab682 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
@@ -83,7 +83,7 @@ Status MapParallelization::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!CanParallelize(*function, function_library)) continue;
 
     auto* parallel_map = graph.AddNode(MakeParallelMap(*map_node, &graph));
-    graph.ReplaceInput(*map_node, *parallel_map);
+    graph.UpdateFanouts(map_node->name(), parallel_map->name());
     nodes_to_delete.insert(map_node->name());
   }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index 04ab46885cf0499defb318c30aa2cf3448ea2b7b..3401dcc6f23bae1b2e77d5ea18a94f382fee4fb8 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -264,7 +264,7 @@ Status MapVectorization::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     auto* new_map_node = graph.AddNode(MakeNewMapNode(
         *map_node, batch_node, *new_batch_node, *vectorized_func, &graph));
-    graph.ReplaceInput(batch_node, *new_map_node);
+    graph.UpdateFanouts(batch_node.name(), new_map_node->name());
 
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index 763434b6136b866b725247fdeeb04effb643e32e..bd405c8329464793ee42757bc7ee1a3f34826bd9 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -79,7 +79,7 @@ Status NoOpElimination::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!IsNoOp(node, graph)) continue;
 
     NodeDef* const parent = graph_utils::GetInputNode(node, graph);
-    graph.ReplaceInput(node, *parent);
+    graph.UpdateFanouts(node.name(), parent->name());
 
     nodes_to_delete.insert(node.name());
   }
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index 99c4afa6340094991ffa9646710a7febc66f7d64..d9af78d38cd590f5eecefe4d70c7e45dd94985c0 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -86,7 +86,7 @@ Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
 
     NodeDef* shuffle_and_repeat_node =
         graph.AddNode(make_shuffle_and_repeat_node(shuffle_node, repeat_node));
-    graph.ReplaceInput(repeat_node, *shuffle_and_repeat_node);
+    graph.UpdateFanouts(repeat_node.name(), shuffle_and_repeat_node->name());
 
     // Mark the `Shuffle` and `Repeat` nodes for removal.
     nodes_to_delete.insert(shuffle_node.name());
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 49ba6c2ba9f0cd43e88d365e698f2aabe1ad0410..541302361fb07066127196166750e6f5324b7d98 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -33,6 +33,7 @@ cc_library(
     deps = [
         ":wrapped_tensor",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/cc:ops",
         "//tensorflow/core:lib",
     ] + tf_protos_all(),
 )
@@ -120,5 +121,6 @@ cc_library(
         ":unpack_vectorizer",
         ":vectorizer",
         ":vectorizer_registry",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc
index 709882e45aec2b365f14f28b104acbcc93d90461..9d853f84a8a7bad557452f3cbd14db05bef58bf1 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc
@@ -43,87 +43,78 @@ const char* const kExpandDimsPrefix = "vectorized/expanddims/";
 // with shape [n, 12, 7, 5]: we need to manually expand the dimensions of A
 // *after* the leading dimension, i.e. expand A to the shape [n, 1, 1, 5] before
 // broadcasting.
-Status ExpandDimsForBroadcast(std::vector<WrappedTensor>* inputs, Graph* g) {
+Status ExpandDimsForBroadcast(VectorizerInput* inputs, Graph* g) {
   Status status;
   Scope parent = NewInternalScope(g, &status, nullptr);
-  Scope s = parent.NewSubScope(kExpandDimsPrefix);
+  Scope scope = parent.NewSubScope(kExpandDimsPrefix);
 
   // TODO(rachelim): We can potentially get rid of all these ops if shapes are
   // known statically
 
-  Output const_0 = ops::Const(s, 0);
-  Output const_1 = ops::Const(s, 1);
-
-  std::vector<Output> ranks;
-  ranks.reserve(inputs->size());
-
   // Get the stacked rank of each input
-  for (const auto& input : *inputs) {
-    Output rank = ops::Rank(s, Output(input.node, input.output_index));
+  auto get_stacked_rank = [&scope](const WrappedTensor& input) {
+    Output rank = ops::Rank(scope, Output(input.node, input.output_index));
 
     if (!input.stacked) {
       // If the input is unstacked, add 1
-      rank = ops::Add(s, rank, const_1);
+      rank = ops::Add(scope, rank, ops::Const(scope, 1));
     }
 
-    ranks.push_back(rank);
-  }
-
-  // Pack the ranks into one tensor to get the max
-  Output packed_ranks = ops::Stack(s, ranks);
+    return rank;
+  };
 
-  Output max_rank =
-      ops::Max(s, packed_ranks, const_0, ops::Max::Attrs().KeepDims(true));
+  Output rank_0 = get_stacked_rank(inputs->at(0));
+  Output rank_1 = get_stacked_rank(inputs->at(1));
 
-  std::vector<WrappedTensor> expanded_inputs;
-  expanded_inputs.reserve(inputs->size());
+  Output max_rank = ops::Maximum(scope, rank_0, rank_1);
 
   // For all inputs that are stacked, expand dimensions after dim 0.
-  for (size_t i = 0; i < inputs->size(); ++i) {
-    if (!inputs->at(i).stacked) {
-      expanded_inputs.push_back(inputs->at(i));
-      continue;
-    }
-
-    Output input(inputs->at(i).node, inputs->at(i).output_index);
+  auto expand_dims_if_unstacked =
+      [&scope, &max_rank](const WrappedTensor& tensor, const Output& rank) {
+        if (!tensor.stacked)
+          return WrappedTensor(tensor.node, tensor.output_index, false);
 
-    // Number of dimensions to expand
-    Output rank_diff = ops::Sub(s, max_rank, ranks[i]);
+        Output input(tensor.node, tensor.output_index);
 
-    // [1] * rank_diff
-    Output ones = ops::Tile(s, ops::Const(s, {1}), rank_diff);
+        Output rank_diff = ops::Sub(scope, max_rank, rank);
 
-    Output const_vec_1 = ops::Const(s, {1});
+        // [1] * rank_diff
+        Output ones = ops::Fill(
+            scope, ops::ExpandDims(scope, rank_diff, ops::Const(scope, 0)),
+            ops::Const(scope, 1));
 
-    Output shape = ops::Shape(s, input);
+        Output shape = ops::Shape(scope, input);
 
-    // shape[:1]
-    Output concat_pre =
-        ops::StridedSlice(s, shape, const_vec_1, const_vec_1, const_vec_1,
-                          ops::StridedSlice::Attrs().BeginMask(1));
+        Output const_vec_1 = ops::Const(scope, {1});
+        // shape[:1]
+        Output concat_pre = ops::StridedSlice(
+            scope, shape, const_vec_1, const_vec_1, const_vec_1,
+            ops::StridedSlice::Attrs().BeginMask(1));
 
-    // shape[1:]
-    Output concat_post =
-        ops::StridedSlice(s, shape, const_vec_1, const_vec_1, const_vec_1,
-                          ops::StridedSlice::Attrs().EndMask(1));
+        // shape[1:]
+        Output concat_post = ops::StridedSlice(
+            scope, shape, const_vec_1, const_vec_1, const_vec_1,
+            ops::StridedSlice::Attrs().EndMask(1));
 
-    // tf.concat([shape[:1], ones, shape[1:]], 0)
-    Output new_shape = ops::Concat(s, {concat_pre, ones, concat_post}, const_0);
+        // tf.concat([shape[:1], ones, shape[1:]], 0)
+        Output new_shape = ops::Concat(scope, {concat_pre, ones, concat_post},
+                                       ops::Const(scope, 0));
 
-    Output result = ops::Reshape(s, input, new_shape);
+        Output reshaped = ops::Reshape(scope, input, new_shape);
 
-    expanded_inputs.push_back({result.node(), 0, true});
-  }
+        return WrappedTensor(reshaped.node(), 0, true);
+      };
 
-  inputs->swap(expanded_inputs);
-  return status;
+  *inputs = VectorizerInput({expand_dims_if_unstacked(inputs->at(0), rank_0),
+                             expand_dims_if_unstacked(inputs->at(1), rank_1)});
+  return Status::OK();
 }
 
 // Vectorization helper for component-wise ops. Since these operations act
 // component-wise, the vectorized op is the same as the original.
 Status CwiseVectorizeHelper(const Node& node, Graph* outer_scope,
-                            std::vector<WrappedTensor>&& inputs,
-                            std::vector<WrappedTensor>* outputs) {
+                            VectorizerInput&& inputs,
+                            VectorizerOutput* outputs) {
   // Add new node with the same op type and attrs as the original node
   Node* new_node;
   auto node_builder = NodeBuilder(strings::StrCat("vectorized/", node.name()),
@@ -144,8 +135,8 @@ Status CwiseVectorizeHelper(const Node& node, Graph* outer_scope,
 class UnaryCwiseOpVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     if (inputs.size() != 1) {
       return errors::Internal("Failed to vectorize ", node.type_string(),
                               ". The op should have 1 input, but has ",
@@ -159,8 +150,8 @@ class UnaryCwiseOpVectorizer : public Vectorizer {
 class BinaryCwiseOpVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     if (inputs.size() != 2) {
       return errors::Internal("Failed to vectorize ", node.type_string(),
                               ". The op should have 2 input, but has ",
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc
index c4460387bbfd41c1ad85fa2d8e2e0bf4be9b9dfe..76c0047747645915456eac7eef887d8eb302ba15 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc
@@ -25,30 +25,21 @@ namespace {
 class DecodeCSVVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    if (!inputs[0].stacked) {
-      return errors::InvalidArgument("Expecting input 0 to be stacked.");
-    }
-    for (size_t i = 1; i < inputs.size(); ++i) {
-      if (inputs[i].stacked) {
-        // Record defaults should not be stacked
-        return errors::InvalidArgument("Expecting input ", i,
-                                       "to be unstacked.");
-      }
-    }
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    NodeBuilder::NodeOut records;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &records));
 
     std::vector<NodeBuilder::NodeOut> defaults;
-    defaults.reserve(inputs.size() - 1);
+    defaults.resize(inputs.size() - 1);
     for (size_t i = 1; i < inputs.size(); ++i) {
-      defaults.emplace_back(inputs[i].node, inputs[i].output_index);
+      TF_RETURN_IF_ERROR(inputs.unstacked(i, &defaults[i - 1]));
     }
 
     Node* new_node;
-    auto node_builder =
-        NodeBuilder(node.type_string(), node.type_string())
-            .Input(inputs[0].node, inputs[0].output_index)  // records;
-            .Input(defaults);                               // defaults
+    auto node_builder = NodeBuilder(node.type_string(), node.type_string())
+                            .Input(records)
+                            .Input(defaults);
 
     for (const auto& attr : node.attrs()) {
       node_builder = node_builder.Attr(attr.first, attr.second);
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
index 7d0edfb386df8dffc5f34a1b363be363f6cb6423..f81b2d01d99452adfb970d1c81b3dd2e6ea3ae1d 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
@@ -27,23 +27,15 @@ namespace {
 class ParseSingleExampleVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    if (!inputs[0].stacked) {
-      return errors::InvalidArgument("Expecting input 0 to be stacked.");
-    }
-    for (size_t i = 1; i < inputs.size(); ++i) {
-      if (inputs[i].stacked) {
-        // Dense defaults should not be stacked
-        return errors::InvalidArgument("Expecting input ", i,
-                                       "to be unstacked.");
-      }
-    }
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    NodeBuilder::NodeOut serialized;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &serialized));
 
     std::vector<NodeBuilder::NodeOut> dense_defaults;
-    dense_defaults.reserve(inputs.size() - 1);
+    dense_defaults.resize(inputs.size() - 1);
     for (size_t i = 1; i < inputs.size(); ++i) {
-      dense_defaults.emplace_back(inputs[i].node, inputs[i].output_index);
+      TF_RETURN_IF_ERROR(inputs.unstacked(i, &dense_defaults[i - 1]));
     }
 
     Status scope_status;
@@ -79,11 +71,11 @@ class ParseSingleExampleVectorizer : public Vectorizer {
     Node* new_node;
     auto node_builder =
         NodeBuilder(strings::StrCat("vectorized/", node.name()), "ParseExample")
-            .Input(inputs[0].node, inputs[0].output_index)  // serialized
-            .Input(names)                                   // names
-            .Input(sparse_keys)                             // sparse_keys
-            .Input(dense_keys)                              // dense_keys
-            .Input(dense_defaults);                         // dense_defaults
+            .Input(serialized)
+            .Input(names)
+            .Input(sparse_keys)
+            .Input(dense_keys)
+            .Input(dense_defaults);
 
     for (const auto& attr : {"sparse_types", "dense_shapes"}) {
       // Copy attrs if they exist
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc
index dfb855ffa5113871cf2ff6d96a6a1d03cc7ee6ca..a094bfd1de4fe48811584e2dcf93fc67b6bb94da 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc
@@ -47,23 +47,18 @@ Output GetVectorizedShape(Scope* s, Output tensor, Output original_shape) {
 class ReshapeVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    if (!inputs[0].stacked || inputs[1].stacked) {
-      return errors::InvalidArgument(
-          "Expecting input 0 (`tensor`) to be stacked and input 1 (`shape`) to "
-          "be unstacked.");
-    }
-
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     Status status;
     Scope parent = NewInternalScope(outer_scope, &status, nullptr);
     Scope s = parent.NewSubScope(kReshapePrefix);
 
-    Output tensor = {inputs[0].node, inputs[0].output_index};
+    Output tensor, shape;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &tensor));
+    TF_RETURN_IF_ERROR(inputs.unstacked(1, &shape));
+
     Output vectorized_reshape =
-        ops::Reshape(s, tensor,
-                     GetVectorizedShape(
-                         &s, tensor, {inputs[1].node, inputs[1].output_index}));
+        ops::Reshape(s, tensor, GetVectorizedShape(&s, tensor, shape));
 
     TF_RETURN_IF_ERROR(status);
 
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc
index 4c286d9c4a925b0ffc7dda352adc84cf44865840..45ad72bb7af4c3b4c73ff7a3ee93fc8e15eb7af4 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc
@@ -41,20 +41,18 @@ constexpr char kTransposePrefix[] = "vectorized/transpose";
 class TransposeVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    if (!inputs[0].stacked || inputs[1].stacked) {
-      return errors::InvalidArgument(
-          "Expecting input 0 (`x`) to be stacked and input 1 (`perm`) to "
-          "be unstacked.");
-    }
-
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     Status status;
     Scope parent = NewInternalScope(outer_scope, &status, /*refiner=*/nullptr);
     Scope scope = parent.NewSubScope(kTransposePrefix);
 
-    Output tensor = {inputs[0].node, inputs[0].output_index};
-    Output original_perm = {inputs[1].node, inputs[1].output_index};
+    Output tensor, original_perm;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &tensor));
+    TF_RETURN_IF_ERROR(inputs.unstacked(1, &original_perm));
+    if (original_perm.type() != DT_INT32) {
+      original_perm = ops::Cast(scope, original_perm, DT_INT32);
+    }
 
     // The vectorized permutation is the original permutation with an additional
     // leading 0 and all other values incremented by 1.
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
index 13b8500eda6a2f1864261dc0ec746c6b28895d62..6e00c0cb05128b2efe2a55b10e1a96060f94266d 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
@@ -24,16 +24,10 @@ namespace {
 class UnpackVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
-    Status s;
-    if (node.num_inputs() != 1 || inputs.size() != 1) {
-      return errors::Internal("Unpack op should only have one input.");
-    }
-
-    // Add new Unpack node with the same op and attrs as the original node
-    auto new_unpack_node = outer_scope->AddNode(node.def(), &s);
-    TF_RETURN_IF_ERROR(s);
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    NodeBuilder::NodeOut value;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &value));
 
     int axis = 0;
     if (HasNodeAttr(node.def(), "axis")) {
@@ -46,17 +40,21 @@ class UnpackVectorizer : public Vectorizer {
       // Note: negative axis values wrap around.
       axis += 1;
     }
-    new_unpack_node->AddAttr("axis", axis);
-
-    outer_scope->AddEdge(inputs[0].node, inputs[0].output_index,
-                         new_unpack_node, 0);
 
     int num;
     TF_RETURN_IF_ERROR(GetNodeAttr(node.attrs(), "num", &num));
 
+    Node* new_node;
+    TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("vectorized/", node.name()),
+                                   node.type_string())
+                           .Input(value)
+                           .Attr("axis", axis)
+                           .Attr("num", num)
+                           .Finalize(outer_scope, &new_node));
+
     // Add the output mappings
     for (int i = 0; i < num; ++i) {
-      outputs->push_back({new_unpack_node, i, true});
+      outputs->push_back({new_node, i, true});
     }
 
     return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
index 8d4676aae0700250a274ee02c7a83d5815463936..7c9905f89ad1b6969b95ed708b9dd2dd7da6bb35 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -25,6 +26,72 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Represents the outputs of a vectorized op. Currently, a simple type alias
+// provided for symmetry with `VectorizerInput`.
+using VectorizerOutput = std::vector<WrappedTensor>;
+
+// Represents the inputs of a vectorized op. Supports iteration, random access,
+// and retrieval of stacked and unstacked tensor inputs.
+class VectorizerInput {
+ public:
+  VectorizerInput(std::vector<WrappedTensor>&& inputs)
+      : inputs_(std::move(inputs)) {}
+
+  // Gets the stacked tensor input at position index. Returns an error if
+  // the tensor at index is unstacked. The type T must have a (Node*, int)
+  // constructor.
+  template <class T>
+  Status stacked(int index, T* result) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, size());
+
+    if (!inputs_[index].stacked) {
+      return errors::InvalidArgument("Expecting input ", index,
+                                     " to be stacked.");
+    }
+    *result = {inputs_[index].node, inputs_[index].output_index};
+    return Status::OK();
+  }
+
+  // Gets the unstacked tensor input at position index. Returns an error if
+  // the tensor at index is stacked. The type T must have a (Node*, int)
+  // constructor.
+  template <class T>
+  Status unstacked(int index, T* result) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, size());
+
+    if (inputs_[index].stacked) {
+      return errors::InvalidArgument("Expecting input ", index,
+                                     " to be unstacked.");
+    }
+    *result = {inputs_[index].node, inputs_[index].output_index};
+    return Status::OK();
+  }
+
+  // Returns a const reference to the element at specified location index.
+  const WrappedTensor& at(int index) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, size());
+    return inputs_.at(index);
+  }
+
+  // Returns a const iterator pointing to the first wrapped tensor input.
+  std::vector<WrappedTensor>::const_iterator begin() const {
+    return inputs_.begin();
+  }
+  // Returns a const iterator pointing to the past-the-end wrapped tensor input.
+  std::vector<WrappedTensor>::const_iterator end() const {
+    return inputs_.end();
+  }
+
+  // Returns the number of input tensors.
+  size_t size() const { return inputs_.size(); }
+
+ private:
+  std::vector<WrappedTensor> inputs_;
+};
+
 // Interface for vectorization of TensorFlow operations. See `CastVectorizer`
 // for an example.
 class Vectorizer {
@@ -40,8 +107,8 @@ class Vectorizer {
   // value in `outputs` corresponds to the i'th output port of the node
   // to be converted.
   virtual Status Vectorize(const Node& node, Graph* outer_scope,
-                           std::vector<WrappedTensor>&& inputs,
-                           std::vector<WrappedTensor>* outputs) = 0;
+                           VectorizerInput&& inputs,
+                           VectorizerOutput* outputs) = 0;
 };
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
index 054aeb9a8ff0077d9b5ac4d6bfd0737faaa979dc..0eee91f241a8e3c09b93a159c93addb43e749b02 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
@@ -24,8 +24,8 @@ namespace grappler {
 class TestVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<WrappedTensor>&& inputs,
-                   std::vector<WrappedTensor>* outputs) override {
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index ddfcc92d293f893f4d84c3bd0016d24521ef8619..7fee3ae9d51bcdb234945a6000985fb5531000a0 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -154,11 +154,8 @@ int DependencyOptimizer::NumEdgesIfBypassed(
 
     for (auto consumer : output_nodes) {
       for (int j = 0; j < consumer->input_size(); ++j) {
-        const string& consumer_input = consumer->input(j);
-        int consumer_input_pos;
-        StringPiece consumer_input_node_name =
-            ParseNodeNameAsStringPiece(consumer_input, &consumer_input_pos);
-        if (consumer_input_node_name == node.name()) {
+        const TensorId consumer_input = ParseTensorName(consumer->input(j));
+        if (consumer_input.node() == node.name()) {
           if (IsControlInput(consumer_input)) {
             num_edges_if_bypassed += num_inputs;
           } else {
@@ -248,11 +245,9 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       bool optimize_fanout = false;
       bool data_connection = false;
       for (int i = fanout->input_size() - 1; i >= 0; --i) {
-        int pos;
-        StringPiece input_name =
-            ParseNodeNameAsStringPiece(fanout->input(i), &pos);
-        if (input_name == node_name) {
-          if (pos < 0) {
+        const TensorId input_tensor = ParseTensorName(fanout->input(i));
+        if (input_tensor.node() == node_name) {
+          if (input_tensor.index() < 0) {
             fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
             fanout->mutable_input()->RemoveLast();
             optimize_fanout = true;
@@ -396,20 +391,19 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
           const string& input_to_forward = node->input(i);
           CHECK(!IsControlInput(input_to_forward));
           for (int j = 0; j < consumer->input_size(); ++j) {
-            const string& old_input = consumer->input(j);
-            int old_input_pos;
-            StringPiece old_input_node_name =
-                ParseNodeNameAsStringPiece(old_input, &old_input_pos);
-            if (old_input_node_name == node_name) {
-              if (old_input_pos == i) {
+            const TensorId old_input = ParseTensorName(consumer->input(j));
+            if (old_input.node() == node_name) {
+              if (old_input.index() == i) {
                 // Regular input
                 new_input = input_to_forward;
-                node_map_->UpdateInput(consumer->name(), old_input, new_input);
+                node_map_->UpdateInput(consumer->name(), old_input.ToString(),
+                                       new_input);
                 consumer->set_input(j, new_input);
-              } else if (old_input_pos == -1) {
+              } else if (old_input.index() == -1) {
                 // Control dependency
                 new_input = AsControlDependency(NodeName(input_to_forward));
-                node_map_->UpdateInput(consumer->name(), old_input, new_input);
+                node_map_->UpdateInput(consumer->name(), old_input.ToString(),
+                                       new_input);
                 consumer->set_input(j, new_input);
               }
             }
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc b/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
index 2c36c9b7b314669402108c5f5a864eb731002fcf..75ad8bffefd8aa00bb1ba88c10ed9b1170a0d25f 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -32,6 +34,73 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
+                     const FunctionApiInfo& apiInfo) {
+  VLOG(3) << "Node def before swap is: " << node_def->DebugString();
+  auto tin = node_def->mutable_attr()->find("Tin");
+  tin->second.mutable_list()->clear_type();
+  for (const auto& tin_dtype : apiInfo.input_arg_dtypes()) {
+    tin->second.mutable_list()->add_type(tin_dtype);
+  }
+
+  auto tout = node_def->mutable_attr()->find("Tout");
+  tout->second.mutable_list()->clear_type();
+  for (const auto& tout_dtype : apiInfo.output_arg_dtypes()) {
+    tout->second.mutable_list()->add_type(tout_dtype);
+  }
+
+  if (apiInfo.function_type() == FunctionApiInfo::BACKWARD) {
+    // Update the inputs since for backward function, it might have different
+    // number of inputs due the different number output from forward function.
+    // The output of forward function are composed by two parts:
+    //   1. Real output tensors from defun.
+    //   2. Internal states that will be used for gradient calculation.
+    // Part 1 will be static, and part 2 could be different based on the
+    // different implementation.
+
+    const int prev_input_size = node_def->input_size();
+    const int diff = prev_input_size - apiInfo.input_arg_dtypes().size();
+    if (diff >= 0) {
+      for (int i = 0; i < diff; ++i) node_def->mutable_input()->RemoveLast();
+    } else {
+      // Adding new inputs for internal states, the name of the internal states
+      // should be in format "{forward_node_name}:{index}", where the newly
+      // added index should start from last index of the state.
+      // Eg:
+      // {
+      //   input: "gradients/unified_lstm/strided_slice_1_grad/StridedSliceGrad"
+      //   input: "gradients/zeros_like_1"
+      //   input: "gradients/zeros_like_2"
+      //   input: "unified_lstm/StatefulPartitionedCall:3"
+      //   input: "unified_lstm/StatefulPartitionedCall:4"
+      //   # New input should be "unified_lstm/StatefulPartitionedCall:5"
+      // }
+      const string last_input = node_def->input(prev_input_size - 1);
+      const std::vector<string> name_index = ::absl::StrSplit(last_input, ':');
+      if (name_index.size() != 2) {
+        return errors::InvalidArgument(
+            "Invalid format of input node name: ", last_input,
+            " Expected: {forward_node_name}:{index}");
+      }
+      const absl::string_view node_name = name_index[0];
+      int last_index;
+      if (!::absl::SimpleAtoi(name_index[1], &last_index)) {
+        return errors::InvalidArgument(
+            "The index of input node is expected to be number, got: ",
+            name_index[1]);
+      }
+      for (int i = 1; i <= -diff; ++i)
+        node_def->add_input(strings::StrCat(node_name, ":", i + last_index));
+    }
+  }
+
+  node_def->mutable_attr()->find("f")->second.mutable_func()->set_name(
+      funcName);
+
+  VLOG(3) << "Node def after swap is: " << node_def->DebugString();
+  return Status::OK();
+}
+
 Status ExperimentalImplementationSelector::LoadFunctions(
     const GraphDef& graph) {
   lib_info_.reset(new FunctionLibraryApiInfo);
@@ -43,8 +112,11 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
     NodeDef* node_def) const {
   // There are two ways of calling functions:
   //  1. By specifying an op name as a function name, or
-  //  2. Via the @defun functional interface, where the real function name
-  //     appear as the attribute with type func.
+  //  2. Via the @defun functional interface, where the real function call
+  //     happens with partitionedcall op, and the function name appear as the
+  //     attribute with name "f" and type func. In this use case, there are more
+  //     attributes need to be taken care, like Tin and Tout which take care of
+  //     the DTYPE of input/output.
   std::vector<string> function_attribute_names;
   for (const auto& attr : node_def->attr()) {
     if (attr.second.has_func() &&
@@ -70,22 +142,29 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
 
   for (const auto& attr_name : function_attribute_names) {
     string function_name = node_def->attr().at(attr_name).func().name();
-    string best_function_name;
-    lib_info_->GetBestImplementation(function_name, parsed_name.type,
-                                     &best_function_name);
-    if (function_name != best_function_name) {
-      node_def->mutable_attr()
-          ->find(attr_name)
-          ->second.mutable_func()
-          ->set_name(best_function_name);
+    std::vector<string> equiv_func_names;
+    TF_RETURN_IF_ERROR(lib_info_->GetEquivalentImplementations(
+        function_name, &equiv_func_names));
+    for (const auto& func_name : equiv_func_names) {
+      const auto& func_api_info = lib_info_->GetApiInfo(func_name);
+      if (func_api_info->preferred_device() == parsed_name.type) {
+        VLOG(2) << "Swapping: " << function_name << " TO: " << func_name;
+        TF_RETURN_IF_ERROR(UpdateNodeDef(node_def, func_name, *func_api_info));
+        break;
+      }
     }
   }
+
   if (lib_info_->GetApiInfo(node_def->op()) != nullptr) {
-    string best_function_name;
-    lib_info_->GetBestImplementation(node_def->op(), parsed_name.type,
-                                     &best_function_name);
-    if (node_def->op() != best_function_name) {
-      node_def->set_op(best_function_name);
+    std::vector<string> equiv_func_names;
+    TF_RETURN_IF_ERROR(lib_info_->GetEquivalentImplementations(
+        node_def->op(), &equiv_func_names));
+    for (const string& func_name : equiv_func_names) {
+      const auto func_api_info = lib_info_->GetApiInfo(func_name);
+      if (func_api_info->preferred_device() == parsed_name.type) {
+        node_def->set_op(func_name);
+        break;
+      }
     }
   }
   return Status::OK();
@@ -93,6 +172,11 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
 
 Status ExperimentalImplementationSelector::SelectImplementation(
     GraphDef* graph) const {
+  if (!graph->has_library()) {
+    VLOG(2) << "Skipping graph since it does not have function def";
+    return Status::OK();
+  }
+
   for (int k = 0; k < graph->node_size(); ++k)
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph->mutable_node(k)));
 
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
index 3f1ebefac68a1e9b86acea0ddb9dd1c6a638ac6e..e1ac7766d34af69668a57e20acc945a1c975fd1b 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
@@ -133,6 +133,101 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
                                  test::AsScalar<float>(2.0f));
 }
 
+TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+  // boost_1 returns the doubled input and a const as the internal state, the
+  // state will be feed to gradient function to mimic the behavior of backward
+  // function of defun that use internal states as extra inputs.
+  FunctionDef boost_1 = FDH::Create(
+      "Boost1", {"x:float"}, {"z:float", "s:float"}, {},
+      {{{"boost"}, "Add", {"x", "x"}, {{"T", DT_FLOAT}}},
+       FDH::Const("one", 1.0f)},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "boost:z:0"}, {"s", "one:output:0"}});
+  auto* boost_1_attr = boost_1.mutable_attr();
+  (*boost_1_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_1_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_attr)["backward_function_name"].set_s("BoostCpuGradient");
+
+  FunctionDef boost_1_gradient = FDH::Create(
+      "Boost1Gradient", {"x:float", "s:float"}, {"dx:float"}, {},
+      {FDH::Const("two", 2.0f),
+       {{"grad"}, "Mul", {"x", "two:output:0"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"dx", "grad:z:0"}});
+  auto* boost_1_grad_attr = boost_1_gradient.mutable_attr();
+  (*boost_1_grad_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_1_grad_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_grad_attr)["forward_function_name"].set_s("BoostCpu");
+
+  // boost_2 return the input * 4, and with two extra internal states.
+  FunctionDef boost_2_func = FDH::Create(
+      "Boost2", {"x:float"}, {"z:float", "s1:float", "s2:float"}, {},
+      {FDH::Const("four", 4.0f),
+       {{"boost"}, "Mul", {"x", "four:output:0"}, {{"T", DT_FLOAT}}},
+       FDH::Const("one", 1.0f),
+       FDH::Const("two", 2.0f)},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "boost:z:0"}, {"s1", "one:output:0"}, {"s2", "two:output:0"}});
+  auto* boost_2_attr = boost_2_func.mutable_attr();
+  (*boost_2_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_attr)["backward_function_name"].set_s("BoostGpuGradient");
+
+  FunctionDef boost_2_gradient = FDH::Create(
+      "Boost2Gradient", {"x:float", "s1:float", "s2:float"}, {"dx:float"}, {},
+      {FDH::Const("four", 4.0f),
+       {{"grad"}, "Mul", {"x", "four:output:0"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"dx", "grad:z:0"}});
+  auto* boost_2_grad_attr = boost_2_gradient.mutable_attr();
+  (*boost_2_grad_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_2_grad_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_grad_attr)["forward_function_name"].set_s("BoostGpu");
+
+  // Define the forward function with f = boost2 function but with CPU device.
+  // Expect the grappler plugin to swap f and attributes to use the boost1.
+  const auto forward =
+      NDef("lstm/StatefulPartitionedCall", "StatefulPartitionedCall", {"input"},
+           {{"Tin", DataTypeSlice{DT_FLOAT}},
+            {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+            {"f", FDH::FunctionRef("Boost2")}},
+           CpuDevice);
+  const auto backward =
+      NDef("gradient/lstm/StatefulPartitionedCall", "StatefulPartitionedCall",
+           {"input", "lstm/StatefulPartitionedCall:1",
+            "lstm/StatefulPartitionedCall:2"},
+           {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+            {"Tout", DataTypeSlice{DT_FLOAT}},
+            {"f", FDH::FunctionRef("Boost2Gradient")}},
+           CpuDevice);
+
+  ExperimentalImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("input", "Placeholder", {}, {{"dtype", DT_FLOAT}}, CpuDevice),
+       forward, backward,
+       NDef("output", "Identity", {"lstm/StatefulPartitionedCall:0"},
+            {{"T", DT_FLOAT}}, CpuDevice)},
+      // FunctionLib
+      {boost_1, boost_1_gradient, boost_2_func, boost_2_gradient});
+
+  const Tensor input = test::AsScalar<float>(1.0f);
+  item.fetch = {"output"};
+  item.feed.emplace_back("input", input);
+
+  const auto four_times_boosted_tensor = EvaluateFetchNodes(item);
+  test::ExpectTensorEqual<float>(four_times_boosted_tensor[0],
+                                 test::AsScalar<float>(4.0f));
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  GrapplerItem optimized(item, std::move(output));
+  const auto twice_boosted_tensor = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(twice_boosted_tensor[0],
+                                 test::AsScalar<float>(2.0f));
+}
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.cc b/tensorflow/core/grappler/optimizers/function_api_info.cc
index 798e0f6fd55930f437d7a95d1886eb14e07946b5..497ad6032ea80b22e5b5e2b23b2860b7c99fc57b 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info.cc
@@ -27,6 +27,7 @@ FunctionApiInfo::FunctionApiInfo() {}
 FunctionApiInfo::~FunctionApiInfo() {}
 
 Status FunctionApiInfo::Init(const FunctionDef& function_def) {
+  function_type_ = FunctionApiInfo::FunctionType::INFERENCE;
   for (const auto& attr : function_def.attr()) {
     if (attr.first == "experimental_api_preferred_device") {
       preferred_device_ = attr.second.s();
@@ -34,7 +35,25 @@ Status FunctionApiInfo::Init(const FunctionDef& function_def) {
     if (attr.first == "experimental_api_implements") {
       interface_name_ = attr.second.s();
     }
+    if (attr.first == "forward_function_name") {
+      function_type_ = FunctionApiInfo::FunctionType::BACKWARD;
+      pairing_function_name_ = attr.second.s();
+    }
+    if (attr.first == "backward_function_name") {
+      function_type_ = FunctionApiInfo::FunctionType::FORWARD;
+      pairing_function_name_ = attr.second.s();
+    }
+  }
+
+  input_arg_dtypes_.reserve(function_def.signature().input_arg_size());
+  for (const auto& input_arg : function_def.signature().input_arg()) {
+    input_arg_dtypes_.emplace_back(input_arg.type());
   }
+  output_arg_dtypes_.reserve(function_def.signature().output_arg_size());
+  for (const auto& output_arg : function_def.signature().output_arg()) {
+    output_arg_dtypes_.emplace_back(output_arg.type());
+  }
+
   if (interface_name_.empty() && !preferred_device_.empty()) {
     return errors::InvalidArgument(
         "Function '", function_def.signature().name(),
@@ -51,53 +70,94 @@ const string& FunctionApiInfo::interface_name() const {
   return interface_name_;
 }
 
+const FunctionApiInfo::FunctionType FunctionApiInfo::function_type() const {
+  return function_type_;
+}
+
+const string& FunctionApiInfo::pairing_function_name() const {
+  return pairing_function_name_;
+}
+
+const DataTypeVector& FunctionApiInfo::input_arg_dtypes() const {
+  return input_arg_dtypes_;
+}
+
+const DataTypeVector& FunctionApiInfo::output_arg_dtypes() const {
+  return output_arg_dtypes_;
+}
+
 FunctionLibraryApiInfo::FunctionLibraryApiInfo() {}
 FunctionLibraryApiInfo::~FunctionLibraryApiInfo() {}
 
 namespace {
-bool IsSameSignature(const FunctionDef& f1, const FunctionDef& f2) {
-  if (f1.ret().size() != f2.ret().size()) return false;
+bool IsSameArgDef(const OpDef::ArgDef& arg1, const OpDef::ArgDef& arg2) {
+  if (arg1.type() != arg2.type()) return false;
+  if (arg1.type_attr() != arg2.type_attr()) return false;
+  if (arg1.number_attr() != arg2.number_attr()) return false;
+  if (arg1.type_list_attr() != arg2.type_list_attr()) return false;
+  if (arg1.is_ref() != arg2.is_ref()) return false;
+  return true;
+}
+
+bool IsSameSignature(const FunctionDef& f1, const FunctionDef& f2,
+                     const bool check_inputs, const bool check_outputs) {
   const auto& sig1 = f1.signature();
   const auto& sig2 = f2.signature();
   // Functions have positional semantics, so we don't check for names.
-  if (sig1.input_arg_size() != sig2.input_arg_size()) return false;
-  for (int k = 0; k < sig1.input_arg_size(); ++k) {
-    const OpDef::ArgDef& arg1 = sig1.input_arg(k);
-    const OpDef::ArgDef& arg2 = sig2.input_arg(k);
-    if (arg1.type() != arg2.type()) return false;
-    if (arg1.type_attr() != arg2.type_attr()) return false;
-    if (arg1.number_attr() != arg2.number_attr()) return false;
-    if (arg1.type_list_attr() != arg2.type_list_attr()) return false;
-    if (arg1.is_ref() != arg2.is_ref()) return false;
+  if (check_inputs) {
+    if (sig1.input_arg_size() != sig2.input_arg_size()) return false;
+    for (int k = 0; k < sig1.input_arg_size(); ++k) {
+      if (!IsSameArgDef(sig1.input_arg(k), sig2.input_arg(k))) return false;
+    }
+  }
+  if (check_outputs) {
+    if (f1.ret().size() != f2.ret().size()) return false;
+    if (sig1.output_arg_size() != sig2.output_arg_size()) return false;
+    for (int k = 0; k < sig1.output_arg_size(); ++k) {
+      if (!IsSameArgDef(sig1.output_arg(k), sig2.output_arg(k))) return false;
+    }
   }
   return true;
 }
 
 Status ValidateSignature(const string& interface_name,
-                         const std::vector<const FunctionDef*>& equiv_funcs) {
+                         const std::vector<const FunctionDef*>& equiv_funcs,
+                         const FunctionApiInfo::FunctionType function_type) {
   if (equiv_funcs.size() < 2) return Status::OK();
   for (size_t k = 1; k < equiv_funcs.size(); ++k) {
-    if (!IsSameSignature(*equiv_funcs[0], *equiv_funcs[k]))
+    const bool check_input =
+        (function_type == FunctionApiInfo::FunctionType::INFERENCE ||
+         function_type == FunctionApiInfo::FunctionType::FORWARD);
+    const bool check_output =
+        (function_type == FunctionApiInfo::FunctionType::INFERENCE ||
+         function_type == FunctionApiInfo::FunctionType::BACKWARD);
+    if (!IsSameSignature(*equiv_funcs[0], *equiv_funcs[k], check_input,
+                         check_output)) {
       return errors::InvalidArgument(
           "Functions '", equiv_funcs[0]->signature().name(), "' and '",
           equiv_funcs[k]->signature().name(), "' both implement '",
           interface_name, "' but their signatures do not match.");
+    }
   }
   return Status::OK();
 }
 
 Status ValidateSignatures(
     const std::unordered_map<string, std::vector<const FunctionDef*>>&
-        intf_to_func) {
+        intf_to_func,
+    const FunctionApiInfo::FunctionType function_type) {
   for (const auto& item : intf_to_func)
-    TF_RETURN_IF_ERROR(ValidateSignature(item.first, item.second));
+    TF_RETURN_IF_ERROR(
+        ValidateSignature(item.first, item.second, function_type));
   return Status::OK();
 }
 }  // namespace
 
 Status FunctionLibraryApiInfo::Init(
     const FunctionDefLibrary& function_library) {
-  std::unordered_map<string, std::vector<const FunctionDef*>> intf_to_func;
+  std::unordered_map<string, std::vector<const FunctionDef*>> infer_funcs;
+  std::unordered_map<string, std::vector<const FunctionDef*>> fwd_funcs;
+  std::unordered_map<string, std::vector<const FunctionDef*>> bwd_funcs;
   for (const auto& function : function_library.function()) {
     std::unique_ptr<FunctionApiInfo> func_info(new FunctionApiInfo);
     TF_RETURN_IF_ERROR(func_info->Init(function));
@@ -106,54 +166,64 @@ Status FunctionLibraryApiInfo::Init(
 
     const string& function_name = function.signature().name();
     const string& interface_name = func_info->interface_name();
-    func_to_intf_[function_name] = interface_name;
-    intf_to_funcs_[interface_name].emplace_back(function_name);
-    intf_to_func[interface_name].emplace_back(&function);
+    VLOG(3) << "Got " << func_info->function_type()
+            << " function: " << function_name
+            << " with interface: " << interface_name;
+    switch (func_info->function_type()) {
+      case FunctionApiInfo::FunctionType::INFERENCE:
+        intf_to_inference_funcs_[interface_name].emplace_back(function_name);
+        infer_funcs[interface_name].emplace_back(&function);
+        break;
+      case FunctionApiInfo::FunctionType::FORWARD:
+        intf_to_forward_funcs_[interface_name].emplace_back(function_name);
+        fwd_funcs[interface_name].emplace_back(&function);
+        break;
+      case FunctionApiInfo::FunctionType::BACKWARD:
+        intf_to_backward_funcs_[interface_name].emplace_back(function_name);
+        bwd_funcs[interface_name].emplace_back(&function);
+        break;
+      default:
+        return errors::InvalidArgument("Unrecognized function type: ",
+                                       func_info->function_type());
+    }
     func_info_[function_name] = std::move(func_info);
   }
-  TF_RETURN_IF_ERROR(ValidateSignatures(intf_to_func));
+  TF_RETURN_IF_ERROR(ValidateSignatures(
+      infer_funcs, FunctionApiInfo::FunctionType::INFERENCE));
+  TF_RETURN_IF_ERROR(
+      ValidateSignatures(fwd_funcs, FunctionApiInfo::FunctionType::FORWARD));
+  TF_RETURN_IF_ERROR(
+      ValidateSignatures(bwd_funcs, FunctionApiInfo::FunctionType::BACKWARD));
   return Status::OK();
 }
 
-void FunctionLibraryApiInfo::GetEquivalentImplementations(
-    const string& function_name, std::vector<string>* other_names) const {
-  const auto intf_it = func_to_intf_.find(function_name);
-  // The function does not implement any interface.
-  if (intf_it == func_to_intf_.end()) return;
-  CHECK(!intf_it->second.empty()) << "Function " << function_name
-                                  << "should at least implement 1 interface.";
-  const auto it = intf_to_funcs_.find(intf_it->second);
-  CHECK(it != intf_to_funcs_.end())
-      << "Function " << function_name << " maps to " << intf_it->second
-      << " but no reverse mapping was found";
-  CHECK_GE(it->second.size(), 1) << "Class " << it->first << " is empty";
-  other_names->reserve(it->second.size() - 1);
-  for (const auto& other_name : it->second) {
-    if (other_name == function_name) continue;
-    other_names->emplace_back(other_name);
+Status FunctionLibraryApiInfo::GetEquivalentImplementations(
+    const string& function_name, std::vector<string>* other_functions) const {
+  const auto func_it = func_info_.find(function_name);
+  if (func_it == func_info_.end()) return Status::OK();
+  const FunctionApiInfo* func_info = func_it->second.get();
+
+  absl::flat_hash_map<string, std::vector<string>>::const_iterator it;
+  switch (func_info->function_type()) {
+    case FunctionApiInfo::FunctionType::INFERENCE:
+      it = intf_to_inference_funcs_.find(func_info->interface_name());
+      break;
+    case FunctionApiInfo::FunctionType::FORWARD:
+      it = intf_to_forward_funcs_.find(func_info->interface_name());
+      break;
+    case FunctionApiInfo::FunctionType::BACKWARD:
+      it = intf_to_backward_funcs_.find(func_info->interface_name());
+      break;
+    default:
+      return errors::InvalidArgument("Unrecognized function type: ",
+                                     func_info->function_type());
   }
-}
 
-void FunctionLibraryApiInfo::GetBestImplementation(
-    const string& function_name, const string& device,
-    string* best_func_name) const {
-  CHECK(best_func_name != nullptr);
-  const auto func_it = func_to_intf_.find(function_name);
-  if (func_it == func_to_intf_.end()) return;
-
-  const auto it = intf_to_funcs_.find(func_it->second);
-  // No function found for the given interface.
-  if (it == intf_to_funcs_.end()) return;
   for (const auto& func_name : it->second) {
-    const auto func_api_info = func_info_.find(func_name)->second.get();
-    if (func_api_info->preferred_device() == device) {
-      best_func_name->assign(func_name);
-      return;
-    }
+    if (func_name == function_name) continue;
+    other_functions->emplace_back(func_name);
   }
-  // Didn't find a function with the match device name, choose the first one
-  // among all the available functions.
-  best_func_name->assign(it->second.front());
+  return Status::OK();
 }
 
 const FunctionApiInfo* FunctionLibraryApiInfo::GetApiInfo(
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.h b/tensorflow/core/grappler/optimizers/function_api_info.h
index 412687c58c15460a05b2e697afb1f84454462da8..9a5f548951f0931e98fbe4074f7bbd9aacab0c6e 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.h
+++ b/tensorflow/core/grappler/optimizers/function_api_info.h
@@ -20,7 +20,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -30,14 +33,32 @@ class FunctionApiInfo {
   FunctionApiInfo();
   virtual ~FunctionApiInfo();
 
+  enum FunctionType {
+    INFERENCE,  // Default type.
+    FORWARD,
+    BACKWARD,
+  };
+
   Status Init(const FunctionDef& function_def);
 
   const string& interface_name() const;
   const string& preferred_device() const;
+  const FunctionType function_type() const;
+  const string& pairing_function_name() const;
+  const DataTypeVector& input_arg_dtypes() const;
+  const DataTypeVector& output_arg_dtypes() const;
 
  private:
   string interface_name_;
   string preferred_device_;
+  FunctionType function_type_;
+  // The pairing function is used to pair between forward and backward function,
+  // which will be useful during function swapping. Inference function won't
+  // have pairing function.
+  string pairing_function_name_;
+  // The following two attributes are useful for forward and backward functions.
+  DataTypeVector input_arg_dtypes_;
+  DataTypeVector output_arg_dtypes_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionApiInfo);
 };
@@ -55,21 +76,22 @@ class FunctionLibraryApiInfo {
   // Populate the internal field for the functions within the function_library.
   Status Init(const FunctionDefLibrary& function_library);
 
-  void GetEquivalentImplementations(const string& function_name,
-                                    std::vector<string>* other_names) const;
-
-  void GetBestImplementation(const string& function_name, const string& device,
-                             string* best_func_name) const;
+  Status GetEquivalentImplementations(
+      const string& function_name, std::vector<string>* other_functions) const;
 
   const FunctionApiInfo* GetApiInfo(const string& function_name) const;
 
  private:
   // Map between function name to function details.
   std::unordered_map<string, std::unique_ptr<FunctionApiInfo>> func_info_;
-  // Map between function name to interface name.
-  std::unordered_map<string, string> func_to_intf_;
+
   // Map between interface name to function names.
-  std::unordered_map<string, std::vector<string>> intf_to_funcs_;
+  // Forward/backward function pair usually have different signatures between
+  // each other since forward function could produce extra internal state as
+  // output, and backward will take those extra state as inputs.
+  absl::flat_hash_map<string, std::vector<string>> intf_to_inference_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_forward_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_backward_funcs_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryApiInfo);
 };
diff --git a/tensorflow/core/grappler/optimizers/function_api_info_test.cc b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
index 582890d3e3bb807552039de4a3ff5e8c6e393ca5..b683d26b32f04759b658e9e0704f1b6b661fe178 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
@@ -36,28 +36,35 @@ void SetArg(const string& name, const string& type_name,
 
 typedef std::pair<string, string> ArgSpec;  // name, type.
 
-void SetArgs(const std::vector<ArgSpec>& args_spec, OpDef* sig) {
-  for (const auto& arg_spec : args_spec)
+void SetArgs(const std::vector<ArgSpec>& input_args_spec,
+             const std::vector<ArgSpec>& output_args_spec, OpDef* sig) {
+  for (const auto& arg_spec : input_args_spec)
     SetArg(arg_spec.first, arg_spec.second, sig->add_input_arg());
-  SetArg("output", "float32", sig->add_output_arg());
+  for (const auto& arg_spec : output_args_spec)
+    SetArg(arg_spec.first, arg_spec.second, sig->add_output_arg());
 }
 
 void PopulateFunction(const string& name, const string& api_interface_name,
                       const string& preferred_device,
                       const std::vector<ArgSpec>& input_args,
+                      const std::vector<ArgSpec>& output_args,
+                      const string& forward_function_name,
+                      const string& backward_function_name,
                       FunctionDef* func_def) {
   OpDef* sig = func_def->mutable_signature();
   sig->set_name(name);
 
-  SetArgs(input_args, sig);
-
-  if (!api_interface_name.empty() || !preferred_device.empty()) {
-    auto* func_attr = func_def->mutable_attr();
-    if (!api_interface_name.empty())
-      (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
-    if (!preferred_device.empty())
-      (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
-  }
+  SetArgs(input_args, output_args, sig);
+
+  auto* func_attr = func_def->mutable_attr();
+  if (!api_interface_name.empty())
+    (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
+  if (!preferred_device.empty())
+    (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
+  if (!forward_function_name.empty())
+    (*func_attr)["forward_function_name"].set_s(forward_function_name);
+  if (!backward_function_name.empty())
+    (*func_attr)["backward_function_name"].set_s(backward_function_name);
 }
 
 void PopulateSampleLibrary(const bool mismatch_args,
@@ -65,39 +72,50 @@ void PopulateSampleLibrary(const bool mismatch_args,
   const std::vector<ArgSpec> func_args{{"in1", "float32"}, {"in2", "int32"}};
   const std::vector<ArgSpec> func_wrong_args{{"in1", "int32"},
                                              {"in2", "int32"}};
-  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", func_args,
-                   func_lib->add_function());
+  const std::vector<ArgSpec> output_args{{"out", "float32"}};
+  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", func_args, output_args, "",
+                   "", func_lib->add_function());
   PopulateFunction("DoStuffGpu", "DoStuff", "GPU",
-                   mismatch_args ? func_wrong_args : func_args,
+                   mismatch_args ? func_wrong_args : func_args, output_args, "",
+                   "", func_lib->add_function());
+  PopulateFunction("DoThings", "DoThings", "", func_args, output_args, "", "",
                    func_lib->add_function());
-  PopulateFunction("DoThings", "DoThings", "", func_args,
+  PopulateFunction("OneOff", "", "", func_args, output_args, "", "",
                    func_lib->add_function());
-  PopulateFunction("OneOff", "", "", func_args, func_lib->add_function());
-  PopulateFunction("AnotherOneOff", "", "", func_args,
+  PopulateFunction("AnotherOneOff", "", "", func_args, output_args, "", "",
                    func_lib->add_function());
 }
 
+void PopulateComplexLibrary(FunctionDefLibrary* func_lib) {
+  const std::vector<ArgSpec> input_args{{"in1", "float32"}, {"in2", "int32"}};
+  const std::vector<ArgSpec> output_args{{"out", "float32"}};
+  const std::vector<ArgSpec> output_with_state{
+      {"out", "float32"}, {"state1", "int32"}, {"state2", "int32"}};
+
+  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", input_args, output_args, "",
+                   "DoStuffCpu_gradient", func_lib->add_function());
+  PopulateFunction("DoStuffCpu_gradient", "DoStuff", "CPU", output_args,
+                   input_args, "DoStuffCpu", "", func_lib->add_function());
+  PopulateFunction("DoStuffGpu", "DoStuff", "GPU", input_args,
+                   output_with_state, "", "DoStuffGpu_gradient",
+                   func_lib->add_function());
+  PopulateFunction("DoStuffGpu_gradient", "DoStuff", "GPU", output_with_state,
+                   input_args, "DoStuffGpu", "", func_lib->add_function());
+}
+
 bool CheckEquivImpl(const FunctionLibraryApiInfo& lib_api_info,
                     const string& func_name,
                     const std::vector<string>& expected_other) {
   std::vector<string> other_impl;
-  lib_api_info.GetEquivalentImplementations(func_name, &other_impl);
+  Status status =
+      lib_api_info.GetEquivalentImplementations(func_name, &other_impl);
+  EXPECT_EQ(status, Status::OK());
   const std::unordered_set<string> actual(other_impl.begin(), other_impl.end());
   const std::unordered_set<string> expected(expected_other.begin(),
                                             expected_other.end());
   return actual == expected;
 }
 
-bool CheckGetBestImpl(const FunctionLibraryApiInfo& lib_api_info,
-                      const string& function_name, const string& device,
-                      const string& expected_function_name) {
-  string best_function_name;
-  lib_api_info.GetBestImplementation(function_name, device,
-                                     &best_function_name);
-
-  return best_function_name == expected_function_name;
-}
-
 string GetInterfaceName(const FunctionLibraryApiInfo& lib_api_info,
                         const string& func_name) {
   auto* info = lib_api_info.GetApiInfo(func_name);
@@ -117,34 +135,46 @@ TEST(FunctionApiInfoTest, ParseTags) {
   PopulateSampleLibrary(/* mismatch_args */ false, &func_lib);
   FunctionLibraryApiInfo lib_api_info;
   TF_ASSERT_OK(lib_api_info.Init(func_lib));
+
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu"));
+  EXPECT_EQ("DoThings", GetInterfaceName(lib_api_info, "DoThings"));
+
+  EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu"));
+  EXPECT_EQ("", GetPreferredDevice(lib_api_info, "DoThings"));
+
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu", {"DoStuffGpu"}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu", {"DoStuffCpu"}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "Undefined", {}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "OneOff", {}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "AnotherOneOff", {}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoThings", {}));
+}
+
+TEST(FunctionApiInfoTest, ComplexFunctionLib) {
+  FunctionDefLibrary func_lib;
+  PopulateComplexLibrary(&func_lib);
+  FunctionLibraryApiInfo lib_api_info;
+  TF_ASSERT_OK(lib_api_info.Init(func_lib));
 
   EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu_gradient"));
   EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu"));
-  EXPECT_EQ("DoThings", GetInterfaceName(lib_api_info, "DoThings"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu_gradient"));
 
   EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu_gradient"));
   EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu"));
-  EXPECT_EQ("", GetPreferredDevice(lib_api_info, "DoThings"));
+  EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu_gradient"));
 
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffCpu", "CPU", "DoStuffCpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffCpu", "GPU", "DoStuffGpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "CPU", "DoStuffCpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "GPU", "DoStuffGpu"));
-
-  EXPECT_TRUE(CheckGetBestImpl(lib_api_info, "DoThings", "GPU", "DoThings"));
-  // TPU impl is not available, choose the first one available which is the CPU.
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "TPU", "DoStuffCpu"));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu", {"DoStuffGpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu", {"DoStuffCpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu_gradient",
+                             {"DoStuffGpu_gradient"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu_gradient",
+                             {"DoStuffCpu_gradient"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "Undefined", {}));
 }
 
 TEST(FunctionApiInfoTest, MismatchedArguments) {
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
index 1ea57f7b4f003e8a98fe187f6325e39ebe30e9e7..82c408b521f58bcde685474ba13146d2f56379ba 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+#include "tensorflow/core/graph/tensor_id.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -46,25 +47,27 @@ Status GetTensorProperties(const GraphOptimizerContext& ctx,
     return errors::InvalidArgument("Graph properties are unknown.");
   }
 
-  int port;
-  string tensor_node_name = ParseNodeName(tensor, &port);
-  if (port < 0) {
+  // TODO(ezhulenev): Make it TensorId when graph properties will support
+  // absl::string_view lookup.
+  SafeTensorId tensor_id = ParseTensorName(tensor);
+
+  if (tensor_id.index() < 0) {
     return errors::InvalidArgument(
         "Can't get tensor properties of control dependency ", tensor);
   }
 
   const auto& output_properties =
-      ctx.graph_properties->GetOutputProperties(tensor_node_name);
+      ctx.graph_properties->GetOutputProperties(tensor_id.node());
   auto num_outputs = output_properties.size();
 
-  if (num_outputs == 0 || port > num_outputs - 1) {
+  if (num_outputs == 0 || tensor_id.index() > num_outputs - 1) {
     return errors::InvalidArgument(
-        "Node ", tensor_node_name,
-        " is missing output properties at position :", port,
+        "Node ", tensor_id.node(),
+        " is missing output properties at position :", tensor_id.index(),
         " (num_outputs=", num_outputs, ")");
   }
 
-  properties->CopyFrom(output_properties[port]);
+  properties->CopyFrom(output_properties[tensor_id.index()]);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index f31a30ec0edf5022004e9489994dc6875f60bfd0..99fcb31523800c76b8c413da92576fc16092f588 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -239,7 +239,8 @@ class GraphOptimizerStagePipeline {
         // case of any error it must leave optimized graph unmodified.
         if (!stage_status.ok()) {
           LOG(WARNING) << "Failed to run optimizer " << stage->optimizer_name()
-                       << ", stage " << stage->stage_name()
+                       << ", stage " << stage->stage_name() << " node "
+                       << node->name()
                        << ". Error: " << stage_status.error_message();
         }
         if (break_predicate_(*result)) return true;
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index b40438e98ffc491bfb3fe4233d5a2bd86a153b81..790b6955a599554e99178bc626b25752aff7464f 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -166,7 +166,6 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "Polygamma",
                                           "QuantizeAndDequantizeV2",
                                           "QuantizeAndDequantizeV3",
-                                          "QuantizeV2",
                                           "Pow",
                                           "Real",
                                           "RealDiv",
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 1d787d2b7c264683d4d63895ac397fffb30a9744..6975fa715bc77ec36da7ccaf6c3ff939081a610f 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -127,8 +127,10 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
 
 #undef MK_OPT
 
-MetaOptimizer::MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
-    : cpu_device_(cpu_device), cfg_(cfg) {
+MetaOptimizer::MetaOptimizer(DeviceBase* cpu_device, const ConfigProto& cfg)
+    : cpu_device_(cpu_device),
+      config_proto_(cfg),
+      cfg_(*config_proto_.mutable_graph_options()->mutable_rewrite_options()) {
   DCHECK(cpu_device_ == nullptr ||
          cpu_device_->attributes().device_type() == "CPU");
 }
@@ -279,6 +281,18 @@ MetaOptimizer::GetCustomGraphOptimizerConfig(const string& name) const {
   return nullptr;
 }
 
+#define RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer)                            \
+  {                                                                            \
+    const Status status = RunOptimizer(optimizer, cluster, &optimized_item,    \
+                                       optimized_graph, &optimization_result); \
+    if (status.ok()) {                                                         \
+      is_optimized = true;                                                     \
+    } else if (cfg_.fail_on_optimizer_errors()) {                              \
+      VLOG(2) << "Optimizer '" << optimizer->name() << "' failed: " << status; \
+      TF_RETURN_IF_ERROR(status);                                              \
+    }                                                                          \
+  }
+
 Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
                                     GraphDef* optimized_graph) {
   int min_graph_nodes = cfg_.min_graph_nodes() == 0 ? kDefaultMinGraphNodes
@@ -340,9 +354,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
         if (fusion_optimizer == nullptr) fusion_optimizer = optimizer.get();
         continue;
       }
-      Status status = RunOptimizer(optimizer.get(), cluster, &optimized_item,
-                                   optimized_graph, &optimization_result);
-      if (status.ok()) is_optimized = true;
+      RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer.get());
     }
   }
 
@@ -353,16 +365,12 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   // optimizations from taking place since we don't have shape inference for
   // functions, and we can't optimize across function boundaries.
   if (fusion_optimizer != nullptr) {
-    Status status = RunOptimizer(fusion_optimizer, cluster, &optimized_item,
-                                 optimized_graph, &optimization_result);
-    if (status.ok()) is_optimized = true;
+    RUN_OPTIMIZER_OR_RETURN_IF_ERROR(fusion_optimizer);
   }
 
   // ScopedAllocatorOptimizer must run last.
   if (sa_optimizer != nullptr) {
-    Status status = RunOptimizer(sa_optimizer, cluster, &optimized_item,
-                                 optimized_graph, &optimization_result);
-    if (status.ok()) is_optimized = true;
+    RUN_OPTIMIZER_OR_RETURN_IF_ERROR(sa_optimizer);
   }
 
   // Record graph optimization result.
@@ -379,6 +387,8 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   return Status::OK();
 }
 
+#undef RUN_OPTIMIZER_OR_RETURN_IF_ERROR
+
 Status MetaOptimizer::RunOptimizer(
     GraphOptimizer* optimizer, Cluster* cluster, GrapplerItem* optimized_item,
     GraphDef* optimized_graph, GraphOptimizationResult* optimization_result) {
@@ -562,32 +572,35 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
   // Nothing to do for MetaOptimizer.
 }
 
-bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
-  if (cfg.disable_meta_optimizer()) {
+bool MetaOptimizerEnabled(const ConfigProto& cfg) {
+  const auto& rewrite_cfg = cfg.graph_options().rewrite_options();
+  if (rewrite_cfg.disable_meta_optimizer()) {
     return false;
   }
-  return !cfg.disable_model_pruning() ||
-         cfg.layout_optimizer() != RewriterConfig::OFF ||
-         cfg.function_optimization() != RewriterConfig::OFF ||
-         cfg.constant_folding() != RewriterConfig::OFF ||
-         cfg.shape_optimization() != RewriterConfig::OFF ||
-         cfg.remapping() != RewriterConfig::OFF ||
-         cfg.arithmetic_optimization() != RewriterConfig::OFF ||
-         cfg.loop_optimization() != RewriterConfig::OFF ||
-         cfg.dependency_optimization() != RewriterConfig::OFF ||
-         cfg.auto_parallel().enable() ||
-         cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
-         cfg.debug_stripper() == RewriterConfig::ON ||
-         cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
-         cfg.pin_to_host_optimization() == RewriterConfig::ON ||
-         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
+  return !rewrite_cfg.disable_model_pruning() ||
+         rewrite_cfg.layout_optimizer() != RewriterConfig::OFF ||
+         rewrite_cfg.function_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.constant_folding() != RewriterConfig::OFF ||
+         rewrite_cfg.shape_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.remapping() != RewriterConfig::OFF ||
+         rewrite_cfg.arithmetic_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.loop_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.dependency_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.auto_parallel().enable() ||
+         rewrite_cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
+         rewrite_cfg.debug_stripper() == RewriterConfig::ON ||
+         rewrite_cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
+         rewrite_cfg.pin_to_host_optimization() == RewriterConfig::ON ||
+         !rewrite_cfg.optimizers().empty() ||
+         !rewrite_cfg.custom_optimizers().empty();
 }
 
-Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph) {
   MetaOptimizer optimizer(cpu_device, cfg);
-  optimizer.set_deadline_usec(DeadlineMicroSeconds(cfg));
+  optimizer.set_deadline_usec(
+      DeadlineMicroSeconds(cfg.graph_options().rewrite_options()));
   Status status = optimizer.Optimize(cluster, item, optimized_graph);
   if (!status.ok()) {
     *optimized_graph = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index e599a9201bc2755d1424a7495a0b86667ed0d828..a06da4394e4b8a4d8e75855a0a432114f7d7fcb3 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -28,7 +29,7 @@ namespace grappler {
 // Run the other grappler optimizers based on the specified rewriter config.
 class MetaOptimizer : public GraphOptimizer {
  public:
-  MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg);
+  MetaOptimizer(DeviceBase* cpu_device, const ConfigProto& cfg);
   ~MetaOptimizer() override = default;
 
   string name() const override { return "meta_optimizer"; };
@@ -65,7 +66,8 @@ class MetaOptimizer : public GraphOptimizer {
                        GraphDef* optimized_graph);
 
   DeviceBase* const cpu_device_;  // may be NULL
-  RewriterConfig cfg_;
+  ConfigProto config_proto_;
+  RewriterConfig& cfg_;
 
   struct OptimizerResult {
     string optimizer_name;
@@ -85,7 +87,7 @@ class MetaOptimizer : public GraphOptimizer {
   std::vector<GraphOptimizationResult> optimization_results_;
 };
 
-bool MetaOptimizerEnabled(const RewriterConfig& cfg);
+bool MetaOptimizerEnabled(const ConfigProto& cfg);
 
 // Run the meta optimizer.
 //
@@ -93,7 +95,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg);
 // during constant folding; if NULL, a new device is created for doing constant
 // folding. For performance, it is recommended to pass in an existing cpu_device
 // when possible.
-Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph);
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 6105bf27bab8020facf259fc426edd44a6547ff2..b60aa256676416ba4c7045b2b127d49b99a14f1f 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -134,11 +134,13 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   CHECK(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -151,13 +153,15 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerWithParams) {
   CHECK(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizerWithParams");
   auto* custom_config = rewriter_config.add_custom_optimizers();
   custom_config->set_name("TestOptimizerWithParams");
   (*custom_config->mutable_parameter_map())["foo"] = AttrValue();
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -171,13 +175,15 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerAndCustomGraphOptimizer) {
 
   TestOptimizer::SetOptimized(false);
   TestGraphOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizer");
   auto customGraphOptimizer = rewriter_config.add_custom_optimizers();
   customGraphOptimizer->set_name("TestGraphOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -190,11 +196,13 @@ TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -205,13 +213,15 @@ TEST_F(MetaOptimizerTest, RunToggleOptimizersAndCustomGraphOptimizerTwice) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   auto customGraphOptimizer = rewriter_config.add_custom_optimizers();
   customGraphOptimizer->set_name("TestGraphOptimizer");
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -222,13 +232,16 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   using test::function::NDef;
 
   // Enable ony function optimization.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_function_optimization(RewriterConfig::ON);
   rewriter_config.add_optimizers("function");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define function library:
   //
@@ -394,14 +407,17 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
   using test::function::NDef;
 
   // Enable function optimization and pruning.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_function_optimization(RewriterConfig::ON);
   rewriter_config.add_optimizers("function");
   rewriter_config.add_optimizers("pruning");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // MyFunc defines two Mul nodes inside function body and two corresponding
   // function outputs.
@@ -505,12 +521,15 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
       &allowed_optimizations);
 
   // Just record properties of optimized Grappler items.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define simple function library with two identical mul functions.
   FunctionDef mul_func_1 = FunctionDefHelper::Create(
@@ -605,7 +624,9 @@ TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config;
+  RewriterConfig& rewriter_config =
+      *config.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("SleepingOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
   rewriter_config.set_meta_optimizer_timeout_ms(1500);
@@ -613,7 +634,7 @@ TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
 
   GraphDef output;
   const Status status =
-      RunMetaOptimizer(item, rewriter_config, nullptr, nullptr, &output);
+      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
   EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
   // Make sure the graph was reverted to the original regardless of when the
   // optimizer timed out.
@@ -625,14 +646,16 @@ TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config;
+  RewriterConfig& rewriter_config =
+      *config.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("SleepingOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
   rewriter_config.set_meta_optimizer_timeout_ms(1500);
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
   GraphDef output;
   const Status status =
-      RunMetaOptimizer(item, rewriter_config, nullptr, nullptr, &output);
+      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
 }
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index b94573bef0f193648d04ac7f669a7e36f59c6876..1be87a9d0d516a49e6b50e2dada3a2cdeea71ef6 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -183,19 +183,18 @@ Status RewriteIdentityNAndInputsOutputs(
       if (IsControlInput(input)) {
         continue;
       }
-      int pos;
-      const StringPiece name = ParseNodeNameAsStringPiece(input, &pos);
-      if (name == node->name()) {
-        if (terminal_ports.find(pos) == terminal_ports.end()) {
+      TensorId input_tensor = ParseTensorName(input);
+      if (input_tensor.node() == node->name()) {
+        if (terminal_ports.find(input_tensor.index()) == terminal_ports.end()) {
           // Replace input that does not lead to a terminal node with newly
           // created identity.
-          string new_identity = new_identities[pos];
+          string new_identity = new_identities[input_tensor.index()];
           output->set_input(i, new_identity);
           updates.push_back({new_identity, output->name()});
         } else {
           // Update input ports that lead to a terminal node from splitting
           // inputs.
-          int new_pos = terminal_input_pos[pos];
+          int new_pos = terminal_input_pos[input_tensor.index()];
           string updated_input_name =
               new_pos > 0 ? strings::StrCat(node->name(), ":", new_pos)
                           : node->name();
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 9ada8b7ff9546b097a1bd347c31dcfb8470d36c7..d8e62e0b24e19033090ea19e1c5698dbc7e3bbe9 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -22,19 +23,498 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
 
-void AddBatchNormNodes(GraphDef* optimized_graph, const NodeDef& fused_node) {
+namespace {
+
+constexpr char kFusedConv2D[] = "_FusedConv2D";
+
+constexpr char kDataFormat[] = "data_format";
+constexpr char kIsTraining[] = "is_training";
+
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+bool EigenSupportsContractionOutputKernel() {
+#if defined(EIGEN_USE_LIBXSMM)
+  return false;
+#endif
+  return true;
+}
+
+struct RemapperContext {
+  explicit RemapperContext(const GrapplerItem& item)
+      : nodes_to_preserve(item.NodesToPreserve()),
+        graph_view(&item.graph),
+        graph_properties(item),
+        inferred_graph_properties(false) {}
+
+  std::unordered_set<string> nodes_to_preserve;
+  GraphView graph_view;
+  GraphProperties graph_properties;
+  bool inferred_graph_properties;
+};
+
+// FusedBatchNorm that can be replaced with a cheaper set of primitives.
+struct FusedBatchNorm {
+  const NodeDef* fused_batch_norm = nullptr;
+};
+
+// Conv2D node followed by a BiasAdd.
+struct Conv2DWithBiasAdd {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* bias_add = nullptr;
+};
+
+// Conv2D node followed by a BiasAdd and Relu.
+struct Conv2DWithBiasAddAndRelu {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* bias_add = nullptr;
+  const NodeDef* relu = nullptr;
+};
+
+// Conv2D node followed by a Squeeze and BiasAdd.
+struct Conv2DWithSqueezeAndBiasAdd {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* squeeze = nullptr;
+  const NodeDef* bias_add = nullptr;
+};
+
+// Conv2D node followed by a FusedBatchNorm.
+struct Conv2DWithBatchNorm {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* fused_batch_norm = nullptr;
+  float epsilon = 0.0;
+};
+
+// Conv2D node followed by a FusedBatchNorm and Relu.
+struct Conv2DWithBatchNormAndRelu {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* fused_batch_norm = nullptr;
+  const NodeDef* relu = nullptr;
+  float epsilon = 0.0;
+};
+
+bool IsFloatOrDoubleDataType(const NodeDef* node,
+                             const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == DT_FLOAT || dtype == DT_DOUBLE;
+}
+
+bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
+                      const string& type_attr = "T") {
+  DataType lhs_attr = GetDataTypeFromAttr(*lhs, type_attr);
+  DataType rhs_attr = GetDataTypeFromAttr(*rhs, type_attr);
+
+  return lhs_attr != DT_INVALID && rhs_attr != DT_INVALID &&
+         lhs_attr == rhs_attr;
+}
+
+bool HasDataType(const NodeDef* node, const DataType& expected,
+                 const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == expected;
+}
+
+bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
+  return ctx.nodes_to_preserve.count(node->name()) > 0;
+}
+
+bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* node,
+                        Conv2DWithBiasAdd* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a BiasAdd.
+  if (!node) return false;
+  if (!IsBiasAdd(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // Input to the BiasAdd must be a Conv2D in NHWC format.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
+  if (!conv2d.node) return false;
+  if (!IsConv2D(*conv2d.node)) return false;
+  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+BiasAdd pattern.
+  matched->conv2d = conv2d.node;
+  matched->bias_add = node;
+
+  return true;
+}
+
+bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* node,
+                               Conv2DWithBiasAddAndRelu* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a Relu.
+  if (!node) return false;
+  if (!IsRelu(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // And input to Relu must match Conv2DWithBiasAdd pattern.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto bias_add = ctx.graph_view.GetRegularFanin(input_port);
+
+  Conv2DWithBiasAdd base;
+  if (!FindConv2DWithBias(ctx, bias_add.node, &base)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, base.bias_add)) return false;
+  if (!HaveSameDataType(node, base.bias_add)) return false;
+  if (IsInPreserveSet(ctx, base.bias_add)) return false;
+
+  // We successfully found a Conv2D+BiasAdd+Relu pattern.
+  matched->conv2d = base.conv2d;
+  matched->bias_add = base.bias_add;
+  matched->relu = node;
+
+  return true;
+}
+
+bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
+                                  const NodeDef* node,
+                                  Conv2DWithSqueezeAndBiasAdd* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a BiasAdd.
+  if (node == nullptr) return false;
+  if (node->op() != "BiasAdd") return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // Input to the BiasAdd must be a Squeeze.
+  const auto bias_input_port = GraphView::InputPort(node, 0);
+  const auto squeeze = ctx.graph_view.GetRegularFanin(bias_input_port);
+  if (squeeze.node == nullptr) return false;
+  if (squeeze.node->op() != "Squeeze") return false;
+  if (!NodeIsOnCpu(squeeze.node)) return false;
+  if (!HaveSameDataType(node, squeeze.node, "T")) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, squeeze.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, squeeze.node)) return false;
+  if (IsInPreserveSet(ctx, squeeze.node)) return false;
+
+  // Squeeze must not squeeze output channel dimension.
+  std::vector<int32> dims;
+  if (!GetNodeAttr(*squeeze.node, "squeeze_dims", &dims).ok()) return false;
+  for (auto dim : dims) {
+    if (dim == 3) return false;
+  }
+
+  // Input to the Squeeze must be a Conv2D in NHWC format.
+  const auto squeeze_input_port = GraphView::InputPort(squeeze.node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(squeeze_input_port);
+  if (conv2d.node == nullptr) return false;
+  if (conv2d.node->op() != "Conv2D") return false;
+  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+Squeeze+BiasAdd pattern.
+  matched->conv2d = conv2d.node;
+  matched->squeeze = squeeze.node;
+  matched->bias_add = node;
+
+  return true;
+}
+
+bool FindConv2DWithBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+                             Conv2DWithBatchNorm* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a FusedBatchNorm or a FusedBatchNormV2.
+  if (node == nullptr) return false;
+  if (!IsFusedBatchNorm(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!HasDataType(node, DT_FLOAT)) return false;
+
+  // V2 has a separate data type for the scale/offset/mean/variance inputs.
+  if (node->op() == "FusedBatchNormV2" && !HasDataType(node, DT_FLOAT, "U"))
+    return false;
+
+  // Check that batch normalization is in inference mode.
+  const auto& attr = node->attr();
+  if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
+
+  // Check that only 0th output is consumed by other nodes.
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (HasFanouts(ctx.graph_view, node, 1)) return false;  // batch_mean
+  if (HasFanouts(ctx.graph_view, node, 2)) return false;  // batch_variance
+  if (HasFanouts(ctx.graph_view, node, 3)) return false;  // reserve_space_1
+  if (HasFanouts(ctx.graph_view, node, 4)) return false;  // reserve_space_2
+
+  // Input to the FusedBatchNorm must be a Conv2D in NHWC format.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
+  if (conv2d.node == nullptr) return false;
+  if (!IsConv2D(*conv2d.node)) return false;
+  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+FusedBatchNorm pattern.
+  matched->conv2d = conv2d.node;
+  matched->fused_batch_norm = node;
+  if (!GetNodeAttr(*node, "epsilon", &matched->epsilon).ok()) return false;
+
+  return true;
+}
+
+bool FindConv2DWithBatchNormAndRelu(const RemapperContext& ctx,
+                                    const NodeDef* node,
+                                    Conv2DWithBatchNormAndRelu* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a Relu.
+  if (node == nullptr) return false;
+  if (!IsRelu(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // And input to Relu must match Conv2DWithBatchNorm pattern.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto batch_norm = ctx.graph_view.GetRegularFanin(input_port);
+
+  Conv2DWithBatchNorm base;
+  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm)) return false;
+  if (!HaveSameDataType(node, base.fused_batch_norm)) return false;
+  if (IsInPreserveSet(ctx, base.fused_batch_norm)) return false;
+
+  // We successfully found a Conv2D+FusedBatchNorm+Relu pattern.
+  matched->conv2d = base.conv2d;
+  matched->fused_batch_norm = base.fused_batch_norm;
+  matched->relu = node;
+  matched->epsilon = base.epsilon;
+
+  return true;
+}
+
+// Check that given node meets some basic FusedBatchNorm optimization
+// preconditions. We use this check to lazily infer graph properties which is
+// rather expensive.
+bool IsFusedBatchNormCandidate(const NodeDef& node) {
+  if (!IsFusedBatchNorm(node)) return false;
+  if (GetDataTypeFromAttr(node, "T") != DT_FLOAT) return false;
+
+  // Check that the node is in inference mode.
+  const auto& attr = node.attr();
+  if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
+
+  return true;
+}
+
+bool FindFusedBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+                        FusedBatchNorm* matched) {
+  if (!IsFusedBatchNormCandidate(*node)) return false;
+
+  const auto& props = ctx.graph_properties.GetInputProperties(node->name());
+
+  // a. Scaling factor can be const folded:
+  //      scaling_factor = (variance + epsilon).rsqrt() * scale
+  bool const_scaling_factor =
+      props.size() == 5 &&     // [x, scale, offset, mean, variance]
+      props[1].has_value() &&  // scale
+      props[4].has_value();    // variance aka estimated variance
+
+  // b. Or input can be const folded into some other expression.
+  auto const_inputs = std::count_if(
+      props.begin(), props.end(),
+      [](const OpInfo::TensorProperties& props) { return props.has_value(); });
+
+  // TODO(bsteiner): use the cost model to compare the cost of fused batch
+  // norm against that of the optimized form.
+  bool can_remap = const_scaling_factor || const_inputs >= 4;
+  if (!can_remap) return false;
+
+  // The optimized version only generates the first output.
+  for (GraphView::Edge edge : ctx.graph_view.GetFanoutEdges(*node, false)) {
+    if (edge.src.port_id != 0) return false;
+  }
+
+  // We found a fused batch norm node that can be replaced with primitive ops.
+  matched->fused_batch_norm = node;
+  return true;
+}
+
+void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
+                          const std::vector<string>& fused_ops = {},
+                          int num_args = 1, float epsilon = 0.0) {
+  auto* attr = fused_conv2d->mutable_attr();
+  auto src_attr = conv2d->attr();
+
+  (*attr)["T"] = src_attr.at("T");
+  (*attr)["strides"] = src_attr.at("strides");
+  (*attr)["padding"] = src_attr.at("padding");
+  (*attr)["dilations"] = src_attr.at("dilations");
+  (*attr)["data_format"] = src_attr.at("data_format");
+
+  auto* fused_ops_attr = (*attr)["fused_ops"].mutable_list();
+  for (const string& fused_op : fused_ops) {
+    fused_ops_attr->add_s(fused_op);
+  }
+
+  SetAttrValue(num_args, &(*attr)["num_args"]);
+  // Required only for FusedBatchNorm.
+  SetAttrValue(epsilon, &(*attr)["epsilon"]);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBiasAdd& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BiasAdd: bias_add=" << matched.bias_add->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.bias_add->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.bias_add->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBiasAddAndRelu& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: relu=" << matched.relu->name()
+          << " bias_add=" << matched.bias_add->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.relu->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.relu->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd", "Relu"});
+
+  invalidated_nodes->insert(matched.relu);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithSqueezeAndBiasAdd& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with Squeeze and BiasAdd: "
+          << " bias_add=" << matched.bias_add->name()
+          << " squeeze=" << matched.squeeze->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  // Replace Conv2D node with a fused Conv2D. Matched pattern guarantees that it
+  // has single consumer (only the squeeze node).
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.conv2d->name());
+  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_device(matched.conv2d->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+
+  // Replace BiasAdd node with a Squeeze.
+  NodeDef* remapped_squeeze = optimized_graph->add_node();
+  *remapped_squeeze = *matched.squeeze;
+  remapped_squeeze->set_name(matched.bias_add->name());
+  remapped_squeeze->set_input(0, fused_conv2d->name());
+
+  invalidated_nodes->insert(matched.squeeze);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBatchNorm& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BatchNorm: batch_norm="
+          << matched.fused_batch_norm->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.fused_batch_norm->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
+  fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
+  fused_conv2d->add_input(matched.fused_batch_norm->input(2));  // 3: offset
+  fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
+  fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm"},
+                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+
+  invalidated_nodes->insert(matched.fused_batch_norm);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBatchNormAndRelu& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BatchNorm and Relu: relu="
+          << matched.relu->name()
+          << " batch_norm=" << matched.fused_batch_norm->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.relu->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
+  fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
+  fused_conv2d->add_input(matched.fused_batch_norm->input(2));  // 3: offset
+  fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
+  fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm", "Relu"},
+                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+
+  invalidated_nodes->insert(matched.relu);
+  invalidated_nodes->insert(matched.fused_batch_norm);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddBatchNormNodes(const FusedBatchNorm& matched,
+                       GraphDef* optimized_graph) {
+  const NodeDef& fused_node = *matched.fused_batch_norm;
+  VLOG(2) << "Optimizing fused batch norm node "
+          << SummarizeNodeDef(fused_node);
+
   const string& x = fused_node.input(0);
   string scale = fused_node.input(1);
   string offset = fused_node.input(2);
   string mean = fused_node.input(3);
   string variance = fused_node.input(4);
 
-  if (fused_node.attr().at("data_format").s() == "NCHW") {
+  if (fused_node.attr().at(kDataFormat).s() == "NCHW") {
     // Need to reshape the last 4 inputs
     NodeDef* new_shape = optimized_graph->add_node();
     new_shape->set_name(AddPrefixToNodeName("NCHWShape", fused_node.name()));
@@ -164,59 +644,94 @@ void AddBatchNormNodes(GraphDef* optimized_graph, const NodeDef& fused_node) {
   *r->add_input() = a->name();
   *r->add_input() = c->name();
 }
+}  // namespace
 
 Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
                           GraphDef* optimized_graph) {
-  GraphProperties properties(item);
-  bool inferred_properties = false;
-  GraphView graph(const_cast<GraphDef*>(&item.graph));
-
-  // During inference, most of the inputs to FusedBatchNorm are constant, and we
-  // can therefore replace the op with a much cheaper set of primitives.
-  optimized_graph->mutable_node()->Reserve(item.graph.node_size());
-  for (const NodeDef& node : item.graph.node()) {
-    if (node.op() == "FusedBatchNorm" || node.op() == "FusedBatchNormV2") {
-      bool optimizable = (node.attr().count("T") == 0 ||
-                          node.attr().at("T").type() == DT_FLOAT);
-      optimizable &= (node.attr().count("is_training") == 0 ||
-                      !node.attr().at("is_training").b());
-      if (optimizable) {
-        int const_inputs = 0;
-        if (!inferred_properties) {
-          // Infer properties lazily in case they are not needed.
-          TF_RETURN_IF_ERROR(properties.InferStatically(false));
-          inferred_properties = true;
-        }
-        const auto& props = properties.GetInputProperties(node.name());
-        for (const auto& prop : props) {
-          if (prop.has_value()) {
-            const_inputs += 1;
-          }
-        }
-        // TODO(bsteiner): use the cost model to compare the cost of fused batch
-        // norm against that of the optimized form.
-        optimizable = (const_inputs >= 4);
-      }
-      if (optimizable) {
-        for (GraphView::Edge edge : graph.GetFanoutEdges(node, false)) {
-          if (edge.src.port_id != 0) {
-            // The optimized version only generates the first output.
-            optimizable = false;
-            break;
-          }
-        }
-      }
-      if (optimizable) {
-        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
-        AddBatchNormNodes(optimized_graph, node);
-        continue;
-      }
+  // Supported graph patterns.
+  // clang-format off
+  FusedBatchNorm              fused_batch_norm;
+  Conv2DWithBiasAdd           conv2d_with_bias;
+  Conv2DWithBiasAddAndRelu    conv2d_with_bias_and_relu;
+  Conv2DWithBatchNorm         conv2d_with_batch_norm;
+  Conv2DWithBatchNormAndRelu  conv2d_with_batch_norm_and_relu;
+  Conv2DWithSqueezeAndBiasAdd conv2d_with_squeeze_and_bias;
+  // clang-format on
+
+  // Processing graph in reverse-topological sorted order allows to remap
+  // longer chains of dependent ops in one pass.
+  GraphDef topo_sorted_graph = item.graph;
+  TF_RETURN_IF_ERROR(TopologicalSort(&topo_sorted_graph));
+  std::reverse(topo_sorted_graph.mutable_node()->begin(),
+               topo_sorted_graph.mutable_node()->end());
+
+  GrapplerItem topo_sorted_item(item, std::move(topo_sorted_graph));
+  RemapperContext ctx(topo_sorted_item);
+
+  // Skip nodes that were invalidated by a remapper, e.g. do not process BiasAdd
+  // and Relu nodes that were fused into a Conv2D node.
+  absl::flat_hash_set<const NodeDef*> invalidated_nodes;
+
+  optimized_graph->mutable_node()->Reserve(topo_sorted_item.graph.node_size());
+  for (const NodeDef& node : topo_sorted_item.graph.node()) {
+    // Check if node was invalidated by one of the previous remaps.
+    if (invalidated_nodes.count(&node) > 0) continue;
+
+    // Remap Conv2D+BiasAdd into the _FusedConv2D.
+    if (FindConv2DWithBias(ctx, &node, &conv2d_with_bias)) {
+      AddFusedConv2DNode(conv2d_with_bias, optimized_graph, &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+BiasAdd+Relu into the _FusedConv2D.
+    if (FindConv2DWithBiasAndRelu(ctx, &node, &conv2d_with_bias_and_relu)) {
+      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+Squeeze+BiasAdd into the _FusedConv2D+Squeeze.
+    if (FindConv2DWithSqueezeAndBias(ctx, &node,
+                                     &conv2d_with_squeeze_and_bias)) {
+      AddFusedConv2DNode(conv2d_with_squeeze_and_bias, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+FusedBatchNorm into the _FusedConv2D;
+    if (FindConv2DWithBatchNorm(ctx, &node, &conv2d_with_batch_norm)) {
+      AddFusedConv2DNode(conv2d_with_batch_norm, optimized_graph,
+                         &invalidated_nodes);
+      continue;
     }
+
+    // Remap Conv2D+FusedBatchNorm+Relu into the _FusedConv2D;
+    if (FindConv2DWithBatchNormAndRelu(ctx, &node,
+                                       &conv2d_with_batch_norm_and_relu)) {
+      AddFusedConv2DNode(conv2d_with_batch_norm_and_relu, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Infer properties lazily in case they are not needed.
+    if (!ctx.inferred_graph_properties && IsFusedBatchNormCandidate(node)) {
+      TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(false));
+      ctx.inferred_graph_properties = true;
+    }
+
+    // During inference, most of the inputs to FusedBatchNorm are constant, and
+    // we can therefore replace the op with a much cheaper set of primitives.
+    if (FindFusedBatchNorm(ctx, &node, &fused_batch_norm)) {
+      AddBatchNormNodes(fused_batch_norm, optimized_graph);
+      continue;
+    }
+
+    // If we didn't match a node to any pattern copy it to the optimized graph.
     *optimized_graph->add_node() = node;
   }
 
-  *optimized_graph->mutable_library() = item.graph.library();
-  *optimized_graph->mutable_versions() = item.graph.versions();
+  *optimized_graph->mutable_library() = topo_sorted_item.graph.library();
+  *optimized_graph->mutable_versions() = topo_sorted_item.graph.versions();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 4cbf0d8d6f11ea77cfd3973e3ff0c109d48c0273..ffc242decc70e8947547fbe9ca25909625381887 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -24,7 +24,17 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-class RemapperTest : public GrapplerTest {};
+class RemapperTest : public GrapplerTest {
+ protected:
+  // TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+  // contractions with non-default contraction output kernels.
+  bool EigenSupportsContractionOutputKernel() {
+#if defined(EIGEN_USE_LIBXSMM)
+    return false;
+#endif
+    return true;
+  }
+};
 
 TEST_F(RemapperTest, FusedBatchNorm) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -91,5 +101,361 @@ TEST_F(RemapperTest, FusedBatchNormNCHW) {
   }
 }
 
+TEST_F(RemapperTest, FuseConv2DWithBias) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ::tensorflow::ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "bias_add") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(1, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithBiasAndRelu) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ::tensorflow::ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+  auto relu = ops::Relu(s.WithOpName("relu"), bias_add);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "relu") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      ASSERT_EQ(2, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      EXPECT_EQ("Relu", fused_ops[1]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto scale_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
+  auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
+  auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
+  auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  ops::FusedBatchNorm::Attrs attrs;
+  attrs = attrs.IsTraining(false);
+  auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
+                                        offset, mean, variance, attrs);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), batch_norm.y);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t},
+               {"scale", scale_t}, {"offset", offset_t},
+               {"mean", mean_t},   {"variance", variance_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "batch_norm") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(4, node.attr().at("num_args").i());
+      EXPECT_EQ("scale", node.input(2));
+      EXPECT_EQ("offset", node.input(3));
+      EXPECT_EQ("mean", node.input(4));
+      EXPECT_EQ("variance", node.input(5));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(1, fused_ops.size());
+      EXPECT_EQ("FusedBatchNorm", fused_ops[0]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithBatchNormAndRelu) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto scale_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
+  auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
+  auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
+  auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  ops::FusedBatchNorm::Attrs attrs;
+  attrs = attrs.IsTraining(false);
+  auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
+                                        offset, mean, variance, attrs);
+  auto relu = ops::Relu(s.WithOpName("relu"), batch_norm.y);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t},
+               {"scale", scale_t}, {"offset", offset_t},
+               {"mean", mean_t},   {"variance", variance_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "relu") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(4, node.attr().at("num_args").i());
+      EXPECT_EQ("scale", node.input(2));
+      EXPECT_EQ("offset", node.input(3));
+      EXPECT_EQ("mean", node.input(4));
+      EXPECT_EQ("variance", node.input(5));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(2, fused_ops.size());
+      EXPECT_EQ("FusedBatchNorm", fused_ops[0]);
+      EXPECT_EQ("Relu", fused_ops[1]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 1, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+
+  ops::Squeeze::Attrs attrs;
+  attrs = attrs.Axis({2});
+  auto squeeze = ops::Squeeze(s.WithOpName("squeeze"), conv, attrs);
+
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), squeeze, bias);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 1, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "conv") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      ASSERT_EQ(1, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      found++;
+    } else if (node.name() == "bias_add") {
+      EXPECT_EQ("Squeeze", node.op());
+      EXPECT_EQ("conv", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 2ac11a024c2025f524232652f858a9302b9117b9..9336c4df8b05408d9f8ce622bf488a7b3d07bc3e 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -145,20 +145,18 @@ void NodeMap::UpdateOutput(const string& node_name,
 }
 
 bool IsSameInput(const string& name1, const string& name2) {
-  if (name1 == name2) {
-    return true;
-  }
-  int position1;
-  StringPiece node1 = ParseNodeNameAsStringPiece(name1, &position1);
-  int position2;
-  StringPiece node2 = ParseNodeNameAsStringPiece(name2, &position2);
-  return (position1 == position2) && (node1 == node2);
+  if (name1 == name2) return true;
+  TensorId tensor1 = ParseTensorName(name1);
+  TensorId tensor2 = ParseTensorName(name2);
+  return tensor1.node() == tensor2.node() && tensor1.index() == tensor2.index();
 }
 
 bool IsControlInput(const string& name) {
   return !name.empty() && name[0] == '^';
 }
 
+bool IsControlInput(const TensorId& tensor_id) { return tensor_id.index() < 0; }
+
 string AddPrefixToNodeName(const string& name, const string& prefix,
                            const string& delimiter) {
   if (!name.empty()) {
@@ -200,6 +198,12 @@ string AsControlDependency(const string& node_name) {
              : strings::StrCat("^", node_name);
 }
 
+bool NodeIsOnCpu(const NodeDef* node) {
+  string task, device;
+  return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+         str_util::StartsWith(device, DEVICE_CPU);
+}
+
 int NumOutputs(const NodeDef& node, GraphDef* graph) {
   int num_outputs = 0;
   const OpDef* op_def = nullptr;
@@ -245,7 +249,6 @@ int NumNonControlInputs(const NodeDef& node) {
 
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
   int num_outputs = 0;
-  int pos;
   for (const NodeDef* output : node_map.GetOutputs(node.name())) {
     for (const string& node_as_input : output->input()) {
       if (IsControlInput(node_as_input)) {
@@ -254,9 +257,8 @@ int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
       if (node_as_input == node.name()) {
         ++num_outputs;
       } else {
-        const StringPiece name =
-            ParseNodeNameAsStringPiece(node_as_input, &pos);
-        if (name == node.name()) {
+        const TensorId tensor = ParseTensorName(node_as_input);
+        if (tensor.node() == node.name()) {
           ++num_outputs;
         }
       }
@@ -283,11 +285,11 @@ int NumNonControlDataOutputs(const NodeDef& node, const NodeMap& node_map) {
 
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
-DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name) {
-  if (!node.attr().count(attr_name)) {
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& type_attr) {
+  if (!node.attr().count(type_attr)) {
     return DT_INVALID;
   }
-  const auto& attr = node.attr().at(attr_name);
+  const auto& attr = node.attr().at(type_attr);
   if (attr.value_case() != AttrValue::kType) {
     return DT_INVALID;
   }
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index e40b4748cf44d7e61b4473bd55e5085835f2042d..b1e2d4e9cb5bbe15508695595de4e00f7313c401 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -103,6 +104,9 @@ class SetVector {
 // the ^ character.
 bool IsControlInput(const string& name);
 
+// True iff tensor index refers to a control input.
+bool IsControlInput(const TensorId& tensor_id);
+
 // True iff 'name1' and 'name2' refer to the same input.
 bool IsSameInput(const string& name1, const string& name2);
 
@@ -165,6 +169,7 @@ inline string NodeName(const string& name) {
 }
 
 // Returns the node name and position in a single call.
+// DEPRECATED(ezhulenev): Use TensorId and ParseTensorName.
 inline StringPiece ParseNodeNameAsStringPiece(const string& name,
                                               int* position) {
   static const string empty;
@@ -195,6 +200,7 @@ inline StringPiece ParseNodeNameAsStringPiece(const string& name,
 }
 
 // Returns the node name and position in a single call.
+// DEPRECATED(ezhulenev): Use SafeTensorId and ParseTensorName.
 inline string ParseNodeName(const string& name, int* position) {
   return string(ParseNodeNameAsStringPiece(name, position));
 }
@@ -229,6 +235,9 @@ string AsControlDependency(const NodeDef& node);
 // for control dependency, given a node name
 string AsControlDependency(const string& node);
 
+// Returns true if the node is assigned to run on CPU device.
+bool NodeIsOnCpu(const NodeDef* node);
+
 // Returns the number of outputs of a node according to its OpDef. Note that
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
@@ -257,7 +266,7 @@ Status CheckAttrsExist(const NodeDef& node, absl::Span<const string> keys);
 
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
-DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name);
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& type_attr);
 
 // Returns the last node in the simple chain starting at source and traversing
 // through the input(0) edge from each node as long as the next node satisfies
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index dbe425b75fd1bb3632690d2ba16073e9ba9340a3..2b9448e40344c16d7b4bf636d6252569674a9c85 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -172,7 +172,6 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index c806f3874ddbfa7493914e69c08dbacb8c5db763..7756c73967b1b169deae59e1647d19f17f89f8f5 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -74,120 +74,16 @@ Status ResolveFunctionBodyNodeAttrPlaceholders(
   return Status::OK();
 }
 
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
-  // Functions that are reachable from the graph.
-  absl::flat_hash_set<string> reachable_funcs;
-
-  // Functions might be reachable from the nested function calls, so we keep a
-  // queue of functions that we have to check.
-  gtl::InlinedVector<const FunctionDef*, 4> func_queue;
-
-  // Add reachable and not already processed functions to the functions queue.
-  const auto add_to_func_queue = [&](const string& func_name) {
-    const FunctionDef* func = flib.Find(func_name);
-    if (func && reachable_funcs.find(func_name) == reachable_funcs.end()) {
-      func_queue.push_back(func);
-    }
-  };
-
-  // Add all the functions that are reachable from the given node to the queue.
-  const auto process_node = [&](const NodeDef& node) {
-    // Node itself can be a call to the function.
-    add_to_func_queue(node.op());
-
-    // Or node can have an attribute referencing a function.
-    for (const auto& attr : node.attr()) {
-      const auto& attr_value = attr.second;
-
-      // 1. AttrValue.func
-      if (attr_value.has_func()) {
-        add_to_func_queue(attr_value.func().name());
-      }
-
-      // 2. AttrValue.ListValue.func
-      if (attr_value.has_list()) {
-        for (const auto& func : attr_value.list().func()) {
-          add_to_func_queue(func.name());
-        }
-      }
-    }
-  };
-
-  // Add all functions that are directly called from the optimized graph.
-  std::for_each(nodes.begin(), nodes.end(), process_node);
-
-  // Process all reachable functions.
-  while (!func_queue.empty()) {
-    const FunctionDef* func = func_queue.back();
-    func_queue.pop_back();
-
-    const string& func_name = func->signature().name();
-    reachable_funcs.insert(func_name);
-
-    // Find all the functions called from the function body.
-    const auto& func_body = func->node_def();
-    std::for_each(func_body.begin(), func_body.end(), process_node);
-
-    // Check if the function has a registered gradient.
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
-  }
-
-  return reachable_funcs;
-}
-
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
-  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, nodes);
-
-  FunctionLibraryDefinition reachable_flib(flib.default_registry(),
-                                           FunctionDefLibrary());
-
-  for (const string& func_name : reachable_funcs) {
-    const FunctionDef* func = flib.Find(func_name);
-    DCHECK_NE(func, nullptr);
-    // That should never fail, because we copy functions from valid flib and use
-    // the same default registry.
-    const Status added = reachable_flib.AddFunctionDef(*func);
-    DCHECK(added.ok());
-
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) {
-      GradientDef grad;
-      grad.set_function_name(func_name);
-      grad.set_gradient_func(grad_func_name);
-      // It can only fail if function already has a gradient function.
-      const Status added_grad = reachable_flib.AddGradientDef(grad);
-      DCHECK(added_grad.ok());
-    }
-  }
-
-  return reachable_flib;
-}
-
 }  // namespace
 
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph) {
-  return ReachableFunctions(flib, graph.node());
-}
-
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func) {
-  return ReachableFunctions(flib, func.node_def());
-}
-
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
     const FunctionLibraryDefinition& flib, const GraphDef& graph) {
-  return ReachableFunctionLibraryDefinition(flib, graph.node());
+  return flib.ReachableDefinitions(graph);
 }
 
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
     const FunctionLibraryDefinition& flib, const FunctionDef& func) {
-  return ReachableFunctionLibraryDefinition(flib, func.node_def());
+  return flib.ReachableDefinitions(func);
 }
 
 void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
@@ -451,12 +347,6 @@ GrapplerFunctionItem::GrapplerFunctionItem(
       fetch.push_back(output_tensor);
     }
   }
-  // Stateful and Send (it's not stateful) nodes must be preserved in the graph.
-  for (const NodeDef& node : graph.node()) {
-    if (IsSend(node)) {
-      keep_ops.push_back(node.name());
-    }
-  }
 }
 
 const string& GrapplerFunctionItem::description() const { return description_; }
@@ -688,8 +578,8 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     TF_RETURN_IF_ERROR(RegisterFunctionBodyOutputs(*registration, func_def_node,
                                                    &connectivity));
 
-    // Stateful and Send nodes must be preserved in a function body
-    if (registration->op_def.is_stateful() || IsSend(func_def_node)) {
+    // Ops with side effects must be preserved in a function body.
+    if (!IsFreeOfSideEffect(func_def_node)) {
       keep_nodes.push_back(func_def_node.name());
     }
   }
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 72b3c0f31aaa919d57567c158e90648513379fb5..ba9950e4843bf40ae524ef11abcfeacd7b079827 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -31,13 +30,6 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Returns a set of functions from the function library, that are reachable from
-// the nodes of the graph.
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph);
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func);
-
 // Returns a copy of FunctionLibraryDefinition with subset of functions that are
 // reachable from the nodes of the graph.
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
@@ -150,12 +142,6 @@ class GrapplerFunctionItemInstantiation {
 class GrapplerFunctionItem : public GrapplerItem {
  public:
   GrapplerFunctionItem() = default;
-  GrapplerFunctionItem(string func_name, string description,
-                       AttrSlice func_attr,
-                       std::vector<InputArgExpansion> input_arg_expansions,
-                       std::vector<OutputArgExpansion> output_arg_expansions,
-                       std::vector<string> keep_nodes, int graph_def_version,
-                       bool is_stateful, GraphDef&& function_body);
 
   const string& description() const;
 
@@ -178,12 +164,22 @@ class GrapplerFunctionItem : public GrapplerItem {
   GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
 
  private:
+  friend Status MakeGrapplerFunctionItem(const FunctionDef&, const AttrSlice&,
+                                         const FunctionLibraryDefinition&, int,
+                                         GrapplerFunctionItem*);
   friend Status ReplaceInputWithConst(const NodeDef&, int,
                                       GrapplerFunctionItem*);
   friend Status RemoveUnusedOutputs(
       const gtl::FlatSet<int>& active_outputs, GrapplerFunctionItem* item,
       std::vector<std::pair<int, int>>* output_mapping);
 
+  GrapplerFunctionItem(string func_name, string description,
+                       AttrSlice func_attr,
+                       std::vector<InputArgExpansion> input_arg_expansions,
+                       std::vector<OutputArgExpansion> output_arg_expansions,
+                       std::vector<string> keep_nodes, int graph_def_version,
+                       bool is_stateful, GraphDef&& function_body);
+
   string description_;
   AttrSlice func_attr_;  // Attributes specific to function definition that
                          // produced this item (FuncDef.attr field).
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 16834acecf08d3e066cc52b52ac86bf543c499d4..8639dec05a1eb8aa7afcadc20ee9f8949bfeae14 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -32,65 +32,6 @@ constexpr char kDevice[] = "/device:CPU:0";
 
 class FunctionsTest : public ::testing::Test {};
 
-TEST_F(FunctionsTest, ReachableFunctions) {
-  using ::tensorflow::test::function::GDef;
-  using ::tensorflow::test::function::NDef;
-  using FDH = ::tensorflow::FunctionDefHelper;
-
-  const auto make_simple_fdef = [](const string &name) {
-    return FDH::Create(
-        name, {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
-        {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-        /* Mapping between function returns and function node outputs. */
-        {{"z", "output:z:0"}});
-  };
-
-  FunctionDef func_1 = make_simple_fdef("Func1");
-  FunctionDef func_2 = make_simple_fdef("Func2");
-  FunctionDef func_3 = make_simple_fdef("Func3");
-
-  FunctionDef func_2_grad = make_simple_fdef("Func2_grad");
-
-  GraphDef graph = GDef(
-      {
-          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
-          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
-          NDef("x", "Func1", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
-          NDef("y", "PartitionedCall", {"a", "b"},
-               {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
-                {"Tout", DataTypeSlice{DT_FLOAT}},
-                {"f", FDH::FunctionRef("Func2", {{"T", DT_FLOAT}})}},
-               kDevice),
-      },
-      // FunctionLib
-      {func_1, func_2, func_3, func_2_grad});
-
-  // Register custom function gradient after the graph was constructed.
-  GradientDef *func3_grad_def = graph.mutable_library()->add_gradient();
-  func3_grad_def->set_function_name("Func2");
-  func3_grad_def->set_gradient_func("Func2_grad");
-
-  FunctionLibraryDefinition flib(OpRegistry::Global(), graph.library());
-
-  // - 'Func1' called directly from the graph
-  // - 'Func2' called indirectly via PartitionedCall attribute, and it also
-  //   has a custom gradient ('Func2_grad') that must remain in the library
-  // - 'Func3' in unreachable and has to be removed from the library
-
-  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, graph);
-  ASSERT_EQ(reachable_funcs.size(), 3);
-  EXPECT_NE(reachable_funcs.find("Func1"), reachable_funcs.end());
-  EXPECT_NE(reachable_funcs.find("Func2"), reachable_funcs.end());
-  EXPECT_NE(reachable_funcs.find("Func2_grad"), reachable_funcs.end());
-
-  FunctionLibraryDefinition reachable_flib =
-      ReachableFunctionLibraryDefinition(flib, graph);
-  ASSERT_EQ(reachable_flib.num_functions(), 3);
-  EXPECT_TRUE(reachable_flib.Contains("Func1"));
-  EXPECT_TRUE(reachable_flib.Contains("Func2"));
-  EXPECT_TRUE(reachable_flib.Contains("Func2_grad"));
-}
-
 TEST_F(FunctionsTest, IsParametrized) {
   // Function is defined for multiple input types.
   FunctionDef parametrized_func = FunctionDefHelper::Create(
@@ -635,6 +576,33 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   EXPECT_EQ("two", cast.input(0));
 }
 
+TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  FunctionDef func = FunctionDefHelper::Define(
+      /* Name */ "SideEffects",
+      /* Args */ {"x: Ref(float)"},
+      /* Return values */ {},
+      /* Attr def */ {},
+      /* Nodes */
+      {{{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+       {{"update"}, "AssignAdd", {"x", "one"}, {{"T", DT_FLOAT}}}});
+
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
+
+  EXPECT_EQ("SideEffects", item.id);
+  EXPECT_EQ(3, item.function_body().node_size());
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ(0, item.output_size());
+  ASSERT_EQ(1, item.keep_ops.size());
+  EXPECT_EQ("update", item.keep_ops[0]);
+}
+
 TEST_F(FunctionsTest, MakeFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 32825d09891be82c613c858de70795e3e216120b..4fe0d2e87e826f4c4f4bf2784a365693c373f197 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -22,6 +22,7 @@ package_group(
         "//learning/brain/research/sparse_matrix/...",
         "//learning/faster_training/...",
         "//tensorflow/...",
+        "//tensorflow_text/...",
         "//third_party/car/...",
     ],
 )
@@ -40,7 +41,6 @@ load(
     "tf_mkl_kernel_library",
     "cc_header_only_library",
     "if_not_windows",
-    "if_override_eigen_strong_inline",
 )
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
@@ -94,13 +94,13 @@ config_setting(
 )
 
 config_setting(
-    # Add "--define tensorflow_eigen_mkldnn=1" to your build command to use mkldnn
-    # sgemm in Eigen tensor contractions (matrix multiplications and convolutions).
-    # The mkldnn kernels are generated at runtime and use avx/avx2/fma/avx512
-    # based on cpu status registers (https://en.wikipedia.org/wiki/CPUID).
-    name = "eigen_mkldnn",
+    # Add "--define tensorflow_mkldnn_contraction_kernel=1" to your build command to use mkldnn
+    # sgemm in Eigen tensor contractions (matrix multiplications and convolutions). The mkldnn
+    # kernels are generated at runtime and use avx/avx2/fma/avx512 based on cpu status registers
+    # (https://en.wikipedia.org/wiki/CPUID).
+    name = "mkldnn_contraction_kernel",
     values = {
-        "define": "tensorflow_eigen_mkldnn=1",
+        "define": "tensorflow_mkldnn_contraction_kernel=1",
     },
 )
 
@@ -195,15 +195,27 @@ cc_library(
     deps = ["//third_party/eigen3"],
 )
 
-cc_library(
+tf_kernel_library(
     name = "conv_2d",
     hdrs = ["conv_2d.h"],
+    gpu_srcs = [
+        "conv_2d_gpu.cu.cc",
+        "conv_2d.h",
+    ],
     deps = [
         ":eigen_helpers",
+        ":fill_functor",
         ":gpu_util_hdrs",
+        ":image_resizer_state",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -554,6 +566,40 @@ cc_library(
     ],
 )
 
+# Depending on a build configuration this target provides custom kernel for Eigen
+# tensor contractions (small matrix multiplication kernel used to multiple together
+# blocks of the original tensors).
+#
+# 0) Default contraction kernel is Eigen::internal::gebp_kernel.
+#
+# 1) --define tensorflow_mkldnn_contraction_kernel=1
+#    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at runtime and
+#    use avx/avx2/fma/avx512 based on cpu status registers (https://en.wikipedia.org/wiki/CPUID).
+#
+# If you use `tensor.contract(other_tensor)` in your code, you must include additional header
+# to get the benefit of custom contraction kernel:
+#
+#   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#   #include "third_party/tensorflow/core/kernels/eigen_contraction_kernel.h"
+#   #endif
+cc_library(
+    name = "eigen_contraction_kernel",
+    hdrs = ["eigen_contraction_kernel.h"],
+    defines = select({
+        ":mkldnn_contraction_kernel": [
+            "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
+            "TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        "//third_party/eigen3",
+    ] + select({
+        ":mkldnn_contraction_kernel": ["//third_party/intel_mkl_dnn:mkldnn_single_threaded"],
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
     name = "eigen_helpers",
     hdrs = [
@@ -566,20 +612,11 @@ cc_library(
         "eigen_softmax.h",
         "eigen_spatial_convolutions.h",
         "eigen_volume_patch.h",
-    ] + select({
-        ":eigen_mkldnn": ["eigen_mkldnn.h"],
-        "//conditions:default": [],
-    }),
-    defines = select({
-        ":eigen_mkldnn": ["EIGEN_USE_MKLDNN"],
-        "//conditions:default": [],
-    }),
+    ],
     deps = [
+        ":eigen_contraction_kernel",
         "//third_party/eigen3",
-    ] + select({
-        ":eigen_mkldnn": ["//third_party/intel_mkl_dnn:mkldnn_single_threaded"],
-        "//conditions:default": [],
-    }),
+    ],
 )
 
 cc_library(
@@ -1220,6 +1257,9 @@ tf_cc_test(
     name = "conv_ops_test",
     size = "medium",
     srcs = ["conv_ops_test.cc"],
+    tags = [
+        "nomsan",
+    ],
     deps = [
         ":conv_ops",
         ":image",
@@ -1648,14 +1688,14 @@ tf_kernel_library(
     ],
     visibility = [":friends"],
     deps = [
-        ":conv_ops",
+        ":conv_2d",
         ":ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//third_party/eigen3",
     ],
-    alwayslink = 0,
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -1908,10 +1948,22 @@ tf_kernel_library(
     deps = DATA_FLOW_DEPS,
 )
 
+cc_library(
+    name = "stack",
+    srcs = ["stack.cc"],
+    hdrs = ["stack.h"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "stack_ops",
     prefix = "stack_ops",
-    deps = DATA_FLOW_DEPS,
+    deps = DATA_FLOW_DEPS + [":stack"],
 )
 
 tf_kernel_library(
@@ -2422,15 +2474,15 @@ tf_cc_tests(
 # Conditional test target generation is not supported by the "tf_cc_tests" macro
 # (can't add 'select' to the srcs field, type 'select' is not iterable).
 tf_cc_test(
-    name = "eigen_mkldnn_test",
+    name = "eigen_mkldnn_contraction_kernel_test",
     size = "small",
     srcs = select({
-        ":eigen_mkldnn": ["eigen_mkldnn_test.cc"],
+        ":mkldnn_contraction_kernel": ["eigen_mkldnn_contraction_kernel_test.cc"],
         "//conditions:default": [],
     }),
-    tags = ["eigen_mkldnn"],
+    tags = ["mkldnn_contraction_kernel"],
     deps = [
-        ":eigen_helpers",
+        ":eigen_contraction_kernel",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -3045,11 +3097,8 @@ tf_kernel_library(
     ]),
     # <prefix>*impl.h are excluded by default from the CPU build, add explicitly.
     hdrs = ["batch_matmul_op_impl.h"],
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "batch_matmul_op",
-    deps = MATH_DEPS + if_mkl_ml([
+    deps = MATH_DEPS + [":eigen_contraction_kernel"] + if_mkl_ml([
         "//third_party/mkl:intel_binary_blob",
     ]),
 )
@@ -3115,9 +3164,6 @@ tf_kernel_library(
         "mkl_matmul_op.cc",
     ]),
     hdrs = ["matmul_op.h"],
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
         ":xsmm": [
             "TENSORFLOW_USE_LIBXSMM",
@@ -3126,11 +3172,10 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     deps = MATH_DEPS + [
+        ":eigen_contraction_kernel",
         ":gpu_util_hdrs",
     ] + select({
-        ":xsmm": [
-            "@libxsmm_archive//:xsmm_avx",
-        ],
+        ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
         "//conditions:default": [],
     }) + mkl_deps() + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
@@ -3155,7 +3200,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "scan_ops",
     prefix = "scan_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -3333,6 +3378,29 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "scan_ops_test",
+    size = "small",
+    srcs = ["scan_ops_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":host_constant_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":scan_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "reduction_ops_test",
     size = "small",
@@ -3516,9 +3584,6 @@ tf_kernel_library(
         ":xsmm_convolutions": ["xsmm_conv2d.h"],
         "//conditions:default": [],
     }),
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
         ":xsmm_convolutions": [
             "TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS",
@@ -3536,6 +3601,7 @@ tf_kernel_library(
         ":bounds_check",
         ":conv_2d",
         ":conv_3d",
+        ":eigen_contraction_kernel",
         ":image_resizer_state",
         ":fill_functor",
         ":ops_util",
@@ -3626,6 +3692,7 @@ cc_library(
 NN_DEPS = [
     ":bounds_check",
     ":conv_2d",
+    ":eigen_contraction_kernel",
     ":fused_batch_norm_util_gpu",
     ":ops_util",
     ":pooling_ops",
@@ -3635,7 +3702,7 @@ NN_DEPS = [
     "//tensorflow/core:nn_grad",
     "//tensorflow/core:nn_ops_op_lib",
     "//third_party/eigen3",
-]
+] + if_mkl(["//tensorflow/core:mkl_nn_ops_op_lib"])
 
 tf_kernel_library(
     name = "batch_norm_op",
@@ -3674,9 +3741,6 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "lrn_op",
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "lrn_op",
     deps = NN_DEPS,
 )
@@ -4762,6 +4826,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:string_ops_op_lib",
+        "//third_party/icu/data:conversion_data",
         "@icu//:common",
     ],
 )
@@ -5314,7 +5379,9 @@ filegroup(
         "batch_norm_op.h",
         "control_flow_ops.h",
         "conv_2d.h",
+        "conv_3d.h",
         "conv_ops.h",
+        "conv_ops_gpu.h",
         "data_format_ops.h",
         "depthtospace_op.h",
         "depthwise_conv_op.h",
@@ -5332,7 +5399,9 @@ filegroup(
         "mfcc_mel_filterbank.h",
         "mirror_pad_op.h",
         "mirror_pad_op_cpu_impl.h",
+        "multinomial_op.h",
         "pad_op.h",
+        "pooling_ops_3d.h",
         "random_op.h",
         "reduction_ops.h",
         "reduction_ops_common.h",
@@ -5349,6 +5418,7 @@ filegroup(
         "spacetobatch_functor.h",
         "spacetodepth_op.h",
         "spectrogram.h",
+        "stateless_random_ops.h",
         "string_util.h",
         "tensor_array.h",
         "tile_functor.h",
@@ -5380,6 +5450,7 @@ filegroup(
         "conv_grad_ops.cc",
         "conv_grad_ops.h",
         "conv_ops.cc",
+        "conv_ops_3d.cc",
         "conv_ops_fused.cc",
         "conv_ops_using_gemm.cc",
         "crop_and_resize_op.cc",
@@ -5487,9 +5558,11 @@ filegroup(
         "mirror_pad_op_cpu_impl_3.cc",
         "mirror_pad_op_cpu_impl_4.cc",
         "mirror_pad_op_cpu_impl_5.cc",
+        "multinomial_op.cc",
         "pad_op.cc",
         "padding_fifo_queue.cc",
         "padding_fifo_queue_op.cc",
+        "pooling_ops_3d.cc",
         "queue_base.cc",
         "queue_op.cc",
         "queue_ops.cc",
@@ -5523,7 +5596,10 @@ filegroup(
         "sparse_to_dense_op.cc",
         "spectrogram.cc",
         "spectrogram_op.cc",
+        "stack.cc",
+        "stack.h",
         "stack_ops.cc",
+        "stateless_random_ops.cc",
         "string_join_op.cc",
         "string_util.cc",
         "summary_op.cc",
@@ -6461,6 +6537,10 @@ tf_cc_test(
 
 tf_mkl_kernel_library(
     name = "mkl_conv_op",
+    hdrs = [
+        "mkl_quantized_conv_ops.h",
+        "no_op.h",
+    ],
     prefix = "mkl_conv",
     deps = [
         ":bounds_check",
@@ -6470,6 +6550,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6504,6 +6585,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6519,6 +6601,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6538,6 +6621,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6552,6 +6636,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
@@ -6567,6 +6652,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
@@ -6587,7 +6673,7 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_concat_op",
     prefix = "mkl_concat_op",
-    deps = ARRAY_DEPS + mkl_deps(),
+    deps = [":quantization_utils"] + ARRAY_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6652,6 +6738,13 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "tensor_forest_ops",
+    deps = [
+        "//tensorflow/core/kernels/tensor_forest:tensor_forest_ops",
+    ],
+)
+
 tf_kernel_library(
     name = "dataset_ops",
     deps = [
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc
index 6079aa749d52c5a3483ac21cd44feef5a3978fb3..52dec94305d3c8558013861a44524609ad6eed7a 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@@ -216,8 +216,8 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, delta_h](
-              int64 start_channel, int64 end_channel) {
+          [&input_data, &output_data, delta_h](int64 start_channel,
+                                               int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 944564dfba62f257ae45b3c5c25d0de64fa0b773..aa9123582210bdf31993e9d8c58ba90cc02acc5e 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -180,7 +180,7 @@ class Barrier : public ResourceBase {
         // SQSS is closed, nothing is left in the incomplete set,
         // the queue is not already marked as closed, and (most
         // importantly), the queue has entries in it.
-        [this, ctx, callback, component_index]() {
+        [this, ctx, callback]() {
           if (!ctx->status().ok()) {
             callback();
             return;
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 766713a338caf3f9aa317179902c596de3a25cfd..43539ac908ffdcb49d6f35ad3dc8cdc6ce28bc61 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -34,6 +34,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index 4e8bfa02fc3a21329e6495fc4ebccf365d3a02a8..8f2c2dbe8a778353dff5e0b8823ac99de68282df 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -2,7 +2,10 @@
 #   OpKernels for boosted trees ops.
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
 )
 
 licenses(["notice"])  # Apache 2.0
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 1ab72af05914bc15148fc4caff7a07493c1ff1e5..4e9bab3e21f9f240d32e78a1a489033a693caa73 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -12,6 +12,7 @@ message Node {
     Leaf leaf = 1;
     BucketizedSplit bucketized_split = 2;
     CategoricalSplit categorical_split = 3;
+    DenseSplit dense_split = 4;
   }
   NodeMetadata metadata = 777;
 }
@@ -70,6 +71,19 @@ message CategoricalSplit {
   int32 right_id = 4;
 }
 
+// TODO(nponomareva): move out of boosted_trees and rename to trees.proto
+message DenseSplit {
+  // Float feature column and split threshold describing
+  // the rule feature <= threshold.
+  int32 feature_id = 1;
+  float threshold = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
 // Tree describes a list of connected nodes.
 // Node 0 must be the root and can carry any payload including a leaf
 // in the case of representing the bias.
diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
index d1840941c1d38f7f299788cd739ab055f036c039..81f04732d331a7eccb825642283cd27d63e35a79 100644
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@@ -29,6 +29,7 @@
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -151,8 +152,14 @@ class BoostedTreesMakeQuantileSummariesOp : public OpKernel {
     const Tensor* example_weights_t;
     OP_REQUIRES_OK(context,
                    context->input(kExampleWeightsName, &example_weights_t));
+    DCHECK(float_features_list.size() > 0) << "Got empty feature list";
     auto example_weights = example_weights_t->flat<float>();
-    const int64 batch_size = example_weights.size();
+    const int64 weight_size = example_weights.size();
+    const int64 batch_size = float_features_list[0].flat<float>().size();
+    OP_REQUIRES(
+        context, weight_size == 1 || weight_size == batch_size,
+        errors::InvalidArgument(strings::Printf(
+            "Weights should be a single value or same size as features.")));
     const Tensor* epsilon_t;
     OP_REQUIRES_OK(context, context->input(kEpsilonName, &epsilon_t));
     float epsilon = epsilon_t->scalar<float>()();
@@ -168,7 +175,9 @@ class BoostedTreesMakeQuantileSummariesOp : public OpKernel {
         QuantileStream stream(epsilon, batch_size + 1);
         // Run quantile summary generation.
         for (int64 j = 0; j < batch_size; j++) {
-          stream.PushEntry(feature_values(j), example_weights(j));
+          stream.PushEntry(feature_values(j), (weight_size > 1)
+                                                  ? example_weights(j)
+                                                  : example_weights(0));
         }
         stream.Finalize();
         const auto summary_entry_list = stream.GetFinalSummary().GetEntryList();
@@ -263,6 +272,57 @@ REGISTER_KERNEL_BUILDER(
     Name("BoostedTreesQuantileStreamResourceAddSummaries").Device(DEVICE_CPU),
     BoostedTreesQuantileStreamResourceAddSummariesOp);
 
+class BoostedTreesQuantileStreamResourceDeserializeOp : public OpKernel {
+ public:
+  explicit BoostedTreesQuantileStreamResourceDeserializeOp(
+      OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr(kNumStreamsName, &num_features_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    QuantileStreamResource* streams_resource;
+    // Create a reference to the underlying resource using the handle.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &streams_resource));
+    // Remove the reference at the end of this scope.
+    mutex_lock l(*streams_resource->mutex());
+    core::ScopedUnref unref_me(streams_resource);
+
+    OpInputList bucket_boundaries_list;
+    OP_REQUIRES_OK(context, context->input_list(kBucketBoundariesName,
+                                                &bucket_boundaries_list));
+
+    auto do_quantile_deserialize = [&](const int64 begin, const int64 end) {
+      // Iterating over all streams.
+      for (int64 stream_idx = begin; stream_idx < end; stream_idx++) {
+        const Tensor& bucket_boundaries_t = bucket_boundaries_list[stream_idx];
+        const auto& bucket_boundaries = bucket_boundaries_t.vec<float>();
+        std::vector<float> result;
+        result.reserve(bucket_boundaries.size());
+        for (size_t i = 0; i < bucket_boundaries.size(); ++i) {
+          result.push_back(bucket_boundaries(i));
+        }
+        streams_resource->set_boundaries(result, stream_idx);
+      }
+    };
+
+    // TODO(tanzheny): comment on the magic number.
+    const int64 kCostPerUnit = 500 * num_features_;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *context->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, num_features_,
+          kCostPerUnit, do_quantile_deserialize);
+  }
+
+ private:
+  int64 num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesQuantileStreamResourceDeserialize").Device(DEVICE_CPU),
+    BoostedTreesQuantileStreamResourceDeserializeOp);
+
 class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
  public:
   explicit BoostedTreesQuantileStreamResourceFlushOp(
@@ -409,28 +469,29 @@ class BoostedTreesBucketizeOp : public OpKernel {
         const int64 num_values = values_tensor.dim_size(0);
 
         Tensor* output_t = nullptr;
-        OP_REQUIRES_OK(
-            context, buckets_list.allocate(
-                         feature_idx, TensorShape({num_values, 1}), &output_t));
-        auto output = output_t->matrix<int32>();
+        OP_REQUIRES_OK(context,
+                       buckets_list.allocate(
+                           feature_idx, TensorShape({num_values}), &output_t));
+        auto output = output_t->flat<int32>();
 
         const std::vector<float>& bucket_boundaries_vector =
             GetBuckets(feature_idx, bucket_boundaries_list);
-        CHECK(!bucket_boundaries_vector.empty())
-            << "Got empty buckets for feature " << feature_idx;
         auto flat_values = values_tensor.flat<float>();
+        const auto& iter_begin = bucket_boundaries_vector.begin();
+        const auto& iter_end = bucket_boundaries_vector.end();
         for (int64 instance = 0; instance < num_values; instance++) {
+          if (iter_begin == iter_end) {
+            output(instance) = 0;
+            continue;
+          }
           const float value = flat_values(instance);
-          auto bucket_iter =
-              std::lower_bound(bucket_boundaries_vector.begin(),
-                               bucket_boundaries_vector.end(), value);
-          if (bucket_iter == bucket_boundaries_vector.end()) {
+          auto bucket_iter = std::lower_bound(iter_begin, iter_end, value);
+          if (bucket_iter == iter_end) {
             --bucket_iter;
           }
-          const int32 bucket = static_cast<int32>(
-              bucket_iter - bucket_boundaries_vector.begin());
+          const int32 bucket = static_cast<int32>(bucket_iter - iter_begin);
           // Bucket id.
-          output(instance, 0) = bucket;
+          output(instance) = bucket;
         }
       }
     };
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
index 31d7fe25a477c3a2374d95749c5ff940ac2311d5..5690c3a601466cb525af66ce2e46e9ad7bec9443 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
@@ -39,7 +39,7 @@ class WeightedQuantilesSummary {
       // Explicitly initialize all of memory (including padding from memory
       // alignment) to allow the struct to be msan-resistant "plain old data".
       //
-      // POD = http://en.cppreference.com/w/cpp/concept/PODType
+      // POD = https://en.cppreference.com/w/cpp/named_req/PODType
       memset(this, 0, sizeof(*this));
 
       value = v;
diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
index 87bdba14550918b777363dd2077e4199d99d658f..f9f10c1b42f2ed6d2012798c8f720bbb9d211f5c 100644
--- a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -60,9 +60,9 @@ template <typename T>
 struct CheckNumericsLaunch {
   void Run(const GPUDevice &d, const T *data, int size,
            int abnormal_detected[2]) {
-    const int32 block_size = d.maxCudaThreadsPerBlock();
+    const int32 block_size = d.maxGpuThreadsPerBlock();
     const int32 num_blocks =
-        (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+        (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
         block_size;
 
     CheckNumericsKernel<T><<<num_blocks, block_size, 0, d.stream()>>>(
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
index 0faad11e4721c9c575ef29591b30135b256bf41c..3988c190e701c8eb0d3163ec26ddefc5aba93541 100644
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -79,7 +79,7 @@ void ConstantOpTest::PersistentMemoryTrackingTest(bool on_gpu) {
   }
 
   // Remove memory leak errors.
-  for (auto allocator_pair : ctx.wrapped_allocators()) {
+  for (auto allocator_pair : ctx.ConsumeWrappedAllocators()) {
     allocator_pair.second->GetRecordsAndUnRef();
   }
 }
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index fd3a0ad422372f84669d34b33b4931c88c0b6730..1587eb5114f0afed179b81cca1084b1fec7d8bff 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -70,9 +70,14 @@ void SwitchOp::Compute(OpKernelContext* context) {
 TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
+REGISTER_CPU_SWITCH(uint64);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
+REGISTER_GPU_SWITCH(uint64);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
@@ -256,8 +261,11 @@ REGISTER_KERNEL_BUILDER(Name("RefMerge").Device(DEVICE_CPU), MergeOp);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
+REGISTER_GPU_KERNEL(uint64);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 639c3062cc689f21359914f1848c6dbb21d97c6d..a6964b1aacb445ffb3938817b241d2455a4c2fa3 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -51,42 +51,47 @@ struct InflatePadAndShuffle {
   }
 };
 
-template <typename Device, typename Input, typename Filter, typename Output>
+template <typename Device, typename Input, typename Filter, typename Output,
+          typename OutputKernel>
 void SpatialConvolutionFunc(const Device& d, Output output, Input input,
                             Filter filter, int row_stride, int col_stride,
                             int row_dilation, int col_dilation,
-                            const Eigen::PaddingType& padding) {
+                            const Eigen::PaddingType& padding,
+                            const OutputKernel& output_kernel) {
   // Need to swap row/col when calling Eigen.
   output.device(d) =
       Eigen::SpatialConvolution(input, filter, col_stride, row_stride, padding,
-                                col_dilation, row_dilation);
+                                col_dilation, row_dilation, output_kernel);
 }
 
-template <typename Device, typename T>
+template <typename Device, typename T,
+          typename OutputKernel = const Eigen::NoOpOutputKernel>
 struct SpatialConvolution {
   void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
                   typename TTypes<T, 4>::ConstTensor input,
                   typename TTypes<T, 4>::ConstTensor filter, int row_stride,
                   int col_stride, int row_dilation, int col_dilation,
-                  const Eigen::PaddingType& padding) {
+                  const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
     SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
-                           row_dilation, col_dilation, padding);
+                           row_dilation, col_dilation, padding, output_kernel);
   }
 };
 
-template <typename Device>
-struct SpatialConvolution<Device, Eigen::half> {
+template <typename Device, typename OutputKernel>
+struct SpatialConvolution<Device, Eigen::half, OutputKernel> {
   void operator()(const Device& d,
                   typename TTypes<Eigen::half, 4>::Tensor output,
                   typename TTypes<Eigen::half, 4>::ConstTensor input,
                   typename TTypes<Eigen::half, 4>::ConstTensor filter,
                   int row_stride, int col_stride, int row_dilation,
-                  int col_dilation, const Eigen::PaddingType& padding) {
+                  int col_dilation, const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
     output.device(d) =
         Eigen::SpatialConvolution(input.cast<float>(), filter.cast<float>(),
                                   col_stride, row_stride, padding, col_dilation,
-                                  row_dilation)
-            .cast<Eigen::half>();
+                                  row_dilation, output_kernel)
+            .template cast<Eigen::half>();
   }
 };
 
@@ -124,7 +129,8 @@ struct SpatialConvolutionBackwardFilter {
 // TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h.
 // My initial attempt to do this compiled but failed in the pytest
 // due to a swigdeps error.
-template <typename Device, typename T>
+template <typename Device, typename T,
+          typename OutputKernel = const Eigen::NoOpOutputKernel>
 struct MatMulConvFunctor {
   // Computes on device "d": out = in0 * in1, where * is matrix
   // multiplication.
@@ -132,8 +138,9 @@ struct MatMulConvFunctor {
       const Device& d, typename TTypes<T, 2>::Tensor out,
       typename TTypes<T, 2>::ConstTensor in0,
       typename TTypes<T, 2>::ConstTensor in1,
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
-    out.device(d) = in0.contract(in1, dim_pair);
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      const OutputKernel& output_kernel = OutputKernel()) {
+    out.device(d) = in0.contract(in1, dim_pair, output_kernel);
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_2d_gpu.cu.cc
similarity index 91%
rename from tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
rename to tensorflow/core/kernels/conv_2d_gpu.cu.cc
index 46167db3a2b44da40a2dc60e90d6b0cd900503ec..c6adf9ebff7bf6363da7a8dfc20287f5b1720450 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_2d_gpu.cu.cc
@@ -433,7 +433,7 @@ struct TransformFilter<GPUDevice, T, int, NDIMS> {
     combined_dims[2] = in.dimension(NDIMS - 1);  // output filters
     CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
 
-    CHECK(dst_filter_format == FORMAT_OIHW)
+    DCHECK(dst_filter_format == FORMAT_OIHW)
         << "Unsupported output layout: " << ToString(dst_filter_format);
 
     ShuffleInTensor3Simple<T, 2, 1, 0>
@@ -998,78 +998,82 @@ struct NCHWToNHWC<GPUDevice, T, NDIMS> {
   }
 };
 
-}  // namespace functor
-
-template struct functor::ShuffleAndReverse<GPUDevice, float, 4, int>;
-template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4, int>;
-
-template struct functor::ShuffleAndReverse<GPUDevice, float, 4,
-                                           Eigen::DenseIndex>;
-template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4,
-                                           Eigen::DenseIndex>;
-
-template struct functor::TransformDepth<GPUDevice, float, int>;
-template struct functor::TransformDepth<GPUDevice, Eigen::half, int>;
-
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint8>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint16>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint32>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint64>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, float4>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, float2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, double2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, Eigen::half>;
-
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint8>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint16>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint32>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint64>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, float4>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, float2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, double2,
-                                                     /*conjugate=*/true>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4, int>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4, int>;
+
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4,
+                                  Eigen::DenseIndex>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4,
+                                  Eigen::DenseIndex>;
+
+template struct TransformDepth<Eigen::GpuDevice, float, int>;
+template struct TransformDepth<Eigen::GpuDevice, Eigen::half, int>;
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint8>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint16>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint32>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint64>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, Eigen::half>;
+
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint8>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint16>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint32>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint64>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
 
 // For 2d ops.
-template struct functor::TransformFilter<GPUDevice, double, int, 4>;
-template struct functor::TransformFilter<GPUDevice, float, int, 4>;
-template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
+template struct TransformFilter<Eigen::GpuDevice, double, int, 4>;
+template struct TransformFilter<Eigen::GpuDevice, float, int, 4>;
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 4>;
 
-template struct functor::ReverseTransformFilter<GPUDevice, double, 4>;
-template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
-template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, double, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 4>;
 
-template struct functor::NHWCToNCHW<GPUDevice, double, 4>;
-template struct functor::NHWCToNCHW<GPUDevice, float, 4>;
-template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, double, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 4>;
 
-template struct functor::NCHWToNHWC<GPUDevice, double, 4>;
-template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
-template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, double, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 4>;
 
-template struct functor::PadInput<GPUDevice, int, int, 4>;
-template struct functor::PadInput<GPUDevice, double, int, 4>;
-template struct functor::PadInput<GPUDevice, float, int, 4>;
-template struct functor::PadInput<GPUDevice, Eigen::half, int, 4>;
+template struct PadInput<Eigen::GpuDevice, int, int, 4>;
+template struct PadInput<Eigen::GpuDevice, double, int, 4>;
+template struct PadInput<Eigen::GpuDevice, float, int, 4>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 4>;
 
 // For 3d ops.
-template struct functor::TransformFilter<GPUDevice, float, int, 5>;
-template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 5>;
+template struct TransformFilter<Eigen::GpuDevice, double, int, 5>;
+template struct TransformFilter<Eigen::GpuDevice, float, int, 5>;
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 5>;
 
-template struct functor::ReverseTransformFilter<GPUDevice, float, 5>;
-template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, double, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 5>;
 
-template struct functor::NHWCToNCHW<GPUDevice, float, 5>;
-template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, double, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 5>;
 
-template struct functor::NCHWToNHWC<GPUDevice, float, 5>;
-template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, double, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 5>;
 
-template struct functor::PadInput<GPUDevice, float, int, 5>;
-template struct functor::PadInput<GPUDevice, Eigen::half, int, 5>;
+template struct PadInput<Eigen::GpuDevice, double, int, 5>;
+template struct PadInput<Eigen::GpuDevice, float, int, 5>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 5>;
 
+}  // namespace functor
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 9e86a16b66d5ba7614effe850f5901a4fb6e8091..bc30da40991b56adc136bbe6115db16c00a04666 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -44,6 +44,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 43bb5ea56c97af793cce78b7f6b9f0ae9e224414..e06af15f2fc5558e9810c3da525fbf3cb385e893 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -43,6 +43,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index bab91f5e861236f87f12d39e452a37d75467be0d..e4c49efea0bd87fdbaa3fbdad3d5612d6b4f8a82 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -35,6 +35,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 using stream_executor::dnn::DimIndex;
@@ -1070,6 +1074,7 @@ namespace functor {
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -1859,6 +1864,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                           Conv3DBackpropFilterOp<GPUDevice, T>);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 2b273d6ff2b631b2fc8285e47f488175e730d137..74857fc2078dc3ee5e17959fc32febcdcb38a689 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -867,34 +867,36 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                  \
-  template <>                                                                \
-  void SpatialConvolution<GPUDevice, T>::operator()(                         \
-      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,              \
-      typename TTypes<T, 4>::ConstTensor input,                              \
-      typename TTypes<T, 4>::ConstTensor filter, int row_stride,             \
-      int col_stride, int row_dilation, int col_dilation,                    \
-      const Eigen::PaddingType& padding);                                    \
-  extern template struct SpatialConvolution<GPUDevice, T>;                   \
-  template <>                                                                \
-  void MatMulConvFunctor<GPUDevice, T>::operator()(                          \
-      const GPUDevice& d, typename TTypes<T, 2>::Tensor out,                 \
-      typename TTypes<T, 2>::ConstTensor in0,                                \
-      typename TTypes<T, 2>::ConstTensor in1,                                \
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); \
-  extern template struct MatMulConvFunctor<GPUDevice, T>;                    \
-  template <>                                                                \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(                    \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,              \
-      typename TTypes<T, 4, int>::ConstTensor in,                            \
-      typename TTypes<T, 4, int>::Tensor out);                               \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;              \
-  template <>                                                                \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                           \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,        \
-      const std::array<int, 2>& padding_left,                                \
-      const std::array<int, 2>& padding_right,                               \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);     \
+#define DECLARE_GPU_SPEC(T)                                                 \
+  template <>                                                               \
+  void SpatialConvolution<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,             \
+      typename TTypes<T, 4>::ConstTensor input,                             \
+      typename TTypes<T, 4>::ConstTensor filter, int row_stride,            \
+      int col_stride, int row_dilation, int col_dilation,                   \
+      const Eigen::PaddingType& padding,                                    \
+      const Eigen::NoOpOutputKernel& output_kernel);                        \
+  extern template struct SpatialConvolution<GPUDevice, T>;                  \
+  template <>                                                               \
+  void MatMulConvFunctor<GPUDevice, T>::operator()(                         \
+      const GPUDevice& d, typename TTypes<T, 2>::Tensor out,                \
+      typename TTypes<T, 2>::ConstTensor in0,                               \
+      typename TTypes<T, 2>::ConstTensor in1,                               \
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, \
+      const Eigen::NoOpOutputKernel& output_kernel);                        \
+  extern template struct MatMulConvFunctor<GPUDevice, T>;                   \
+  template <>                                                               \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(                   \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,             \
+      typename TTypes<T, 4, int>::ConstTensor in,                           \
+      typename TTypes<T, 4, int>::Tensor out);                              \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;             \
+  template <>                                                               \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                          \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,       \
+      const std::array<int, 2>& padding_left,                               \
+      const std::array<int, 2>& padding_right,                              \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);    \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 83df4dce38e09b09956104c411d3e36f6cfb7657..f20ac93b5a01cf2dbd1c53ce55c832727f49979f 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -533,10 +533,19 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
-      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
+      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);   \
+  template <>                                                         \
+  void NHWCToNCHW<GPUDevice, T, 5>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
+      typename TTypes<T, 5>::Tensor out);                             \
+  template <>                                                         \
+  void NCHWToNHWC<GPUDevice, T, 5>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
+      typename TTypes<T, 5>::Tensor out);
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 
 }  // namespace functor
@@ -548,6 +557,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv3DOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    Conv3DOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 972100ba77872eb54af75e6f62bda5ac0ecc1774..a0484e9235dd3235f8074bf956914772a0d8c84e 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -30,9 +30,11 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
 #include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
@@ -898,4 +900,477 @@ TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
 TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
 TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
 
+// Support for fusing computationally cheap, but memory bandwidth expensive
+// computations into the output of convolution to reduce the overall latency.
+//
+// Example: Fuse Conv2D+BiasAdd+Relu.
+
+namespace {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// Type aliases for the unaligned tensors (tensor maps) used in output kernels.
+template <typename T>
+struct OutputTypes {
+  // There is no guarantee that the output block passed to the output kernel
+  // will be aligned.
+
+  using Tensor =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>,
+                       Eigen::Unaligned>;
+
+  using ConstTensor = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>,
+      Eigen::Unaligned>;
+};
+
+// Type alias for the tensor contraction output mapper.
+template <typename Scalar, typename Index>
+using ContractionOutputMapper =
+    Eigen::internal::blas_data_mapper<Scalar, Index, Eigen::ColMajor>;
+
+// Returns input expression without any transformations.
+struct Identity {
+  template <typename XprType>
+  static auto apply(XprType expr) -> XprType {
+    return expr;
+  };
+};
+
+// Applies `Relu` to the passed input expression.
+struct Relu {
+  template <typename XprType>
+  static auto apply(XprType expr)
+      -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) {
+    return expr.cwiseMax(static_cast<typename XprType::Scalar>(0));
+  };
+};
+
+// TensorContraction swaps lhs with rhs, and changes layout from RowMajor
+// (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul
+// using these tensors.
+//
+// TensorContraction output matrix (before reshape) has a ColMajor layout, and
+// has dimensions:
+//  - rows: output_channels
+//  - cols: all other dimensions
+//
+// First element in every column is:
+//   [batch ??, height ??, width ??, out_channel = i]
+//
+// We do not know what are the values of the 'batch', 'height', and 'width' here
+// (if we know original dimensions, they can be computed from 'j').
+//
+// Each column of an output block is a continuous slice along the output channel
+// dimension, so we can use it to efficiently compute any transformation that
+// depends only on a channel value (e.g. add channel bias).
+
+// Output kernel that fuses BiasAdd operation into the output of tensor
+// contraction + any other transformation defined by Transform.
+template <typename T, typename Transform = Identity>
+struct BiasAddOutputKernel {
+  explicit BiasAddOutputKernel(const T* bias_data) : bias_data(bias_data) {}
+
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, Index>& output_mapper,
+      const Eigen::TensorContractionParams& params, Index i, Index j,
+      Index num_rows, Index num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* bias_base = bias_data + i;
+    typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename OutputTypes<T>::Tensor output(output_base, num_rows);
+      const auto expr = output + bias;
+      output = Transform::template apply<decltype(expr)>(expr);
+    }
+  }
+
+ private:
+  const T* bias_data;
+};
+
+// Output kernel that fuses FusedBatchNorm operation into the output of tensor
+// contraction + any other transformation defined by Transform.
+template <typename T, typename Transform = Identity>
+struct FusedBatchNormOutputKernel {
+  FusedBatchNormOutputKernel(T epsilon, const T* scaling_factor_data,
+                             const T* offset_data, const T* estimated_mean_data)
+      : epsilon(epsilon),
+        scaling_factor_data(scaling_factor_data),
+        offset_data(offset_data),
+        estimated_mean_data(estimated_mean_data) {}
+
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, Index>& output_mapper,
+      const Eigen::TensorContractionParams& params, Index i, Index j,
+      Index num_rows, Index num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* scaling_factor_base = scaling_factor_data + i;
+    const T* offset_base = offset_data + i;
+    const T* mean_base = estimated_mean_data + i;
+
+    typename OutputTypes<T>::ConstTensor scaling_factor(scaling_factor_base,
+                                                        num_rows);
+    typename OutputTypes<T>::ConstTensor offset(offset_base, num_rows);
+    typename OutputTypes<T>::ConstTensor mean(mean_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename OutputTypes<T>::Tensor output(output_base, num_rows);
+
+      auto scaled = (output - mean) * scaling_factor;
+      auto shifted = scaled + offset;
+
+      output = Transform::template apply<decltype(shifted)>(shifted);
+    }
+  }
+
+ private:
+  T epsilon;
+  const T* scaling_factor_data;
+  const T* offset_data;
+  const T* estimated_mean_data;
+};
+
+// Type aliases for the output kernels, purely for the sake of better launch
+// dispatching code readability.
+template <typename T>
+using WithBiasAdd = BiasAddOutputKernel<T>;
+template <typename T>
+using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
+template <typename T>
+using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
+template <typename T>
+using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
+
+// Dispatch 2D convolution to the appropriate primitive operation:
+//   (1) MatMul for the case of 1x1 convolution.
+//   (2) MatMul for the case when filter size equals to the input size.
+//   (3) General spatial 2D convolution for all other cases.
+template <typename T>
+class LaunchConv2DWithOutputKernel {
+ public:
+  LaunchConv2DWithOutputKernel(int row_stride, int col_stride,      //
+                               int row_dilation, int col_dilation,  //
+                               Padding padding)
+      : row_stride_(row_stride),
+        col_stride_(col_stride),
+        row_dilation_(row_dilation),
+        col_dilation_(col_dilation),
+        padding_(padding) {}
+
+  template <typename OutputKernel>
+  void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
+                  const Tensor& input, const Tensor& filter, Tensor* output) {
+    if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 &&
+        row_stride_ == 1 && col_stride_ == 1) {
+      int conv_width = 1;  // Width for the convolution step.
+      for (int i = 0; i < 3; ++i) {
+        conv_width *= output->dim_size(i);
+      }
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+          ctx->eigen_device<CPUDevice>(),
+          output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
+          input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
+          filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
+          dim_pair, output_kernel);
+
+    } else if (filter.dim_size(0) == input.dim_size(1) &&
+               filter.dim_size(1) == input.dim_size(2) && row_dilation_ == 1 &&
+               col_dilation_ == 1 && padding_ == VALID) {
+      // If the input data and filter have the same height/width,
+      // reduce the 2D convolution to matrix multiplication.
+      const auto k =  // Length of reduction dimension.
+          filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2);
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+          ctx->eigen_device<CPUDevice>(),
+          output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}),
+          input.shaped<T, 2>({input.dim_size(0), k}),
+          filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair,
+          output_kernel);
+
+    } else {
+      functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
+          ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
+          input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_, col_stride_,
+          row_dilation_, col_dilation_, BrainPadding2EigenPadding(padding_),
+          output_kernel);
+    }
+  }
+
+ private:
+  int row_stride_;
+  int col_stride_;
+  int row_dilation_;
+  int col_dilation_;
+  const Padding padding_;
+};
+
+}  // namespace
+
+// Conv2D op with fused output kernels. Supports only CPUDevice.
+template <typename T>
+class FusedConv2DOp : public OpKernel {
+ public:
+  explicit FusedConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
+
+    // 'fused_ops' and 'num_args' attributes are specified by the Grappler
+    // Remapper optimizer.
+
+    std::vector<string> fused_ops;
+    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
+    OP_REQUIRES(context, !fused_ops.empty(),
+                errors::InvalidArgument(
+                    "Fused Conv2D must have at least one fused op."));
+
+    int num_args;
+    OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
+
+    // TODO(ezhulenev): Add support for fusion element-wise op chains defined
+    // at runtime, e.g. Relu+Sqrt+Tanh+etc...
+
+    // Match combination of fused ops to one of the supported fusions.
+    if (FusedOpsMatches(fused_ops, {"BiasAdd"})) {
+      fused_computation_ = FusedComputationType::kBiasAdd;
+    } else if (FusedOpsMatches(fused_ops, {"BiasAdd", "Relu"})) {
+      fused_computation_ = FusedComputationType::kBiasAddWithRelu;
+    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm"})) {
+      fused_computation_ = FusedComputationType::kFusedBatchNorm;
+    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm", "Relu"})) {
+      fused_computation_ = FusedComputationType::kFusedBatchNormWithRelu;
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::Unimplemented("Fusion is not implemented: [",
+                                        str_util::Join(fused_ops, ","), "]"));
+    }
+
+    // Depending on a picked fusion type validate fusion-specific arguments.
+
+    if (fused_computation_ == FusedComputationType::kBiasAdd ||
+        fused_computation_ == FusedComputationType::kBiasAddWithRelu) {
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
+    }
+
+    if (fused_computation_ == FusedComputationType::kFusedBatchNorm ||
+        fused_computation_ == FusedComputationType::kFusedBatchNormWithRelu) {
+      OP_REQUIRES(
+          context, num_args == 4,
+          errors::InvalidArgument("Fused FusedBatchNorm must have four extra "
+                                  "arguments: scale, offset, mean, variance."));
+      OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(1);
+
+    Conv2DDimensions dimensions;
+    OP_REQUIRES_OK(context,
+                   ComputeConv2DDimension(params_, input, filter, &dimensions));
+
+    TensorShape out_shape = ShapeFromFormat(
+        params_.data_format, dimensions.batch, dimensions.out_rows,
+        dimensions.out_cols, dimensions.out_depth);
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "FusedConv2DWithBias: in_depth = " << dimensions.in_depth
+            << ", patch_depth = " << dimensions.patch_depth
+            << ", input_cols = " << dimensions.input_cols
+            << ", filter_cols = " << dimensions.filter_cols
+            << ", input_rows = " << dimensions.input_rows
+            << ", filter_rows = " << dimensions.filter_rows
+            << ", stride_rows = " << dimensions.stride_rows
+            << ", stride_cols = " << dimensions.stride_cols
+            << ", dilation_rows = " << dimensions.dilation_rows
+            << ", dilation_cols = " << dimensions.dilation_cols
+            << ", out_depth = " << dimensions.out_depth;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+
+    OP_REQUIRES(context, params_.data_format == FORMAT_NHWC,
+                errors::Unimplemented("Fused conv implementation only supports "
+                                      "NHWC tensor format for now."));
+    OP_REQUIRES(context, dimensions.in_depth == filter.dim_size(2),
+                errors::Unimplemented("Fused conv implementation does not "
+                                      "support grouped convolutions for now."));
+
+    BiasAddArgs bias_add;
+    FusedBatchNormArgs fused_batch_norm;
+
+    LaunchConv2DWithOutputKernel<T> conv2d(
+        dimensions.stride_rows, dimensions.stride_cols,
+        dimensions.dilation_rows, dimensions.dilation_cols, params_.padding);
+
+    switch (fused_computation_) {
+      case FusedComputationType::kBiasAdd:
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
+        conv2d(WithBiasAdd<T>(bias_add.bias_add_data), context, input, filter,
+               output);
+        break;
+
+      case FusedComputationType::kBiasAddWithRelu:
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
+        conv2d(WithBiasAddAndRelu<T>(bias_add.bias_add_data), context, input,
+               filter, output);
+        break;
+
+      case FusedComputationType::kFusedBatchNorm:
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, &fused_batch_norm));
+        conv2d(WithFusedBatchNorm<T>(epsilon_,
+                                     fused_batch_norm.scaling_factor.data(),
+                                     fused_batch_norm.offset_data,
+                                     fused_batch_norm.estimated_mean_data),
+               context, input, filter, output);
+        break;
+
+      case FusedComputationType::kFusedBatchNormWithRelu:
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, &fused_batch_norm));
+        conv2d(WithFusedBatchNormAndRelu<T>(
+                   epsilon_, fused_batch_norm.scaling_factor.data(),
+                   fused_batch_norm.offset_data,
+                   fused_batch_norm.estimated_mean_data),
+               context, input, filter, output);
+        break;
+    }
+  }
+
+ private:
+  bool FusedOpsMatches(const std::vector<string>& fused_ops,
+                       const std::vector<string>& expected) const {
+    return fused_ops == expected;
+  }
+
+  struct BiasAddArgs {
+    const T* bias_add_data = nullptr;
+  };
+
+  struct FusedBatchNormArgs {
+    const T* scale_data = nullptr;
+    const T* offset_data = nullptr;
+    const T* estimated_mean_data = nullptr;
+    const T* estimated_variance_data = nullptr;
+
+    // Precomputed expression:
+    //   scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
+    Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
+  };
+
+#define TF_REQUIRES(EXP, STATUS) \
+  if (!TF_PREDICT_TRUE(EXP)) return (STATUS)
+
+  void InitDataPtr(const Tensor& tensor, const T** ptr) const {
+    *ptr = reinterpret_cast<const T*>(tensor.tensor_data().data());
+  }
+
+  Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs* args) const {
+    // Bias of the following dimensions: [ output_depth ]
+    const Tensor& bias = context->input(2);
+
+    TF_REQUIRES(bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional",
+                                        bias.shape().DebugString()));
+
+    InitDataPtr(bias, &args->bias_add_data);
+
+    return Status::OK();
+  }
+
+  Status InitFusedBatchNormArgs(OpKernelContext* context,
+                                FusedBatchNormArgs* args) const {
+    const Tensor& scale = context->input(2);
+    const Tensor& offset = context->input(3);
+    const Tensor& estimated_mean = context->input(4);
+    const Tensor& estimated_variance = context->input(5);
+
+    TF_REQUIRES(scale.dims() == 1,
+                errors::InvalidArgument("scale must be 1-dimensional",
+                                        scale.shape().DebugString()));
+    TF_REQUIRES(offset.dims() == 1,
+                errors::InvalidArgument("offset must be 1-dimensional",
+                                        offset.shape().DebugString()));
+    TF_REQUIRES(estimated_mean.dims() == 1,
+                errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                        estimated_mean.shape().DebugString()));
+    TF_REQUIRES(
+        estimated_variance.dims() == 1,
+        errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                estimated_variance.shape().DebugString()));
+
+    InitDataPtr(scale, &args->scale_data);
+    InitDataPtr(offset, &args->offset_data);
+    InitDataPtr(estimated_mean, &args->estimated_mean_data);
+    InitDataPtr(estimated_variance, &args->estimated_variance_data);
+
+    // Precompute scaling factor once for all output blocks (kernels).
+    args->scaling_factor =
+        (estimated_variance.flat<T>() + static_cast<T>(epsilon_)).rsqrt() *
+        scale.flat<T>();
+
+    return Status::OK();
+  }
+
+#undef TF_REQUIRES
+
+  // Element-wise ops applied to the result of Conv2D.
+  // TODO(ezhulenev): Add support for runtime-defined op chains.
+  enum class FusedComputationType {
+    kBiasAdd,
+    kBiasAddWithRelu,
+    kFusedBatchNorm,
+    kFusedBatchNormWithRelu
+  };
+
+  Conv2DParameters params_;
+  FusedComputationType fused_computation_;
+
+  // FusedBatchNorm attributes.
+  float epsilon_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DOp);
+};
+
+#define REGISTER_FUSED_CONV2D(T)                                      \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      FusedConv2DOp<T>);
+
+// If we're using the alternative GEMM-based implementation of Conv2D for the
+// CPU implementation, don't register this EigenTensor-based version.
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
+TF_CALL_float(REGISTER_FUSED_CONV2D);
+TF_CALL_double(REGISTER_FUSED_CONV2D);
+#endif  // !USE_GEMM_FOR_CONV
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 1236f27051898e88f580a139f1d6cbf95dd0411b..87bbc30573d0a91b7b49c585f24caf3e88e3a2bf 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
@@ -522,4 +525,900 @@ TEST_F(ConvOpTest, HandwrittenConv) { HandwrittenConv(); }
 
 TEST_F(ConvOpTest, AnisotropicStride) { AnisotropicStrides(); }
 
+template <typename T>
+class FusedConv2DOpTest : public OpsTestBase {
+ protected:
+  static constexpr int kDepth = 3;
+  static constexpr int kImageWidth = 32;
+  static constexpr int kImageHeight = 32;
+  static constexpr int kImageBatchCount = 8;
+
+  using BiasAddGraphRunner =
+      std::function<void(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* out)>;
+
+  using BatchNormGraphRunner = std::function<void(
+      const Tensor& input_data, const Tensor& filter_data,
+      const Tensor& scale_data, const Tensor& offset_data,
+      const Tensor& mean_data, const Tensor& variance_data, Tensor* out)>;
+
+  // Runs a Tensorflow graph defined by the root scope, and fetches the result
+  // of 'fetch' node into the output Tensor.
+  void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
+                   Tensor* output) {
+    tensorflow::GraphDef graph;
+    TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+    std::unique_ptr<tensorflow::Session> session(
+        tensorflow::NewSession(tensorflow::SessionOptions()));
+    TF_ASSERT_OK(session->Create(graph));
+
+    std::vector<Tensor> unfused_tensors;
+    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
+
+    *output = unfused_tensors[0];
+  }
+
+  void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* output,
+                         int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    RunAndFetch(root, "with_bias", output);
+  }
+
+  void RunConv2DWithBiasAndRelu(const Tensor& input_data,
+                                const Tensor& filter_data,
+                                const Tensor& bias_data, Tensor* output,
+                                int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
+
+    RunAndFetch(root, "with_relu", output);
+  }
+
+  void RunConv2DWithBatchNorm(const Tensor& input_data,
+                              const Tensor& filter_data,
+                              const Tensor& scale_data,
+                              const Tensor& offset_data,
+                              const Tensor& mean_data,
+                              const Tensor& variance_data, Tensor* output,
+                              int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    ops::FusedBatchNorm::Attrs attr;
+    attr = attr.IsTraining(false);
+
+    auto with_fused_batch_norm = ops::FusedBatchNorm(
+        root.WithOpName("with_fused_batch_norm"), conv,
+        ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
+        ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
+        ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
+        ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
+        attr);
+
+    RunAndFetch(root, "with_fused_batch_norm", output);
+  }
+
+  void RunConv2DWithBatchNormAndRelu(const Tensor& input_data,
+                                     const Tensor& filter_data,
+                                     const Tensor& scale_data,
+                                     const Tensor& offset_data,
+                                     const Tensor& mean_data,
+                                     const Tensor& variance_data,
+                                     Tensor* output, int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    ops::FusedBatchNorm::Attrs attr;
+    attr = attr.IsTraining(false);
+
+    auto with_fused_batch_norm = ops::FusedBatchNorm(
+        root.WithOpName("with_fused_batch_norm"), conv,
+        ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
+        ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
+        ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
+        ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
+        attr);
+
+    auto with_relu =
+        ops::Relu(root.WithOpName("with_relu"), with_fused_batch_norm.y);
+
+    RunAndFetch(root, "with_relu", output);
+  }
+
+  void RunFusedConv2DOp(const Tensor& image, const Tensor& filter,
+                        const std::vector<Tensor>& args,
+                        const std::vector<string>& fused_ops, Tensor* output,
+                        int stride = 1) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    int num_args = static_cast<int>(args.size());
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_FusedConv2D")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Attr("num_args", num_args)
+                     .Input(FakeInput(num_args, dtype))
+                     .Attr("T", dtype)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "SAME")
+                     .Attr("fused_ops", fused_ops)
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    *output = *GetOutput(0);
+  }
+
+  void VerifyBiasAddTensorsNear(int depth, int image_width, int image_height,
+                                int image_batch_count, int filter_size,
+                                int filter_count,
+                                const BiasAddGraphRunner& run_default,
+                                const BiasAddGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+
+    const int bias_size = filter_count;
+    Tensor bias(dtype, {bias_size});
+    bias.flat<T>() = bias.flat<T>().setRandom();
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, bias, &conv_2d);
+    run_fused(image, filter, bias, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-5);
+  }
+
+  void VerifyFusedBatchNormTensorsNear(int depth, int image_width,
+                                       int image_height, int image_batch_count,
+                                       int filter_size, int filter_count,
+                                       const BatchNormGraphRunner& run_default,
+                                       const BatchNormGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+
+    const int scale_size = filter_count;
+
+    Tensor scale(dtype, {scale_size});
+    scale.flat<T>() = scale.flat<T>().setRandom();
+
+    Tensor offset(dtype, {scale_size});
+    offset.flat<T>() = offset.flat<T>().setRandom();
+
+    Tensor mean(dtype, {scale_size});
+    mean.flat<T>() = mean.flat<T>().setRandom();
+
+    Tensor variance(dtype, {scale_size});
+    variance.flat<T>() = variance.flat<T>().setRandom();
+    variance.flat<T>() += variance.flat<T>().constant(static_cast<T>(0.5f));
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, scale, offset, mean, variance, &conv_2d);
+    run_fused(image, filter, scale, offset, mean, variance, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-3);
+  }
+
+  // Verifies that computing Conv2D+BiasAdd in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBias(int filter_size, int filter_count,
+                            int depth = kDepth, int image_width = kImageWidth,
+                            int image_height = kImageHeight,
+                            int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithBias(input_data, filter_data, bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused = [this](const Tensor& input_data,
+                                                const Tensor& filter_data,
+                                                const Tensor& bias_data,
+                                                Tensor* out) {
+      RunFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"}, out);
+    };
+
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBiasAndRelu(int filter_size, int filter_count,
+                                   int depth = kDepth,
+                                   int image_width = kImageWidth,
+                                   int image_height = kImageHeight,
+                                   int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithBiasAndRelu(input_data, filter_data, bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data, {bias_data},
+                           {"BiasAdd", "Relu"}, out);
+        };
+
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+FusedBatchNorm in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBatchNorm(int filter_size, int filter_count,
+                                 int depth = kDepth,
+                                 int image_width = kImageWidth,
+                                 int image_height = kImageHeight,
+                                 int image_batch_count = kImageBatchCount) {
+    const BatchNormGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunConv2DWithBatchNorm(input_data, filter_data, scale_data,
+                                 offset_data, mean_data, variance_data, out);
+        };
+
+    const BatchNormGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data,
+                           {scale_data, offset_data, mean_data, variance_data},
+                           {"FusedBatchNorm"}, out);
+        };
+
+    VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
+                                    image_batch_count, filter_size,
+                                    filter_count, run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+FusedBatchNorm+Relu in a graph is identical
+  // to FusedConv2D.
+  void VerifyConv2DWithBatchNormAndRelu(
+      int filter_size, int filter_count, int depth = kDepth,
+      int image_width = kImageWidth, int image_height = kImageHeight,
+      int image_batch_count = kImageBatchCount) {
+    const BatchNormGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunConv2DWithBatchNormAndRelu(input_data, filter_data, scale_data,
+                                        offset_data, mean_data, variance_data,
+                                        out);
+        };
+
+    const BatchNormGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data,
+                           {scale_data, offset_data, mean_data, variance_data},
+                           {"FusedBatchNorm", "Relu"}, out);
+        };
+
+    VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
+                                    image_batch_count, filter_size,
+                                    filter_count, run_default, run_fused);
+  }
+};
+
+// Conv2D with BatchNorm can be tested only with `T=float`, because default
+// `FusedBatchNorm` kernel supports only floats for scale, mean and variance.
+
+template <typename T>
+class FusedConv2DWithBiasOpTest : public FusedConv2DOpTest<T> {};
+template <typename T>
+class FusedConv2DWithBatchNormOpTest : public FusedConv2DOpTest<T> {};
+
+TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest);
+TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest);
+
+// -------------------------------------------------------------------------- //
+// Conv2D + BiasAdd + {Relu}                                                  //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolution) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolutionAndRelu) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+// -------------------------------------------------------------------------- //
+// Conv2D + FusedBatchNorm + {Relu}                                           //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ImageSizeConvolution) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ImageSizeConvolutionAndRelu) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest,    //
+                           OneByOneConvolution,          //
+                           ImageSizeConvolution,         //
+                           SpatialConvolution,           //
+                           OneByOneConvolutionAndRelu,   //
+                           ImageSizeConvolutionAndRelu,  //
+                           SpatialConvolutionAndRelu);
+
+REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest,  //
+                           OneByOneConvolution,             //
+                           ImageSizeConvolution,            //
+                           SpatialConvolution,              //
+                           OneByOneConvolutionAndRelu,      //
+                           ImageSizeConvolutionAndRelu,     //
+                           SpatialConvolutionAndRelu);
+
+using FusedBiasAddDataTypes = ::testing::Types<float, double>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBiasOpTest,
+                              FusedBiasAddDataTypes);
+
+using FusedBatchNormDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBatchNormOpTest,
+                              FusedBatchNormDataTypes);
+
+////////////////////////////////////////////////////////////////////////////////
+// Performance benchmarks for the FusedConv2DWithBiasOp.                      //
+////////////////////////////////////////////////////////////////////////////////
+
+struct Conv2DGraph {
+  Graph* graph;
+  Node* conv2d;
+};
+
+struct Conv2DWithBiasGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* bias;
+};
+
+struct Conv2DWithBiasAndReluGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* bias;
+  Node* relu;
+};
+
+struct Conv2DWithBatchNormGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* batch_norm;
+};
+
+struct Conv2DWithBatchNormAndReluGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* batch_norm;
+  Node* relu;
+};
+
+static Tensor MakeRandomTensor(const TensorShape& shape) {
+  Tensor tensor(DT_FLOAT, TensorShape(shape));
+  tensor.flat<float>() = tensor.flat<float>().setRandom();
+  return tensor;
+}
+
+// Creates a simple Tensorflow graph with single Conv2D node.
+static Conv2DGraph Conv2D(int batch, int height, int width, int in_depth,
+                          int filter_w, int filter_h, int out_depth) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+
+  Node* conv2d;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "Conv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Finalize(graph, &conv2d));
+
+  return {graph, conv2d};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by BiasAdd.
+static Conv2DWithBiasGraph Conv2DWithBias(int batch, int height, int width,
+                                          int in_depth, int filter_w,
+                                          int filter_h, int out_depth) {
+  Conv2DGraph conv_graph =
+      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+
+  Tensor bias_t = MakeRandomTensor({out_depth});
+  Node* bias = test::graph::Constant(graph, bias_t, "bias");
+
+  Node* out;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("bias"), "BiasAdd")
+                  .Input(conv2d)
+                  .Input(bias)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("data_format", "NHWC")
+                  .Finalize(graph, &out));
+
+  return {graph, conv2d, out};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by BiasAdd and Relu.
+static Conv2DWithBiasAndReluGraph Conv2DWithBiasAndRelu(int batch, int height,
+                                                        int width, int in_depth,
+                                                        int filter_w,
+                                                        int filter_h,
+                                                        int out_depth) {
+  Conv2DWithBiasGraph conv_graph = Conv2DWithBias(
+      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+  Node* bias = conv_graph.bias;
+
+  Node* relu;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("relu"), "Relu")
+                  .Input(bias)
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(graph, &relu));
+
+  return {graph, conv2d, bias, relu};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm.
+static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(int batch, int height,
+                                                    int width, int in_depth,
+                                                    int filter_w, int filter_h,
+                                                    int out_depth) {
+  Conv2DGraph conv_graph =
+      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+
+  Tensor scale_t = MakeRandomTensor({out_depth});
+  Tensor offset_t = MakeRandomTensor({out_depth});
+  Tensor mean_t = MakeRandomTensor({out_depth});
+  Tensor variance_t = MakeRandomTensor({out_depth});
+
+  Node* scale = test::graph::Constant(graph, scale_t, "scale");
+  Node* offset = test::graph::Constant(graph, offset_t, "offset");
+  Node* mean = test::graph::Constant(graph, mean_t, "mean");
+  Node* variance = test::graph::Constant(graph, variance_t, "variance");
+
+  Node* out;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("batch_norm"), "FusedBatchNorm")
+                  .Input(conv2d)
+                  .Input(scale)
+                  .Input(offset)
+                  .Input(mean)
+                  .Input(variance)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("is_training", false)
+                  .Finalize(graph, &out));
+
+  return {graph, conv2d, out};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm and
+// Relu.
+static Conv2DWithBatchNormAndReluGraph Conv2DWithBatchNormAndRelu(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth) {
+  Conv2DWithBatchNormGraph conv_graph = Conv2DWithBatchNorm(
+      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+  Node* batch_norm = conv_graph.batch_norm;
+
+  Node* relu;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("relu"), "Relu")
+                  .Input(batch_norm)
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(graph, &relu));
+
+  return {graph, conv2d, batch_norm, relu};
+}
+
+// Creates a tensorflow graph with a single FusedConv2D (with BiasAdd) node and
+// fuses into it additional computations (e.g. Relu).
+static Graph* FusedConv2DWithBias(int batch, int height, int width,
+                                  int in_depth, int filter_w, int filter_h,
+                                  int out_depth,
+                                  const std::vector<string>& fused_ops = {}) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+  Tensor bias_t = MakeRandomTensor({out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+  Node* bias = test::graph::Constant(graph, bias_t, "bias");
+
+  std::vector<NodeBuilder::NodeOut> args = {bias};
+
+  Node* conv;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "_FusedConv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("num_args", 1)
+                  .Input(args)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Attr("fused_ops", fused_ops)
+                  .Finalize(graph, &conv));
+
+  return graph;
+}
+
+// Creates a tensorflow graph with a single FusedConv2D (with FusedBatchNorm)
+// node and fuses into it additional computations (e.g. Relu).
+static Graph* FusedConv2DWithBatchNorm(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, const std::vector<string>& fused_ops = {}) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+  Tensor scale_t = MakeRandomTensor({out_depth});
+  Tensor offset_t = MakeRandomTensor({out_depth});
+  Tensor mean_t = MakeRandomTensor({out_depth});
+  Tensor variance_t = MakeRandomTensor({out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+  Node* scale = test::graph::Constant(graph, scale_t, "scale");
+  Node* offset = test::graph::Constant(graph, offset_t, "offset");
+  Node* mean = test::graph::Constant(graph, mean_t, "mean");
+  Node* variance = test::graph::Constant(graph, variance_t, "variance");
+
+  std::vector<NodeBuilder::NodeOut> args = {scale, offset, mean, variance};
+
+  Node* conv;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "_FusedConv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("num_args", 4)
+                  .Input(args)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Attr("fused_ops", fused_ops)
+                  .Finalize(graph, &conv));
+
+  return graph;
+}
+
+// Macro arguments names: --------------------------------------------------- //
+//    N: batch size
+//    H: height
+//    W: width
+//    C: channels
+//   FC: filter count
+//   FH: filter height
+//   FW: filter width
+
+#define BM_SETUP(N, H, W, C, type, LABEL, NAME)                               \
+  testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
+  testing::SetLabel(LABEL);
+
+#define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
+  name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
+
+#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                       \
+  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) {  \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
+    test::Benchmark(#type, Conv2D(N, H, W, C, FW, FH, FC).graph).Run(iters); \
+  }                                                                          \
+  BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                   \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                           \
+    test::Benchmark(#type, Conv2DWithBias(N, H, W, C, FW, FH, FC).graph) \
+        .Run(iters);                                                     \
+  }                                                                      \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                    \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                            \
+    test::Benchmark(#type,                                                \
+                    Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC).graph)  \
+        .Run(iters);                                                      \
+  }                                                                       \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type,                                                    \
+                    FusedConv2DWithBias(N, H, W, C, FW, FH, FC, {"BiasAdd"})) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                         \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
+    test::Benchmark(#type, FusedConv2DWithBias(N, H, W, C, FW, FH, FC,         \
+                                               {"BiasAdd", "Relu"}))           \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type, Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC).graph) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                         \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
+    test::Benchmark(#type,                                                     \
+                    Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC).graph)  \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                       \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
+    test::Benchmark(#type, FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,  \
+                                                    {"FusedBatchNorm"}))     \
+        .Run(iters);                                                         \
+  }                                                                          \
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type,      \
+                                           LABEL)                             \
+  static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C,   \
+                      FW, FH, FC)(int iters) {                                \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type,                                                    \
+                    FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,          \
+                                             {"FusedBatchNorm", "Relu"}))     \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
+                    FH, FC));
+
+// -------------------------------------------------------------------------- //
+// Pixel CNN convolutions.
+// -------------------------------------------------------------------------- //
+
+// 1x1 Convolution: MatMulFunctor
+
+BM_Conv2D(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2D(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2D(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+// 1) BiasAdd {+ Relu}
+
+BM_Conv2DWithBias(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBias(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBias(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBias(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBias(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBias(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+// 2) FusedBatchNorm {+ Relu}
+
+BM_Conv2DWithBatchNorm(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNorm(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNorm(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBatchNormAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNormAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNormAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNorm(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNorm(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBatchNorm(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNormAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNormAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu,
+                                   "1x1 /b 16");
+BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu,
+                                   "1x1 /b 32");
+
+// -------------------------------------------------------------------------- //
+// 3x3 Convolution: SpatialConvolution
+// -------------------------------------------------------------------------- //
+
+BM_Conv2D(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2D(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2D(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+// 1) BiasAdd {+ Relu}
+
+BM_Conv2DWithBias(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBias(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBias(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBias(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBias(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2DWithBias(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+// 2) FusedBatchNorm {+ Relu}
+
+BM_Conv2DWithBatchNorm(8, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNorm(16, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNorm(32, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBatchNormAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBatchNormAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBatchNorm(8, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNorm(16, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBatchNorm(32, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNormAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBatchNormAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu,
+                                   "3x3 /b 16");
+BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu,
+                                   "3x3 /b 32");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 66ba827a9015fcf9875ed6c51cfc3c0e1cc7983d..3f7aa0dc39919e223e206b6a2328d379a0f828a5 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -656,7 +656,7 @@ template <typename T>
 struct erfc : base<T, Eigen::internal::scalar_erfc_op<T>> {};
 
 template <typename T>
-struct sigmoid : base<T, Eigen::internal::scalar_sigmoid_op<T>> {};
+struct sigmoid : base<T, Eigen::internal::scalar_logistic_op<T>> {};
 
 template <typename T>
 struct sin : base<T, Eigen::internal::scalar_sin_op<T>> {};
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 9993b4d180ca083091d1d505275e37df9ac61f3d..b7ccf5f70ec28f475b02e652987c3578048e9976 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -658,6 +658,7 @@ tf_kernel_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/optimizers/data",
+        "//tensorflow/core/grappler/optimizers/data:graph_utils",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index cbcae0588c6e28eea7ae6165be7704b1a6cf0754..41b04346ebdd20dedd00f0a9575e349dc6403e03 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -174,6 +174,11 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           batch_component_shape.AppendShape(first_element.shape());
           out_tensors->emplace_back(ctx->allocator({}), first_element.dtype(),
                                     batch_component_shape);
+          if (!out_tensors->back().IsInitialized()) {
+            return errors::ResourceExhausted(
+                "Failed to allocate memory for the batch of component ",
+                component_index);
+          }
           Tensor& batch_component = out_tensors->back();
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index d36eec0646b4e52e5655c367329678cd294ba435..64834e507f2d5bfb224693d8419b0c1070aace8a 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -79,12 +79,7 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface {
 
     bool TrackAllocations() const override { return false; }
 
-    void SetMemory(OpKernelContext* ctx) override {
-      // Returning `false` from `TrackAllocations()` should prevent
-      // `TrackingAllocator` objects from being constructed.
-      DCHECK_EQ(0, ctx->wrapped_allocators().size())
-          << "Allocations were tracked but should not have been requested.";
-    }
+    void SetMemory(OpKernelContext* ctx) override {}
 
     void SetOutput(int slot, const Tensor* tensor) override {}
 
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 5e54222c3ec2594416a5992c7bb936599dad09b7..1a18864ecf5619d6bb6c86bd9452202ee1db490f 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -147,6 +147,16 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "non_serializable_dataset_op",
+    srcs = ["non_serializable_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "dataset_kernels",
     deps = [
@@ -156,6 +166,7 @@ tf_kernel_library(
         ":ignore_errors_dataset_op",
         ":indexed_dataset",
         ":lmdb_dataset_op",
+        ":non_serializable_dataset_op",
         ":numa_map_and_batch_dataset_op",
         ":prefetching_kernels",
         ":sleep_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..953e086de3786bcb101da9b8a15d5a19c0f8cc57
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -0,0 +1,130 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit NonSerializableDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    *output = new Dataset(ctx, input, output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::NonSerializable")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "NonSerializableDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented(DebugString(), "::AsGraphDefInternal");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args), /*ratio=*/1);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* input_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalNonSerializableDataset").Device(DEVICE_CPU),
+    NonSerializableDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
index 677141a89d639ca98f224035f8840445acaf79a9..068f854023064a90720ca0e51a94fb994be2386c 100644
--- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
@@ -201,7 +201,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
         if (num_parallel_calls_->value == kAutoTune) {
-          num_parallel_calls_->value = port::NumSchedulableCPUs();
+          num_parallel_calls_->value = ctx->runner_threadpool_size();
           num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
@@ -244,7 +244,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return model::MakeAsyncKnownRatioNode(
             std::move(args), dataset()->batch_size_,
             {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                  /*max=*/port::NumSchedulableCPUs())});
+                                  /*max=*/ctx->runner_threadpool_size())});
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 64715aee2e21bbf175ebcfcdfac02fc2dec77baf..7bd393f0f41917f31bad857364734cda60e3af71 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -47,6 +47,8 @@ class ThreadPoolResource : public ResourceBase {
     }
   }
 
+  int32 NumThreads() { return thread_pool_.NumThreads(); }
+
   string DebugString() override { return "ThreadPoolResource"; }
 
  private:
@@ -185,25 +187,15 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        return dataset()->input_->MakeIterator(
+            IteratorContext(CreateParams(ctx)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        ThreadPoolResource* pool = dataset()->threadpool_;
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = [pool](std::function<void()> c) {
-          pool->Schedule(std::move(c));
-        };
-        params.stats_aggregator = ctx->stats_aggregator();
-        params.lib = ctx->lib();
-        params.function_library = ctx->function_library();
-        params.allocator_getter = ctx->allocator_getter();
-        IteratorContext threadpool_ctx(params);
-        return input_impl_->GetNext(&threadpool_ctx, out_tensors,
-                                    end_of_sequence);
+        return input_impl_->GetNext(IteratorContext(CreateParams(ctx)),
+                                    out_tensors, end_of_sequence);
       }
 
      protected:
@@ -214,6 +206,16 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       }
 
      private:
+      IteratorContext::Params CreateParams(IteratorContext* ctx) {
+        ThreadPoolResource* pool = dataset()->threadpool_;
+        IteratorContext::Params params(ctx);
+        params.runner = [pool](std::function<void()> c) {
+          pool->Schedule(std::move(c));
+        };
+        params.runner_threadpool_size = pool->NumThreads();
+        return params;
+      }
+
       std::unique_ptr<IteratorBase> input_impl_;
     };
 
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index d858b4d698da2b59073a3fdba96e73578815c1dc..9b42981ed75aff0ac49f813343a23e6f22c101bd 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -247,16 +247,6 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
             &current_element_iterator_);
       }
 
-      Status BuildCurrentElementIteratorLocked(OpKernelContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
-        params.lib = ctx->function_library();
-        IteratorContext iter_ctx(std::move(params));
-        return BuildCurrentElementIteratorLocked(&iter_ctx);
-      }
-
       mutex mu_;
       size_t element_index_ GUARDED_BY(mu_) = 0;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 459dc28ee46fa5b2c660a9bb55139e7e40d97b3c..93999dc095b421bccb8df425f705ca48819d3ec6 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -65,11 +67,16 @@ class IteratorResource : public ResourceBase {
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) {
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
+    IteratorContext::Params params(ctx);
+    std::shared_ptr<IteratorBase> captured_iterator;
+    {
+      tf_shared_lock l(mu_);
+      captured_iterator = iterator_;
+      params.lib = lib_;
+    }
     if (captured_iterator) {
-      CHECK_NOTNULL(lib_);
-      ctx->set_lib(lib_);
-      return captured_iterator->GetNext(ctx, out_tensors, end_of_sequence);
+      return captured_iterator->GetNext(IteratorContext(std::move(params)),
+                                        out_tensors, end_of_sequence);
     } else {
       return errors::FailedPrecondition(
           "GetNext() failed because the iterator has not been initialized. "
@@ -78,8 +85,17 @@ class IteratorResource : public ResourceBase {
     }
   }
 
+  Status GetNext(IteratorContext&& ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) {
+    return GetNext(&ctx, out_tensors, end_of_sequence);
+  }
+
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) {
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
+    std::shared_ptr<IteratorBase> captured_iterator;
+    {
+      tf_shared_lock l(mu_);
+      captured_iterator = iterator_;
+    }
     if (captured_iterator) {
       return captured_iterator->Save(ctx, writer);
     } else {
@@ -124,54 +140,83 @@ class IteratorResource : public ResourceBase {
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
     std::unique_ptr<IteratorBase> iterator;
-    IteratorContext iter_ctx(ctx);
-    iter_ctx.set_lib(lib);
+    {
+      IteratorContext::Params params(ctx);
+      params.lib = lib;
+      TF_RETURN_IF_ERROR(dataset->MakeIterator(
+          IteratorContext(std::move(params)), "Iterator", &iterator));
+    }
     TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
-    TF_RETURN_IF_ERROR(set_iterator(std::move(iterator)));
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
+        VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
 
-    if (captured_iterator) {
-      IteratorContext::Params params;
-      params.env = ctx->env();
-      params.runner = *(ctx->runner());
+    {
+      IteratorContext::Params params(ctx);
       params.lib = lib;
       DeviceBase* device = lib->device();
       params.allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
       IteratorContext iter_ctx(std::move(params));
+      TF_RETURN_IF_ERROR(iterator->Restore(&iter_ctx, reader));
+    }
 
-      TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader));
+    std::shared_ptr<IteratorBase> old_iterator;
+    {
       mutex_lock l(mu_);
-      device_mgr_ = std::move(device_mgr);
-      lib_def_ = std::move(flib_def);
-      pflr_ = std::move(pflr);
+      std::swap(device_mgr_, device_mgr);
+      std::swap(flib_def_, flib_def);
+      std::swap(pflr_, pflr);
       lib_ = lib;
-      return Status::OK();
-    } else {
-      return errors::FailedPrecondition(
-          "Failed to restore iterator. Make sure the checkpoint ",
-          "is not corrupt. If the checkpoint does not contain the GraphDef, ",
-          "you will need to initialize your iterator before restoring.");
+      old_iterator = iterator_;
+      iterator_ = std::move(iterator);
     }
-  }
 
-  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
-    tf_shared_lock l(mu_);
-    return lib_def_;
+    return Status::OK();
   }
 
-  FunctionLibraryRuntime* function_library_runtime() { return lib_; }
+  Status AddLibrary(const FunctionLibraryDefinition& flib_def) {
+    mutex_lock l(mu_);
+    return flib_def_->AddLibrary(flib_def);
+  }
 
-  // Transfers ownership of iterator to this. This method is thread-safe.
-  Status set_iterator(std::unique_ptr<IteratorBase> iterator) {
-    if (iterator) {
-      TF_RETURN_IF_ERROR(
-          VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+  Status SetIteratorFromDataset(OpKernelContext* ctx, DatasetBase* dataset) {
+    mutex_lock l(mu_);
+    // Ensure that the iterator has access to all functions in the current
+    // subgraph, because some functions may have been defined after the resource
+    // was initially created.
+    Status s = flib_def_->AddLibrary(
+        *ctx->function_library()->GetFunctionLibraryDefinition());
+
+    if (!s.ok()) {
+      // Adding functions to `flib_def_` may fail, if there are clashes between
+      // the function names in (e.g.) a restored graph and the currently
+      // executing graph. In that case, we create a new function runtime for
+      // this iterator, based on the current `OpKernelContext`, which will have
+      // the functions we need.
+      iterator_.reset();
+      FunctionLibraryRuntime* lib;
+      std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+      std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
       TF_RETURN_IF_ERROR(
-          VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+          ctx->function_library()->Clone(&flib_def, &pflr, &lib));
+      std::swap(device_mgr_, device_mgr);
+      std::swap(flib_def_, flib_def);
+      std::swap(pflr_, pflr);
+      lib_ = lib;
     }
+
+    std::unique_ptr<IteratorBase> iterator;
+    IteratorContext::Params params(ctx);
+    params.lib = lib_;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
+                                             "Iterator", &iterator));
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
     iterator_.reset(iterator.release());
     return Status::OK();
   }
@@ -185,16 +230,12 @@ class IteratorResource : public ResourceBase {
   }
 
  private:
-  // The following (device_mgr_, flib_def_, pflr_) are only used when the
-  // IteratorResource is shared between sessions and in that case we create
-  // a new FLR. Otherwise these are set to null.
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
-  std::shared_ptr<IteratorBase> iterator_;
   mutex mu_;
-  std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
+  std::unique_ptr<DeviceMgr> device_mgr_ GUARDED_BY(mu_);
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_ GUARDED_BY(mu_);
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ GUARDED_BY(mu_);
+  FunctionLibraryRuntime* lib_ GUARDED_BY(mu_) = nullptr;  // not owned.
+  std::shared_ptr<IteratorBase> iterator_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
 };
@@ -580,13 +621,7 @@ void MakeIteratorOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(
       ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
   core::ScopedUnref unref(iterator_resource);
-
-  std::unique_ptr<IteratorBase> iterator;
-  IteratorContext iter_ctx(ctx);
-  iter_ctx.set_lib(iterator_resource->function_library_runtime());
-  OP_REQUIRES_OK(
-      ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
-  OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
+  OP_REQUIRES_OK(ctx, iterator_resource->SetIteratorFromDataset(ctx, dataset));
 }
 
 namespace {
@@ -912,13 +947,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
     // factory function.
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
-    std::unique_ptr<IteratorBase> iter;
-    IteratorContext iter_ctx(ctx);
-    iter_ctx.set_lib(lib);
-    TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iter));
-    TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter)));
-
+    TF_RETURN_IF_ERROR((*iterator)->SetIteratorFromDataset(ctx, dataset));
     (*iterator)->Ref();
     return Status::OK();
   }
@@ -972,17 +1001,8 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         std::vector<Tensor> components;
         bool end_of_sequence = false;
 
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
-        params.function_library = iterator->function_library();
-        DeviceBase* device = ctx->function_library()->device();
-        params.allocator_getter = [device](AllocatorAttributes attrs) {
-          return device->GetAllocator(attrs);
-        };
-        IteratorContext iter_ctx(std::move(params));
-
-        Status s = iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+        Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                     &end_of_sequence);
         // NOTE(mrry): We must unref the iterator before calling `done()`, to
         // avoid destruction races.
         iterator->Unref();
@@ -1006,22 +1026,11 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
   IteratorResource* iterator;
   OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
   core::ScopedUnref unref_iterator(iterator);
-
   std::vector<Tensor> components;
   bool end_of_sequence = false;
 
-  IteratorContext::Params params;
-  params.env = ctx->env();
-  params.runner = *(ctx->runner());
-  params.function_library = iterator->function_library();
-  DeviceBase* device = ctx->function_library()->device();
-  params.allocator_getter = [device](AllocatorAttributes attrs) {
-    return device->GetAllocator(attrs);
-  };
-  IteratorContext iter_ctx(std::move(params));
-
-  OP_REQUIRES_OK(ctx,
-                 iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
+  OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(ctx), &components,
+                                        &end_of_sequence));
   OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
 
   for (int i = 0; i < components.size(); ++i) {
@@ -1054,18 +1063,8 @@ class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
           std::vector<Tensor> components;
           bool end_of_sequence = false;
 
-          IteratorContext::Params params;
-          params.env = ctx->env();
-          params.runner = *(ctx->runner());
-          params.function_library = iterator->function_library();
-          DeviceBase* device = ctx->function_library()->device();
-          params.allocator_getter = [device](AllocatorAttributes attrs) {
-            return device->GetAllocator(attrs);
-          };
-          IteratorContext iter_ctx(std::move(params));
-
-          Status s =
-              iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+          Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                       &end_of_sequence);
           // NOTE(mrry): We must unref the iterator before calling `done()`, to
           // avoid destruction races.
           iterator->Unref();
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 7790d133203207d2accb768ec01279aec50d973f..72a401e99b818d0357a3e52b153fb6c2d867197a 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -262,9 +263,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
         if (num_parallel_calls_->value == kAutoTune) {
-          // TODO(jsimsa): Surface the number of threads used by `ctx->runner()`
-          // and use it here for the default.
-          num_parallel_calls_->value = port::NumSchedulableCPUs();
+          num_parallel_calls_->value = ctx->runner_threadpool_size();
           num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
@@ -298,7 +297,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return model::MakeAsyncKnownRatioNode(
             std::move(args), dataset()->batch_size_,
             {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                  /*max=*/port::NumSchedulableCPUs())});
+                                  /*max=*/ctx->runner_threadpool_size())});
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
@@ -414,32 +413,36 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         auto done = [this, ctx, result, return_values, offset](Status status) {
           result->UpdateStatus(status, offset);
           if (status.ok()) {
-            EnsureOutputAllocated(ctx, result, return_values);
-            for (size_t i = 0; i < return_values->size(); ++i) {
-              const Tensor& tensor = return_values->at(i);
-              Tensor* batch = &(result->output)[i];
-              if (tensor.NumElements() !=
-                  (batch->NumElements() / batch->dim_size(0))) {
-                TensorShape batch_shape = batch->shape();
-                batch_shape.RemoveDim(0);
-                result->UpdateStatus(
-                    errors::InvalidArgument(
-                        "Cannot add tensor to the batch: number of elements "
-                        "does "
-                        "not match. Shapes are: [tensor]: ",
-                        tensor.shape().DebugString(),
-                        ", [batch]: ", batch_shape.DebugString()),
-                    offset);
-                break;
-              }
-              // TODO(mrry): Add a version of DoParallelConcat that allows us to
-              // move `tensor` where possible, to speed up string tensor
-              // batching.
-              Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                  *dataset()->device_, tensor, offset, batch);
-              if (!copy_status.ok()) {
-                result->UpdateStatus(copy_status, offset);
-                break;
+            Status allocate_status =
+                EnsureOutputAllocated(ctx, result, return_values);
+            if (!allocate_status.ok()) {
+              result->UpdateStatus(allocate_status, offset);
+            } else {
+              for (size_t i = 0; i < return_values->size(); ++i) {
+                const Tensor& tensor = return_values->at(i);
+                Tensor* batch = &(result->output)[i];
+                if (tensor.NumElements() !=
+                    (batch->NumElements() / batch->dim_size(0))) {
+                  TensorShape batch_shape = batch->shape();
+                  batch_shape.RemoveDim(0);
+                  result->UpdateStatus(
+                      errors::InvalidArgument(
+                          "Cannot add tensor to the batch: number of elements "
+                          "does not match. Shapes are: [tensor]: ",
+                          tensor.shape().DebugString(),
+                          ", [batch]: ", batch_shape.DebugString()),
+                      offset);
+                  break;
+                }
+                // TODO(mrry): Add a version of DoParallelConcat that allows us
+                // to move `tensor` where possible, to speed up string tensor
+                // batching.
+                Status copy_status = ::tensorflow::functor::DoParallelConcat(
+                    *dataset()->device_, tensor, offset, batch);
+                if (!copy_status.ok()) {
+                  result->UpdateStatus(copy_status, offset);
+                  break;
+                }
               }
             }
             {
@@ -487,13 +490,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
-      void EnsureOutputAllocated(
+      Status EnsureOutputAllocated(
           const std::shared_ptr<IteratorContext>& ctx,
           const std::shared_ptr<BatchResult>& result,
           const std::shared_ptr<std::vector<Tensor>>& return_values) {
         mutex_lock l(result->mu);
         if (result->output_allocated) {
-          return;
+          return Status::OK();
         }
         const size_t num_components = return_values->size();
         for (size_t i = 0; i < num_components; ++i) {
@@ -504,8 +507,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           result->output.emplace_back(ctx->allocator(attr),
                                       return_values->at(i).dtype(),
                                       component_shape);
+          if (!result->output.back().IsInitialized()) {
+            return errors::ResourceExhausted(
+                "Failed to allocate memory for the batch of component ", i);
+          }
         }
         result->output_allocated = true;
+        return Status::OK();
       }
 
       Status ProcessResult(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index d64114e70e531c527e07ec1f38e3771ee171e8cd..ab20b832986874f8adba666e194af6a3470c1dbe 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -244,6 +244,11 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalMapDataset")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("input_dataset")
+                            .HostMemory("handle"),
+                        MapDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 56a45b120492c0a19b37c336e1e3583d591395c4..dcd23095968493a9051fe918f6c79c527dad638e 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -86,19 +86,23 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status Initialize(IteratorContext* ctx) override {
-        IteratorContext ctx_with_model(CreateParams(ctx));
-        return dataset()->input_->MakeIterator(&ctx_with_model, prefix(),
-                                               &input_impl_);
+        IteratorContext::Params params(ctx);
+        params.model = model_;
+        return dataset()->input_->MakeIterator(
+            IteratorContext(std::move(params)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
-        IteratorContext ctx_with_model(CreateParams(ctx));
-        return input_impl_->GetNext(&ctx_with_model, out_tensors,
-                                    end_of_sequence);
+        IteratorContext::Params params(ctx);
+        {
+          mutex_lock l(mu_);
+          TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
+          params.model = model_;
+        }
+        return input_impl_->GetNext(IteratorContext(std::move(params)),
+                                    out_tensors, end_of_sequence);
       }
 
      protected:
@@ -121,12 +125,6 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      IteratorContext::Params CreateParams(IteratorContext* ctx) {
-        IteratorContext::Params params = ctx->params();
-        params.model = model_;
-        return params;
-      }
-
      private:
       Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -175,7 +173,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       std::shared_ptr<model::Model> model_;
       std::unique_ptr<Thread> optimize_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_;
     };
 
     const DatasetBase* input_;
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 4d4f8c01640a53c4ce42b283826278d5edf06137..5268007e3d95286eaf3bdf19456c6b007e90f329 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -86,12 +86,18 @@ class MultiDeviceIterator : public ResourceBase {
   void GetNextFromShard(IteratorContext* ctx, int shard_num,
                         int64 incarnation_id,
                         MultiDeviceIteratorCallback callback) {
-    if (lib_ != nullptr) {
-      ctx->set_lib(lib_);
+    if (ctx->lib() == lib_) {
+      tf_shared_lock l(mu_);
+      multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id,
+                                             std::move(callback));
+    } else {
+      IteratorContext::Params params(ctx);
+      params.lib = lib_;
+      IteratorContext iter_ctx(std::move(params));
+      tf_shared_lock l(mu_);
+      multi_device_buffer_->GetNextFromShard(
+          &iter_ctx, shard_num, incarnation_id, std::move(callback));
     }
-    tf_shared_lock l(mu_);
-    multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id,
-                                           std::move(callback));
   }
 
   const DataTypeVector& output_types() const { return output_types_; }
@@ -455,8 +461,9 @@ class MultiDeviceIteratorInitOp : public OpKernel {
     core::ScopedUnref unref(resource);
 
     std::unique_ptr<IteratorBase> iterator;
-    IteratorContext iter_ctx(ctx);
-    iter_ctx.set_lib(resource->lib());
+    IteratorContext::Params params(ctx);
+    params.lib = resource->lib();
+    IteratorContext iter_ctx(std::move(params));
     OP_REQUIRES_OK(
         ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
     int64 incarnation_id;
@@ -496,16 +503,6 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
         ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
     background_worker_.Schedule(std::bind(
         [ctx, iterator, shard_num, incarnation_id](DoneCallback done) {
-          IteratorContext::Params params;
-          params.env = ctx->env();
-          params.runner = *(ctx->runner());
-          params.function_library = iterator->function_library();
-          DeviceBase* device = ctx->function_library()->device();
-          params.allocator_getter = [device](AllocatorAttributes attrs) {
-            return device->GetAllocator(attrs);
-          };
-          IteratorContext iter_ctx(std::move(params));
-
           MultiDeviceIteratorCallback callback = std::bind(
               [ctx](const HostBufferElement& elem, DoneCallback done) {
                 // iterator->Unref();
@@ -523,6 +520,9 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
               },
               std::placeholders::_1, std::move(done));
 
+          IteratorContext::Params params(ctx);
+          params.function_library = iterator->function_library();
+          IteratorContext iter_ctx(std::move(params));
           iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
                                      callback);
           iterator->Unref();
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index e0dd1b8929b48ead2e721cfbac2d577055536fc2..f90dcb95e3c3e20b0efc05e151270063b9a75ed3 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -94,9 +95,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       Node* input_node = nullptr;
       SerializationContext::Params params;
       std::vector<std::pair<string, Tensor>> input_list;
-      params.allow_stateful_functions = true;
       params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
       params.input_list = &input_list;
+      params.optimization_only = true;
       SerializationContext serialization_ctx(params);
       TF_RETURN_IF_ERROR(
           db.AddInputDataset(&serialization_ctx, input_, &input_node));
@@ -164,19 +165,19 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        IteratorContext::Params params = ctx->params();
+        IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
         return dataset()->optimized_input_->MakeIterator(
-            IteratorContext(params), prefix(), &input_impl_);
+            IteratorContext(std::move(params)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        IteratorContext::Params params = ctx->params();
+        IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
-        return input_impl_->GetNext(IteratorContext(params), out_tensors,
-                                    end_of_sequence);
+        return input_impl_->GetNext(IteratorContext(std::move(params)),
+                                    out_tensors, end_of_sequence);
       }
 
      protected:
@@ -203,11 +204,16 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
     Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
                               string* output_node) {
-      // Add a fake sink node to allow rewriting the actual sink node.
+      // Add an identity node as the fetch node, otherwise we might get
+      // 'placeholder is both fed and fetched' errors in some cases when using
+      // input list with placeholder dataset nodes.
       NodeDef* node = graph_def->mutable_node()->Add();
-      node->set_name("FakeSink");
-      node->set_op("SinkDataset");
+      tensorflow::grappler::graph_utils::SetUniqueGraphNodeName(
+          "Sink", graph_def, node);
+      node->set_op("Identity");
       node->add_input(*output_node);
+      (*node->mutable_attr())["T"].set_type(DT_VARIANT);
+      *output_node = node->name();
 
       // Create metagraph.
       MetaGraphDef meta_graph_def;
@@ -216,11 +222,13 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       // Grappler determines fetch ops from collection 'train_op'.
       CollectionDef collection_def;
       auto node_list = collection_def.mutable_node_list();
-      node_list->add_value("FakeSink");
+      node_list->add_value(*output_node);
       (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
 
       // Create Grappler item.
-      tensorflow::RewriterConfig rewriter_config;
+      tensorflow::ConfigProto config;
+      RewriterConfig& rewriter_config =
+          *config.mutable_graph_options()->mutable_rewrite_options();
       for (const string& optimization : optimizations_) {
         rewriter_config.add_optimizers(optimization);
       }
@@ -258,15 +266,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         }
       }
       TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-          *grappler_item, rewriter_config, ctx->device(), &cluster, graph_def));
-
-      // Set `output_node` to the input of the fake sink node.
-      {
-        grappler::GraphView graph(graph_def);
-        grappler::GraphView::InputPort input_port =
-            graph.GetInputPort("FakeSink", 0);
-        *output_node = graph.GetRegularFanin(input_port).node->name();
-      }
+          *grappler_item, config, ctx->device(), &cluster, graph_def));
 
       return Status::OK();
     }
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 1fa9a1fdc50361bd7384caef9932c16071768bbc..ec1c92384304d06332ba82f4315bd7286bcf99da 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -65,9 +65,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   Status Initialize(IteratorContext* ctx) override {
     mutex_lock l(*mu_);
     if (num_parallel_calls_->value == kAutoTune) {
-      // TODO(jsimsa): Surface the number of threads used by `ctx->runner()` and
-      // use it here for the default.
-      num_parallel_calls_->value = port::NumSchedulableCPUs();
+      num_parallel_calls_->value = ctx->runner_threadpool_size();
       num_parallel_calls_->tunable = true;
     }
     TF_RETURN_IF_ERROR(
@@ -103,7 +101,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
         std::move(args),
         /*ratio=*/1,
         {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                              /*max=*/port::NumSchedulableCPUs())});
+                              /*max=*/ctx->runner_threadpool_size())});
   }
 
   Status SaveInternal(IteratorStateWriter* writer) override {
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 1ad5b007751895f56e97da402dedeaf05fcec1a0..207e957e3747e4a03a7f91cc5502f92fb6953e1b 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -104,9 +104,8 @@ class RangeDatasetOp : public DatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        out_tensors->emplace_back(ctx->allocator({}), DT_INT64,
-                                  TensorShape({}));
-        out_tensors->back().scalar<int64>()() = next_;
+        out_tensors->reserve(1);
+        out_tensors->emplace_back(next_);
         *end_of_sequence = false;
         next_ += dataset()->step_;
 
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index ea97cf5ffdc0b2dba479982b49d825398bd91005..971fd2a43685197892ad0fb3cd37e3709cd144c1 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -271,6 +271,9 @@ REGISTER_KERNEL_BUILDER(Name("TextLineDataset").Device(DEVICE_CPU),
 class FixedLengthRecordDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
+  explicit FixedLengthRecordDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx),
+        op_version_(ctx->def().op() == "FixedLengthRecordDataset" ? 1 : 2) {}
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
     const Tensor* filenames_tensor;
@@ -311,9 +314,17 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
     if (buffer_size == 0) {
       buffer_size = 256 << 10;  // 256 kB as default.
     }
-
+    string compression_type;
+    if (op_version_ > 1) {
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "compression_type",
+                                                      &compression_type));
+      OP_REQUIRES(ctx,
+                  compression_type.empty() || compression_type == "ZLIB" ||
+                      compression_type == "GZIP",
+                  errors::InvalidArgument("Unsupported compression_type."));
+    }
     *output = new Dataset(ctx, std::move(filenames), header_bytes, record_bytes,
-                          footer_bytes, buffer_size);
+                          footer_bytes, buffer_size, compression_type);
   }
 
  private:
@@ -321,18 +332,24 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
    public:
     explicit Dataset(OpKernelContext* ctx, std::vector<string> filenames,
                      int64 header_bytes, int64 record_bytes, int64 footer_bytes,
-                     int64 buffer_size)
+                     int64 buffer_size, const string& compression_type)
         : DatasetBase(DatasetContext(ctx)),
           filenames_(std::move(filenames)),
           header_bytes_(header_bytes),
           record_bytes_(record_bytes),
           footer_bytes_(footer_bytes),
-          buffer_size_(buffer_size) {}
+          buffer_size_(buffer_size),
+          compression_type_(compression_type) {}
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+      if (compression_type_.empty()) {
+        return std::unique_ptr<IteratorBase>(new UncompressedIterator(
+            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+      } else {
+        return std::unique_ptr<IteratorBase>(new CompressedIterator(
+            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+      }
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -359,22 +376,25 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       Node* record_bytes = nullptr;
       Node* footer_bytes = nullptr;
       Node* buffer_size = nullptr;
+      Node* compression_type = nullptr;
       TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
       TF_RETURN_IF_ERROR(b->AddScalar(header_bytes_, &header_bytes));
       TF_RETURN_IF_ERROR(b->AddScalar(record_bytes_, &record_bytes));
       TF_RETURN_IF_ERROR(b->AddScalar(footer_bytes_, &footer_bytes));
       TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {filenames, header_bytes, record_bytes, footer_bytes, buffer_size},
-          output));
+      TF_RETURN_IF_ERROR(b->AddScalar(compression_type_, &compression_type));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this,
+                        {filenames, header_bytes, record_bytes, footer_bytes,
+                         buffer_size, compression_type},
+                        output));
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
+    class UncompressedIterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
+      explicit UncompressedIterator(const Params& params)
           : DatasetIterator<Dataset>(params) {}
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -391,9 +411,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               TF_RETURN_IF_ERROR(
                   input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
               // Produce the record as output.
-              out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
-                                        TensorShape({}));
-              out_tensors->back().scalar<string>()() = record;
+              Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+              record_tensor.scalar<string>()() = record;
+              out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
               return Status::OK();
             }
@@ -440,11 +460,6 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       }
 
      protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeSourceNode(std::move(args));
-      }
-
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"),
@@ -497,16 +512,207 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
     };
 
+    class CompressedIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit CompressedIterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next record.
+          if (buffered_input_stream_) {
+            const int64 current_pos = buffered_input_stream_->Tell();
+            if (dataset()->compression_type_.empty()) {
+              DCHECK_GE(file_pos_limit_, 0);
+              if (current_pos < file_pos_limit_) {
+                string record;
+                TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
+                    dataset()->record_bytes_, &record));
+                // Produce the record as output.
+                Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+                record_tensor.scalar<string>()() = std::move(record);
+                out_tensors->emplace_back(std::move(record_tensor));
+                *end_of_sequence = false;
+                return Status::OK();
+              }
+            } else {
+              string record;
+              Status s = buffered_input_stream_->ReadNBytes(
+                  dataset()->record_bytes_, &record);
+              if (s.ok()) {
+                lookahead_cache_.append(record);
+                record = lookahead_cache_.substr(0, dataset()->record_bytes_);
+                lookahead_cache_ =
+                    lookahead_cache_.substr(dataset()->record_bytes_);
+                // Produce the record as output.
+                Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+                record_tensor.scalar<string>()() = std::move(record);
+                out_tensors->emplace_back(std::move(record_tensor));
+                *end_of_sequence = false;
+                return Status::OK();
+              }
+              if (errors::IsOutOfRange(s) && !record.empty()) {
+                uint64 body_size =
+                    current_pos + record.size() -
+                    (dataset()->header_bytes_ + dataset()->footer_bytes_);
+                return errors::DataLoss(
+                    "Excluding the header (", dataset()->header_bytes_,
+                    " bytes) and footer (", dataset()->footer_bytes_,
+                    " bytes), input file \"",
+                    dataset()->filenames_[current_file_index_],
+                    "\" has body length ", body_size,
+                    " bytes, which is not an exact multiple of the record "
+                    "length (",
+                    dataset()->record_bytes_, " bytes).");
+              }
+            }
+
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            buffered_input_stream_.reset();
+            file_.reset();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          // Actually move on to next file.
+          if (dataset()->compression_type_.empty()) {
+            uint64 file_size;
+            TF_RETURN_IF_ERROR(ctx->env()->GetFileSize(
+                dataset()->filenames_[current_file_index_], &file_size));
+            file_pos_limit_ = file_size - dataset()->footer_bytes_;
+
+            uint64 body_size = file_size - (dataset()->header_bytes_ +
+                                            dataset()->footer_bytes_);
+
+            if (body_size % dataset()->record_bytes_ != 0) {
+              return errors::InvalidArgument(
+                  "Excluding the header (", dataset()->header_bytes_,
+                  " bytes) and footer (", dataset()->footer_bytes_,
+                  " bytes), input file \"",
+                  dataset()->filenames_[current_file_index_],
+                  "\" has body length ", body_size,
+                  " bytes, which is not an exact multiple of the record length "
+                  "(",
+                  dataset()->record_bytes_, " bytes).");
+            }
+          }
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          if (!dataset()->compression_type_.empty()) {
+            const io::ZlibCompressionOptions zlib_options =
+                dataset()->compression_type_ == "ZLIB"
+                    ? io::ZlibCompressionOptions::DEFAULT()
+                    : io::ZlibCompressionOptions::GZIP();
+            file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
+            buffered_input_stream_.reset(new io::ZlibInputStream(
+                file_stream_.get(), dataset()->buffer_size_,
+                dataset()->buffer_size_, zlib_options));
+          } else {
+            buffered_input_stream_.reset(new io::BufferedInputStream(
+                file_.get(), dataset()->buffer_size_));
+          }
+          TF_RETURN_IF_ERROR(
+              buffered_input_stream_->SkipNBytes(dataset()->header_bytes_));
+          lookahead_cache_.clear();
+          if (!dataset()->compression_type_.empty()) {
+            TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
+                dataset()->footer_bytes_, &lookahead_cache_));
+          }
+        } while (true);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"),
+                                               current_file_index_));
+
+        // `buffered_input_stream_` is empty if
+        // 1. GetNext has not been called even once.
+        // 2. All files have been read and iterator has been exhausted.
+        int64 current_pos =
+            buffered_input_stream_ ? buffered_input_stream_->Tell() : -1;
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("current_pos"), current_pos));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        int64 current_file_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_file_index"),
+                                              &current_file_index));
+        current_file_index_ = size_t(current_file_index);
+        int64 current_pos;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("current_pos"), &current_pos));
+
+        // Seek to current_pos.
+        buffered_input_stream_.reset();
+        file_.reset();
+        if (current_pos >= 0) {  // There was an active buffered_input_stream_.
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          const io::ZlibCompressionOptions zlib_options =
+              dataset()->compression_type_ == "ZLIB"
+                  ? io::ZlibCompressionOptions::DEFAULT()
+                  : io::ZlibCompressionOptions::GZIP();
+          file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
+          buffered_input_stream_.reset(new io::ZlibInputStream(
+              file_stream_.get(), dataset()->buffer_size_,
+              dataset()->buffer_size_, zlib_options));
+          lookahead_cache_.clear();
+          TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes(
+              current_pos - dataset()->footer_bytes_));
+          TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
+              dataset()->footer_bytes_, &lookahead_cache_));
+        }
+
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_
+          GUARDED_BY(mu_);  // must outlive buffered_input_stream_
+      std::unique_ptr<io::RandomAccessInputStream>
+          file_stream_;  // must outlive buffered_input_stream_
+      std::unique_ptr<io::InputStreamInterface> buffered_input_stream_
+          GUARDED_BY(mu_);
+      int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
+      string lookahead_cache_ GUARDED_BY(mu_);
+    };
+
     const std::vector<string> filenames_;
     const int64 header_bytes_;
     const int64 record_bytes_;
     const int64 footer_bytes_;
     const int64 buffer_size_;
+    const string compression_type_;
   };
+  const int op_version_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDataset").Device(DEVICE_CPU),
                         FixedLengthRecordDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDatasetV2").Device(DEVICE_CPU),
+                        FixedLengthRecordDatasetOp);
 
 class TFRecordDatasetOp : public DatasetOpKernel {
  public:
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
index 08d34eea878b840bf2a92150717f3b8150d576cf..a21b3fc16b7a93978bd2e03081aec9e7aa5e5ba4 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -163,19 +163,13 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         StatsAggregatorResource* stats_aggregator_resource =
             dataset()->stats_aggregator_resource_;
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
+        IteratorContext::Params params(ctx);
         params.stats_aggregator = std::shared_ptr<StatsAggregator>(
             new StatsAggregatorWithTagAndPrefix(
                 stats_aggregator_resource->stats_aggregator(), dataset()->tag_,
                 dataset()->prefix_));
-        params.lib = ctx->lib();
-        params.function_library = ctx->function_library();
-        params.allocator_getter = ctx->allocator_getter();
-        IteratorContext set_stats_aggregator_ctx(params);
-        return input_impl_->GetNext(&set_stats_aggregator_ctx, out_tensors,
-                                    end_of_sequence);
+        IteratorContext iter_ctx(std::move(params));
+        return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence);
       }
 
      protected:
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index bae2d5afc3ec5bc44fb154351793f8094cb557a4..c7d374f489740a62b837690a4a80278212e98cce 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -69,10 +69,10 @@ class TensorDatasetOp : public DatasetOpKernel {
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
-        std::vector<std::pair<string, Tensor>>* input_list = ctx->input_list();
-        if (input_list) {
+        if (ctx->optimization_only()) {
           TF_RETURN_IF_ERROR(b->AddPlaceholder(t, &node));
-          input_list->emplace_back(node->name(), t);
+          DCHECK_NE(ctx->input_list(), nullptr);
+          ctx->input_list()->emplace_back(node->name(), t);
         } else {
           TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
         }
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index bb219847f91d8addec8e60cf45e45f51ce0b6b87..6291bfc110bafe028114b8f9ed010fdd2f97f1cd 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -92,10 +92,10 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
-        std::vector<std::pair<string, Tensor>>* input_list = ctx->input_list();
-        if (input_list) {
+        if (ctx->optimization_only()) {
           TF_RETURN_IF_ERROR(b->AddPlaceholder(t, &node));
-          input_list->emplace_back(node->name(), t);
+          DCHECK_NE(ctx->input_list(), nullptr);
+          ctx->input_list()->emplace_back(node->name(), t);
         } else {
           TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
         }
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
index ab8d8570336000cfb9d7719600e2fdd54cc077e9..b32ab8ba4faa7b762c950f7fa444456ecd0c76d1 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -151,7 +151,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
         // dimension. If it is statically known for any component, we model the
         // transformation using `KnownRatio`. Otherwise, we use `UnknownRatio`.
         for (auto& shape : dataset()->input_->output_shapes()) {
-          if (shape.dims() > 0 && shape.dim_size(0) != -1) {
+          if (shape.dims() > 0 && shape.dim_size(0) > 0) {
             return model::MakeKnownRatioNode(
                 std::move(args), 1.0 / static_cast<double>(shape.dim_size(0)));
           }
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 1aa8c72d667207cf7d24107da235c0006a6f03f7..873663988166252ea2a65be485143fa7ed87634a 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -434,10 +434,9 @@ struct TransformFilters {
         tile_spatial_size, base_filter_spatial_size, transform_matrix);
 
     auto shard = [&ctx, &args, &transform, &base_filter_rows, &base_filter_cols,
-                  &num_filters_transform, &in_depth, &out_depth,
-                  &filter_shards_row, &filter_shards_col, &tile_spatial_size,
-                  &filter_in, &transform_matrix,
-                  &filter_out](int64 start, int64 limit) {
+                  &num_filters_transform, &in_depth, &filter_shards_row,
+                  &filter_shards_col, &tile_spatial_size, &filter_in,
+                  &transform_matrix, &filter_out](int64 start, int64 limit) {
       // Allocate buffer for pre-processed filter:
       //   [base_filter_rows, base_filter_cols, num_filters_transform, in_depth]
       //
@@ -500,8 +499,9 @@ class GemmFilterPacker {
   typedef Eigen::internal::const_blas_data_mapper<T, int64, Eigen::RowMajor>
       LhsMapper;
   typedef Eigen::internal::gebp_traits<T, T> Traits;
-  Eigen::internal::gemm_pack_lhs<T, int64, LhsMapper, Traits::mr,
-                                 Traits::LhsProgress, Eigen::RowMajor>
+  Eigen::internal::gemm_pack_lhs<
+      T, int64, LhsMapper, Traits::mr, Traits::LhsProgress,
+      typename Traits::LhsPacket4Packing, Eigen::RowMajor>
       pack_lhs;
 
   GemmFilterPacker(const int64 rows, const int64 depth, const T* lhs_input,
@@ -532,9 +532,9 @@ struct PackFilters {
     const int64 out_depth = args.out_depth;
     const int64 num_filters = filter_shards_row * filter_shards_col * out_depth;
 
-    auto shard = [&ctx, &packed_filters, &filter_transform_data,
-                  &tile_spatial_size, &in_depth, &out_depth, &filter_shards_row,
-                  &filter_shards_col, &num_filters](int64 start, int64 limit) {
+    auto shard = [&ctx, &packed_filters, &filter_transform_data, &in_depth,
+                  &out_depth, &filter_shards_row, &filter_shards_col,
+                  &num_filters](int64 start, int64 limit) {
       const int64 filter_coord_stride = num_filters * in_depth;
       for (int64 i = start; i < limit; ++i) {
         // Allocate filter buffer [out_depth, shard_rows, shard_cols, in_depth].
@@ -1003,9 +1003,9 @@ struct DeepConv2D<CPUDevice, T> {
         out_tile_spatial_size, tile_spatial_size, output_transform_matrix);
 
     auto shard = [&ctx, &args, &transform, &packed_filters, &in_depth,
-                  out_depth, tile_rows, tile_cols, out_tile_rows, out_tile_cols,
-                  filter_shards_row, filter_shards_col, tile_spatial_size,
-                  &input, &tile_transform_matrix, &output_transform_matrix,
+                  out_depth, out_tile_rows, out_tile_cols, filter_shards_row,
+                  filter_shards_col, tile_spatial_size, &input,
+                  &tile_transform_matrix, &output_transform_matrix,
                   &output](int64 batch_start, int64 batch_limit) {
       const int64 row_tiles =
           (args.out_rows + out_tile_rows - 1) / out_tile_rows +
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 76afd6f18c23157d79375ff1340a0fb655ab6852..1398c87662575ff5d1752b4db03087bd7dabcb83 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -764,7 +764,7 @@ Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args,
   const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
                                       kKnownDepthMultiplier < 0
                                   ? std::numeric_limits<int>::max()
-                                  : device.getNumCudaMultiProcessors();
+                                  : device.getNumGpuMultiProcessors();
   kernel<<<std::min(max_block_count, config.block_count),
            config.thread_per_block, 0, device.stream()>>>(args, input, filter,
                                                           output, num_outputs);
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..92d29e39958e3cd30ee80776f2abb5c67f1a07e2
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -0,0 +1,234 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
+
+// Depending on a build configuration this header provides custom kernel for
+// Eigen tensor contractions (small matrix multiplication kernel used to
+// multiple together blocks of the original tensors).
+//
+// 1) --define tensorflow_mkldnn_contraction_kernel=1
+//    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at
+//    runtime and use avx/avx2/fma/avx512 based on cpu status registers
+//    (https://en.wikipedia.org/wiki/CPUID).
+//
+// If you use `tensor.contract(other_tensor)` in your code, you must include
+// this header to get the benefit of custom contraction kernel:
+//
+//   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+//   #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+//   #endif
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/intel_mkl_dnn/include/mkldnn.h"
+
+namespace Eigen {
+namespace internal {
+
+// Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1"
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
+template <typename Scalar, typename IndexType, typename DataMapper,
+          int StorageOrder>
+struct mkldnn_gemm_pack;
+
+// mkl_gemm_pack for ColMajor storage order.
+template <typename Scalar, typename IndexType, typename DataMapper>
+struct mkldnn_gemm_pack<Scalar, IndexType, DataMapper,
+                        /*StorageOrder*/ ColMajor> {
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  enum { PacketSize = internal::packet_traits<Scalar>::size };
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& data_mapper, IndexType rows,
+                  IndexType cols) {
+    const IndexType unrolled_rows =
+        (rows / (4 * PacketSize)) * (4 * PacketSize);
+    const IndexType vectorized_rows = (rows / PacketSize) * PacketSize;
+
+    for (IndexType col = 0; col < cols; ++col) {
+      LinearMapper lm = data_mapper.getLinearMapper(0, col);
+
+      // Give compiler a strong possibility to unroll the loop.
+      for (IndexType i = 0; i < unrolled_rows; i += 4 * PacketSize) {
+        for (IndexType j = 0; j < 4; ++j) {
+          const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
+          internal::pstoreu(block + j * PacketSize, p);
+        }
+        block += 4 * PacketSize;
+      }
+
+      // Process remaining rows with packets.
+      for (IndexType i = unrolled_rows; i < vectorized_rows; i += PacketSize) {
+        const Packet p = lm.template loadPacket<Packet>(i);
+        internal::pstoreu(block, p);
+        block += PacketSize;
+      }
+
+      // Finalize with coefficients.
+      for (IndexType i = vectorized_rows; i < rows; ++i) {
+        *block = lm(i);
+        ++block;
+      }
+    }
+  }
+};
+
+template <typename Scalar, typename IndexType, typename OutputMapper,
+          bool ConjugateLhs = false, bool ConjugateRhs = false>
+struct mkldnn_gemm_kernel;
+
+// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
+template <typename IndexType, typename OutputMapper, bool ConjugateLhs,
+          bool ConjugateRhs>
+struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
+                          ConjugateLhs, ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const OutputMapper& output, const float* blockA,
+                  const float* blockB, const IndexType rows,
+                  const IndexType depth, const IndexType cols, float alpha) {
+    static const int max_index = (std::numeric_limits<int>::max)();
+
+    eigen_assert(max_index >= rows);
+    eigen_assert(max_index >= cols);
+    eigen_assert(max_index >= depth);
+    eigen_assert(max_index >= output.stride());
+
+    const int m = static_cast<int>(rows);
+    const int n = static_cast<int>(cols);
+    const int k = static_cast<int>(depth);
+
+    const char transposeA = ConjugateLhs ? 'Y' : 'N';
+    const char transposeB = ConjugateRhs ? 'Y' : 'N';
+
+    const int ldA = ConjugateLhs ? k : m;
+    const int ldB = ConjugateRhs ? n : k;
+    const int ldC = static_cast<int>(output.stride());
+
+    const float beta = 1.0;
+
+    mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
+                                      &alpha, blockA, &ldA, blockB, &ldB, &beta,
+                                      const_cast<float*>(output.data()), &ldC);
+    eigen_assert(st == 0);
+  }
+};
+
+// For mkldnn_sgemm having the right dimensions (especially for small matrices)
+// is more important than fitting all the working set in L1/L2 caches.
+// TODO(ezhulenev): Do better heuristics.
+template <typename StorageIndex, int sharding_type>
+class TensorContractionBlocking<float, float, float, StorageIndex,
+                                sharding_type> {
+  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
+  using Scalar = float;
+
+  // Adjust the block sizes to work well with mkldnn kernels.
+
+  // Multiply default choice of block size along M and N dimensions.
+  // TODO(ezhulenev): Explore if this can work in general (kScaleM=2.0 worked
+  // well in some of models).
+  static const float kScaleM = 1.5;
+  static const float kScaleN = 1.0;
+
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48.
+  static const StorageIndex kUnrollM = 48;
+
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8.
+  static const StorageIndex kUnrollN = 24;
+
+ public:
+  TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
+                            StorageIndex num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
+    // 1. Compute block sizes using default Eigen heuristics.
+    if (sharding_type == ShardByCol) {
+      computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, mc_, nc_,
+                                                     num_threads);
+    } else {
+      computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, nc_, mc_,
+                                                     num_threads);
+    }
+
+    // 2. And refine them to work well with mkldnn sgemm.
+    mc_ = (std::min)(
+        m, Eigen::divup(static_cast<StorageIndex>(mc_ * kScaleM), kUnrollM) *
+               kUnrollM);
+    nc_ = (std::min)(
+        n, Eigen::divup(static_cast<StorageIndex>(nc_ * kScaleN), kUnrollN) *
+               kUnrollN);
+
+    // We split Kth dimensions in roughly equal slices.
+    StorageIndex target_k_slices =
+        (std::max)(StorageIndex(1), Eigen::divup(k, kc_));
+    StorageIndex packet_size = 8;
+    StorageIndex target_bk =
+        Eigen::divup(k / target_k_slices, packet_size) * packet_size;
+    kc_ = (std::min)(k, target_bk);
+  }
+
+  EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+
+ private:
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
+};
+
+template <typename StorageIndex, typename OutputMapper, typename LhsMapper,
+          typename RhsMapper>
+struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
+                               LhsMapper, RhsMapper> {
+  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
+  using Scalar = float;
+  using Traits = typename internal::gebp_traits<Scalar, Scalar>;
+
+  using LhsPacker = mkldnn_gemm_pack<Scalar, StorageIndex,
+                                     typename LhsMapper::SubMapper, ColMajor>;
+  using RhsPacker = mkldnn_gemm_pack<Scalar, StorageIndex,
+                                     typename RhsMapper::SubMapper, ColMajor>;
+  using GemmKernel = mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>;
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packLhs(
+      Scalar* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(lhsBlock, data_mapper, rows, depth);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packRhs(
+      Scalar* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void invoke(
+      const OutputMapper& output_mapper, const Scalar* lhsBlock,
+      const Scalar* rhsBlock, const StorageIndex rows, const StorageIndex depth,
+      const StorageIndex cols, const Scalar alpha) {
+    GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
+  }
+};
+
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h
index 6a9a2accd8d807834930411d2cd1d1e0e9d3c55f..3182307e51e5fc2912ff7e178fbeab6c73d47d03 100644
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@@ -19,6 +19,10 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_volume_patch.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -51,11 +55,10 @@ namespace internal {
 //   col - index of the extracted patch (in code: patchIndex)
 //         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
 //
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar_,
-          typename Index, typename nocontract_t, typename contract_t, int Side,
-          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
-          int Alignment>
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar_, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper<
     Scalar_, Index, Side,
     TensorEvaluator<const TensorReshapingOp<NewDimension,
@@ -332,13 +335,6 @@ class TensorContractionInputMapper<
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
-                                             const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
  private:
   friend class TensorContractionSubMapper<
       Scalar, Index, Side,
@@ -681,11 +677,10 @@ class TensorContractionInputMapper<
   const TensorEvaluator<ArgType, Device> m_impl;
 };
 
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t, int Side,
-          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
-          int Alignment>
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper<
     Scalar, Index, Side,
     TensorEvaluator<const TensorReshapingOp<NewDimension,
@@ -880,6 +875,12 @@ class TensorContractionSubMapper<
     const Index inputIndex = depth + baseIndex;
     return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
   }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
 
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padPlane(const Index plane) const {
@@ -948,7 +949,9 @@ class TensorContractionSubMapper<
   }
 
  private:
-  const ParentMapper& m_base_mapper;
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
+
   Index m_depth_offset;  // First row in the input matrix
   Index m_col_offset;    // First col in the input matrix
 
@@ -991,11 +994,14 @@ class TensorContractionSubMapper<
 // *) nr - number of registers along the 'n' dimension.
 //    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
 //    Multiplication" paper.
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t,
-          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
-          int Alignment, int nr>
+//
+// TODO(ezhulenev): Add support for squeezing reads along two innermost
+// dimensions (see eigen_spatial_convolutions).
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          int nr>
 struct gemm_pack_rhs<
     Scalar, Index,
     TensorContractionSubMapper<
@@ -1170,11 +1176,13 @@ struct gemm_pack_rhs<
 
 // Template specialization for packet_size = 2. We must special-case packet
 // blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          int nr>
+//
+// TODO(ezhulenev): Add support for squeezing reads along two innermost
+// dimensions (see eigen_spatial_convolutions).
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
     Scalar, Index,
     TensorContractionSubMapper<
@@ -1351,11 +1359,10 @@ struct gemm_pack_rhs<
 };
 
 // Special case for non-vectorized types such as float16 (packet_size = 1).
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          int nr>
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
     Scalar, Index,
     TensorContractionSubMapper<
@@ -1425,6 +1432,170 @@ struct gemm_pack_rhs<
   }
 };
 
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+// Arrange a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted volume patches) in contiguous memory.
+//
+// Mkldnn doesn't require Lhs/Rhs blocks to be packed in any specific format, so
+// this is basically the same as taking a slice of the matrix. Knowing
+// properties of the original patch op we can do it more efficient than default
+// mkldnn_gemm_pack.
+//
+// TODO(ezhulenev): mkldnn_gemm_pack for spatial convolutions supports squeezing
+// reads along the 2 innermost dimensions, add it here if needed.
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar,
+          typename StorageIndex, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
+          int Alignment>
+struct mkldnn_gemm_pack<
+    Scalar, StorageIndex,
+    TensorContractionSubMapper<
+        Scalar, StorageIndex, Rhs,
+        TensorEvaluator<const TensorReshapingOp<
+                            NewDimension, const TensorVolumePatchOp<
+                                              Planes, Rows, Cols, ArgType> >,
+                        Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    ColMajor> {
+  typedef TensorContractionSubMapper<
+      Scalar, StorageIndex, Rhs,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
+                  StorageIndex cols) {
+    const bool standard_patches = !rhs.nonStandardPatches();
+
+    if (standard_patches && rhs.patchDepth() % packet_size == 0) {
+      packStandardPatches<true>(block, rhs, rows, cols);
+
+    } else if (standard_patches) {
+      packStandardPatches<false>(block, rhs, rows, cols);
+
+    } else {
+      // With non-standard patches we don't do any vectorized loads.
+      // TODO(ezhulenev): It doesn't look like that we should completely give up
+      // on packets. Make this code path faster!
+      for (StorageIndex col = 0; col < cols; ++col) {
+        SubMapper lm = rhs.getLinearMapper(0, col);
+        for (StorageIndex i = 0; i < rows; ++i) {
+          *block = lm(i);
+          ++block;
+        }
+      }
+    }
+  }
+
+ private:
+  // Pack standard volume patches:
+  //
+  // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
+  //   depth dimension size to be a multiple of packet size, so we can skip all
+  //   non vectorized loads and checks.
+  //
+  template <bool patch_depth_is_multiple_of_packet_size>
+  EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
+                                               const DataMapper& rhs,
+                                               StorageIndex rows,
+                                               StorageIndex cols) {
+    eigen_assert(!rhs.nonStandardPatches());
+
+    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
+    const Index peeled_k = (rows / packet_size) * packet_size;
+
+    const Index start_col = rhs.colOffset();
+    const Index max_col = rhs.maxCol(peeled_k);
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      SubMapper lm = rhs.getLinearMapper(0, col);
+
+      Index k = 0;
+      for (Index c = start_col; c < max_col; ++c) {
+        eigen_assert(k <= peeled_k);
+
+        const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const Index max_row = rhs.maxRow(peeled_k, c);
+        const bool pad_col = lm.padCol(c);
+
+        for (Index r = start_row; r < max_row; ++r) {
+          eigen_assert(k <= peeled_k);
+
+          const Index start_plane =
+              ((c == start_col) && (r == start_row)) ? rhs.planeOffset() : 0;
+          const Index max_plane = rhs.maxPlane(peeled_k, c, r);
+          const bool pad_row = pad_col || lm.padRow(r);
+
+          for (Index p = start_plane; p < max_plane; ++p) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_depth =
+                ((c == start_col) && (r == start_row) && (p == start_plane))
+                    ? rhs.depthOffset()
+                    : 0;
+            const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+
+            const bool pad = pad_col || pad_row || lm.padPlane(p);
+            const Index base_idx = lm.baseIndex(p, r, c);
+
+            if (patch_depth_is_multiple_of_packet_size)
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
+            const Index max_vectorized_depth =
+                patch_depth_is_multiple_of_packet_size
+                    ? max_depth
+                    : max_depth - packet_size;
+
+            Index d = start_depth;
+
+            // 1. Process depth dimension with vectorized instructions.
+            for (; d < max_vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet packet = pad ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, packet);
+              block += packet_size;
+              k += packet_size;
+            }
+
+            // 2. Finish with coefficients.
+            if (!patch_depth_is_multiple_of_packet_size) {
+              for (; d < max_depth; d++) {
+                eigen_assert(k < peeled_k);
+                *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
+                ++block;
+                ++k;
+              }
+            }
+          }
+        }
+      }
+
+      // The loop above should fill peeled_k elements.
+      eigen_assert(peeled_k == k);
+
+      // Fill remaining elements using loadCoeffStandard.
+      for (; k < rows; ++k) {
+        *block = lm.loadCoeffStandard(k);
+        ++block;
+      }
+    }
+  }
+};
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
 }  // namespace internal
 
 /** CuboidConvolution
@@ -1476,9 +1647,8 @@ EIGEN_ALWAYS_INLINE static const typename internal::conditional<
                 const DSizes<typename internal::traits<Input>::Index, 2>,
                 const Kernel> > > >::type
 CuboidConvolution(const Input& input, const Kernel& kernel,
-                  const DenseIndex stridePlanes = 1,
-                  const DenseIndex strideRows = 1,
-                  const DenseIndex strideCols = 1,
+                  const Index stridePlanes = 1, const Index strideRows = 1,
+                  const Index strideCols = 1,
                   const PaddingType padding_type = PADDING_SAME) {
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar,
diff --git a/tensorflow/core/kernels/eigen_mkldnn.h b/tensorflow/core/kernels/eigen_mkldnn.h
deleted file mode 100644
index 5235431f5f36e0ef3787dd11d7fc1bf0596718ee..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/eigen_mkldnn.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_MKLDNN_H_
-#define TENSORFLOW_CORE_KERNELS_EIGEN_MKLDNN_H_
-
-// Support for Mkldnn sgemm kernel in Eigen/Tensor contractions:
-//
-// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using
-//    DataMapper (see TensorContractionInputMapper).
-// 2. Invoke gemm kernel with packed blocks (replacement for default
-//    gebp_kernel).
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/intel_mkl_dnn/include/mkldnn.h"
-
-namespace Eigen {
-namespace internal {
-
-template <typename Scalar, typename IndexType, typename DataMapper,
-          int StorageOrder>
-struct mkldnn_gemm_pack;
-
-// mkl_gemm_pack for ColMajor storage order.
-template <typename Scalar, typename IndexType, typename DataMapper>
-struct mkldnn_gemm_pack<Scalar, IndexType, DataMapper,
-                        /*StorageOrder*/ ColMajor> {
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  typedef typename DataMapper::LinearMapper LinearMapper;
-
-  enum { PacketSize = internal::packet_traits<Scalar>::size };
-
-  EIGEN_DONT_INLINE
-  void operator()(Scalar *block, const DataMapper &data_mapper, IndexType rows,
-                  IndexType cols) {
-    const IndexType unrolled_rows =
-        (rows / (4 * PacketSize)) * (4 * PacketSize);
-    const IndexType vectorized_rows = (rows / PacketSize) * PacketSize;
-
-    for (IndexType col = 0; col < cols; ++col) {
-      LinearMapper lm = data_mapper.getLinearMapper(0, col);
-
-      // Give compiler a strong possibility to unroll the loop.
-      for (IndexType i = 0; i < unrolled_rows; i += 4 * PacketSize) {
-        for (IndexType j = 0; j < 4; ++j) {
-          const Packet p = lm.loadPacket(i + j * PacketSize);
-          internal::pstoreu(block + j * PacketSize, p);
-        }
-        block += 4 * PacketSize;
-      }
-
-      // Process remaining rows with packets.
-      for (IndexType i = unrolled_rows; i < vectorized_rows; i += PacketSize) {
-        const Packet p = lm.loadPacket(i);
-        internal::pstoreu(block, p);
-        block += PacketSize;
-      }
-
-      // Finalize with coefficients.
-      for (IndexType i = vectorized_rows; i < rows; ++i) {
-        *block = lm(i);
-        ++block;
-      }
-    }
-  }
-};
-
-template <typename Scalar, typename IndexType, typename OutputMapper,
-          bool ConjugateLhs = false, bool ConjugateRhs = false>
-struct mkldnn_gemm_kernel;
-
-// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
-template <typename IndexType, typename OutputMapper, bool ConjugateLhs,
-          bool ConjugateRhs>
-struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
-                          ConjugateLhs, ConjugateRhs> {
-  EIGEN_DONT_INLINE
-  void operator()(const OutputMapper &output, const float *blockA,
-                  const float *blockB, const IndexType rows,
-                  const IndexType depth, const IndexType cols, float alpha) {
-    static const int max_index = (std::numeric_limits<int>::max)();
-
-    eigen_assert(max_index >= rows);
-    eigen_assert(max_index >= cols);
-    eigen_assert(max_index >= depth);
-    eigen_assert(max_index >= output.stride());
-
-    const int m = static_cast<int>(rows);
-    const int n = static_cast<int>(cols);
-    const int k = static_cast<int>(depth);
-
-    const char transposeA = ConjugateLhs ? 'Y' : 'N';
-    const char transposeB = ConjugateRhs ? 'Y' : 'N';
-
-    const int ldA = ConjugateLhs ? k : m;
-    const int ldB = ConjugateRhs ? n : k;
-    const int ldC = static_cast<int>(output.stride());
-
-    const float beta = 1.0;
-
-    mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
-                                      &alpha, blockA, &ldA, blockB, &ldB, &beta,
-                                      const_cast<float *>(output.data()), &ldC);
-    eigen_assert(st == 0);
-  }
-};
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_MKLDNN_H_
diff --git a/tensorflow/core/kernels/eigen_mkldnn_test.cc b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
similarity index 97%
rename from tensorflow/core/kernels/eigen_mkldnn_test.cc
rename to tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
index a1ca703d0ef0135dfe7c1aa5a2368a6699275fc0..da4a61d1bda1ea1171fdea5c9dffaab8aabd4429 100644
--- a/tensorflow/core/kernels/eigen_mkldnn_test.cc
+++ b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/eigen_mkldnn.h"
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace Eigen {
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index e926d73f87c0bb936d068312e7973e4ff6513399..1f211b19b4ad982d2ab2a6520bc0e9277e99055a 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -18,6 +18,10 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -52,8 +56,8 @@ namespace internal {
 //
 // TODO(ezhulenev): Consolidate this part of the code with the image patch
 // extraction code since they are both very similar.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar_, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar_, typename Index,
           typename nocontract_t, typename contract_t, int Side, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper<
@@ -264,13 +268,6 @@ class TensorContractionInputMapper<
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
-                                             const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
  private:
   friend class TensorContractionSubMapper<
       Scalar, Index, Side,
@@ -511,8 +508,8 @@ class TensorContractionInputMapper<
   const TensorEvaluator<ArgType, Device> m_impl;
 };
 
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, int Side, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper<
@@ -682,6 +679,12 @@ class TensorContractionSubMapper<
     const Index inputIndex = depth + baseIndex;
     return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
   }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
 
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
@@ -701,6 +704,15 @@ class TensorContractionSubMapper<
            c * m_base_mapper.m_colInputStride + m_otherIndex;
   }
 
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowStride() const {
+    return m_base_mapper.m_row_strides;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colStride() const {
+    return m_base_mapper.m_col_strides;
+  }
+
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index rowOffset() const {
     const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
@@ -726,9 +738,11 @@ class TensorContractionSubMapper<
   }
 
  private:
-  const ParentMapper& m_base_mapper;  // that was a reference before
-  Index m_depth_offset;               // First row in the input matrix
-  Index m_col_offset;                 // First col in the input matrix
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
+
+  Index m_depth_offset;  // First row in the input matrix
+  Index m_col_offset;    // First col in the input matrix
 
   // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
   // indices for the first element in a patch specified by col_offset
@@ -768,8 +782,8 @@ class TensorContractionSubMapper<
 // *) nr - number of registers along the 'n' dimension.
 //    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
 //    Multiplication" paper.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
           int nr>
@@ -835,6 +849,55 @@ struct gemm_pack_rhs<
             const bool pad_col2 = dm2.padCol(c);
             const bool pad_col3 = dm3.padCol(c);
 
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                                //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
             for (Index r = start_row; r < max_row; ++r) {
               eigen_assert(k <= peeled_k);
 
@@ -929,8 +992,8 @@ struct gemm_pack_rhs<
 
 // Template specialization for packet_size = 2. We must special-case packet
 // blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
           bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
@@ -996,6 +1059,56 @@ struct gemm_pack_rhs<
             const bool pad_col2 = dm2.padCol(c);
             const bool pad_col3 = dm3.padCol(c);
 
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                                //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
             for (Index r = start_row; r < max_row; ++r) {
               eigen_assert(k <= peeled_k);
 
@@ -1095,8 +1208,8 @@ struct gemm_pack_rhs<
 };
 
 // Special case for non-vectorized types such as float16.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
           bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
@@ -1168,6 +1281,218 @@ struct gemm_pack_rhs<
   }
 };
 
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+// Arrange a block of the right input matrix (in our case it's always a
+// "virtual matrix" constructed from extracted image patches) in contiguous
+// memory.
+//
+// Mkldnn doesn't require Lhs/Rhs blocks to be packed in any specific format, so
+// this is basically the same as taking a slice of the matrix. Knowing
+// properties of the original patch op we can do it more efficient than default
+// mkldnn_gemm_pack.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename StorageIndex,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+struct mkldnn_gemm_pack<
+    Scalar, StorageIndex,
+    TensorContractionSubMapper<
+        Scalar, StorageIndex, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    ColMajor> {
+  typedef TensorContractionSubMapper<
+      Scalar, StorageIndex, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
+                  StorageIndex cols) {
+    const bool standard_patches = !rhs.nonStandardPatches();
+
+    if (standard_patches && (rhs.patchDepth() % packet_size == 0)) {
+      if (rhs.rowStride() == 1) {
+        packStandardPatches<true, /*squeeze*/ true>(block, rhs, rows, cols);
+      } else {
+        packStandardPatches<true, /*squeeze*/ false>(block, rhs, rows, cols);
+      }
+
+    } else if (standard_patches) {
+      if (rhs.rowStride() == 1) {
+        packStandardPatches<false, /*squeeze*/ true>(block, rhs, rows, cols);
+      } else {
+        packStandardPatches<false, /*squeeze*/ false>(block, rhs, rows, cols);
+      }
+
+    } else {
+      // With non-standard patches we don't do any vectorized loads.
+      // TODO(ezhulenev): It doesn't look like that we should completely give up
+      // on packets. Make this code path faster!
+      for (StorageIndex col = 0; col < cols; ++col) {
+        SubMapper lm = rhs.getLinearMapper(0, col);
+        for (StorageIndex i = 0; i < rows; ++i) {
+          *block = lm(i);
+          ++block;
+        }
+      }
+    }
+  }
+
+ private:
+  // Pack standard image patches:
+  //
+  // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
+  //   depth dimension size to be a multiple of packet size, so we can skip all
+  //   non vectorized loads and checks.
+  //
+  // - squeeze_reads=true: If stride along the `row` dimension is `1`, we can
+  //   squeeze reads along the `row` and `depth` dimensions, because they are
+  //   guaranteed to be contiguous in memory (two innermost dimensions).
+  //
+  template <bool patch_depth_is_multiple_of_packet_size, bool squeeze_reads>
+  EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
+                                               const DataMapper& rhs,
+                                               StorageIndex rows,
+                                               StorageIndex cols) {
+    eigen_assert(!rhs.nonStandardPatches());
+
+    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
+    const Index peeled_k = (rows / packet_size) * packet_size;
+
+    const Index start_col = rhs.colOffset();
+    const Index max_col = rhs.maxCol(peeled_k);
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      SubMapper lm = rhs.getLinearMapper(0, col);
+
+      Index k = 0;
+      for (Index c = start_col; c < max_col; ++c) {
+        eigen_assert(k <= peeled_k);
+
+        const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const Index max_row = rhs.maxRow(peeled_k, c);
+        const bool pad_col = lm.padCol(c);
+
+        // We can squeeze reads for all rows in [start_row, max_row) range.
+        if (squeeze_reads && !pad_col && !lm.padRow(start_row) &&
+            !lm.padRow(max_row - 1)) {
+          const Index start_depth = (c == start_col) ? rhs.depthOffset() : 0;
+
+          // Upper bound on the number of elements in the depth dimension that
+          // we can squeeze read.
+          const Index squeeze_length =
+              (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+          // Do not overshoot beyond the block size.
+          const Index max_depth =
+              start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+
+          const Index base_idx = lm.baseIndex(start_row, c);
+
+          if (patch_depth_is_multiple_of_packet_size)
+            eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+          // If patch depth is a multiple of packet size, it's guaranteed that
+          // we can process all values in depth dimension with packets.
+          const Index max_vectorized_depth =
+              patch_depth_is_multiple_of_packet_size ? max_depth
+                                                     : max_depth - packet_size;
+
+          Index d = start_depth;
+
+          // 1. Process depth dimension with vectorized instructions.
+          for (; d < max_vectorized_depth; d += packet_size) {
+            eigen_assert(k < peeled_k);
+            internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+            block += packet_size;
+            k += packet_size;
+          }
+
+          // 2. Finish with coefficients.
+          if (!patch_depth_is_multiple_of_packet_size) {
+            for (; d < max_depth; d++) {
+              eigen_assert(k < peeled_k);
+              *block = rhs.coeffNoPadding(d, base_idx);
+              ++block;
+              ++k;
+            }
+          }
+
+          // Go to the next column.
+          continue;
+        }
+
+        // If we are not allowed to squeeze reads along the `row` and `depth`
+        // dimensions, we must process rows one by one.
+        for (Index r = start_row; r < max_row; ++r) {
+          eigen_assert(k <= peeled_k);
+
+          const Index start_depth =
+              ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+          const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+
+          const bool pad = pad_col || lm.padRow(r);
+          const Index base_idx = lm.baseIndex(r, c);
+
+          if (patch_depth_is_multiple_of_packet_size)
+            eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+          // If patch depth is a multiple of packet size, it's guaranteed that
+          // we can process all values in depth dimension with packets.
+          const Index max_vectorized_depth =
+              patch_depth_is_multiple_of_packet_size ? max_depth
+                                                     : max_depth - packet_size;
+
+          Index d = start_depth;
+
+          // 1. Process depth dimension with vectorized instructions.
+          for (; d < max_vectorized_depth; d += packet_size) {
+            eigen_assert(k < peeled_k);
+            const Packet p = pad ? pset1<Packet>(Scalar(0))
+                                 : rhs.packetNoPadding(d, base_idx);
+            internal::pstoreu(block, p);
+            block += packet_size;
+            k += packet_size;
+          }
+
+          // 2. Finish with coefficients.
+          if (!patch_depth_is_multiple_of_packet_size) {
+            for (; d < max_depth; d++) {
+              eigen_assert(k < peeled_k);
+              *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
+              ++block;
+              ++k;
+            }
+          }
+        }
+      }
+
+      // The loop above should fill peeled_k elements.
+      eigen_assert(peeled_k == k);
+
+      // Fill remaining elements using loadCoeffStandard.
+      for (; k < rows; ++k) {
+        *block = lm.loadCoeffStandard(k);
+        ++block;
+      }
+    }
+  }
+};
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
 }  // end namespace internal
 
 /** SpatialConvolution
@@ -1193,8 +1518,12 @@ struct gemm_pack_rhs<
  * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
  *
+ * It is also possible to add an output kernel to the contraction, output
+ * kernel is called by Eigen when it "finalizes" the block of an output tensor.
+ *
  */
-template <typename Input, typename Kernel>
+template <typename Input, typename Kernel,
+          typename OutputKernel = const NoOpOutputKernel>
 EIGEN_DEVICE_FUNC
     EIGEN_ALWAYS_INLINE static const typename internal::conditional<
         internal::traits<Input>::Layout == ColMajor,
@@ -1209,8 +1538,8 @@ EIGEN_DEVICE_FUNC
                     const Kernel>,
                 const TensorReshapingOp<
                     const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const TensorImagePatchOp<Dynamic, Dynamic,
-                                             const Input> > > >,
+                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+                const OutputKernel> >,
         TensorReshapingOp<
             const DSizes<typename internal::traits<Input>::Index,
                          internal::traits<Input>::NumDimensions>,
@@ -1222,13 +1551,14 @@ EIGEN_DEVICE_FUNC
                     const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
                 const TensorReshapingOp<
                     const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel> > > >::type
+                    const Kernel>,
+                const OutputKernel> > >::type
     SpatialConvolution(const Input& input, const Kernel& kernel,
-                       const DenseIndex row_stride = 1,
-                       const DenseIndex col_stride = 1,
+                       const Index row_stride = 1, const Index col_stride = 1,
                        const PaddingType padding_type = PADDING_SAME,
-                       const DenseIndex row_in_stride = 1,
-                       const DenseIndex col_in_stride = 1) {
+                       const Index row_in_stride = 1,
+                       const Index col_in_stride = 1,
+                       const OutputKernel& output_kernel = OutputKernel()) {
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar,
                    internal::traits<Input>::NumDimensions,
@@ -1258,9 +1588,9 @@ EIGEN_DEVICE_FUNC
   const TensorIndex kernelCols =
       isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
 
-  const DenseIndex kernelRowsEff =
+  const Index kernelRowsEff =
       kernelRows + (kernelRows - 1) * (row_in_stride - 1);
-  const DenseIndex kernelColsEff =
+  const Index kernelColsEff =
       kernelCols + (kernelCols - 1) * (col_in_stride - 1);
 
   array<IndexPair<TensorIndex>, 1> contract_dims;
@@ -1351,13 +1681,13 @@ EIGEN_DEVICE_FUNC
                             kernelRows, kernelCols, row_stride, col_stride,
                             row_in_stride, col_in_stride, padding_type)
                         .reshape(pre_contract_dims),
-                    contract_dims)
+                    contract_dims, output_kernel)
           .reshape(post_contract_dims),
       input
           .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
                                  row_in_stride, col_in_stride, padding_type)
           .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
+          .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
           .reshape(post_contract_dims));
 }
 
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 1b29a16352b808528bc16860b5e12b34a31cd2c8..8219fc9025b49ad0de23edbcbcb5324bbf88b22b 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -1378,7 +1378,9 @@ static void PackRhsHelper(int iters,
                           int input_batches, int input_cols, int input_rows,
                           int input_depth,
                           /* Filter (kernel) dimensions: */
-                          int filter_count, int filter_cols, int filter_rows) {
+                          int filter_count, int filter_cols, int filter_rows,
+                          /* Input strides: */
+                          int col_strides, int row_strides) {
   tensorflow::testing::UseRealTime();
   tensorflow::testing::StopTiming();
 
@@ -1392,14 +1394,17 @@ static void PackRhsHelper(int iters,
   static const int packet_size = Eigen::internal::packet_traits<float>::size;
 
   // Reshape dimensions.
-  using NewDimension = Eigen::array<Eigen::Index, 2>;
+  using NewDimension = Eigen::DSizes<Index, 2>;
 
   // Contraction dimensions.
   using nocontract_t = Eigen::array<Eigen::Index, 1>;
   using contract_t = Eigen::array<Eigen::Index, 1>;
 
-  // Input to the TensorImagePatchOp.
-  using ArgType = Tensor<float, 4>;
+  // Input to the TensorImagePatchOp. It is the tensorflow TTypes<float>::Tensor
+  // with ColMajor layout, instead of RowMajor. But that doesn't make any
+  // difference, because TensorContraction swaps LHS with RHS for row major
+  // inputs, and contraction mapper always works with column major data.
+  using ArgType = TensorMap<Tensor<float, 4>, Eigen::Aligned>;
 
   using Evaluator = TensorEvaluator<
       const TensorReshapingOp<
@@ -1422,12 +1427,17 @@ static void PackRhsHelper(int iters,
       /*inner_dim_reordered*/ false,                  //
       /*Alignment*/ 0>;
 
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackRhsImpl = Eigen::internal::mkldnn_gemm_pack<float, Eigen::Index,
+                                                        SubMapper, ColMajor>;
+#else
   using PackRhsImpl =
       Eigen::internal::gemm_pack_rhs<float, Eigen::Index, SubMapper,  //
                                      Traits::nr,                      //
                                      ColMajor,                        //
                                      /*Conjugate*/ false,             //
                                      /*PanelMode*/ false>;
+#endif
 
   Eigen::DefaultDevice device;
 
@@ -1454,20 +1464,25 @@ static void PackRhsHelper(int iters,
     inputs.emplace_back(input_dims);
     inputs[i].setRandom();
 
+    ArgType tensor_map(inputs[i].data(), input_dims);
+
     // 1. Extract image patches from input tensor. All strides are `1`.
     const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
-        inputs[i],                                             //
+        tensor_map,                                            //
         filter_rows, filter_cols,                              //
-        /*row_strides=*/1, /*col_strides=*/1,                  //
+        row_strides, col_strides,                              //
         /*in_row_strides=*/1, /*in_col_strides=*/1,            //
         /*row_inflate_strides=*/1, /*col_inflate_strides=*/1,  //
         Eigen::PADDING_SAME, /*padding_value=*/0.0);
 
     // 2. Reshape extracted patches into "virtual" 2d tensor.
-    NewDimension reshape_dims = {
-        input_depth * filter_rows * filter_cols,  // patch size
-        // PADDING_SAME: output {rows, cols} == input {rows, cols}
-        input_rows * input_cols * input_batches};  // num_patches
+    // NOTE: This is valid for PADDING_SAME only.
+    Index output_rows = input_rows / row_strides;
+    Index output_cols = input_cols / col_strides;
+    NewDimension reshape_dims;
+    reshape_dims[0] = input_depth * filter_rows * filter_cols;    // patch size
+    reshape_dims[1] = output_rows * output_cols * input_batches;  // num_patches
+
     const auto reshape_op =
         TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
             image_patch_op, reshape_dims);
@@ -1516,9 +1531,9 @@ static void PackRhsHelper(int iters,
     Index packed_offset =
         internal::random<Index>(0, packed_total_size - packed_size - 1);
 
-    pack_rhs(packed.data() + packed_offset,
-             input_mappers[input_idx].getSubMapper(depth_offset, col_offset),
-             depth, cols);
+    SubMapper sub_mapper =
+        input_mappers[input_idx].getSubMapper(depth_offset, col_offset);
+    pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
   }
   tensorflow::testing::StopTiming();
 
@@ -1529,14 +1544,14 @@ static void PackRhsHelper(int iters,
   tensorflow::testing::SetLabel(stringStream.str());
 }
 
-#define BM_NAME(prefix, N, H, W, C, FC, FH, FW) \
-  BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW
+#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW) \
+  BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW##_s##SH##x##SW
 
-#define BM_PackRhs(N, H, W, C, FC, FH, FW)                          \
-  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW)(int iters) { \
-    PackRhsHelper(iters, N, H, W, C, FC, FH, FW);                   \
-  }                                                                 \
-  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW))
+#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW)                          \
+  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW)(int iters) { \
+    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW);                   \
+  }                                                                         \
+  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW))
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
@@ -1547,13 +1562,28 @@ BM_PackRhs(/*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 32,     //
            /*num_filters*/ 64,  //
-           /*filter*/ 5, 5);
+           /*filter*/ 5, 5,     //
+           /*stride*/ 1, 1);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 32,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5,     //
+           /*stride*/ 2, 2);
 
 // Slow path: input channel dimension is not the multiple of the packet size.
 BM_PackRhs(/*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 30,     //
            /*num_filters*/ 64,  //
-           /*filter*/ 5, 5);
+           /*filter*/ 5, 5,     //
+           /*stride*/ 1, 1);
 
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 30,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5,     //
+           /*stride*/ 2, 2);
 }  // namespace Eigen
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index a5374e02684ea1140e6e521d986a07b8838e45b7..cca3cfbd7c0bc4729016c54bf1c9b417f9d4c28a 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -69,6 +69,7 @@ void RetvalOp::Compute(OpKernelContext* ctx) {
 }
 
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kArgOp).Device(DEVICE_CPU), ArgOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceArgOp).Device(DEVICE_CPU), ArgOp);
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kRetOp).Device(DEVICE_CPU), RetvalOp);
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceRetOp).Device(DEVICE_CPU), RetvalOp);
 
@@ -99,11 +100,14 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
   REGISTER_KERNEL_BUILDER( \
       Name(kArgOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), ArgOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+TF_CALL_QUANTIZED_TYPES(REGISTER)
 TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kArgOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("output")
                                                    .TypeConstraint<int32>("T"),
                                                ArgOp);
+REGISTER_KERNEL_BUILDER(
+    Name(kDeviceArgOp).Device(DEVICE_GPU).TypeConstraint<int32>("T"), ArgOp);
 #undef REGISTER
 
 REGISTER_KERNEL_BUILDER(Name(kArgOp)
@@ -122,6 +126,7 @@ REGISTER_KERNEL_BUILDER(Name(kArgOp)
   REGISTER_KERNEL_BUILDER( \
       Name(kRetOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+TF_CALL_QUANTIZED_TYPES(REGISTER)
 TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("input")
diff --git a/tensorflow/core/kernels/function_ops.h b/tensorflow/core/kernels/function_ops.h
index 0f51eca16380acf98664a6ca255c64691ee57691..9ddd49560392dd4c313877f819c13d2a6b0079ed 100644
--- a/tensorflow/core/kernels/function_ops.h
+++ b/tensorflow/core/kernels/function_ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 namespace tensorflow {
 
 static const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static const char* const kDeviceArgOp = FunctionLibraryDefinition::kDeviceArgOp;
 static const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
 static const char* const kDeviceRetOp = FunctionLibraryDefinition::kDeviceRetOp;
 
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 1529d2e3368266174d3098bad5f4b35bb83b502e..5ecb203cbc7296d75f6a0a68a2189d7bf018c7fe 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -526,21 +526,40 @@ REGISTER_KERNEL_BUILDER(Name("For")
                             .HostMemory("delta"),
                         ForOp);
 
+// FakeParamOp allocates a tensor with a shape conforming to the expected
+// output. This is necessary if the value will be stored in a while_loop's
+// TensorList. The output is otherwise not expected to be consumed by anything
+// else.
 class FakeParamOp : public OpKernel {
  public:
   explicit FakeParamOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    DataType dtype;
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype));
+
+    // Set shape to the specified shape, setting unknown dimensions to empty.
+    // If the specified shape is unknown, leave as an empty shape.
+    TensorShape shape;
+    PartialTensorShape partial_shape;
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &partial_shape));
+    if (!partial_shape.unknown_rank()) {
+      for (int64 d : partial_shape.dim_sizes()) {
+        shape.AddDim(d == -1 ? 0 : d);
+      }
+    }
+
+    // Create a persistent tensor that we can repeatedly return to save memory.
+    // TODO(b/119612758): add optimization to prevent sending this across
+    // devices on each Compute() call.
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                dtype, shape, &value_handle_, nullptr));
   }
 
   void Compute(OpKernelContext* context) override {
-    // We must produce something (only Switch and Recvs are allowed to output
-    // dead tensors). This output is not expected to be consumed by anything.
-    Tensor output_tensor(dtype_, TensorShape({}));
-    context->set_output(0, output_tensor);
+    context->set_output(0, *value_handle_.AccessTensor(context));
   }
 
  private:
-  DataType dtype_;
+  PersistentTensor value_handle_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 1a254da50e6f32659299f7aed17925a91e10ffa6..6f3a49805ce769645ccc113a59360beab27e8403 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -37,8 +37,6 @@ tf_ops_fuzz_target_lib("decode_bmp")
 
 tf_ops_fuzz_target_lib("decode_png")
 
-tf_ops_fuzz_target_lib("decode_jpeg")
-
 tf_ops_fuzz_target_lib("decode_wav")
 
 tf_ops_fuzz_target_lib("example_proto_fast_parsing")
diff --git a/tensorflow/core/kernels/gemm_functors.h b/tensorflow/core/kernels/gemm_functors.h
index 1c808440851d4c01ea61967bbb15d12fd9b857e2..97e077c096031e260d54dcfcccb03af097b0c71e 100644
--- a/tensorflow/core/kernels/gemm_functors.h
+++ b/tensorflow/core/kernels/gemm_functors.h
@@ -36,6 +36,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 // Apple provides an optimized BLAS library that is better than Eigen for their
 // devices, so use that if possible.
 #if defined(__APPLE__) && defined(USE_GEMM_FOR_CONV)
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 3d0c193d9fce6002f02cc98ac5d2ee05e9836697..5f244b1b10f65c60becc1ce3c0e87836a48e3ae3 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -42,7 +42,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 TensorList::TensorList(const TensorList& other)
     : tensors(other.tensors),
       element_shape(other.element_shape),
-      element_dtype(other.element_dtype) {}
+      element_dtype(other.element_dtype),
+      max_num_elements(other.max_num_elements) {}
 
 void TensorList::Encode(VariantTensorData* data) const {
   data->set_type_name(TypeName());
@@ -63,6 +64,7 @@ void TensorList::Encode(VariantTensorData* data) const {
     core::PutVarint64(&metadata, static_cast<uint64>(i));
   }
   core::PutVarint64(&metadata, static_cast<uint64>(element_dtype));
+  core::PutVarint64(&metadata, static_cast<uint64>(max_num_elements));
   TensorShapeProto element_shape_proto;
   element_shape.AsProto(&element_shape_proto);
   element_shape_proto.AppendToString(&metadata);
@@ -74,6 +76,7 @@ static Status TensorListDeviceCopy(
     const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
   to->element_shape = from.element_shape;
   to->element_dtype = from.element_dtype;
+  to->max_num_elements = from.max_num_elements;
   to->tensors.reserve(from.tensors.size());
   for (const Tensor& t : from.tensors) {
     Tensor tmp(t.dtype());
@@ -140,6 +143,8 @@ bool TensorList::Decode(const VariantTensorData& data) {
 
   core::GetVarint64(&iter, &scratch);
   element_dtype = static_cast<DataType>(scratch);
+  core::GetVarint64(&iter, &scratch);
+  max_num_elements = static_cast<int>(scratch);
   TensorShapeProto element_shape_proto;
   element_shape_proto.ParseFromString(string(iter.data(), iter.size()));
   element_shape = PartialTensorShape(element_shape_proto);
@@ -175,12 +180,19 @@ class EmptyTensorList : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
+    const Tensor& max_num_elements_t = ctx->input(1);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(max_num_elements_t.shape()),
+        errors::InvalidArgument(
+            "max_num_elements expected to be a scalar ",
+            "but got shape: ", max_num_elements_t.shape().DebugString()));
     Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr));
     TensorList empty;
     empty.element_dtype = element_dtype_;
+    empty.max_num_elements = max_num_elements_t.scalar<int32>()();
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(ctx, TensorShapeFromTensor(ctx->input(0), &element_shape));
     empty.element_shape = element_shape;
@@ -198,9 +210,11 @@ REGISTER_KERNEL_BUILDER(Name("EmptyTensorList").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("EmptyTensorList").Device(DEVICE_GPU).HostMemory("element_shape"),
-    EmptyTensorList);
+REGISTER_KERNEL_BUILDER(Name("EmptyTensorList")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("element_shape")
+                            .HostMemory("max_num_elements"),
+                        EmptyTensorList);
 
 #endif  // GOOGLE_CUDA
 
@@ -237,6 +251,14 @@ class TensorListPushBack : public OpKernel {
                                         " but list elements ",
                                         DataTypeString(l->element_dtype)));
 
+    if (l->max_num_elements != -1) {
+      OP_REQUIRES(
+          c, l->tensors.size() < l->max_num_elements,
+          errors::InvalidArgument("Tried to push item into a full list",
+                                  " list size: ", l->tensors.size(),
+                                  " max_num_elements: ", l->max_num_elements));
+    }
+
     TensorList output;
     output = *l;
     output.tensors.push_back(input);
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 12581b15b1c9718a8ccc11b206485f0bbbf73553..75d91aff49de08a51d7ab7fb6b63631489ec25bf 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -56,6 +56,9 @@ struct TensorList {
   std::vector<Tensor> tensors;
   PartialTensorShape element_shape;
   DataType element_dtype;
+  // The maximum allowed size of `tensors`. Defaults to -1 meaning that the size
+  // of `tensors` is unbounded.
+  int max_num_elements = -1;
 };
 
 Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
@@ -371,8 +374,12 @@ Status TensorListBinaryAdd(OpKernelContext* c, const TensorList& a,
           b_tensor.shape().DebugString(), " in position ", i);
     }
     Tensor out_tensor;
-    TF_RETURN_IF_ERROR(
-        c->allocate_temp(a_tensor.dtype(), a_tensor.shape(), &out_tensor));
+    AllocatorAttributes attr;
+    if (a_tensor.dtype() == DT_VARIANT) {
+      attr.set_on_host(true);
+    }
+    TF_RETURN_IF_ERROR(c->allocate_temp(a_tensor.dtype(), a_tensor.shape(),
+                                        &out_tensor, attr));
     out->tensors.push_back(out_tensor);
     switch (out_tensor.dtype()) {
 #define DTYPE_CASE(dtype)                                        \
@@ -384,6 +391,13 @@ Status TensorListBinaryAdd(OpKernelContext* c, const TensorList& a,
       TF_CALL_NUMBER_TYPES(DTYPE_CASE)
 
 #undef DTYPE_CASE
+      case DataTypeToEnum<Variant>::value: {
+        Variant* v_out = &(out_tensor.scalar<Variant>()());
+        TF_RETURN_IF_ERROR(BinaryOpVariants<Device>(
+            c, ADD_VARIANT_BINARY_OP, a_tensor.scalar<Variant>()(),
+            b_tensor.scalar<Variant>()(), v_out));
+        break;
+      }
       default:
         return errors::InvalidArgument("Trying to add unsupported dtype ",
                                        out_tensor.dtype());
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index b4252eb04446895a7c293b62473dba28a06845a1..f405ca3c58cfffc8422dcdd65e66c7fd12784519 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -26,6 +26,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 4b74a64025a19bbac1053efb6081347358fdc0c6..48769f3fe5d6eb4d5bb2856f9dd027253ebd8582 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -21,6 +21,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 namespace functor {
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index b332edad0ae136d5486bd903540dd448a77bd620..dc6f78362349d6c6381b0f68067a2a8b7769ec19 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include <string.h>
+#include <algorithm>
 #include <map>
 #include <vector>
-#include <memory>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -29,6 +29,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/mkl_conv_ops.h"
+#include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
+#include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -69,6 +71,12 @@ struct MklConvFwdParams {
   memory::dims dilations;
   memory::dims padding_left;
   memory::dims padding_right;
+  string dtypes = string("");
+  struct PostOpParam {
+    string name;
+    std::vector<float> param;
+  };
+  std::vector<PostOpParam> post_op_params;
 
   MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims,
                    memory::dims bias_dims, memory::dims dst_dims,
@@ -83,8 +91,10 @@ struct MklConvFwdParams {
         padding_left(padding_left),
         padding_right(padding_right) {}
 };
-
-template <typename T>
+// With quantization, input, filter, and output can have different types
+// so we use differnt template parameter for each type
+template <typename T, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput>
 class MklConvFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims)
@@ -103,16 +113,16 @@ class MklConvFwdPrimitive : public MklPrimitive {
   //   filter_data: input data buffer of filter (weights)
   //   bias_data:   input data buffer of bias
   //   dst_data:    output data buffer of dst
-  void Execute(const T* src_data, const T* filter_data, const T* bias_data,
-               const T* dst_data) {
+  void Execute(const Tinput* src_data, const Tfilter* filter_data,
+               const Tbias* bias_data, const Toutput* dst_data) {
     context_.src_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(src_data)));
+        static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(filter_data)));
+        static_cast<void*>(const_cast<Tfilter*>(filter_data)));
     context_.bias_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(bias_data)));
+        static_cast<void*>(const_cast<Tbias*>(bias_data)));
     context_.dst_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(dst_data)));
+        static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
     // after exec, set data handle back
@@ -128,13 +138,14 @@ class MklConvFwdPrimitive : public MklPrimitive {
   //   src_data:    input data buffer of src
   //   filter_data: input data buffer of filter (weights)
   //   dst_data:    output data buffer of dst
-  void Execute(const T* src_data, const T* filter_data, const T* dst_data) {
+  void Execute(const Tinput* src_data, const Tfilter* filter_data,
+               const Toutput* dst_data) {
     context_.src_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(src_data)));
+        static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(filter_data)));
+        static_cast<void*>(const_cast<Tfilter*>(filter_data)));
     context_.dst_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(dst_data)));
+        static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
     // after execution, set data handle back
@@ -200,17 +211,17 @@ class MklConvFwdPrimitive : public MklPrimitive {
   void Setup(const MklConvFwdParams& convFwdDims) {
     // create memory descriptors for convolution data w/ no specified format
     context_.src_md.reset(new memory::desc(
-        {convFwdDims.src_dims}, MklDnnType<T>(), memory::format::any));
+        {convFwdDims.src_dims}, MklDnnType<Tinput>(), memory::format::any));
 
     context_.filter_md.reset(new memory::desc(
-        {convFwdDims.filter_dims}, MklDnnType<T>(), memory::format::any));
+        {convFwdDims.filter_dims}, MklDnnType<Tfilter>(), memory::format::any));
 
     context_.dst_md.reset(new memory::desc(
-        {convFwdDims.dst_dims}, MklDnnType<T>(), memory::format::any));
+        {convFwdDims.dst_dims}, MklDnnType<Toutput>(), memory::format::any));
 
     if (!convFwdDims.bias_dims.empty())
       context_.bias_md.reset(new memory::desc(
-          {convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::any));
+          {convFwdDims.bias_dims}, MklDnnType<Tbias>(), memory::format::any));
 
     // create a convolution
     if (!convFwdDims.bias_dims.empty()) {
@@ -230,6 +241,42 @@ class MklConvFwdPrimitive : public MklPrimitive {
     context_.fwd_pd.reset(new convolution_forward::primitive_desc(
         *context_.fwd_desc, cpu_engine_));
 
+    // Check if there is any fusions as post-ops
+    auto const& post_op_params = convFwdDims.post_op_params;
+    mkldnn::primitive_attr post_ops_attr;
+    mkldnn::post_ops post_ops;
+    if (!post_op_params.empty()) {
+      for (auto const& post_op_param : post_op_params) {
+        if (post_op_param.name == "relu") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.append_eltwise(op_scale, mkldnn::eltwise_relu, op_alpha,
+                                  op_beta);
+        } else if (post_op_param.name == "sum") {
+          DCHECK_EQ(post_op_param.param.size(), 1);
+          float op_scale = post_op_param.param[0];
+          post_ops.append_sum(op_scale);
+        } else if (post_op_param.name == "output_scale") {
+          DCHECK_EQ(post_op_param.param.size(), 1);
+          std::vector<float> scales;
+          scales.push_back(post_op_param.param[0]);
+          post_ops_attr.set_output_scales(0, scales);
+        } else {
+          DCHECK((post_op_param.name == "relu") ||
+                 (post_op_param.name == "sum") ||
+                 (post_op_param.name == "output_scale"));
+        }
+      }
+      post_ops_attr.set_post_ops(post_ops);
+      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
+          *context_.fwd_desc, post_ops_attr, cpu_engine_));
+    } else {
+      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
+          *context_.fwd_desc, cpu_engine_));
+    }
+
     // store the expected memory format
     context_.src_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->src_primitive_desc().desc().data.format);
@@ -268,23 +315,30 @@ class MklConvFwdPrimitive : public MklPrimitive {
   engine cpu_engine_;
 };
 
-template <typename T>
+template <typename T, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput>
 class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
-  static MklConvFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims,
-                                     bool do_not_cache) {
-    MklConvFwdPrimitive<T>* conv_fwd = nullptr;
+  static MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>* Get(
+      const MklConvFwdParams& convFwdDims, bool do_not_cache) {
+    MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>* conv_fwd = nullptr;
 
     if (do_not_cache) { /* Always create new primitive */
-      conv_fwd = new MklConvFwdPrimitive<T>(convFwdDims);
+      conv_fwd = new MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>(
+          convFwdDims);
     } else {
       // try to find a suitable one in pool
-      conv_fwd = dynamic_cast<MklConvFwdPrimitive<T>*>(
-          MklConvFwdPrimitiveFactory<T>::GetInstance().GetConvFwd(convFwdDims));
+      conv_fwd = dynamic_cast<
+          MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>*>(
+          MklConvFwdPrimitiveFactory<T, Tinput, Tfilter, Tbias,
+                                     Toutput>::GetInstance()
+              .GetConvFwd(convFwdDims));
       if (conv_fwd == nullptr) {
-        conv_fwd = new MklConvFwdPrimitive<T>(convFwdDims);
-        MklConvFwdPrimitiveFactory<T>::GetInstance().SetConvFwd(convFwdDims,
-                                                                conv_fwd);
+        conv_fwd = new MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>(
+            convFwdDims);
+        MklConvFwdPrimitiveFactory<T, Tinput, Tfilter, Tbias,
+                                   Toutput>::GetInstance()
+            .SetConvFwd(convFwdDims, conv_fwd);
       }
     }
 
@@ -314,6 +368,29 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(convFwdDims.dilations);
     key_creator.AddAsKey(convFwdDims.padding_left);
     key_creator.AddAsKey(convFwdDims.padding_right);
+    key_creator.AddAsKey(convFwdDims.dtypes);
+
+    // Generate keys for post-ops
+    for (auto const& post_op_param : convFwdDims.post_op_params) {
+      if (post_op_param.name == "relu") {
+        DCHECK_EQ(post_op_param.param.size(), 3);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+        key_creator.AddAsKey(post_op_param.param[1]);
+        key_creator.AddAsKey(post_op_param.param[2]);
+      } else if (post_op_param.name == "sum") {
+        DCHECK_EQ(post_op_param.param.size(), 1);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+      } else if (post_op_param.name == "output_scale") {
+        DCHECK_EQ(post_op_param.param.size(), 1);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+      } else {
+        return string("not_a_key");
+      }
+    }
+
     return key_creator.GetKey();
   }
 
@@ -757,10 +834,23 @@ class MklConvOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+// FP32 kernel registration for INTEL_MKL_ML
+REGISTER_KERNEL_BUILDER(Name("_MklConv2D")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklConv2DOp<CPUDevice, float, false>);
+REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklConv2DOp<CPUDevice, float, true>);
+
 #else
 
 // Base class for convolution forward operations
-template <typename Device, typename T, bool biasEnabled>
+template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput, typename Ttemp_output, bool biasEnabled>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -828,25 +918,25 @@ class MklConvOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-            errors::InvalidArgument("Filter should not be in "
-            "Mkl Layout"));
+                  errors::InvalidArgument("Filter should not be in "
+                                          "Mkl Layout"));
 
-      MklDnnData<T> src(&cpu_engine);
-      MklDnnData<T> filter(&cpu_engine);
+      MklDnnData<Tinput> src(&cpu_engine_);
+      MklDnnData<Tfilter> filter(&cpu_engine_);
 
       memory::dims src_dims, filter_dims, padding_left, padding_right,
-                   dilations, strides;
+          dilations, strides;
       memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
-                             dilations_);
+                              dilations_);
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
       conv_utl.GetConvFwdSizesInMklOrder(
-          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims,
-          &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order,
-          &padding_left, &padding_right);
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
+          &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
+          &padding_right);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -854,20 +944,25 @@ class MklConvOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* dst_tensor = nullptr;
-      if (dst_tf_shape.num_elements() == 0 ||
-          dst_dims_tf_order[0] == 0) {
+      if (dst_tf_shape.num_elements() == 0 || dst_dims_tf_order[0] == 0) {
         MklDnnShape dst_mkl_shape;
         dst_mkl_shape.SetMklTensor(false);
-        AllocateOutputSetMklShape(context, kOutputIndex_Dst,
-                    &dst_tensor, src_tf_shape, dst_mkl_shape);
+        AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor,
+                                  src_tf_shape, dst_mkl_shape);
 
         // MklConv2D/3D also outputs converted filter
         // as 2nd output of Conv2D/3D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
-        AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor,
-                                  filter_tf_shape, filter_mkl_shape);
+        // MklConv2D also outputs converted filter as 2nd output.
+        if (typeid(Tinput) == typeid(float) &&
+            typeid(Tfilter) == typeid(float) &&
+            typeid(Toutput) == typeid(float)) {
+          filter_mkl_shape.SetMklTensor(false);
+          AllocateOutputSetMklShape(context, kOutputIndex_Filter,
+                                    &output_filter_tensor, filter_tf_shape,
+                                    filter_mkl_shape);
+        }
         return;
       }
 
@@ -887,15 +982,17 @@ class MklConvOp : public OpKernel {
       //     Conv3D: NDHWC or NCDHW
       auto src_md = src_mkl_shape.IsMklTensor()
                         ? src_mkl_shape.GetMklLayout()
-                        : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+                        : memory::desc(src_dims, MklDnnType<Tinput>(), tf_fmt);
+      src.SetUsrMem(src_md, &src_tensor);
 
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
                            ? filter_mkl_shape.GetMklLayout()
-                           : memory::desc(filter_dims, MklDnnType<T>(),
+                           : memory::desc(filter_dims, MklDnnType<Tfilter>(),
                                           isConv2D ? memory::format::hwio
                                                    : memory::format::dhwio);
+      filter.SetUsrMem(filter_md, &filter_tensor);
       // MKLDNN dilation starts from 0.
       for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
@@ -905,67 +1002,85 @@ class MklConvOp : public OpKernel {
       // in the following cases
       //   1. Legacy CPU without AVX512/AVX2, or
       //   2. 1x1 convolution with stride != 1
-      bool do_not_cache = MklPrimitiveFactory<T>::IsPrimitiveMemOptEnabled() &&
-                    (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) &&
-                    (MklPrimitiveFactory<T>::IsLegacyPlatform() ||
-                     IsConv1x1StrideNot1(filter_dims, strides));
+      bool do_not_cache =
+          MklPrimitiveFactory<Tinput>::IsPrimitiveMemOptEnabled() &&
+          (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) &&
+          (MklPrimitiveFactory<Tinput>::IsLegacyPlatform() ||
+           IsConv1x1StrideNot1(filter_dims, strides));
 
       // get a conv2d fwd from primitive pool
-      MklConvFwdPrimitive<T>* conv_fwd = nullptr;
+      MklConvFwdPrimitive<float, Tinput, Tfilter, Tbias, Ttemp_output>*
+          conv_fwd = nullptr;
       if (biasEnabled) {
         memory::dims bias_dims = {};
         conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
         MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(
-            convFwdDims, do_not_cache);
+
+        // TODO(mdfaijul):  Extend the basic parameters for data types and
+        // fusions
+        this->ExtendConvFwdParams(context, convFwdDims);
+
+        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
+                                              Ttemp_output>::Get(convFwdDims,
+                                                                 do_not_cache);
       } else {
         MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(
-            convFwdDims, do_not_cache);
+
+        // Extend the basic parameters for data types and fusions
+        this->ExtendConvFwdParams(context, convFwdDims);
+
+        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
+                                              Ttemp_output>::Get(convFwdDims,
+                                                                 do_not_cache);
       }
 
       // allocate output tensors output_tensor and filter_out_tensor
       std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd =
           conv_fwd->GetPrimitiveDesc();
-      AllocateOutputTensor(context, *conv_fwd_pd,
-                       dst_dims_mkl_order, tf_fmt, &dst_tensor);
+      AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt,
+                           &dst_tensor);
       Tensor* filter_out_tensor = nullptr;
-      AllocateFilterOutputTensor(context, *conv_fwd_pd,
-                                 TFShapeToMklDnnDims(filter_tf_shape),
-                                 &filter_out_tensor);
+      if (typeid(Tinput) == typeid(float) && typeid(Tfilter) == typeid(float) &&
+          typeid(Toutput) == typeid(float)) {
+        AllocateFilterOutputTensor(context, *conv_fwd_pd,
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+      }
 
-      T* dst_data = static_cast<T*>(dst_tensor->flat<T>().data());
+      Ttemp_output* dst_data =
+          reinterpret_cast<Ttemp_output*>(dst_tensor->flat<Toutput>().data());
 
       // check whether src/filter need reorder
-      T *src_data = nullptr;
+      Tinput* src_data = nullptr;
       if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) {
         src.SetUsrMem(src_md, &src_tensor);
         src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc());
-        src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
+        src_data = static_cast<Tinput*>(src.GetOpMem().get_data_handle());
       } else {
-        src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        src_data = static_cast<Tinput*>(
+            const_cast<Tinput*>(src_tensor.flat<Tinput>().data()));
       }
-      T* filter_data = nullptr;
+      Tfilter* filter_data = nullptr;
       if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
         filter.SetUsrMem(filter_md, &filter_tensor);
         filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(),
                                    filter.GetTensorBuffer(filter_out_tensor));
-        filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
-      } else {
         filter_data =
-            static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data()));
+            static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
+      } else {
+        filter_data = static_cast<Tfilter*>(
+            const_cast<Tfilter*>(filter_tensor.flat<Tfilter>().data()));
       }
 
       // execute convolution
       if (biasEnabled) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
-        T* bias_data = static_cast<T*>(const_cast<T*>(
-            bias_tensor.flat<T>().data()));
-
+        Tbias* bias_data =
+            this->GetBiasHandle(context, conv_fwd_pd, bias_tensor);
         conv_fwd->Execute(src_data, filter_data, bias_data, dst_data);
       } else {
         conv_fwd->Execute(src_data, filter_data, dst_data);
@@ -973,27 +1088,41 @@ class MklConvOp : public OpKernel {
 
       // delete primitive since it is not cached.
       if (do_not_cache) delete conv_fwd;
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = tensorflow::strings::StrCat(
           "Status: ", e.status, ", message: ", string(e.message), ", in file ",
           __FILE__, ":", __LINE__);
-      OP_REQUIRES_OK(context,
-        errors::Aborted("Operation received an exception:", error_msg));
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
- private:
-  std::vector<int32> strides_;
-  std::vector<int32> dilations_;
-  Padding padding_;
-  TensorFormat data_format_;
-  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
-  const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
-  const int kDilationH = 0, kDilationW = 1;
-  engine cpu_engine = engine(engine::cpu, 0);
+ protected:
+  virtual void ExtendConvFwdParams(OpKernelContext* context,
+                                   MklConvFwdParams& params) {
+    // Create a string from data types of input, filter, bias, and output.
+    params.dtypes.append(typeid(Tinput).name());
+    params.dtypes.append(typeid(Tfilter).name());
+    params.dtypes.append(typeid(Tbias).name());
+    params.dtypes.append(typeid(Toutput).name());
+  }
+
+  virtual Tbias* GetBiasHandle(
+      OpKernelContext* context,
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>&
+          conv2d_fwd_pd,
+      const Tensor& bias_tensor) {
+    if (biasEnabled) {
+      return static_cast<Tbias*>(
+          const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+    } else {
+      return nullptr;
+    }
+  }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(
+  virtual void AllocateOutputTensor(
       OpKernelContext* context,
       const convolution_forward::primitive_desc& conv_prim_desc,
       const memory::dims& output_dims_mkl_order,
@@ -1001,23 +1130,40 @@ class MklConvOp : public OpKernel {
     CHECK_NOTNULL(output_tensor);
     auto dst_pd = conv_prim_desc.dst_primitive_desc();
 
+    auto dst_md = dst_pd.desc();
+    if (!std::is_same<Ttemp_output, Toutput>::value) {
+      dst_md.data.data_type =
+          static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
+      dst_pd = memory::primitive_desc(dst_md, cpu_engine_);
+    }
     // Allocate shape of Mkl tensor.
     MklDnnShape output_mkl_shape;
     output_mkl_shape.SetMklTensor(true);
     output_mkl_shape.SetMklLayout(&dst_pd);
-    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetElemType(MklDnnType<Toutput>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
                                  output_dims_mkl_order, output_tf_format);
 
     // Allocate shape of TF tensor.
     TensorShape output_tf_shape;
-    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput)));
 
     AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
                               output_tf_shape, output_mkl_shape);
   }
 
-  // Allocate output tensor.
+  engine cpu_engine_ = engine(engine::cpu, 0);
+
+ private:
+  std::vector<int32> strides_;
+  std::vector<int32> dilations_;
+  Padding padding_;
+  TensorFormat data_format_;
+  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
+  const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
+  const int kDilationH = 0, kDilationW = 1;
+
+  // Allocate filter output tensor.
   void AllocateFilterOutputTensor(
       OpKernelContext* context,
       const convolution_forward::primitive_desc& conv_prim_desc,
@@ -1029,7 +1175,7 @@ class MklConvOp : public OpKernel {
     MklDnnShape filter_mkl_shape;
     filter_mkl_shape.SetMklTensor(true);
     filter_mkl_shape.SetMklLayout(&filter_pd);
-    filter_mkl_shape.SetElemType(MklDnnType<T>());
+    filter_mkl_shape.SetElemType(MklDnnType<Tfilter>());
 
     // The format of the filter is actually OIhw8i8o, but TF doesn't support
     // this format. Just use format::blocked for now because the layout
@@ -1039,17 +1185,17 @@ class MklConvOp : public OpKernel {
 
     // Allocate the data space for the filter to propagate as TF tensor.
     TensorShape filter_tf_shape;
-    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
+    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(Tfilter)));
 
     AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
                               filter_tf_shape, filter_mkl_shape);
   }
-
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
       const convolution_forward::primitive_desc& conv_prim_desc,
-      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
-      MklDnnData<T>* output, Tensor* filter_out_tensor) {
+      MklDnnData<Tinput>* src, MklDnnData<Tfilter>* filter,
+      MklDnnData<Tbias>* bias, MklDnnData<Toutput>* output,
+      Tensor* filter_out_tensor) {
     CHECK_NOTNULL(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
@@ -1065,12 +1211,12 @@ class MklConvOp : public OpKernel {
     // Create convolution primitive and add it to net.
     std::vector<primitive> net;
     if (bias) {
-      CHECK_EQ(biasEnabled, true);
+      DCHECK(biasEnabled);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(), bias->GetOpMem(),
                                         output->GetOpMem()));
     } else {
-      CHECK_EQ(biasEnabled, false);
+      DCHECK(!biasEnabled);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(),
                                         output->GetOpMem()));
@@ -1080,24 +1226,581 @@ class MklConvOp : public OpKernel {
   }
 };
 
-#endif
+// We create new class for each verison of Quantized Convolution and inherit
+// from the FP32 version of the base class
+template <typename Device, typename Tbias, typename Toutput,
+          typename Ttemp_output, bool biasEnabled>
+class MklQuantizedConv2DOp
+    : public MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+                       biasEnabled> {
+ public:
+  virtual ~MklQuantizedConv2DOp() {
+    if (this->input_bias_ != nullptr) {
+      delete this->input_bias_;
+      input_bias_ = nullptr;
+    }
+
+    if (this->scaled_bias_ != nullptr) {
+      delete this->scaled_bias_;
+      scaled_bias_ = nullptr;
+    }
+  }
+
+  explicit MklQuantizedConv2DOp(OpKernelConstruction* context)
+      : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+                  biasEnabled>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Compute int32 output tensor
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+              biasEnabled>::Compute(context);
+
+    // Compute additional outputs: min/max scalars.
+    int bias_index_offset;
+    bias_index_offset = biasEnabled ? 1 : 0;
+
+    const float min_input =
+        context->input(2 + bias_index_offset).flat<float>()(0);
+    const float max_input =
+        context->input(3 + bias_index_offset).flat<float>()(0);
+    const float min_filter =
+        context->input(4 + bias_index_offset).flat<float>()(0);
+    const float max_filter =
+        context->input(5 + bias_index_offset).flat<float>()(0);
+
+    float min_output_value;
+    float max_output_value;
+    if (std::is_same<Toutput, quint8>::value ||
+        std::is_same<Toutput, qint8>::value) {
+      // This is the case the convolution and requantization are fused.
+      // min_freezed_output and max_freezed_output are the actual range
+      // for the output
+      min_output_value = context->input(6 + bias_index_offset).flat<float>()(0);
+      max_output_value = context->input(7 + bias_index_offset).flat<float>()(0);
+    } else {
+      MklQuantizationRangeForMultiplication<quint8, qint8, qint32>(
+          min_input, max_input, min_filter, max_filter, &min_output_value,
+          &max_output_value);
+    }
+
+    Tensor* output_min = nullptr;
+    Tensor* output_max = nullptr;
+    MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+    output_min_mkl_shape.SetMklTensor(false);
+    output_max_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 1, &output_min, {},
+                              output_min_mkl_shape);
+    AllocateOutputSetMklShape(context, 2, &output_max, {},
+                              output_max_mkl_shape);
+    output_min->flat<float>()(0) = min_output_value;
+    output_max->flat<float>()(0) = max_output_value;
+  }
+
+ protected:
+  void ExtendConvFwdParams(OpKernelContext* context,
+                           MklConvFwdParams& params) override {
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+              biasEnabled>::ExtendConvFwdParams(context, params);
+
+    // When the output type is quint8, the output data id requantized
+    // into quint8. A post_op "output_scale" is added to do the conversion.
+    if (std::is_same<Toutput, quint8>::value ||
+        std::is_same<Toutput, qint8>::value) {
+      int bias_index_offset;
+      bias_index_offset = biasEnabled ? 1 : 0;
+
+      const float min_input =
+          context->input(2 + bias_index_offset).flat<float>()(0);
+      const float max_input =
+          context->input(3 + bias_index_offset).flat<float>()(0);
+      const float min_filter =
+          context->input(4 + bias_index_offset).flat<float>()(0);
+      const float max_filter =
+          context->input(5 + bias_index_offset).flat<float>()(0);
+      const float min_freezed_output =
+          context->input(6 + bias_index_offset).flat<float>()(0);
+      const float max_freezed_output =
+          context->input(7 + bias_index_offset).flat<float>()(0);
+
+      float min_output_value;
+      float max_output_value;
+      MklQuantizationRangeForMultiplication<quint8, qint8, qint32>(
+          min_input, max_input, min_filter, max_filter, &min_output_value,
+          &max_output_value);
+      float scale_int32 =
+          std::max(std::abs(min_output_value), std::abs(max_output_value));
+      float scale_eightbit =
+          std::max(std::abs(min_freezed_output), std::abs(max_freezed_output));
+      float scale = 1.0;
+      if (std::is_same<Toutput, quint8>::value)
+        scale = scale_int32 / scale_eightbit / static_cast<float>(1 << 23);
+      else
+        scale = scale_int32 / scale_eightbit / static_cast<float>(1 << 24);
+
+      std::vector<float> output_scale;
+      output_scale.push_back(scale);
+      params.post_op_params.push_back({"output_scale", output_scale});
+    }
+  }
+
+  Tbias* GetBiasHandle(
+      OpKernelContext* context,
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>& conv_fwd_pd,
+      const Tensor& bias_tensor) override {
+    int bias_index_offset;
+    bias_index_offset = biasEnabled ? 1 : 0;
+
+    const float min_input =
+        context->input(2 + bias_index_offset).flat<float>()(0);
+    const float max_input =
+        context->input(3 + bias_index_offset).flat<float>()(0);
+    const float min_filter =
+        context->input(4 + bias_index_offset).flat<float>()(0);
+    const float max_filter =
+        context->input(5 + bias_index_offset).flat<float>()(0);
+
+    std::vector<mkldnn::primitive> net;
+    if (biasEnabled) {
+      if (std::is_same<Tbias, qint32>::value) {
+        return static_cast<Tbias*>(
+            const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+      }
+      // If bias is enabled and requantization is not fused, scale the
+      // bias to be consistent with quantized-input and quantized-filter.
+      float bias_scale = 255.0 * 127.0 /
+                         (std::max(std::abs(max_input), std::abs(min_input)) *
+                          std::max(std::abs(max_filter), std::abs(min_filter)));
+      std::vector<float> scales;
+      scales.push_back(bias_scale);
+      mkldnn::primitive_attr bias_attr;
+      bias_attr.set_output_scales(0, scales);
+
+      void* bias_buf = static_cast<void*>(
+          const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+      input_bias_ = new memory(conv_fwd_pd->bias_primitive_desc(), bias_buf);
+      scaled_bias_ = new memory(conv_fwd_pd->bias_primitive_desc());
+      auto reorder_desc = mkldnn::reorder::primitive_desc(
+          input_bias_->get_primitive_desc(), scaled_bias_->get_primitive_desc(),
+          bias_attr);
+      net.push_back(mkldnn::reorder(reorder_desc, *input_bias_, *scaled_bias_));
+      stream(stream::kind::eager).submit(net).wait();
+      return reinterpret_cast<Tbias*>(scaled_bias_->get_data_handle());
+    } else {
+      return nullptr;
+    }
+  }
+
+  memory* input_bias_ = nullptr;
+  memory* scaled_bias_ = nullptr;
+};
+
+template <typename Device, typename Tbias, typename Toutput,
+          typename Ttemp_output, bool biasEnabled>
+class MklQuantizedConv2DReluOp
+    : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                                  biasEnabled> {
+ public:
+  virtual ~MklQuantizedConv2DReluOp() {}
+
+  explicit MklQuantizedConv2DReluOp(OpKernelConstruction* context)
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
+            context) {}
+
+ protected:
+  void ExtendConvFwdParams(OpKernelContext* context,
+                           MklConvFwdParams& params) override {
+    MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                         biasEnabled>::ExtendConvFwdParams(context, params);
+    params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
+  }
+};
+
+template <typename Device, typename Tbias, typename Toutput,
+          typename Ttemp_output, bool biasEnabled>
+class MklQuantizedConv2DSumReluOp
+    : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                                  biasEnabled> {
+ public:
+  virtual ~MklQuantizedConv2DSumReluOp() {
+    if (this->summand_ != nullptr) {
+      delete this->summand_;
+      summand_ = nullptr;
+    }
+
+    if (this->dst_ != nullptr) {
+      delete this->dst_;
+      dst_ = nullptr;
+    }
+  }
+
+  explicit MklQuantizedConv2DSumReluOp(OpKernelConstruction* context)
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
+            context) {}
+
+ protected:
+  void ExtendConvFwdParams(OpKernelContext* context,
+                           MklConvFwdParams& params) override {
+    MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                         biasEnabled>::ExtendConvFwdParams(context, params);
+    // Calculate the scale (beta in mkldnn api term) for sum
+    if (std::is_same<Toutput, quint8>::value) {
+      int summand_idx = context->num_inputs() / 2 - 1 - 2;
+      DataType summand_type = this->input_type(summand_idx);
+      bool summand_condition =
+          (summand_type == DT_QINT8) || (summand_type == DT_QUINT8);
+      CHECK((summand_condition));
+      int bias_index_offset = biasEnabled ? 1 : 0;
+      const float min_freezed_output =
+          context->input(6 + bias_index_offset).flat<float>()(0);
+      const float max_freezed_output =
+          context->input(7 + bias_index_offset).flat<float>()(0);
+      const float min_freezed_summand =
+          context->input(9 + bias_index_offset).flat<float>()(0);
+      const float max_freezed_summand =
+          context->input(10 + bias_index_offset).flat<float>()(0);
+
+      float scale_output =
+          std::max(std::abs(min_freezed_output), std::abs(max_freezed_output));
+      float scale_summand = std::max(std::abs(min_freezed_summand),
+                                     std::abs(max_freezed_summand));
+      if (summand_type == DT_QUINT8)
+        params.post_op_params.push_back(
+            {"sum", {scale_summand / scale_output}});
+      else
+        params.post_op_params.push_back(
+            {"sum", {2.0f * scale_summand / scale_output}});
+    } else {
+      params.post_op_params.push_back({"sum", {1.0}});
+    }
+    params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
+  }
+
+  // Allocate output tensor.
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) override {
+    int summand_idx = context->num_inputs() / 2 - 1;
+    float reorder_sum_scale = 1.0;
+    if (std::is_same<Toutput, quint8>::value) {
+      summand_idx -= 2;
+      DataType summand_type = this->input_type(summand_idx);
+      bool summand_condition =
+          (summand_type == DT_QINT8) || (summand_type == DT_QUINT8);
+      CHECK((summand_condition));
+      Tensor& summand = const_cast<Tensor&>(MklGetInput(context, summand_idx));
+      MklDnnShape summand_mkl_shape;
+      GetMklShape(context, summand_idx, &summand_mkl_shape);
+      auto dst_md = summand_mkl_shape.GetMklLayout();
+      if (summand_mkl_shape.IsMklTensor()) {
+        if (summand_type == DT_QINT8) {
+          summand.UnsafeCopyFromInternal(summand, DT_QUINT8, summand.shape());
+          dst_md.data.data_type =
+              static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
+          summand_mkl_shape.SetMklLayout(&dst_md);
+          summand_mkl_shape.SetElemType(MklDnnType<Toutput>());
+        }
+        ForwardMklTensorInToOutWithMklShape(context, summand_idx, 0,
+                                            summand_mkl_shape);
+        *output_tensor = const_cast<Tensor*>(&summand);
+        return;
+      } else {
+        TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
+                           "Current fusion is not successful."));
+      }
+    }
+    // TODO(mdfaijul): Add cleaner code for non-mkl tensor
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+              biasEnabled>::AllocateOutputTensor(context, conv_prim_desc,
+                                                 output_dims_mkl_order,
+                                                 output_tf_format,
+                                                 output_tensor);
+    const Tensor& summand = MklGetInput(context, summand_idx);
+    if (summand.dtype() != DT_FLOAT)
+      TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
+                         "Current fusion requires summand to be float"));
+    MklDnnShape summand_mkl_shape;
+    GetMklShape(context, summand_idx, &summand_mkl_shape);
+    // We need to compute scale for the summand
+    int bias_index_offset = biasEnabled ? 1 : 0;
+    const float min_input =
+        context->input(2 + bias_index_offset).flat<float>()(0);
+    const float max_input =
+        context->input(3 + bias_index_offset).flat<float>()(0);
+    const float min_filter =
+        context->input(4 + bias_index_offset).flat<float>()(0);
+    const float max_filter =
+        context->input(5 + bias_index_offset).flat<float>()(0);
+
+    reorder_sum_scale = 255.0 * 127.0 /
+                        (std::max(std::abs(max_input), std::abs(min_input)) *
+                         std::max(std::abs(max_filter), std::abs(min_filter)));
+    std::vector<float> scales;
+    scales.push_back(reorder_sum_scale);
+    mkldnn::primitive_attr reorder_attr;
+    reorder_attr.set_output_scales(0, scales);
+
+    auto summand_md =
+        summand_mkl_shape.IsMklTensor()
+            ? summand_mkl_shape.GetMklLayout()
+            : memory::desc(output_dims_mkl_order, MklDnnType<Tbias>(),
+                           memory::format::nhwc);
+    auto summand_pd = memory::primitive_desc(summand_md, this->cpu_engine_);
+    void* summand_buf =
+        static_cast<void*>(const_cast<Tbias*>(summand.flat<Tbias>().data()));
+    void* dst_buf =
+        static_cast<void*>((*output_tensor)->flat<Ttemp_output>().data());
+    summand_ = new memory(summand_pd, summand_buf);
+    dst_ = new memory(conv_prim_desc.dst_primitive_desc(), dst_buf);
+    auto reorder_desc = mkldnn::reorder::primitive_desc(
+        summand_pd, conv_prim_desc.dst_primitive_desc(), reorder_attr);
+
+    std::vector<mkldnn::primitive> net;
+    net.push_back(mkldnn::reorder(reorder_desc, *summand_, *dst_));
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  memory* summand_ = nullptr;
+  memory* dst_ = nullptr;
+};
+
+// INT8 kernel registration
+// Register NoOp kernel for QunatizedConv2D for qint8 filter
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2D")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation of MklQuntizedConv2D.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2D")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, float, qint32, qint32, false>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, qint32, qint8, qint8, false>);
+
+// Register NoOp kernel for QuantizedConv2DWithBias to get a python interface.
+// This kernel will be replaced by an MKL kernel during graph
+// optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBias")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation MklQuantizedConv2DWithBias.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBias")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, float, qint32, qint32, true>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("Tbias")
+        .TypeConstraint<qint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, qint32, qint8, qint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<float>("Tbias")
+        .TypeConstraint<qint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, float, qint8, qint8, true>);
+
+// Register NoOp kernel for QuantizedConv2DAndRelu to get a python interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndReluAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<quint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation of MklQuantizedConv2DAndRelu.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DAndRelu")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, float, qint32, qint32, false>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, qint32, quint8, quint8, false>);
+
+// Register NoOp kernel for QuantizedConv2DWithBiasAndRelu to get a python
+// interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+// Register NoOp kernel for QuantizedConv2DWithBiasAndReluAndRequantize
+// to get a python interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndReluAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<quint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation of MklQuantizedConv2DWithBiasAndRelu.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndRelu")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, float, qint32, qint32, true>);
+
+// Register a templatized implementation of
+// MklQuantizedConv2DWithBiasAndReluAndRequantize.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<float>("Tbias")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, float, quint8, quint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("Tbias")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, qint32, quint8, quint8, true>);
+
+// Register NoOp kernel for QuantizedConv2DWithBiasSumAndRelu to get a python
+// interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasSumAndRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasSumAndReluAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<quint8>("out_type"),
+                        NoOp);
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type"),
+    NoOp);
+// Register a templatized implementation of MklQuantizedConv2DWithBiasAndRelu.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSumAndRelu")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, float, qint32, qint32, true>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, qint32, quint8, quint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, qint32, quint8, qint8, true>);
+#endif  // INTEL_MKL_ML
 
 // Register 2D operations
-#define REGISTER_MKL_CPU_2D(T)                                      \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                        \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, false>);          \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, true>);           \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
+#define REGISTER_MKL_CPU_2D(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("_MklConv2D")                                               \
+          .Device(DEVICE_CPU)                                          \
+          .TypeConstraint<float>("T")                                  \
+          .Label(mkl_op_registry::kMklOpLabel),                        \
+      MklConvOp<CPUDevice, float, float, float, float, float, false>); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("_MklConv2DWithBias")                                       \
+          .Device(DEVICE_CPU)                                          \
+          .TypeConstraint<float>("T")                                  \
+          .Label(mkl_op_registry::kMklOpLabel),                        \
+      MklConvOp<CPUDevice, float, float, float, float, float, true>);  \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")             \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<T>("T")                  \
+                              .Label(mkl_op_registry::kMklOpLabel),    \
                           MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D);
@@ -1108,7 +1811,7 @@ TF_CALL_float(REGISTER_MKL_CPU_2D);
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, false>);
+                          MklConvOp<CPUDevice, T, T, T, T, T, false>);
 TF_CALL_float(REGISTER_MKL_CPU_3D);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..10825f696253cc6d38bbdee1e6b660d494c34088
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
+
+#ifdef INTEL_MKL
+
+namespace tensorflow {
+template <class T>
+float MklFloatForOneQuantizedLevel(float range_min, float range_max) {
+  const int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
+  const int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+  const float float_for_one_quantized_level =
+      (range_max - range_min) / (highest - lowest);
+  return float_for_one_quantized_level;
+}
+
+template <class T1, class T2, class T3>
+void MklQuantizationRangeForMultiplication(float min_a, float max_a,
+                                           float min_b, float max_b,
+                                           float* min_c, float* max_c) {
+  const float a_float_for_one_quant_level =
+      MklFloatForOneQuantizedLevel<T1>(min_a, max_a);
+  const float b_float_for_one_quant_level =
+      MklFloatForOneQuantizedLevel<T2>(min_b, max_b);
+
+  const int64 c_highest = static_cast<int64>(Eigen::NumTraits<T3>::highest());
+  const int64 c_lowest = static_cast<int64>(Eigen::NumTraits<T3>::lowest());
+  const float c_float_for_one_quant_level =
+      a_float_for_one_quant_level * b_float_for_one_quant_level;
+
+  *min_c = c_float_for_one_quant_level * c_lowest;
+  *max_c = c_float_for_one_quant_level * c_highest;
+}
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 71e506e5e6fd66cb8166f8f223e86cf0882fb1c4..89b74495c722ba1e46aa4a432653e3749c21dbd0 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/reffed_status_callback.h"
@@ -50,12 +51,29 @@ class PartitionedCallOp : public AsyncOpKernel {
  public:
   explicit PartitionedCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    string rewriter_config_serialized;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &rewriter_config_serialized));
+    string deprecated_config_serialized;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &deprecated_config_serialized));
+    string config_proto_serialized;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config_proto", &config_proto_serialized));
     OP_REQUIRES(
-        ctx, rewriter_config_.ParseFromString(rewriter_config_serialized),
-        errors::InvalidArgument("Unable to parse rewriter_config string as "
-                                "tensorflow::RewriterConfig proto."));
+        ctx,
+        deprecated_config_serialized.empty() || config_proto_serialized.empty(),
+        errors::InvalidArgument("Provided both 'config' and 'config_proto' but "
+                                "only one should be provided.  Note the "
+                                "'config' option is deprecated."));
+    if (!deprecated_config_serialized.empty()) {
+      OP_REQUIRES(ctx,
+                  config_proto_.mutable_graph_options()
+                      ->mutable_rewrite_options()
+                      ->ParseFromString(deprecated_config_serialized),
+                  errors::InvalidArgument("Unable to parse config string as "
+                                          "tensorflow::RewriteOptions proto."));
+    } else {
+      OP_REQUIRES(
+          ctx, config_proto_.ParseFromString(config_proto_serialized),
+          errors::InvalidArgument("Unable to parse config_proto string as "
+                                  "tensorflow::ConfigProto proto."));
+    }
     OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_));
   }
 
@@ -506,7 +524,7 @@ class PartitionedCallOp : public AsyncOpKernel {
                        FunctionLibraryDefinition* flib,
                        const DeviceSet& device_set, Device* cpu_device,
                        std::unique_ptr<Graph>* graph) {
-    if (!tensorflow::grappler::MetaOptimizerEnabled(rewriter_config_)) {
+    if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto_)) {
       return Status::OK();
     }
 
@@ -530,7 +548,7 @@ class PartitionedCallOp : public AsyncOpKernel {
     // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
     // proto (which also contain the OptimizerOptions).
     TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-        item, rewriter_config_, cpu_device, &cluster, &out_graph));
+        item, config_proto_, cpu_device, &cluster, &out_graph));
 
     std::unique_ptr<Graph> optimized_graph(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
@@ -562,7 +580,7 @@ class PartitionedCallOp : public AsyncOpKernel {
   }
 
   NameAttrList func_;
-  RewriterConfig rewriter_config_;
+  ConfigProto config_proto_;
   string executor_type_;
   // Contains maps from device names to handles of function partitions, keyed by
   // FunctionLibraryRuntime pointers. (Because this kernel may be instantiated
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index b2a342f63783a72369e63d77c2ba9fde407a3511..903a97a9601a9e8613c3189ef61ed9965c82d3d5 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -236,8 +236,10 @@ class RaggedGatherOpBase : public OpKernel {
     values_shape.set_dim(0, num_values);
     TF_RETURN_IF_ERROR(
         context->allocate_output(values_index, values_shape, &values_out));
-    int64 value_size = params_dense_values_in.NumElements() /
-                       params_dense_values_in.dim_size(0);
+    const int64 num_elements = params_dense_values_in.NumElements();
+    const int64 value_size =
+        num_elements == 0 ? 0
+                          : (num_elements / params_dense_values_in.dim_size(0));
     CallWriteValueSlices(params_dense_values_in, value_slices, value_size,
                          values_out);
     return ::tensorflow::Status::OK();
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 3393b39faf4a25791b48af99a5e474f3e9bfbfce..edb2b10e3d69b6ac93c13b875d00fa9de7ed5362 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -217,9 +217,9 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
     OpKernelContext*, const GPUDevice& d, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64 size,
     Distribution dist) {
-  const int32 block_size = d.maxCudaThreadsPerBlock();
+  const int32 block_size = d.maxGpuThreadsPerBlock();
   const int32 num_blocks =
-      (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+      (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
       block_size;
 
   FillPhiloxRandomKernelLaunch<Distribution>
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index f5644d0da4cee34ab8542edffd9937803f5ecea5..e9cf36c62b966f5f91cf7764421f0c1ff6c131fc 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -218,7 +218,11 @@ __global__ void RowReduceKernel(
     T in, outT out, int num_rows, int num_cols, Op op,
     typename std::iterator_traits<T>::value_type initVal) {
   typedef typename std::iterator_traits<T>::value_type value_type;
-  const int row = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  // Defensive index computation to avoid integer overflow.
+  assert(blockDim.x % 32 == 0);
+  int warps_per_block = blockDim.x / 32;
+  int warp_index = threadIdx.x / 32;
+  const int row = blockIdx.x * warps_per_block + warp_index;
   const int lane = threadIdx.x % 32;
 
   if (num_cols == 1) {
@@ -526,27 +530,27 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
         init);
     return;
   }
-  std::size_t temp_storage_bytes = 0;
 
-  Tensor temp_storage;
-  // written as a loop because it reduces clutter
-  // first pass allocates memory, second launches kernel(s)
-  for (int i = 0; i < 2; ++i) {
-    auto success = cub::DeviceReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, in, out, in_size, op, init, cu_stream);
+  size_t temp_storage_bytes = 0;
+  auto reduce = [&](void* temp_storage_ptr) {
+    auto success =
+        cub::DeviceReduce::Reduce(temp_storage_ptr, temp_storage_bytes, in, out,
+                                  in_size, op, init, cu_stream);
 
     OP_REQUIRES(
         ctx, success == 0,
         errors::Internal("CUB reduce error", cudaGetErrorString(success)));
+  };
 
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
 }
 
 template <typename T, typename Op, typename OUT_T, typename IN_T>
@@ -569,25 +573,26 @@ void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
   cub::TransformInputIterator<int, RowOffset, cub::CountingInputIterator<int>>
       transform_iter(counting_iter, row_offset_op);
 
-  std::size_t temp_storage_bytes = 0;
-  Tensor temp_storage;
-  for (int i = 0; i < 2; ++i) {
+  size_t temp_storage_bytes = 0;
+  auto reduce = [&](void* temp_storage_ptr) {
     auto success = cub::DeviceSegmentedReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, in, out, num_rows, transform_iter,
+        temp_storage_ptr, temp_storage_bytes, in, out, num_rows, transform_iter,
         transform_iter + 1, op, init, cu_stream);
 
     OP_REQUIRES(ctx, success == 0,
                 errors::Internal("CUB segmented reduce error",
                                  cudaGetErrorString(success)));
+  };
 
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
 }
 
 template <typename T, typename Op, typename OUT_T, typename IN_T>
@@ -720,25 +725,25 @@ void Launch3DXZReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
                                                                  gather_iter);
 
   std::size_t temp_storage_bytes = 0;
-  Tensor temp_storage;
-
-  for (int i = 0; i < 2; ++i) {
+  auto reduce = [&](void* temp_storage_ptr) {
     auto success = cub::DeviceSegmentedReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, permute_iter, out, extent_y, transform_iter,
-        transform_iter + 1, op, init, cu_stream);
+        temp_storage_ptr, temp_storage_bytes, permute_iter, out, extent_y,
+        transform_iter, transform_iter + 1, op, init, cu_stream);
 
     OP_REQUIRES(ctx, success == 0,
                 errors::Internal("CUB segmented reduce error",
                                  cudaGetErrorString(success)));
+  };
 
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
 }
 
 namespace reduction_op_helper {
diff --git a/tensorflow/core/kernels/scan_ops_gpu.cu.cc b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
index ed6c6affce54a7e847ede07b329d31411b713bec..ed66c02dc584541ce4d5eb644630b678c1b05916 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,8 +17,20 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#if CUDA_VERSION >= 9000
+#define CUB_USE_COOPERATIVE_GROUPS
+#endif  // CUDA_VERSION >= 9000
+
+#include "third_party/cub/block/block_load.cuh"
+#include "third_party/cub/block/block_scan.cuh"
+#include "third_party/cub/block/block_store.cuh"
+#include "third_party/cub/iterator/counting_input_iterator.cuh"
+#include "third_party/cub/iterator/transform_input_iterator.cuh"
+#include "cuda/include/cuComplex.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/permutation_input_iterator.h"
+#include "tensorflow/core/util/permutation_output_iterator.h"
 
 #include "tensorflow/core/kernels/scan_ops.h"
 
@@ -27,6 +39,258 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::Index Index;
 
+namespace functor {
+
+// Map a contiguous range to the actual memory locations depending on which
+// axis the scan is taking place over and whether or not reversed.
+struct MapIndexToLocation {
+  __host__ __device__ MapIndexToLocation(int dimx, int dimy, int dimz,
+                                         bool reverse = false)
+      : dimx_(dimx), dimy_(dimy), dimz_(dimz), reverse_(reverse) {}
+
+  __host__ __device__ int operator()(int id) const {
+    if (dimx_ == 1) {
+      int row = id % dimy_;
+      int col = id / dimy_;
+
+      if (reverse_) return (dimy_ - row - 1) * dimz_ + col;
+
+      return row * dimz_ + col;
+    } else if (dimz_ == 1) {
+      if (reverse_) {
+        int row = id / dimy_;
+        int col = id % dimy_;
+        return row * dimy_ + (dimy_ - col - 1);
+      }
+      return id;
+    } else {
+      int col = id % dimy_;
+      int tmp = id / dimy_;
+
+      int row1 = id / (dimy_ * dimz_);
+      int col1 = tmp % dimz_;
+
+      if (reverse_)
+        return row1 * dimy_ * dimz_ + (dimy_ - col - 1) * dimz_ + col1;
+
+      return row1 * dimy_ * dimz_ + col * dimz_ + col1;
+    }
+  }
+
+  int dimx_;
+  int dimy_;
+  int dimz_;
+  bool reverse_;
+};
+
+template <typename T, typename Op>
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  T running_total_;
+  Op op_;
+
+  __device__ BlockPrefixCallbackOp(T running_total, Op op)
+      : running_total_(running_total), op_(op) {}
+
+  // Callback operator to be entered by the first warp of threads in the block.
+  // tid 0 is responsible for returning a value for seeding the block-wide scan.
+  __device__ T operator()(T block_aggregate) {
+    T old_prefix = running_total_;
+    running_total_ = op_(old_prefix, block_aggregate);
+    return old_prefix;
+  }
+};
+
+template <typename T>
+struct Sum {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <typename T>
+struct Prod {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+template <typename T, typename Op>
+struct IsSum {
+  constexpr static bool value =
+      (std::is_same<Op, Sum<T>>::value ||
+       std::is_same<Op, Eigen::internal::SumReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IsProd {
+  constexpr static bool value =
+      (std::is_same<Op, Prod<T>>::value ||
+       std::is_same<Op, Eigen::internal::ProdReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IdentityValue {
+  static_assert(IsSum<T, Op>::value || IsProd<T, Op>::value,
+                "IdentityValue not yet defined for this type.");
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsSum<U, OpCopy>::value, U>::type t = U(0)) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsProd<U, OpCopy>::value, U>::type t = U(1)) {
+    return t;
+  }
+};
+
+// Each block is mapped to one sequence.  A contiguous range is mapped to the
+// appropriate locations in memory by the permutation iterators.  This is
+// ideal for 1-D and row based scans.  Column scans would be better if they
+// did a block load and then locally transposed.  CUB's device wide scan is not
+// used in the large 1D case, even though it would be more efficient, because
+// it is not deterministic.
+template <typename T, typename Op, int BlockDim = 128, int ItemsPerThread = 4>
+__global__ void scan_kernel(const T* in, T* out, int dimx, int dimy, int dimz,
+                            bool exclusive, bool reverse, Op op) {
+  typedef cub::BlockLoad<T, BlockDim, ItemsPerThread, cub::BLOCK_LOAD_TRANSPOSE>
+      BlockLoad;
+  typedef cub::BlockStore<T, BlockDim, ItemsPerThread,
+                          cub::BLOCK_STORE_TRANSPOSE>
+      BlockStore;
+  typedef cub::BlockScan<T, BlockDim> BlockScan;
+
+  // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  __shared__ union {
+    typename BlockLoad::TempStorage load;
+    typename BlockScan::TempStorage scan;
+    typename BlockStore::TempStorage store;
+  } temp_storage;
+
+  int problem_length = dimy;
+
+  // Initialize running total
+  BlockPrefixCallbackOp<T, Op> prefix_op(IdentityValue<T, Op>()(), op);
+
+  MapIndexToLocation map_op(dimx, dimy, dimz, reverse);
+  int block_start = problem_length * blockIdx.x;
+  // Have the block iterate over segments of items
+  for (int block_offset = block_start;
+       block_offset < block_start + problem_length;
+       block_offset += BlockDim * ItemsPerThread) {
+    int valid_items = min(BlockDim * ItemsPerThread,
+                          problem_length - (block_offset % problem_length));
+
+    // first construct a counting iterator that has the desired start point
+    typedef cub::TransformInputIterator<int, MapIndexToLocation,
+                                        cub::CountingInputIterator<int>>
+        MapIterType;
+
+    cub::CountingInputIterator<int> counting_iter(block_offset);
+
+    // Next map the iterator to the actual locations in memory
+    MapIterType map_iter(counting_iter, map_op);
+
+    PermutationInputIterator<T, const T*, MapIterType> permutein_iter(in,
+                                                                      map_iter);
+    PermutationOutputIterator<T, T*, MapIterType> permuteout_iter(out,
+                                                                  map_iter);
+
+    // Load a segment of consecutive items that are blocked across threads
+    T thread_data[ItemsPerThread];
+    BlockLoad(temp_storage.load).Load(permutein_iter, thread_data, valid_items);
+    __syncthreads();
+
+    // Collectively compute the block-wide scan
+    if (exclusive) {
+      BlockScan(temp_storage.scan)
+          .ExclusiveScan(thread_data, thread_data, op, prefix_op);
+    } else {
+      BlockScan(temp_storage.scan)
+          .InclusiveScan(thread_data, thread_data, op, prefix_op);
+    }
+    __syncthreads();
+
+    // Store scanned items to output segment
+    BlockStore(temp_storage.store)
+        .Store(permuteout_iter, thread_data, valid_items);
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Op>
+void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                typename TTypes<T, 3>::Tensor out, Op op, const bool reverse,
+                const bool exclusive) {
+  const int items_per_thread = 4;
+
+  int dimx = in.dimension(0);
+  int dimy = in.dimension(1);
+  int dimz = in.dimension(2);
+  int num_blocks = dimx * dimz;
+
+  int ideal_block_size = dimy / items_per_thread;
+
+  // There seems to be a bug when the type is not float and block_size 1024.
+  // Launch on the smallest power of 2 block size that we can.
+  if (ideal_block_size >= 1024 && std::is_same<T, float>::value) {
+    const int block_size = 1024;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 512) {
+    const int block_size = 512;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 256) {
+    const int block_size = 256;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 128) {
+    const int block_size = 128;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 64) {
+    const int block_size = 64;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else {
+    const int block_size = 32;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  }
+}
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::SumReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::SumReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Sum<T>>(d, in, out, Sum<T>(), reverse, exclusive);
+  }
+};
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::ProdReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::ProdReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Prod<T>>(d, in, out, Prod<T>(), reverse, exclusive);
+  }
+};
+
+}  // namespace functor
+
 #define DEFINE(REDUCER, T) template struct functor::Scan<GPUDevice, REDUCER, T>;
 
 #define DEFINE_FOR_ALL_REDUCERS(T)           \
diff --git a/tensorflow/core/kernels/scan_ops_test.cc b/tensorflow/core/kernels/scan_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..588b606a99b73588112aec1ca66cabf8d82dc38e
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+template <typename T>
+static Graph* LargeOneDCumsum(int num_x, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x}));
+  data.flat<T>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ColCumsum(int num_x, int num_y, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* RowCumsum(int num_x, int num_y, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ThreeDYCumsum(int num_y, int num_z, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({32, num_y, num_z}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+template <typename T>
+static void LargeOneDimensional(int iters, const string& device, int num_x,
+                                bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * sizeof(T));
+  test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse)).Run(iters);
+}
+
+static void DoRowCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, RowCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void DoColCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ColCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void Do3DYCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void BM_OneDCumsumGPU(int iters, int num_x) {
+  LargeOneDimensional<float>(iters, "gpu", num_x);
+}
+BENCHMARK(BM_OneDCumsumGPU)->Range(1, 1 << 21);
+
+static void BM_OneDCumsumGPUHalf(int iters, int num_x) {
+  LargeOneDimensional<Eigen::half>(iters, "gpu", num_x);
+}
+BENCHMARK(BM_OneDCumsumGPUHalf)->Range(1, 1 << 21);
+
+static void BM_Sum2DRowCumsumGPU(int iters, int num_x, int num_y) {
+  DoRowCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DRowCumsumGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnCumsumGPU(int iters, int num_x, int num_y) {
+  DoColCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DColumnCumsumGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYCumsumGPU(int iters, int num_x, int num_y) {
+  Do3DYCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum3DYCumsumGPU)->RangePair(64, 4096, 64, 4096);
+
+static void BM_OneDCumsumGPU_reverse(int iters, int num_x) {
+  LargeOneDimensional<float>(iters, "gpu", num_x, true);
+}
+BENCHMARK(BM_OneDCumsumGPU_reverse)->Range(1, 1 << 21);
+
+static void BM_Sum2DRowCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  DoRowCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum2DRowCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  DoColCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum2DColumnCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  Do3DYCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum3DYCumsumGPU_reverse)->RangePair(32, 2048, 32, 2048);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 866c5dcd521b2a33f44e2466262ec72b577ffa23..2ea7a1ed3b9c5c37e0c93edef9431ce0438d380d 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -44,6 +44,10 @@ limitations under the License.
 #include "include/libxsmm_spmdm.h"
 #endif
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 namespace {
 
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c70a2d62d36b94362c6f10473644f2623b77d2a
--- /dev/null
+++ b/tensorflow/core/kernels/stack.cc
@@ -0,0 +1,339 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/stack.h"
+
+#include <limits.h>
+#include <atomic>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Stack : public ResourceBase {
+ public:
+  static std::atomic<int64> stack_counter;
+
+  struct TensorAndAllocation {
+    Tensor tensor;
+    AllocatorAttributes alloc_attrs;
+    bool swapped_to_cpu;
+  };
+
+  Stack(const DataType& elem_type, const string& stack_name, int max_size)
+      : elem_type_(elem_type),
+        stack_name_(stack_name),
+        max_size_(max_size),
+        closed_(false) {}
+
+  Status Push(const TensorAndAllocation& value) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(CheckNotClosed());
+    if (max_size_ >= 0 && stack_.size() >= max_size_) {
+      return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ",
+                                     "its max_size (", max_size_, ")");
+    }
+    stack_.push_back(value);
+    return Status::OK();
+  }
+
+  Status Pop(TensorAndAllocation* value) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(CheckNotClosed());
+    if (stack_.empty()) {
+      return errors::InvalidArgument("Stack[", stack_name_,
+                                     "] is empty when calling Pop().");
+    }
+    *value = stack_.back();
+    stack_.pop_back();
+    return Status::OK();
+  }
+
+  // We don't swap the first tensor on the stack and any subsequent tensors
+  // that share the buffer with the first tensor.
+  bool IsUsefulToSwap(const Tensor& tensor) const {
+    mutex_lock l(mu_);
+    if (stack_.empty()) {
+      return false;
+    }
+    const Tensor& first = stack_.front().tensor;
+    return !tensor.SharesBufferWith(first);
+  }
+
+  void Close() {
+    mutex_lock l(mu_);
+    stack_.clear();
+    closed_ = true;
+  }
+
+  DataType ElemType() { return elem_type_; }
+
+  string DebugString() override {
+    mutex_lock l(mu_);
+    return strings::StrCat("Stack[", stack_name_, "]");
+  }
+
+  const string& stack_name() { return stack_name_; }
+
+ private:
+  friend class StackOp;
+  mutex* mu() { return &mu_; }
+
+  mutable mutex mu_;
+  DataType elem_type_;
+  const string stack_name_;
+  Tensor handle_;
+  int max_size_;
+  bool closed_ GUARDED_BY(mu_);
+  std::vector<TensorAndAllocation> stack_ GUARDED_BY(mu_);
+
+  Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (closed_) {
+      return errors::InvalidArgument("Stack[", stack_name_,
+                                     "] has already been closed.");
+    }
+    return Status::OK();
+  }
+};
+
+Status GetStack(OpKernelContext* ctx, Stack** stack) {
+  if (ctx->input_dtype(0) == DT_RESOURCE) {
+    return LookupResource(ctx, HandleFromInput(ctx, 0), stack);
+  } else {
+    Tensor Tstack_handle = ctx->mutable_input(0, false);
+    if (Tstack_handle.NumElements() != 2) {
+      return errors::InvalidArgument(
+          "Stack handle must have two elements, but had shape: ",
+          Tstack_handle.shape().DebugString());
+    }
+    const string& container = Tstack_handle.flat<string>()(0);
+    const string& stack_name = Tstack_handle.flat<string>()(1);
+    string key = strings::StrCat(container, stack_name);
+    ResourceMgr* rm = ctx->resource_manager();
+    if (rm == nullptr) {
+      return errors::Internal("No resource manager.");
+    }
+    auto* step_container = ctx->step_container();
+    if (step_container == nullptr) {
+      return errors::Internal("No step container.");
+    }
+    TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack));
+    return Status::OK();
+  }
+}
+
+std::atomic<int64> Stack::stack_counter{0};
+
+// StackOp
+
+StackOp::StackOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("elem_type", &elem_type_));
+  OP_REQUIRES_OK(context, context->GetAttr("stack_name", &stack_name_));
+  if (stack_name_.empty()) stack_name_ = name();
+}
+
+void StackOp::Compute(OpKernelContext* ctx) {
+  int32 size = std::numeric_limits<int32>::max();
+  if (ctx->num_inputs() > 0) {
+    const Tensor* tensor_size;
+    OP_REQUIRES_OK(ctx, ctx->input("max_size", &tensor_size));
+
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(tensor_size->shape()),
+        errors::InvalidArgument("Stack size must be a scalar, but had shape: ",
+                                tensor_size->shape().DebugString()));
+
+    int32 size_value = tensor_size->scalar<int32>()();
+    if (size_value >= 0) {
+      size = size_value;
+    }
+  }
+
+  static const char kContainer[] = "_stacks";
+  auto stack_id = Stack::stack_counter.fetch_add(1);
+  string stack_name = strings::StrCat(stack_name_, "_", stack_id);
+  // Store the handle in a per-step container.
+  ResourceMgr* rm = ctx->resource_manager();
+  OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
+  string key = strings::StrCat(kContainer, stack_name);
+  Stack* stack = new Stack(elem_type_, stack_name, size);
+  auto* step_container = ctx->step_container();
+  OP_REQUIRES(ctx, step_container != nullptr,
+              errors::Internal("No step container."));
+  OP_REQUIRES_OK(ctx, rm->Create(step_container->name(), key, stack));
+  if (IsRefType(ctx->expected_output_dtype(0))) {
+    // Create the stack handle.
+    AllocatorAttributes alloc_attr;
+    alloc_attr.set_on_host(true);
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING,
+                                           tensorflow::TensorShape({2}),
+                                           &stack->handle_, alloc_attr));
+    auto handle = stack->handle_.flat<string>();
+    handle(0) = kContainer;
+    handle(1) = std::move(stack_name);
+    ctx->set_output_ref(0, stack->mu(), &stack->handle_);
+  } else {
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->flat<ResourceHandle>()(0) =
+        MakePerStepResourceHandle<Stack>(ctx, key);
+  }
+}
+
+// StackPushOp
+
+StackPushOp::StackPushOp(OpKernelConstruction* context, bool allow_swapping)
+    : AsyncOpKernel(context) {
+  if (allow_swapping) {
+    OP_REQUIRES_OK(context, context->GetAttr("swap_memory", &swap_memory_));
+  }
+}
+
+void StackPushOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  // Get the stack from the handle.
+  Stack* stack = nullptr;
+  OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
+  core::ScopedUnref unref(stack);
+
+  if (ctx->input_dtype(1) != stack->ElemType()) {
+    ctx->CtxFailure(errors::InvalidArgument("Must have type ",
+                                            stack->ElemType(), " but got ",
+                                            ctx->input_dtype(1)));
+    done();
+    return;
+  }
+
+  // Push the tensor onto the stack. Swap the tensor to CPU if instructed.
+  const Tensor& tensor = ctx->input(1);
+  AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
+  // For now, we use a simple heuristic for swapping: A GPU tensor is moved
+  // to CPU if the tensor has more than kCopyThreshold bytes and the GPU
+  // allocator says more than kOccupancy of the memory is in use.
+  static constexpr int kCopyThreshold = 2048;
+  static constexpr double kOccupancy = 0.7;
+  if (swap_memory_ && !alloc_attrs.on_host() &&
+      tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
+    DeviceContext* device_ctxt = ctx->op_device_context();
+    auto device = static_cast<tensorflow::Device*>(ctx->device());
+    Allocator* allocator = device->GetAllocator(alloc_attrs);
+    AllocatorStats stats;
+    allocator->GetStats(&stats);
+    if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
+      // Asynchronously copy the tensor from GPU to CPU memory.
+      // TODO(yuanbyu): Swap the oldest tensor first.
+      AllocatorAttributes host_alloc_attrs;
+      host_alloc_attrs.set_gpu_compatible(true);
+      host_alloc_attrs.set_on_host(true);
+      Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs);
+      Tensor* cpu_tensor =
+          new Tensor(cpu_allocator, tensor.dtype(), tensor.shape());
+      device_ctxt->CopyDeviceTensorToCPU(
+          &tensor, "StackPush", device, cpu_tensor,
+          [cpu_tensor, stack, ctx, done](const Status& s) {
+            ctx->SetStatus(s);
+            if (s.ok()) {
+              AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
+              ctx->SetStatus(stack->Push({*cpu_tensor, alloc_attrs, true}));
+            }
+            if (ctx->status().ok()) {
+              ctx->set_output(0, *cpu_tensor);
+            }
+            done();
+            delete cpu_tensor;
+          });
+      return;
+    }
+  }
+
+  // Execute synchronously if not swapped.
+  OP_REQUIRES_OK_ASYNC(ctx, stack->Push({tensor, alloc_attrs, false}), done);
+  ctx->set_output(0, tensor);
+  done();
+}
+
+bool StackPushOp::IsExpensive() { return false; }
+
+// StackPopOp
+
+StackPopOp::StackPopOp(OpKernelConstruction* context)
+    : AsyncOpKernel(context) {}
+
+void StackPopOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  // Get the stack from the handle.
+  Stack* stack = nullptr;
+  OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
+  core::ScopedUnref unref(stack);
+
+  // Pop the tensor. Transfer the tensor back to device if it was
+  // swapped out to CPU.
+  Stack::TensorAndAllocation value;
+  OP_REQUIRES_OK_ASYNC(ctx, stack->Pop(&value), done);
+  if (value.swapped_to_cpu) {
+    // Asynchronously copy the tensor back from CPU to GPU memory.
+    DeviceContext* device_ctxt = ctx->op_device_context();
+    Device* device = static_cast<Device*>(ctx->device());
+    Tensor* cpu_tensor = &value.tensor;
+    Allocator* gpu_allocator = device->GetAllocator(value.alloc_attrs);
+    Tensor* device_tensor =
+        new Tensor(gpu_allocator, cpu_tensor->dtype(), cpu_tensor->shape());
+    device_ctxt->CopyCPUTensorToDevice(
+        cpu_tensor, device, device_tensor,
+        [device_tensor, ctx, done](const Status& s) {
+          ctx->SetStatus(s);
+          if (s.ok()) {
+            ctx->set_output(0, *device_tensor);
+          }
+          done();
+          delete device_tensor;
+        });
+  } else {
+    // Execute synchronously if not swapped.
+    ctx->set_output(0, value.tensor);
+    done();
+  }
+}
+
+bool StackPopOp::IsExpensive() { return false; }
+
+// StackCloseOp
+
+StackCloseOp::StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+void StackCloseOp::Compute(OpKernelContext* ctx) {
+  Stack* stack = nullptr;
+  OP_REQUIRES_OK(ctx, GetStack(ctx, &stack));
+  core::ScopedUnref unref(stack);
+  stack->Close();
+}
+
+bool StackCloseOp::IsExpensive() { return false; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stack.h b/tensorflow/core/kernels/stack.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1927e1f28fa217822aabedb0211546dd7c72758
--- /dev/null
+++ b/tensorflow/core/kernels/stack.h
@@ -0,0 +1,76 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STACK_H_
+#define TENSORFLOW_CORE_KERNELS_STACK_H_
+
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// A per-run local stack. The stack uses a "per-step" resource manager which
+// ensures that correct garbage collection on error or successful completion.
+class StackOp : public OpKernel {
+ public:
+  explicit StackOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataType elem_type_;
+  string stack_name_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
+};
+
+class StackPushOp : public AsyncOpKernel {
+ public:
+  StackPushOp(OpKernelConstruction* context, bool allow_swapping);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+  bool IsExpensive() override;
+
+ private:
+  bool swap_memory_ = false;
+};
+
+// Templated helper to make it easier to register kernels with or without
+// swapping.
+template <bool allow_swapping>
+class TemplatedStackPushOp : public StackPushOp {
+ public:
+  TemplatedStackPushOp(OpKernelConstruction* context)
+      : StackPushOp(context, allow_swapping) {}
+};
+
+class StackPopOp : public AsyncOpKernel {
+ public:
+  explicit StackPopOp(OpKernelConstruction* context);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+  bool IsExpensive() override;
+};
+
+class StackCloseOp : public OpKernel {
+ public:
+  explicit StackCloseOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STACK_H_
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index add4afafc92d4eee56447550390e19b89a95141d..df94a8818e7edd1b7313da4c483725e2119997af 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/data_flow_ops.cc.
 
+#include "tensorflow/core/kernels/stack.h"
+
 #include <limits.h>
 #include <atomic>
 #include <vector>
@@ -38,191 +40,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
-
-class Stack : public ResourceBase {
- public:
-  static std::atomic<int64> stack_counter;
-
-  struct TensorAndAllocation {
-    Tensor tensor;
-    AllocatorAttributes alloc_attrs;
-    bool swapped_to_cpu;
-  };
-
-  Stack(const DataType& elem_type, const string& stack_name, int max_size)
-      : elem_type_(elem_type),
-        stack_name_(stack_name),
-        max_size_(max_size),
-        closed_(false) {}
-
-  Status Push(const TensorAndAllocation& value) {
-    mutex_lock l(mu_);
-    TF_RETURN_IF_ERROR(CheckNotClosed());
-    if (max_size_ >= 0 && stack_.size() >= max_size_) {
-      return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ",
-                                     "its max_size (", max_size_, ")");
-    }
-    stack_.push_back(value);
-    return Status::OK();
-  }
-
-  Status Pop(TensorAndAllocation* value) {
-    mutex_lock l(mu_);
-    TF_RETURN_IF_ERROR(CheckNotClosed());
-    if (stack_.empty()) {
-      return errors::InvalidArgument("Stack[", stack_name_,
-                                     "] is empty when calling Pop().");
-    }
-    *value = stack_.back();
-    stack_.pop_back();
-    return Status::OK();
-  }
-
-  // We don't swap the first tensor on the stack and any subsequent tensors
-  // that share the buffer with the first tensor.
-  bool IsUsefulToSwap(const Tensor& tensor) const {
-    mutex_lock l(mu_);
-    if (stack_.empty()) {
-      return false;
-    }
-    const Tensor& first = stack_.front().tensor;
-    return !tensor.SharesBufferWith(first);
-  }
-
-  void Close() {
-    mutex_lock l(mu_);
-    stack_.clear();
-    closed_ = true;
-  }
-
-  DataType ElemType() { return elem_type_; }
-
-  string DebugString() override {
-    mutex_lock l(mu_);
-    return strings::StrCat("Stack[", stack_name_, "]");
-  }
-
-  const string& stack_name() { return stack_name_; }
-
- private:
-  friend class StackOp;
-  mutex* mu() { return &mu_; }
-
-  mutable mutex mu_;
-  DataType elem_type_;
-  const string stack_name_;
-  Tensor handle_;
-  int max_size_;
-  bool closed_ GUARDED_BY(mu_);
-  std::vector<TensorAndAllocation> stack_ GUARDED_BY(mu_);
-
-  Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (closed_) {
-      return errors::InvalidArgument("Stack[", stack_name_,
-                                     "] has already been closed.");
-    }
-    return Status::OK();
-  }
-};
-
-Status GetStack(OpKernelContext* ctx, Stack** stack) {
-  if (ctx->input_dtype(0) == DT_RESOURCE) {
-    return LookupResource(ctx, HandleFromInput(ctx, 0), stack);
-  } else {
-    Tensor Tstack_handle = ctx->mutable_input(0, false);
-    if (Tstack_handle.NumElements() != 2) {
-      return errors::InvalidArgument(
-          "Stack handle must have two elements, but had shape: ",
-          Tstack_handle.shape().DebugString());
-    }
-    const string& container = Tstack_handle.flat<string>()(0);
-    const string& stack_name = Tstack_handle.flat<string>()(1);
-    string key = strings::StrCat(container, stack_name);
-    ResourceMgr* rm = ctx->resource_manager();
-    if (rm == nullptr) {
-      return errors::Internal("No resource manager.");
-    }
-    auto* step_container = ctx->step_container();
-    if (step_container == nullptr) {
-      return errors::Internal("No step container.");
-    }
-    TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack));
-    return Status::OK();
-  }
-}
-
-std::atomic<int64> Stack::stack_counter{0};
-
-// A per-run local stack. The stack uses a "per-step" resource manager which
-// ensures that correct garbage collection on error or successful completion.
-class StackOp : public OpKernel {
- public:
-  explicit StackOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("elem_type", &elem_type_));
-    OP_REQUIRES_OK(context, context->GetAttr("stack_name", &stack_name_));
-    if (stack_name_.empty()) stack_name_ = name();
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    int32 size = std::numeric_limits<int32>::max();
-    if (ctx->num_inputs() > 0) {
-      const Tensor* tensor_size;
-      OP_REQUIRES_OK(ctx, ctx->input("max_size", &tensor_size));
-
-      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_size->shape()),
-                  errors::InvalidArgument(
-                      "Stack size must be a scalar, but had shape: ",
-                      tensor_size->shape().DebugString()));
-
-      int32 size_value = tensor_size->scalar<int32>()();
-      if (size_value >= 0) {
-        size = size_value;
-      }
-    }
-
-    static const char kContainer[] = "_stacks";
-    auto stack_id = Stack::stack_counter.fetch_add(1);
-    string stack_name = strings::StrCat(stack_name_, "_", stack_id);
-    // Store the handle in a per-step container.
-    ResourceMgr* rm = ctx->resource_manager();
-    OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
-    string key = strings::StrCat(kContainer, stack_name);
-    Stack* stack = new Stack(elem_type_, stack_name, size);
-    auto* step_container = ctx->step_container();
-    OP_REQUIRES(ctx, step_container != nullptr,
-                errors::Internal("No step container."));
-    OP_REQUIRES_OK(ctx, rm->Create(step_container->name(), key, stack));
-    if (IsRefType(ctx->expected_output_dtype(0))) {
-      // Create the stack handle.
-      AllocatorAttributes alloc_attr;
-      alloc_attr.set_on_host(true);
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING,
-                                             tensorflow::TensorShape({2}),
-                                             &stack->handle_, alloc_attr));
-      auto handle = stack->handle_.flat<string>();
-      handle(0) = kContainer;
-      handle(1) = std::move(stack_name);
-      ctx->set_output_ref(0, stack->mu(), &stack->handle_);
-    } else {
-      Tensor* handle;
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
-      handle->flat<ResourceHandle>()(0) =
-          MakePerStepResourceHandle<Stack>(ctx, key);
-    }
-  }
-
- private:
-  DataType elem_type_;
-  string stack_name_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
-};
-
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_CPU), StackOp);
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_GPU).HostMemory("handle"),
                         StackOp);
@@ -242,102 +59,22 @@ REGISTER_KERNEL_BUILDER(Name("StackV2")
                         StackOp);
 #endif  // TENSORFLOW_USE_SYCL
 
-template <typename Device>
-class StackPushOp : public AsyncOpKernel {
- public:
-  explicit StackPushOp(OpKernelConstruction* context) : AsyncOpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("swap_memory", &swap_memory_));
-  }
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    // Get the stack from the handle.
-    Stack* stack = nullptr;
-    OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
-    core::ScopedUnref unref(stack);
-
-    if (ctx->input_dtype(1) != stack->ElemType()) {
-      ctx->CtxFailure(errors::InvalidArgument("Must have type ",
-                                              stack->ElemType(), " but got ",
-                                              ctx->input_dtype(1)));
-      done();
-      return;
-    }
-
-    // Push the tensor onto the stack. Swap the tensor to CPU if instructed.
-    const Tensor& tensor = ctx->input(1);
-    AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
-    // For now, we use a simple heuristic for swapping: A GPU tensor is moved
-    // to CPU if the tensor has more than kCopyThreshold bytes and the GPU
-    // allocator says more than kOccupancy of the memory is in use.
-    static constexpr int kCopyThreshold = 2048;
-    static constexpr double kOccupancy = 0.7;
-    if (swap_memory_ && !alloc_attrs.on_host() &&
-        (std::is_same<Device, GPUDevice>::value
-#ifdef TENSORFLOW_USE_SYCL
-         || std::is_same<Device, SYCLDevice>::value
-#endif  // TENSORFLOW_USE_SYCL
-         ) &&
-        tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
-      DeviceContext* device_ctxt = ctx->op_device_context();
-      auto device = static_cast<tensorflow::Device*>(ctx->device());
-      Allocator* allocator = device->GetAllocator(alloc_attrs);
-      AllocatorStats stats;
-      allocator->GetStats(&stats);
-      if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
-        // Asynchronously copy the tensor from GPU to CPU memory.
-        // TODO(yuanbyu): Swap the oldest tensor first.
-        AllocatorAttributes host_alloc_attrs;
-        host_alloc_attrs.set_gpu_compatible(true);
-        host_alloc_attrs.set_on_host(true);
-        Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs);
-        Tensor* cpu_tensor =
-            new Tensor(cpu_allocator, tensor.dtype(), tensor.shape());
-        device_ctxt->CopyDeviceTensorToCPU(
-            &tensor, "StackPush", device, cpu_tensor,
-            [cpu_tensor, stack, ctx, done](const Status& s) {
-              ctx->SetStatus(s);
-              if (s.ok()) {
-                AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
-                ctx->SetStatus(stack->Push({*cpu_tensor, alloc_attrs, true}));
-              }
-              if (ctx->status().ok()) {
-                ctx->set_output(0, *cpu_tensor);
-              }
-              done();
-              delete cpu_tensor;
-            });
-        return;
-      }
-    }
-
-    // Execute synchronously if not swapped.
-    OP_REQUIRES_OK_ASYNC(ctx, stack->Push({tensor, alloc_attrs, false}), done);
-    ctx->set_output(0, tensor);
-    done();
-  }
-
-  bool IsExpensive() override { return false; }
-
- private:
-  bool swap_memory_;
-};
-
 REGISTER_KERNEL_BUILDER(Name("StackPush").Device(DEVICE_CPU),
-                        StackPushOp<CPUDevice>);
+                        TemplatedStackPushOp</*allow_swapping=*/false>);
 REGISTER_KERNEL_BUILDER(Name("StackPushV2").Device(DEVICE_CPU),
-                        StackPushOp<CPUDevice>);
-
-#define REGISTER_GPU_KERNEL(type)                         \
-  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);        \
-  REGISTER_KERNEL_BUILDER(Name("StackPushV2")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);
+                        TemplatedStackPushOp</*allow_swapping=*/false>);
+
+#define REGISTER_GPU_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")                               \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>); \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")                             \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
@@ -345,21 +82,21 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 // Special GPU kernels for int32 and bool.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-#define REGISTER_GPU_HOST_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .HostMemory("elem")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);        \
-  REGISTER_KERNEL_BUILDER(Name("StackPushV2")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .HostMemory("elem")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);
+#define REGISTER_GPU_HOST_KERNEL(type)                                    \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")                               \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .HostMemory("elem")                         \
+                              .HostMemory("output")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>); \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")                             \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .HostMemory("elem")                         \
+                              .HostMemory("output")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
@@ -372,7 +109,7 @@ REGISTER_GPU_HOST_KERNEL(bool);
                               .Device(DEVICE_SYCL)        \
                               .HostMemory("handle")       \
                               .TypeConstraint<type>("T"), \
-                          StackPushOp<SYCLDevice>);
+                          TemplatedStackPushOp</*allow_swapping=*/true>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
 
@@ -383,7 +120,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
                               .HostMemory("elem")         \
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
-                          StackPushOp<SYCLDevice>)
+                          TemplatedStackPushOp</*allow_swapping=*/true>)
 
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(bool);
@@ -391,48 +128,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-class StackPopOp : public AsyncOpKernel {
- public:
-  explicit StackPopOp(OpKernelConstruction* context) : AsyncOpKernel(context) {}
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    // Get the stack from the handle.
-    Stack* stack = nullptr;
-    OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
-    core::ScopedUnref unref(stack);
-
-    // Pop the tensor. Transfer the tensor back to device if it was
-    // swapped out to CPU.
-    Stack::TensorAndAllocation value;
-    OP_REQUIRES_OK_ASYNC(ctx, stack->Pop(&value), done);
-    if (value.swapped_to_cpu) {
-      // Asynchronously copy the tensor back from CPU to GPU memory.
-      DeviceContext* device_ctxt = ctx->op_device_context();
-      Device* device = static_cast<Device*>(ctx->device());
-      Tensor* cpu_tensor = &value.tensor;
-      Allocator* gpu_allocator = device->GetAllocator(value.alloc_attrs);
-      Tensor* device_tensor =
-          new Tensor(gpu_allocator, cpu_tensor->dtype(), cpu_tensor->shape());
-      device_ctxt->CopyCPUTensorToDevice(
-          cpu_tensor, device, device_tensor,
-          [device_tensor, ctx, done](const Status& s) {
-            ctx->SetStatus(s);
-            if (s.ok()) {
-              ctx->set_output(0, *device_tensor);
-            }
-            done();
-            delete device_tensor;
-          });
-    } else {
-      // Execute synchronously if not swapped.
-      ctx->set_output(0, value.tensor);
-      done();
-    }
-  }
-
-  bool IsExpensive() override { return false; }
-};
-
 REGISTER_KERNEL_BUILDER(Name("StackPop").Device(DEVICE_CPU), StackPopOp);
 REGISTER_KERNEL_BUILDER(Name("StackPopV2").Device(DEVICE_CPU), StackPopOp);
 
@@ -498,20 +193,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-class StackCloseOp : public OpKernel {
- public:
-  explicit StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    Stack* stack = nullptr;
-    OP_REQUIRES_OK(ctx, GetStack(ctx, &stack));
-    core::ScopedUnref unref(stack);
-    stack->Close();
-  }
-
-  bool IsExpensive() override { return false; }
-};
-
 REGISTER_KERNEL_BUILDER(Name("StackClose").Device(DEVICE_CPU), StackCloseOp);
 REGISTER_KERNEL_BUILDER(
     Name("StackClose").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp);
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index a97a71b344d64be09daf919c387d55a5c06db5aa..aa85f546a81d0e6b8cf41fc23532fd4a11fe42ec 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -352,9 +352,9 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
     }
 
     const auto key = strings::StrCat(output_handle(0), output_handle(1));
-    auto creator = [this, key, tensor_array, array_size, marked_size,
-                    element_shape, shape_to_prepend, tensor_array_output_handle,
-                    output_handle](TensorArray** ret) -> Status {
+    auto creator = [key, tensor_array, array_size, marked_size, element_shape,
+                    shape_to_prepend,
+                    tensor_array_output_handle](TensorArray** ret) -> Status {
       *ret = new TensorArray(
           key, tensor_array->ElemType(), *tensor_array_output_handle,
           array_size, element_shape, tensor_array->HasIdenticalElementShapes(),
diff --git a/tensorflow/core/kernels/tensor_forest/BUILD b/tensorflow/core/kernels/tensor_forest/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..df035506f7698d1d213efad6088e9bfb53d97282
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/BUILD
@@ -0,0 +1,53 @@
+# Description:
+#   OpKernels for tensor forest ops.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+cc_library(
+    name = "resources",
+    srcs = ["resources.cc"],
+    hdrs = ["resources.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "resource_ops",
+    srcs = ["resource_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "prediction_ops",
+    srcs = ["prediction_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_forest_ops",
+    deps = [
+        ":prediction_ops",
+        ":resource_ops",
+    ],
+)
diff --git a/tensorflow/core/kernels/tensor_forest/prediction_ops.cc b/tensorflow/core/kernels/tensor_forest/prediction_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e75421fb95791c9dc8aa3b3baf13cffed50d3da
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/prediction_ops.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+class TensorForestTreePredictOp : public OpKernel {
+ public:
+  explicit TensorForestTreePredictOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const Tensor* dense_features_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->input("dense_features", &dense_features_t));
+
+    auto dense_features = dense_features_t->matrix<float>();
+    const int32 batch_size = dense_features_t->dim_size(0);
+
+    Tensor* output_predictions = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, {batch_size, logits_dimension_},
+                                            &output_predictions));
+    auto out = output_predictions->matrix<float>();
+
+    if (decision_tree_resource->get_size() <= 0) {
+      out.setZero();
+      return;
+    }
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    const int32 num_threads = worker_threads->num_threads;
+
+    // TODO(yupbank): This was from contrib version.
+    //  This cost would probably depend on the depth of the tree we have.
+    //  We will need to run it on a number of trees of diff depth
+    //  and see the num of cpu cycles
+    const int64 cost_per_traverse = 500;
+    auto traverse = [this, &out, &dense_features, decision_tree_resource,
+                     batch_size](int64 start, int64 end) {
+      DCHECK_LE(start, end) << "Start exceeding End";
+      DCHECK_LE(end, batch_size) << "End exceeding batch size";
+      for (int example_id = start; example_id < end; ++example_id) {
+        const int32 leaf_id =
+            decision_tree_resource->TraverseTree(example_id, &dense_features);
+        set_output_value(example_id, leaf_id, decision_tree_resource, &out);
+      }
+    };
+    Shard(num_threads, worker_threads->workers, batch_size, cost_per_traverse,
+          traverse);
+  };
+
+  void set_output_value(const int32 example_id, const int32 leaf_id,
+                        const TensorForestTreeResource* decision_tree_resource,
+                        TTypes<float>::Matrix* out) const {
+    for (int j = 0; j < logits_dimension_; ++j) {
+      const float logit = decision_tree_resource->get_prediction(leaf_id, j);
+      (*out)(example_id, j) = logit;
+    }
+  }
+
+ private:
+  int32 logits_dimension_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreePredict").Device(DEVICE_CPU),
+                        TensorForestTreePredictOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resource_ops.cc b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0474d56098f50412345fe017c8bdfb09e908be0b
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
@@ -0,0 +1,136 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+
+namespace tensorflow {
+
+class TensorForestCreateTreeVariableOp : public OpKernel {
+ public:
+  explicit TensorForestCreateTreeVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+
+    auto* const result = new TensorForestTreeResource();
+
+    if (!result->InitFromSerialized(tree_config_t->scalar<string>()())) {
+      result->Unref();
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse tree config."));
+    }
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+};
+
+// Op for serializing a model.
+class TensorForestTreeSerializeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeSerializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_config_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape(), &output_config_t));
+    output_config_t->scalar<string>()() =
+        decision_tree_resource->decision_tree().SerializeAsString();
+  }
+};
+
+// Op for deserializing a tree variable from a checkpoint.
+class TensorForestTreeDeserializeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeDeserializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+
+    // Deallocate all the previous objects on the resource.
+    decision_tree_resource->Reset();
+
+    if (!decision_tree_resource->InitFromSerialized(
+            tree_config_t->scalar<string>()())) {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse tree config."));
+    }
+  }
+};
+
+// Op for getting tree size.
+class TensorForestTreeSizeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeSizeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape(), &output_t));
+    output_t->scalar<int32>()() = decision_tree_resource->get_size();
+  }
+};
+
+REGISTER_RESOURCE_HANDLE_KERNEL(TensorForestTreeResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorForestTreeIsInitializedOp").Device(DEVICE_CPU),
+    IsResourceInitialized<TensorForestTreeResource>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorForestCreateTreeVariable").Device(DEVICE_CPU),
+    TensorForestCreateTreeVariableOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeSerialize").Device(DEVICE_CPU),
+                        TensorForestTreeSerializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeDeserialize").Device(DEVICE_CPU),
+                        TensorForestTreeDeserializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeSize").Device(DEVICE_CPU),
+                        TensorForestTreeSizeOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resources.cc b/tensorflow/core/kernels/tensor_forest/resources.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bcd1a1e904171c6c97a6c1cb5ce0809e393be015
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resources.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+const boosted_trees::Tree& TensorForestTreeResource::decision_tree() const {
+  return *decision_tree_;
+}
+
+const int32 TensorForestTreeResource::get_size() const {
+  return decision_tree_->nodes_size();
+}
+
+TensorForestTreeResource::TensorForestTreeResource()
+    : decision_tree_(
+          protobuf::Arena::CreateMessage<boosted_trees::Tree>(&arena_)) {}
+
+const float TensorForestTreeResource::get_prediction(
+    const int32 id, const int32 dimension_id) const {
+  return decision_tree_->nodes(id).leaf().vector().value(dimension_id);
+}
+
+const int32 TensorForestTreeResource::TraverseTree(
+    const int32 example_id,
+    const TTypes<float>::ConstMatrix* dense_data) const {
+  using boosted_trees::Node;
+  using boosted_trees::Tree;
+  int32 current_id = 0;
+  while (true) {
+    const Node& current = decision_tree_->nodes(current_id);
+    if (current.has_leaf()) {
+      return current_id;
+    }
+    DCHECK_EQ(current.node_case(), Node::kDenseSplit);
+    const auto& split = current.dense_split();
+
+    if ((*dense_data)(example_id, split.feature_id()) <= split.threshold()) {
+      current_id = split.left_id();
+    } else {
+      current_id = split.right_id();
+    }
+  }
+}
+
+bool TensorForestTreeResource::InitFromSerialized(const string& serialized) {
+  return ParseProtoUnlimited(decision_tree_, serialized);
+}
+
+void TensorForestTreeResource::Reset() {
+  arena_.Reset();
+  DCHECK_EQ(0, arena_.SpaceAllocated());
+  decision_tree_ = protobuf::Arena::CreateMessage<boosted_trees::Tree>(&arena_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resources.h b/tensorflow/core/kernels/tensor_forest/resources.h
new file mode 100644
index 0000000000000000000000000000000000000000..da258e5017ca8cc9b996d83bcd767e89d61322d7
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resources.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Forward declaration for proto class Tree.
+namespace boosted_trees {
+class Tree;
+}  // namespace boosted_trees
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class TensorForestTreeResource : public ResourceBase {
+ public:
+  TensorForestTreeResource();
+
+  string DebugString() override {
+    return strings::StrCat("TensorForestTree[size=", get_size(), "]");
+  }
+
+  mutex* get_mutex() { return &mu_; }
+
+  bool InitFromSerialized(const string& serialized);
+
+  // Resets the resource and frees the proto.
+  // Caller needs to hold the mutex lock while calling this.
+  void Reset();
+
+  const int32 get_size() const;
+
+  const boosted_trees::Tree& decision_tree() const;
+
+  const float get_prediction(const int32 id, const int32 dimension_id) const;
+
+  const int32 TraverseTree(const int32 example_id,
+                           const TTypes<float>::ConstMatrix* dense_data) const;
+
+ protected:
+  mutex mu_;
+  protobuf::Arena arena_;
+  boosted_trees::Tree* decision_tree_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index dd4415711b1b36ca570a9af72a5829ae030a5d6a..eb0c1d12285be85f45d325410c9623db4f45e66c 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -33,6 +33,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
 // This error callback is only useful for finding illegal encoding errors when
 // we want to be strict -- otherwise illegal encodings are replaced on read
 // with 0xFFFD and signaled to the callback.
@@ -146,40 +148,64 @@ class WrappedConverter {
   string name_;
 };
 
+struct ErrorOptions {
+  UChar32 subst = 0xFFFD;
+  bool elide_replacement = false;
+  bool replace_control_chars = false;
+  bool error_on_malformatting = false;
+};
+
+Status GetErrorOptions(OpKernelConstruction* ctx, ErrorOptions* out) {
+  *out = ErrorOptions();
+
+  string error_policy;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("errors", &error_policy));
+
+  if (error_policy == "replace") {
+    out->elide_replacement = false;
+  } else if (error_policy == "ignore") {
+    out->elide_replacement = true;
+  } else if (error_policy == "strict") {
+    out->error_on_malformatting = true;
+  } else {
+    return errors::InvalidArgument(
+        "errors policy must be one of 'strict', 'replace', or 'ignore'");
+  }
+
+  int32 replacement_char;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("replacement_char", &replacement_char));
+
+  if (replacement_char >= UCHAR_MIN_VALUE &&
+      replacement_char <= UCHAR_MAX_VALUE) {
+    out->subst = replacement_char;
+  } else {
+    return errors::InvalidArgument(
+        "replacement_char out of unicode codepoint range");
+  }
+
+  TF_RETURN_IF_ERROR(ctx->GetAttr("replace_control_characters",
+                                  &(out->replace_control_chars)));
+
+  return Status::OK();
+}
+
+inline bool ShouldHandleFormatError(const ErrorOptions& error_options,
+                                    UChar32 ch, bool format_error) {
+  return ((error_options.replace_control_chars && ch <= 0x1F) || format_error);
+}
+
+}  // namespace
+
 class UnicodeTranscodeOp : public OpKernel {
  public:
   explicit UnicodeTranscodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string error_policy;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("errors", &error_policy));
-    if (error_policy == "replace") {
-      elide_replacement_ = false;
-    } else if (error_policy == "ignore") {
-      elide_replacement_ = true;
-    } else if (error_policy == "strict") {
-      error_on_malformatting_ = true;
-    } else {
-      ctx->CtxFailure(errors::InvalidArgument(
-          "errors policy must be one of 'strict', 'replace', or 'ignore'"));
-    }
-
-    int32 replacement_char;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("replacement_char", &replacement_char));
-    if (replacement_char >= UCHAR_MIN_VALUE &&
-        replacement_char <= UCHAR_MAX_VALUE) {
-      subst_ = replacement_char;
-    } else {
-      ctx->CtxFailure(errors::InvalidArgument(
-          "replacement_char out of unicode codepoint range"));
-    }
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
 
     string output_encoding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &output_encoding));
     OP_REQUIRES_OK(ctx,
                    ParseUnicodeEncoding(output_encoding, &output_encoding_));
 
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("replace_control_characters",
-                                     &replace_control_chars_));
-
     OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
     // Make a temporary UConverter to ensure it will create without error
     // at execution time (and to warm any data caches the converter needs).
@@ -228,7 +254,7 @@ class UnicodeTranscodeOp : public OpKernel {
       Transcode(&(output_flat(i)), input_encoder->converter_,
                 &found_any_format_error);
     }
-    if (error_on_malformatting_ && found_any_format_error) {
+    if (error_options_.error_on_malformatting && found_any_format_error) {
       ctx->CtxFailure(
           errors::InvalidArgument("Invalid formatting on input string"));
     }
@@ -240,12 +266,12 @@ class UnicodeTranscodeOp : public OpKernel {
   // out-of-range inputs.
   void TranslateCodepoints(icu::UnicodeString* s, bool* found_any_format_error,
                            UChar32 ch, int src_bytes, bool format_error) {
-    if ((replace_control_chars_ && ch <= 0x1F) || format_error) {
+    if (ShouldHandleFormatError(error_options_, ch, format_error)) {
       *found_any_format_error = true;
-      if (elide_replacement_) {
+      if (error_options_.elide_replacement) {
         return;
       } else {
-        ch = subst_;
+        ch = error_options_.subst;
       }
     }
     s->append(ch);
@@ -292,16 +318,129 @@ class UnicodeTranscodeOp : public OpKernel {
     }
   }
 
-  UChar32 subst_ = 0xFFFD;
-  bool elide_replacement_ = false;
-  bool replace_control_chars_ = false;
-  bool error_on_malformatting_ = false;
-
   string input_encoding_;
+  ErrorOptions error_options_;
   UnicodeEncoding output_encoding_ = UnicodeEncoding::UTF8;
 };
 
 REGISTER_KERNEL_BUILDER(Name("UnicodeTranscode").Device(DEVICE_CPU),
                         UnicodeTranscodeOp);
 
+class UnicodeDecodeWithOffsetsOp : public OpKernel {
+ public:
+  explicit UnicodeDecodeWithOffsetsOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
+    // Make a temporary UConverter to ensure it will create without error
+    // at execution time (and to warm any data caches the converter needs).
+    // This instance is not used.
+    std::unique_ptr<WrappedConverter> input_encoder =
+        absl::make_unique<WrappedConverter>();
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+  }
+
+  void Decode(OpKernelContext* ctx, std::vector<UChar32>* char_values,
+              std::vector<int64>* offset_values, int* string_length,
+              int64* next_row_split, UChar32 char_value, int char_length,
+              bool found_any_format_error) {
+    if (error_options_.error_on_malformatting && found_any_format_error) {
+      ctx->CtxFailure(
+          errors::InvalidArgument("Invalid formatting on input string"));
+    }
+    UChar32 decoded_value = char_value;
+    if (ShouldHandleFormatError(error_options_, char_value,
+                                found_any_format_error)) {
+      if (error_options_.elide_replacement) {
+        return;
+      } else {
+        decoded_value = error_options_.subst;
+      }
+    }
+
+    // Emit the char value.
+    char_values->push_back(decoded_value);
+
+    // Emit the byte offset
+    offset_values->push_back(*string_length);
+    *string_length += char_length;
+    *next_row_split += 1;
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+
+    // Go through all the strings in `input`.
+    const auto& input_vec = input_tensor->flat<string>();
+
+    std::unique_ptr<WrappedConverter> input_encoder =
+        absl::make_unique<WrappedConverter>();
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+
+    std::vector<UChar32> char_values;
+    std::vector<int64> offset_values;
+
+    Tensor* output_row_splits;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("row_splits",
+                                             {input_tensor->NumElements() + 1},
+                                             &output_row_splits));
+    auto out_row_splits = output_row_splits->vec<int64>();
+
+    int row_split_index = 0;
+    int64 next_row_split = 0;
+    for (int i = 0; i < input_vec.size(); ++i) {
+      const string& input = input_vec(i);
+      // Convert input strings into unicode values. Output to a list of
+      // char_values, record row splits and char_to_byte_starts, which are all
+      // the fields needed to construct a RaggedTensor.
+      out_row_splits(row_split_index) = next_row_split;
+      row_split_index++;
+      int string_length = 0;
+      IterateUnicodeString(
+          input, input_encoder->converter_,
+          std::bind(&UnicodeDecodeWithOffsetsOp::Decode, this, ctx,
+                    &char_values, &offset_values, &string_length,
+                    &next_row_split, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3));
+    }
+    out_row_splits(row_split_index) = next_row_split;
+
+    DCHECK(offset_values.size() == char_values.size());
+    Tensor* output_char_values;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("char_values",
+                                  {static_cast<int64>(char_values.size())},
+                                  &output_char_values));
+    Tensor* output_offset_values;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("char_to_byte_starts",
+                                  {static_cast<int64>(offset_values.size())},
+                                  &output_offset_values));
+    auto out_char_values = output_char_values->vec<int32>();
+    auto out_offset_values = output_offset_values->vec<int64>();
+
+    // Load output tensors from intermediate value arrays.
+    for (int i = 0; i < char_values.size(); ++i) {
+      out_char_values(i) = static_cast<int32>(char_values[i]);
+      out_offset_values(i) = offset_values[i];
+    }
+  }
+
+ private:
+  string input_encoding_;
+  ErrorOptions error_options_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets").Device(DEVICE_CPU),
+                        UnicodeDecodeWithOffsetsOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 9ccd911b0efbcd047fcfb278cd4e91e2dd768488..e929ff45a1fb8656d5762a8793cb17175f04c1f9 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
@@ -54,6 +55,9 @@ struct EigenEnvironment {
       port::ScopedFlushDenormal flush;
       // Set the processor rounding mode to ROUND TO NEAREST.
       port::ScopedSetRound round(FE_TONEAREST);
+      if (thread_options_.numa_node != port::kNUMANoAffinity) {
+        port::NUMASetThreadNodeAffinity(thread_options_.numa_node);
+      }
       f();
     });
   }
@@ -83,35 +87,38 @@ struct EigenEnvironment {
 
 struct ThreadPool::Impl : Eigen::ThreadPoolTempl<EigenEnvironment> {
   Impl(Env* env, const ThreadOptions& thread_options, const string& name,
-       int num_threads, bool low_latency_hint)
+       int num_threads, bool low_latency_hint, Eigen::Allocator* allocator)
       : Eigen::ThreadPoolTempl<EigenEnvironment>(
             num_threads, low_latency_hint,
-            EigenEnvironment(env, thread_options, name)) {}
+            EigenEnvironment(env, thread_options, name)),
+        allocator_(allocator) {}
 
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn) {
     CHECK_GE(total, 0);
     CHECK_EQ(total, (int64)(Eigen::Index)total);
-    Eigen::ThreadPoolDevice device(this, this->NumThreads());
+    Eigen::ThreadPoolDevice device(this, this->NumThreads(), allocator_);
     device.parallelFor(
         total, Eigen::TensorOpCost(0, 0, cost_per_unit),
         [&fn](Eigen::Index first, Eigen::Index last) { fn(first, last); });
   }
+
+  Eigen::Allocator* allocator_;
 };
 
 ThreadPool::ThreadPool(Env* env, const string& name, int num_threads)
-    : ThreadPool(env, ThreadOptions(), name, num_threads, true) {}
+    : ThreadPool(env, ThreadOptions(), name, num_threads, true, nullptr) {}
 
 ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
                        const string& name, int num_threads)
-    : ThreadPool(env, thread_options, name, num_threads, true) {}
+    : ThreadPool(env, thread_options, name, num_threads, true, nullptr) {}
 
 ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
                        const string& name, int num_threads,
-                       bool low_latency_hint) {
+                       bool low_latency_hint, Eigen::Allocator* allocator) {
   CHECK_GE(num_threads, 1);
   impl_.reset(new ThreadPool::Impl(env, thread_options, "tf_" + name,
-                                   num_threads, low_latency_hint));
+                                   num_threads, low_latency_hint, allocator));
 }
 
 ThreadPool::~ThreadPool() {}
@@ -192,5 +199,14 @@ int ThreadPool::NumThreads() const { return impl_->NumThreads(); }
 
 int ThreadPool::CurrentThreadId() const { return impl_->CurrentThreadId(); }
 
+void ThreadPool::ScheduleWithHint(std::function<void()> fn, int start,
+                                  int limit) {
+  impl_->ScheduleWithHint(std::move(fn), start, limit);
+}
+
+void ThreadPool::SetStealPartitions(
+    const std::vector<std::pair<unsigned, unsigned>>& partitions) {
+  impl_->SetStealPartitions(partitions);
+}
 }  // namespace thread
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index e14ad7ac641ac764ae3326cbedb9998e48e1b070..90c9f294472f1475c99494bc276ce475d5cded81 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -22,6 +22,9 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+namespace Eigen {
+class Allocator;
+}  // namespace Eigen
 namespace tensorflow {
 namespace thread {
 
@@ -37,7 +40,8 @@ class ThreadPool {
   //
   // REQUIRES: num_threads > 0
   ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
-             int num_threads, bool low_latency_hint);
+             int num_threads, bool low_latency_hint,
+             Eigen::Allocator* allocator = nullptr);
 
   // Constructs a pool for low-latency ops that contains "num_threads" threads
   // with specified "name". env->StartThread() is used to create individual
@@ -59,6 +63,10 @@ class ThreadPool {
   // Schedules fn() for execution in the pool of threads.
   void Schedule(std::function<void()> fn);
 
+  void SetStealPartitions(
+      const std::vector<std::pair<unsigned, unsigned>>& partitions);
+
+  void ScheduleWithHint(std::function<void()> fn, int start, int limit);
   // Requires 0 < block_size <= total.
   // Spawns k threads and calls fn(i*block_size, (i+1)*block_size) from the
   // ith thread (i>=0). When (i+1)*block_size > total, fn(i*block_size, total)
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index bc52180265c70ce2953e6818c1ca414f86feee6f..e8dbcb97b94475f91345676bade0a9d220560741 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -92,7 +92,11 @@ void StringReader(png_structp png_ptr, png_bytep data, png_size_t length) {
   DecodeContext* const ctx =
       absl::bit_cast<DecodeContext*>(png_get_io_ptr(png_ptr));
   if (static_cast<png_size_t>(ctx->data_left) < length) {
-    memset(data, 0, length);
+    // Don't zero out the data buffer as it has been lazily allocated (copy on
+    // write) and zeroing it out here can produce an OOM. Since the buffer is
+    // only used for reading data from the image, this doesn't result in any
+    // data leak, so it is safe to just leave the buffer be as it is and just
+    // exit with error.
     png_error(png_ptr, "More bytes requested to read than available");
   } else {
     memcpy(data, ctx->data, length);
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index b8cf5385548918434fe8fac31c92608e86c89519..1c854f661931a6ef26d69752708d7764107b49c6 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 
@@ -400,10 +401,7 @@ REGISTER_OP("BoostedTreesMakeQuantileSummaries")
       for (int i = 0; i < num_features; ++i) {
         ShapeHandle feature_shape;
         DimensionHandle unused_dim;
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &feature_shape));
-        TF_RETURN_IF_ERROR(c->Merge(c->Dim(feature_shape, 0),
-                                    c->Dim(example_weights_shape, 0),
-                                    &unused_dim));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &feature_shape));
         // the columns are value, weight, min_rank, max_rank.
         c->set_output(i, c->MakeShape({c->UnknownDim(), 4}));
       }
@@ -431,6 +429,17 @@ REGISTER_OP("BoostedTreesQuantileStreamResourceAddSummaries")
       return Status::OK();
     });
 
+REGISTER_OP("BoostedTreesQuantileStreamResourceDeserialize")
+    .Attr("num_streams: int")
+    .Input("quantile_stream_resource_handle: resource")
+    .Input("bucket_boundaries: num_streams * float")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      return Status::OK();
+    });
+
 REGISTER_OP("BoostedTreesQuantileStreamResourceFlush")
     .Attr("generate_quantiles: bool = False")
     .Input("quantile_stream_resource_handle: resource")
@@ -470,13 +479,13 @@ REGISTER_OP("BoostedTreesBucketize")
       ShapeHandle feature_shape;
       DimensionHandle unused_dim;
       for (int i = 0; i < num_features; i++) {
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &feature_shape));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &feature_shape));
         TF_RETURN_IF_ERROR(c->Merge(c->Dim(feature_shape, 0),
                                     c->Dim(c->input(0), 0), &unused_dim));
       }
       // Bucketized result should have same dimension as input.
       for (int i = 0; i < num_features; i++) {
-        c->set_output(i, c->MakeShape({c->Dim(c->input(i), 0), 1}));
+        c->set_output(i, c->MakeShape({c->Dim(c->input(i), 0)}));
       }
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 49240c3602bd53ed7fdfc3b822cf382070283508..dd1aaf966eaa75ff75ce415e554a8531a3f80f1e 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -11765,6 +11765,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "BoostedTreesQuantileStreamResourceDeserialize"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_streams"
+  }
+  attr {
+    name: "num_streams"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "BoostedTreesQuantileStreamResourceFlush"
   input_arg {
@@ -20924,6 +20943,10 @@ op {
     name: "element_shape"
     type_attr: "shape_type"
   }
+  input_arg {
+    name: "max_num_elements"
+    type: DT_INT32
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -21875,6 +21898,49 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "ExperimentalMaterializedIndexDatasetHandle"
   output_arg {
@@ -21903,6 +21969,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalNonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalNumaMapAndBatchDataset"
   input_arg {
@@ -23481,6 +23570,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FixedLengthRecordDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "FixedLengthRecordReader"
   output_arg {
@@ -40198,6 +40319,52 @@ op {
     }
   }
 }
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "Placeholder"
   output_arg {
@@ -61513,17 +61680,6 @@ op {
     }
   }
 }
-op {
-  name: "SinkDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-}
 op {
   name: "Size"
   input_arg {
@@ -71760,6 +71916,53 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -75311,6 +75514,108 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -76807,6 +77112,57 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "UnicodeScript"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 98a76962611ba72b8d89321a69c53e32f8d859d9..8402f250f9fe77319e74887c1957d71773b36b87 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -674,6 +674,29 @@ REGISTER_OP("FixedLengthRecordDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("FixedLengthRecordDatasetV2")
+    .Input("filenames: string")
+    .Input("header_bytes: int64")
+    .Input("record_bytes: int64")
+    .Input("footer_bytes: int64")
+    .Input("buffer_size: int64")
+    .Input("compression_type: string")
+    .Output("handle: variant")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // header_bytes, record_bytes, footer_bytes, buffer_size should be
+      // scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("TFRecordDataset")
     .Input("filenames: string")
     .Input("compression_type: string")
@@ -867,11 +890,6 @@ REGISTER_OP("DatasetToGraph")
     .Output("graph: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("SinkDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("OptimizeDataset")
     .Input("input_dataset: variant")
     .Input("optimizations: string")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index d077954f9eb04178a0bcf5410e99972d4882ca04..9733cf27768c2963d5b76247af9b71693f4256ae 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -75,6 +75,24 @@ REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalMapDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("use_inter_op_parallelism: bool = true")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalNonSerializableDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalSleepDataset")
     .Input("input_dataset: variant")
     .Input("sleep_microseconds: int64")
diff --git a/tensorflow/core/ops/function_ops.cc b/tensorflow/core/ops/function_ops.cc
index 6edd86b3ad0eae3b0eaa360e5fef9983d6cd3dc4..8e86dd9f780c8eac3dd813c996288a9707247bc4 100644
--- a/tensorflow/core/ops/function_ops.cc
+++ b/tensorflow/core/ops/function_ops.cc
@@ -35,6 +35,22 @@ output: The argument.
 index: This argument is the index-th argument of the function.
 )doc");
 
+REGISTER_SYSTEM_OP("_DeviceArg")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("index: int >= 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* context) {
+      context->set_output(0, context->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+A graph node which represents an argument to a function.
+
+output: The argument.
+index: This argument is the index-th argument of the function.
+)doc");
+
 REGISTER_SYSTEM_OP("_Retval")
     .Input("input: T")
     .Attr("T: type")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index ee14a851eb9f62f978817236aef98fd7e3a3df0c..5e0bdd888cea1c508a38afe2f40c7c9f17d28269 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -226,6 +226,7 @@ REGISTER_OP("PartitionedCall")
     .Attr("Tout: list(type) >= 0")
     .Attr("f: func")
     .Attr("config: string = ''")
+    .Attr("config_proto: string = ''")
     .Attr("executor_type: string = ''")
     .SetShapeFn(shape_inference::UnknownShape);
 
@@ -235,7 +236,8 @@ REGISTER_OP("StatefulPartitionedCall")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type) >= 0")
     .Attr("f: func")
-    .Attr("config: string = ''")
+    .Attr("config: string = ''")  // Deprecated in favor of config_proto
+    .Attr("config_proto: string = ''")
     .Attr("executor_type: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 7d79df9c1cc37f0cb7ea5be6c5067c2ccae2233e..88d6d14c306f5f6e3bd2317692524d6bdce62621 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -22,6 +22,7 @@ namespace {
 
 REGISTER_OP("EmptyTensorList")
     .Input("element_shape: shape_type")
+    .Input("max_num_elements: int32")
     .Output("handle: variant")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9be3470820eb523e8d41f8bf63434cbb534034d8
--- /dev/null
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -0,0 +1,612 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+// For now, this file only includes MKL quantized ops. In the
+// future, we will move all other MKL ops from nn_ops.cc to this file.
+
+#ifdef INTEL_MKL
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("_MklQuantizedMaxPool")
+    .Input("input:         T")
+    .Input("min_input:     float")
+    .Input("max_input:     float")
+    .Input("mkl_input:     uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Output("output:       T")
+    .Output("min_output:   float")
+    .Output("max_output:   float")
+    .Output("mkl_output:     uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("T: quantizedtype")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .SetShapeFn(shape_inference::MaxPoolShape)
+    .Doc(R"doc(
+MKL version of QuantizedMaxPool operator. Uses MKL DNN APIs to perform max pooling
+on the quantized input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklQuantizedAvgPool")
+    .Input("input:           T")
+    .Input("min_input:       float")
+    .Input("max_input:       float")
+    .Input("mkl_input:       uint8")
+    .Input("mkl_min_input:   uint8")
+    .Input("mkl_max_input:   uint8")
+    .Output("output:         T")
+    .Output("min_output:     float")
+    .Output("max_output:     float")
+    .Output("mkl_output:     uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("T: quantizedtype")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::AvgPoolShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of QuantizedAvgPool operator. Uses MKL DNN APIs to perform average pooling
+on the quantized input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklQuantizedConv2D")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBias")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("summand: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_summand: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("summand: Tsummand")
+    .Input("min_summand: float")
+    .Input("max_summand: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Input("mkl_summand: uint8")
+    .Input("mkl_min_summand: uint8")
+    .Input("mkl_max_summand: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("Tsummand: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("summand: Tsummand")
+    .Input("min_summand: float")
+    .Input("max_summand: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Input("mkl_summand: uint8")
+    .Input("mkl_min_summand: uint8")
+    .Input("mkl_max_summand: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("Tsummand: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 9796587709bba3733a0da72ddb596155101c7346..efa84d6c22c6de6d5fdd576d834f6b660ead61e1 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -315,6 +315,27 @@ REGISTER_OP("Conv2DBackpropFilter")
       return Status::OK();
     });
 
+REGISTER_OP("_FusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. Grappler is
+expected to create these operators.
+)doc");
+
 namespace {
 
 Status CommonFusedConvCalculations(InferenceContext* c, bool has_resize) {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d75c0a699cd972ea3b89d61d545d053253055dc8..bc35ce751393468345bbf054607764342d2f9b25 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4633,6 +4633,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "BoostedTreesQuantileStreamResourceDeserialize"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_streams"
+  }
+  attr {
+    name: "num_streams"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "BoostedTreesQuantileStreamResourceFlush"
   input_arg {
@@ -9605,6 +9624,10 @@ op {
     name: "element_shape"
     type_attr: "shape_type"
   }
+  input_arg {
+    name: "max_num_elements"
+    type: DT_INT32
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -10338,6 +10361,49 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "ExperimentalMaterializedIndexDatasetHandle"
   output_arg {
@@ -10366,6 +10432,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalNonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalNumaMapAndBatchDataset"
   input_arg {
@@ -11283,6 +11372,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FixedLengthRecordDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "FixedLengthRecordReader"
   output_arg {
@@ -20262,6 +20383,13 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "executor_type"
     type: "string"
@@ -29540,17 +29668,6 @@ op {
     }
   }
 }
-op {
-  name: "SinkDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-}
 op {
   name: "Size"
   input_arg {
@@ -33446,6 +33563,13 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "executor_type"
     type: "string"
@@ -35856,6 +35980,108 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -36819,6 +37045,57 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "UnicodeScript"
   input_arg {
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 352253135c406285459527fa1af45237cd9f4207..fbecff11dfaf80942160a80025347a0abf89b7ed 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -259,4 +259,28 @@ REGISTER_OP("UnicodeTranscode")
     .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
     .Attr("replace_control_characters: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("UnicodeDecodeWithOffsets")
+    .Input("input: string")
+    .Output("row_splits: int64")
+    .Output("char_values: int32")
+    .Output("char_to_byte_starts: int64")
+    .Attr("input_encoding: string")
+    .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Attr("replace_control_characters: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      // row_splits.shape == [input.size() + 1]
+      DimensionHandle num_row_splits;
+      DimensionHandle input_size = c->NumElements(c->input(0));
+      TF_RETURN_IF_ERROR(c->Add(input_size, 1, &num_row_splits));
+      c->set_output(0, c->Vector(num_row_splits));
+
+      // char_values.shape == offset_values.shape == [num_chars]
+      DimensionHandle num_chars = c->UnknownDim();
+      c->set_output(1, c->Vector(num_chars));
+      c->set_output(2, c->Vector(num_chars));
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/tensor_forest_ops.cc b/tensorflow/core/ops/tensor_forest_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4b6ba318e9d981af2797a54eca7f9caf049f6b0
--- /dev/null
+++ b/tensorflow/core/ops/tensor_forest_ops.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_OP(TensorForestTreeResource);
+
+REGISTER_OP("TensorForestTreeIsInitializedOp")
+    .Input("tree_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorForestCreateTreeVariable")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs);
+
+REGISTER_OP("TensorForestTreeSerialize")
+    .Input("tree_handle: resource")
+    .Output("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("TensorForestTreeDeserialize")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorForestTreeSize")
+    .Input("tree_handle: resource")
+    .Output("tree_size: int32")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("TensorForestTreePredict")
+    .Attr("logits_dimension: int")
+    .Input("tree_handle: resource")
+    .Input("dense_features: float")
+    .Output("logits: float")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle shape_handle;
+      shape_inference::DimensionHandle batch_size = c->UnknownDim();
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &shape_handle));
+
+      batch_size = c->Dim(shape_handle, 0);
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      c->set_output(0, c->Matrix(batch_size, logits_dimension));
+      return Status::OK();
+    });
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index c6e5777c265137ca1b215e14a7be0c6422804b4b..133ae45a5526d57632aa462168c7ecb0c4563b22 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -117,13 +117,33 @@ int64 LogLevelStrToInt(const char* tf_env_var_val) {
 }  // namespace
 
 int64 MinLogLevelFromEnv() {
+  // We don't want to print logs during fuzzing as that would slow fuzzing down
+  // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
+  // return a value so that nothing is actually printed. Since LOG uses >=
+  // (see ~LogMessage in this file) to see if log messages need to be printed,
+  // the value we're interested on to disable printing is the maximum severity.
+  // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  return tensorflow::NUM_SEVERITIES;
+#else
   const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
   return LogLevelStrToInt(tf_env_var_val);
+#endif
 }
 
 int64 MinVLogLevelFromEnv() {
+  // We don't want to print logs during fuzzing as that would slow fuzzing down
+  // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
+  // return a value so that nothing is actually printed. Since VLOG uses <=
+  // (see VLOG_IS_ON in logging.h) to see if log messages need to be printed,
+  // the value we're interested on to disable printing is 0.
+  // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  return 0;
+#else
   const char* tf_env_var_val = getenv("TF_CPP_MIN_VLOG_LEVEL");
   return LogLevelStrToInt(tf_env_var_val);
+#endif
 }
 
 LogMessage::~LogMessage() {
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 5732271f150a64e22f7eea2eea243e3c6c75631f..7374fccdc2cd2af4cfaec5a83b93fdb8d368cf2c 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -395,6 +396,7 @@ struct ThreadOptions {
   size_t stack_size = 0;  // 0: use system default value
   /// Guard area size to use near thread stacks to use (in bytes)
   size_t guard_size = 0;  // 0: use system default value
+  int numa_node = port::kNUMANoAffinity;
 };
 
 /// A utility routine: copy contents of `src` in file system `src_fs`
diff --git a/tensorflow/core/platform/env_time.cc b/tensorflow/core/platform/env_time.cc
index 76a227b69a10224681ce430b88a56fa2caabd264..10ba2abe7cb6485b1974eca85cc634b35cba23e8 100644
--- a/tensorflow/core/platform/env_time.cc
+++ b/tensorflow/core/platform/env_time.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/platform/platform_strings.cc b/tensorflow/core/platform/platform_strings.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1852633d595e0b65415284a3233ba11385a3c44
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings.cc
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/platform_strings.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+int GetPlatformStrings(const std::string& path,
+                       std::vector<std::string>* found) {
+  int result;
+  FILE* ifp = fopen(path.c_str(), "rb");
+  if (ifp != nullptr) {
+    static const char prefix[] = TF_PLAT_STR_MAGIC_PREFIX_;
+    int first_char = prefix[1];
+    int last_char = -1;
+    int c;
+    while ((c = getc(ifp)) != EOF) {
+      if (c == first_char && last_char == 0) {
+        int i = 2;
+        while (prefix[i] != 0 && (c = getc(ifp)) == prefix[i]) {
+          i++;
+        }
+        if (prefix[i] == 0) {
+          std::string str;
+          while ((c = getc(ifp)) != EOF && c != 0) {
+            str.push_back(c);
+          }
+          if (!str.empty()) {
+            found->push_back(str);
+          }
+        }
+      }
+      last_char = c;
+    }
+
+    result = (ferror(ifp) == 0) ? 0 : errno;
+    fclose(ifp);
+  } else {
+    result = errno;
+  }
+  return result;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/platform_strings.h b/tensorflow/core/platform/platform_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b1dbd130e0df0e991ac3e2dcce2840e66b1f9b9
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings.h
@@ -0,0 +1,364 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+
+// This header defines the macro TF_PLATFORM_STRINGS() which should be used
+// once in each dynamically loadable TensorFlow module.  It embeds static
+// strings into the compilation unit that allow TensorFlow to determine what
+// compilation options were in effect when the compilation unit was built.  All
+// compilation units within the same dynamically loadable library should be
+// built with the same options (or at least, the strings should be embedded in
+// the compilation unit built with the most restrictive options).
+
+// The platform strings embedded into a binary may be retrieved with the
+// GetPlatformStrings function.
+
+// Rationale:
+// We wish to load only those libraries that this CPU can execute.  For
+// example, we should not load a library compiled with avx256 instructions on a
+// CPU that cannot execute them.
+//
+// One might think that one could dlopen() the library, and call a routine that
+// would return which cpu type it was compiled for.  Alas, this does not work,
+// because at dlopen() time, a library containing C++ will execute constructors
+// of class variables with static storage class.  Even code that looks
+// innocuous may use optional platform-specific instructions.  For example,
+// the fastest way to zero a region of memory might use optional instructions.
+//
+// One might think one could run a tool such as "objdump" to read flags from
+// the libraries' headers, or perhaps disassemble each library to look for
+// particular instructions.  Unfortunately, the desired flags are not present
+// in the headers, and disassembly can be prohibitively slow ("objdump -d" is
+// very slow, for example).  Moreover, a tool to examine the library may not
+// be present on the system unless the user has installed special packages (for
+// example, on Windows).
+//
+// Instead, we adopt a crude but straightforward solution:  We require
+// developers to use the macro TF_PLATFORM_STRINGS() in their library, to
+// embed the compilation options as constant strings.  The compiler's
+// predefined macros pick which strings are included.  We then search for the
+// strings in the files, and then dlopen() only those libraries that have or
+// lack strings as needed.
+//
+// We adopt the approach of placing in the binary a fairly raw copy of the
+// predefined macros, rather than trying to interpret them in complex ways at
+// compile time.  This allows the loading binary to alter its interpretation of
+// the strings without library developers having to recompile.
+
+#include <stdio.h>
+
+#include <string>
+#include <vector>
+
+// Aside from the header guard, the internal macros defined here have the form:
+//   TF_PLAT_STR_*
+
+// If a macro is removed from the list of tested macros, the major version in
+// the following version number should be incremented, and the minor version
+// set to zero.  Otherwise, if a macro is added to the list of tested macros,
+// the minor number should be incremented.
+#define TF_PLAT_STR_VERSION_ "1.0"
+
+// Prefix of each option string indicator in the binary.
+// After the prefix, such strings have the form:
+//    [A-Za-z_0-9]=<value>
+// followed by a terminating nul.  To simplify searching, this prefix is all
+// ASCII, starts with a nul, and contains no character twice.
+#define TF_PLAT_STR_MAGIC_PREFIX_ "\0S\\s\":^p*L}"
+
+// A helper macro for TF_PLAT_STR_AS_STR_().
+#define TF_PLAT_STR_STR_1_(x) #x
+
+// Yield a constant string corresponding to x, after macro expansion.
+#define TF_PLAT_STR_AS_STR_(x) TF_PLAT_STR_STR_1_(x)
+
+// An empty definition to make lists more uniform.
+#define TF_PLAT_STR_TERMINATOR_
+
+// TF_PLAT_STR_(x) introduces a constant string indicating whether a
+// particular compilation option has been turned on.
+//
+// In gcc and clang, we might imagine using something like
+// #define TF_PLAT_STR_(x) \
+//     (sizeof (#x) != sizeof (TF_PLAT_STR_AS_STR_ (x))? \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_ (x) : \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=0"),
+// but some compilers (notably MSVC) place both "foo" and "bar" in the binary
+// when presented with
+//    (true?  "foo" : "bar")
+// so we must use #if to select the strings we need, which is rather verbose.
+#define TF_PLAT_STR_(x) TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_(x)
+
+// Include the #if machinery that sets the macros used below.
+// platform_strings_computed.h can be generated by filtering this header file
+// through:
+// awk '
+// header == "" { print; }
+// /\*\// && header == "" {
+//     print "// Generated from platform_strings.h.";
+//     print "";
+//     print "#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "";
+//     header = 1;
+// }
+// /^#define TF_PLAT_STR_LIST_[a-zA-Z0-9_]*\(\) *\\$/ { active = 1; }
+// /TF_PLAT_STR_TERMINATOR_/ { active = 0; }
+// /^ *TF_PLAT_STR_[A-Za-z0-9_]* *\\$/ && active {
+//     x = $0;
+//     sub(/^ *TF_PLAT_STR_/, "", x);
+//     sub(/ *\\$/, "", x);
+//     printf ("#if defined(%s)\n", x);
+//     printf ("#define TF_PLAT_STR_%s TF_PLAT_STR_(%s)\n", x, x);
+//     printf ("#else\n");
+//     printf ("#define TF_PLAT_STR_%s\n", x);
+//     printf ("#endif\n");
+// }
+// END {
+//     print "";
+//     print "#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+// }'
+#include "tensorflow/core/platform/platform_strings_computed.h"
+
+// clang-format butchers the following lines.
+// clang-format off
+
+// x86_64 and x86_32 optional features.
+#define TF_PLAT_STR_LIST___x86_64__()                                      \
+        TF_PLAT_STR__M_IX86_FP                                             \
+        TF_PLAT_STR__NO_PREFETCHW                                          \
+        TF_PLAT_STR___3dNOW_A__                                            \
+        TF_PLAT_STR___3dNOW__                                              \
+        TF_PLAT_STR___ABM__                                                \
+        TF_PLAT_STR___ADX__                                                \
+        TF_PLAT_STR___AES__                                                \
+        TF_PLAT_STR___AVX2__                                               \
+        TF_PLAT_STR___AVX512BW__                                           \
+        TF_PLAT_STR___AVX512CD__                                           \
+        TF_PLAT_STR___AVX512DQ__                                           \
+        TF_PLAT_STR___AVX512ER__                                           \
+        TF_PLAT_STR___AVX512F__                                            \
+        TF_PLAT_STR___AVX512IFMA__                                         \
+        TF_PLAT_STR___AVX512PF__                                           \
+        TF_PLAT_STR___AVX512VBMI__                                         \
+        TF_PLAT_STR___AVX512VL__                                           \
+        TF_PLAT_STR___AVX__                                                \
+        TF_PLAT_STR___BMI2__                                               \
+        TF_PLAT_STR___BMI__                                                \
+        TF_PLAT_STR___CLFLUSHOPT__                                         \
+        TF_PLAT_STR___CLZERO__                                             \
+        TF_PLAT_STR___F16C__                                               \
+        TF_PLAT_STR___FMA4__                                               \
+        TF_PLAT_STR___FMA__                                                \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___FSGSBASE__                                           \
+        TF_PLAT_STR___FXSR__                                               \
+        TF_PLAT_STR___LWP__                                                \
+        TF_PLAT_STR___LZCNT__                                              \
+        TF_PLAT_STR___MMX__                                                \
+        TF_PLAT_STR___MWAITX__                                             \
+        TF_PLAT_STR___PCLMUL__                                             \
+        TF_PLAT_STR___PKU__                                                \
+        TF_PLAT_STR___POPCNT__                                             \
+        TF_PLAT_STR___PRFCHW__                                             \
+        TF_PLAT_STR___RDRND__                                              \
+        TF_PLAT_STR___RDSEED__                                             \
+        TF_PLAT_STR___RTM__                                                \
+        TF_PLAT_STR___SHA__                                                \
+        TF_PLAT_STR___SSE2_MATH__                                          \
+        TF_PLAT_STR___SSE2__                                               \
+        TF_PLAT_STR___SSE_MATH__                                           \
+        TF_PLAT_STR___SSE__                                                \
+        TF_PLAT_STR___SSE3__                                               \
+        TF_PLAT_STR___SSE4A__                                              \
+        TF_PLAT_STR___SSE4_1__                                             \
+        TF_PLAT_STR___SSE4_2__                                             \
+        TF_PLAT_STR___SSSE3__                                              \
+        TF_PLAT_STR___TBM__                                                \
+        TF_PLAT_STR___XOP__                                                \
+        TF_PLAT_STR___XSAVEC__                                             \
+        TF_PLAT_STR___XSAVEOPT__                                           \
+        TF_PLAT_STR___XSAVES__                                             \
+        TF_PLAT_STR___XSAVE__                                              \
+        TF_PLAT_STR_TERMINATOR_
+
+// PowerPC (64- and 32-bit) optional features.
+#define TF_PLAT_STR_LIST___powerpc64__()                                   \
+        TF_PLAT_STR__SOFT_DOUBLE                                           \
+        TF_PLAT_STR__SOFT_FLOAT                                            \
+        TF_PLAT_STR___ALTIVEC__                                            \
+        TF_PLAT_STR___APPLE_ALTIVEC__                                      \
+        TF_PLAT_STR___CRYPTO__                                             \
+        TF_PLAT_STR___FLOAT128_HARDWARE__                                  \
+        TF_PLAT_STR___FLOAT128_TYPE__                                      \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___HTM__                                                \
+        TF_PLAT_STR___NO_FPRS__                                            \
+        TF_PLAT_STR___NO_LWSYNC__                                          \
+        TF_PLAT_STR___POWER8_VECTOR__                                      \
+        TF_PLAT_STR___POWER9_VECTOR__                                      \
+        TF_PLAT_STR___PPC405__                                             \
+        TF_PLAT_STR___QUAD_MEMORY_ATOMIC__                                 \
+        TF_PLAT_STR___RECIPF__                                             \
+        TF_PLAT_STR___RECIP_PRECISION__                                    \
+        TF_PLAT_STR___RECIP__                                              \
+        TF_PLAT_STR___RSQRTEF__                                            \
+        TF_PLAT_STR___RSQRTE__                                             \
+        TF_PLAT_STR___TM_FENCE__                                           \
+        TF_PLAT_STR___UPPER_REGS_DF__                                      \
+        TF_PLAT_STR___UPPER_REGS_SF__                                      \
+        TF_PLAT_STR___VEC__                                                \
+        TF_PLAT_STR___VSX__                                                \
+        TF_PLAT_STR_TERMINATOR_
+
+// aarch64 and 32-bit arm optional features
+#define TF_PLAT_STR_LIST___aarch64__()                                     \
+        TF_PLAT_STR___ARM_ARCH                                             \
+        TF_PLAT_STR___ARM_FEATURE_CLZ                                      \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRYPTO                                   \
+        TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING                        \
+        TF_PLAT_STR___ARM_FEATURE_DSP                                      \
+        TF_PLAT_STR___ARM_FEATURE_FMA                                      \
+        TF_PLAT_STR___ARM_FEATURE_IDIV                                     \
+        TF_PLAT_STR___ARM_FEATURE_LDREX                                    \
+        TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN                           \
+        TF_PLAT_STR___ARM_FEATURE_QBIT                                     \
+        TF_PLAT_STR___ARM_FEATURE_QRDMX                                    \
+        TF_PLAT_STR___ARM_FEATURE_SAT                                      \
+        TF_PLAT_STR___ARM_FEATURE_SIMD32                                   \
+        TF_PLAT_STR___ARM_FEATURE_UNALIGNED                                \
+        TF_PLAT_STR___ARM_FP                                               \
+        TF_PLAT_STR___ARM_NEON_FP                                          \
+        TF_PLAT_STR___ARM_NEON__                                           \
+        TF_PLAT_STR___ARM_WMMX                                             \
+        TF_PLAT_STR___IWMMXT2__                                            \
+        TF_PLAT_STR___IWMMXT__                                             \
+        TF_PLAT_STR___VFP_FP__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+// Generic features, including indication of architecture and OS.
+// The _M_* macros are defined by Visual Studio.
+// It doesn't define __LITTLE_ENDIAN__ or __BYTE_ORDER__;
+// Windows is assumed to be little endian.
+#define TF_PLAT_STR_LIST___generic__()                                     \
+        TF_PLAT_STR_TARGET_IPHONE_SIMULATOR                                \
+        TF_PLAT_STR_TARGET_OS_IOS                                          \
+        TF_PLAT_STR_TARGET_OS_IPHONE                                       \
+        TF_PLAT_STR__MSC_VER                                               \
+        TF_PLAT_STR__M_ARM                                                 \
+        TF_PLAT_STR__M_ARM64                                               \
+        TF_PLAT_STR__M_ARM_ARMV7VE                                         \
+        TF_PLAT_STR__M_ARM_FP                                              \
+        TF_PLAT_STR__M_IX86                                                \
+        TF_PLAT_STR__M_X64                                                 \
+        TF_PLAT_STR__WIN32                                                 \
+        TF_PLAT_STR__WIN64                                                 \
+        TF_PLAT_STR___ANDROID__                                            \
+        TF_PLAT_STR___APPLE__                                              \
+        TF_PLAT_STR___BYTE_ORDER__                                         \
+        TF_PLAT_STR___CYGWIN__                                             \
+        TF_PLAT_STR___FreeBSD__                                            \
+        TF_PLAT_STR___LITTLE_ENDIAN__                                      \
+        TF_PLAT_STR___NetBSD__                                             \
+        TF_PLAT_STR___OpenBSD__                                            \
+        TF_PLAT_STR_____MSYS__                                             \
+        TF_PLAT_STR___aarch64__                                            \
+        TF_PLAT_STR___alpha__                                              \
+        TF_PLAT_STR___arm__                                                \
+        TF_PLAT_STR___i386__                                               \
+        TF_PLAT_STR___i686__                                               \
+        TF_PLAT_STR___ia64__                                               \
+        TF_PLAT_STR___linux__                                              \
+        TF_PLAT_STR___mips32__                                             \
+        TF_PLAT_STR___mips64__                                             \
+        TF_PLAT_STR___powerpc64__                                          \
+        TF_PLAT_STR___powerpc__                                            \
+        TF_PLAT_STR___riscv___                                             \
+        TF_PLAT_STR___s390x__                                              \
+        TF_PLAT_STR___sparc64__                                            \
+        TF_PLAT_STR___sparc__                                              \
+        TF_PLAT_STR___x86_64__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+#if !defined(__x86_64__) && !defined(_M_X64) && \
+    !defined(__i386__) && !defined(_M_IX86)
+#undef TF_PLAT_STR_LIST___x86_64__
+#define TF_PLAT_STR_LIST___x86_64__()
+#endif
+#if !defined(__powerpc64__) && !defined(__powerpc__)
+#undef TF_PLAT_STR_LIST___powerpc64__
+#define TF_PLAT_STR_LIST___powerpc64__()
+#endif
+#if !defined(__aarch64__) && !defined(_M_ARM64) && \
+    !defined(__arm__) && !defined(_M_ARM)
+#undef TF_PLAT_STR_LIST___aarch64__
+#define TF_PLAT_STR_LIST___aarch64__()
+#endif
+
+// Macro to be used in each dynamically loadable library.
+//
+// The BSS global variable tf_cpu_option_global and the class
+// instance tf_cpu_option_avoid_omit_class are needed to prevent
+// compilers/linkers such as clang from omitting the static variable
+// tf_cpu_option[], which would otherwise appear to be unused.  We cannot make
+// tf_cpu_option[] global, because we then might get multiply-defined symbols
+// if TF_PLAT_STR() is used twice in the same library.
+// (tf_cpu_option_global doesn't see such errors because it is
+// defined in BSS, so multiple definitions are combined by the linker.)  gcc's
+// __attribute__((used)) is insufficient because it seems to be ignored by
+// linkers.
+#define TF_PLATFORM_STRINGS()                                                  \
+    static const char tf_cpu_option[] =                                        \
+        TF_PLAT_STR_MAGIC_PREFIX_ "TF_PLAT_STR_VERSION=" TF_PLAT_STR_VERSION_  \
+        TF_PLAT_STR_LIST___x86_64__()                                          \
+        TF_PLAT_STR_LIST___powerpc64__()                                       \
+        TF_PLAT_STR_LIST___aarch64__()                                         \
+        TF_PLAT_STR_LIST___generic__()                                         \
+    ;                                                                          \
+    const char *tf_cpu_option_global;                                          \
+    namespace {                                                                \
+    class TFCPUOptionHelper {                                                  \
+     public:                                                                   \
+      TFCPUOptionHelper() {                                                    \
+        /* Compilers/linkers remove unused variables aggressively.  The */     \
+        /* following gyrations subvert most such optimizations. */             \
+        tf_cpu_option_global = tf_cpu_option;                                  \
+        /* Nothing is printed because the string starts with a nul. */         \
+        printf("%s", tf_cpu_option);                                           \
+      }                                                                        \
+    } tf_cpu_option_avoid_omit_class;                                          \
+    }  /* anonymous namespace */
+// clang-format on
+
+namespace tensorflow {
+
+class Status;
+
+// Retrieves the platform strings from the file at the given path and appends
+// them to the given vector. If the returned int is non-zero, an error occurred
+// reading the file and vector may or may not be modified. The returned error
+// code is suitable for use with strerror().
+int GetPlatformStrings(const std::string& path,
+                       std::vector<std::string>* found);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
diff --git a/tensorflow/core/platform/platform_strings_computed.h b/tensorflow/core/platform/platform_strings_computed.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a17f3bfc3a866ee1fd4945e9ade5a3e379eefa3
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings_computed.h
@@ -0,0 +1,735 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Generated from platform_strings.h.
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+
+#if defined(_M_IX86_FP)
+#define TF_PLAT_STR__M_IX86_FP TF_PLAT_STR_(_M_IX86_FP)
+#else
+#define TF_PLAT_STR__M_IX86_FP
+#endif
+#if defined(_NO_PREFETCHW)
+#define TF_PLAT_STR__NO_PREFETCHW TF_PLAT_STR_(_NO_PREFETCHW)
+#else
+#define TF_PLAT_STR__NO_PREFETCHW
+#endif
+#if defined(__3dNOW_A__)
+#define TF_PLAT_STR___3dNOW_A__ TF_PLAT_STR_(__3dNOW_A__)
+#else
+#define TF_PLAT_STR___3dNOW_A__
+#endif
+#if defined(__3dNOW__)
+#define TF_PLAT_STR___3dNOW__ TF_PLAT_STR_(__3dNOW__)
+#else
+#define TF_PLAT_STR___3dNOW__
+#endif
+#if defined(__ABM__)
+#define TF_PLAT_STR___ABM__ TF_PLAT_STR_(__ABM__)
+#else
+#define TF_PLAT_STR___ABM__
+#endif
+#if defined(__ADX__)
+#define TF_PLAT_STR___ADX__ TF_PLAT_STR_(__ADX__)
+#else
+#define TF_PLAT_STR___ADX__
+#endif
+#if defined(__AES__)
+#define TF_PLAT_STR___AES__ TF_PLAT_STR_(__AES__)
+#else
+#define TF_PLAT_STR___AES__
+#endif
+#if defined(__AVX2__)
+#define TF_PLAT_STR___AVX2__ TF_PLAT_STR_(__AVX2__)
+#else
+#define TF_PLAT_STR___AVX2__
+#endif
+#if defined(__AVX512BW__)
+#define TF_PLAT_STR___AVX512BW__ TF_PLAT_STR_(__AVX512BW__)
+#else
+#define TF_PLAT_STR___AVX512BW__
+#endif
+#if defined(__AVX512CD__)
+#define TF_PLAT_STR___AVX512CD__ TF_PLAT_STR_(__AVX512CD__)
+#else
+#define TF_PLAT_STR___AVX512CD__
+#endif
+#if defined(__AVX512DQ__)
+#define TF_PLAT_STR___AVX512DQ__ TF_PLAT_STR_(__AVX512DQ__)
+#else
+#define TF_PLAT_STR___AVX512DQ__
+#endif
+#if defined(__AVX512ER__)
+#define TF_PLAT_STR___AVX512ER__ TF_PLAT_STR_(__AVX512ER__)
+#else
+#define TF_PLAT_STR___AVX512ER__
+#endif
+#if defined(__AVX512F__)
+#define TF_PLAT_STR___AVX512F__ TF_PLAT_STR_(__AVX512F__)
+#else
+#define TF_PLAT_STR___AVX512F__
+#endif
+#if defined(__AVX512IFMA__)
+#define TF_PLAT_STR___AVX512IFMA__ TF_PLAT_STR_(__AVX512IFMA__)
+#else
+#define TF_PLAT_STR___AVX512IFMA__
+#endif
+#if defined(__AVX512PF__)
+#define TF_PLAT_STR___AVX512PF__ TF_PLAT_STR_(__AVX512PF__)
+#else
+#define TF_PLAT_STR___AVX512PF__
+#endif
+#if defined(__AVX512VBMI__)
+#define TF_PLAT_STR___AVX512VBMI__ TF_PLAT_STR_(__AVX512VBMI__)
+#else
+#define TF_PLAT_STR___AVX512VBMI__
+#endif
+#if defined(__AVX512VL__)
+#define TF_PLAT_STR___AVX512VL__ TF_PLAT_STR_(__AVX512VL__)
+#else
+#define TF_PLAT_STR___AVX512VL__
+#endif
+#if defined(__AVX__)
+#define TF_PLAT_STR___AVX__ TF_PLAT_STR_(__AVX__)
+#else
+#define TF_PLAT_STR___AVX__
+#endif
+#if defined(__BMI2__)
+#define TF_PLAT_STR___BMI2__ TF_PLAT_STR_(__BMI2__)
+#else
+#define TF_PLAT_STR___BMI2__
+#endif
+#if defined(__BMI__)
+#define TF_PLAT_STR___BMI__ TF_PLAT_STR_(__BMI__)
+#else
+#define TF_PLAT_STR___BMI__
+#endif
+#if defined(__CLFLUSHOPT__)
+#define TF_PLAT_STR___CLFLUSHOPT__ TF_PLAT_STR_(__CLFLUSHOPT__)
+#else
+#define TF_PLAT_STR___CLFLUSHOPT__
+#endif
+#if defined(__CLZERO__)
+#define TF_PLAT_STR___CLZERO__ TF_PLAT_STR_(__CLZERO__)
+#else
+#define TF_PLAT_STR___CLZERO__
+#endif
+#if defined(__F16C__)
+#define TF_PLAT_STR___F16C__ TF_PLAT_STR_(__F16C__)
+#else
+#define TF_PLAT_STR___F16C__
+#endif
+#if defined(__FMA4__)
+#define TF_PLAT_STR___FMA4__ TF_PLAT_STR_(__FMA4__)
+#else
+#define TF_PLAT_STR___FMA4__
+#endif
+#if defined(__FMA__)
+#define TF_PLAT_STR___FMA__ TF_PLAT_STR_(__FMA__)
+#else
+#define TF_PLAT_STR___FMA__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__FSGSBASE__)
+#define TF_PLAT_STR___FSGSBASE__ TF_PLAT_STR_(__FSGSBASE__)
+#else
+#define TF_PLAT_STR___FSGSBASE__
+#endif
+#if defined(__FXSR__)
+#define TF_PLAT_STR___FXSR__ TF_PLAT_STR_(__FXSR__)
+#else
+#define TF_PLAT_STR___FXSR__
+#endif
+#if defined(__LWP__)
+#define TF_PLAT_STR___LWP__ TF_PLAT_STR_(__LWP__)
+#else
+#define TF_PLAT_STR___LWP__
+#endif
+#if defined(__LZCNT__)
+#define TF_PLAT_STR___LZCNT__ TF_PLAT_STR_(__LZCNT__)
+#else
+#define TF_PLAT_STR___LZCNT__
+#endif
+#if defined(__MMX__)
+#define TF_PLAT_STR___MMX__ TF_PLAT_STR_(__MMX__)
+#else
+#define TF_PLAT_STR___MMX__
+#endif
+#if defined(__MWAITX__)
+#define TF_PLAT_STR___MWAITX__ TF_PLAT_STR_(__MWAITX__)
+#else
+#define TF_PLAT_STR___MWAITX__
+#endif
+#if defined(__PCLMUL__)
+#define TF_PLAT_STR___PCLMUL__ TF_PLAT_STR_(__PCLMUL__)
+#else
+#define TF_PLAT_STR___PCLMUL__
+#endif
+#if defined(__PKU__)
+#define TF_PLAT_STR___PKU__ TF_PLAT_STR_(__PKU__)
+#else
+#define TF_PLAT_STR___PKU__
+#endif
+#if defined(__POPCNT__)
+#define TF_PLAT_STR___POPCNT__ TF_PLAT_STR_(__POPCNT__)
+#else
+#define TF_PLAT_STR___POPCNT__
+#endif
+#if defined(__PRFCHW__)
+#define TF_PLAT_STR___PRFCHW__ TF_PLAT_STR_(__PRFCHW__)
+#else
+#define TF_PLAT_STR___PRFCHW__
+#endif
+#if defined(__RDRND__)
+#define TF_PLAT_STR___RDRND__ TF_PLAT_STR_(__RDRND__)
+#else
+#define TF_PLAT_STR___RDRND__
+#endif
+#if defined(__RDSEED__)
+#define TF_PLAT_STR___RDSEED__ TF_PLAT_STR_(__RDSEED__)
+#else
+#define TF_PLAT_STR___RDSEED__
+#endif
+#if defined(__RTM__)
+#define TF_PLAT_STR___RTM__ TF_PLAT_STR_(__RTM__)
+#else
+#define TF_PLAT_STR___RTM__
+#endif
+#if defined(__SHA__)
+#define TF_PLAT_STR___SHA__ TF_PLAT_STR_(__SHA__)
+#else
+#define TF_PLAT_STR___SHA__
+#endif
+#if defined(__SSE2_MATH__)
+#define TF_PLAT_STR___SSE2_MATH__ TF_PLAT_STR_(__SSE2_MATH__)
+#else
+#define TF_PLAT_STR___SSE2_MATH__
+#endif
+#if defined(__SSE2__)
+#define TF_PLAT_STR___SSE2__ TF_PLAT_STR_(__SSE2__)
+#else
+#define TF_PLAT_STR___SSE2__
+#endif
+#if defined(__SSE_MATH__)
+#define TF_PLAT_STR___SSE_MATH__ TF_PLAT_STR_(__SSE_MATH__)
+#else
+#define TF_PLAT_STR___SSE_MATH__
+#endif
+#if defined(__SSE__)
+#define TF_PLAT_STR___SSE__ TF_PLAT_STR_(__SSE__)
+#else
+#define TF_PLAT_STR___SSE__
+#endif
+#if defined(__SSE3__)
+#define TF_PLAT_STR___SSE3__ TF_PLAT_STR_(__SSE3__)
+#else
+#define TF_PLAT_STR___SSE3__
+#endif
+#if defined(__SSE4A__)
+#define TF_PLAT_STR___SSE4A__ TF_PLAT_STR_(__SSE4A__)
+#else
+#define TF_PLAT_STR___SSE4A__
+#endif
+#if defined(__SSE4_1__)
+#define TF_PLAT_STR___SSE4_1__ TF_PLAT_STR_(__SSE4_1__)
+#else
+#define TF_PLAT_STR___SSE4_1__
+#endif
+#if defined(__SSE4_2__)
+#define TF_PLAT_STR___SSE4_2__ TF_PLAT_STR_(__SSE4_2__)
+#else
+#define TF_PLAT_STR___SSE4_2__
+#endif
+#if defined(__SSSE3__)
+#define TF_PLAT_STR___SSSE3__ TF_PLAT_STR_(__SSSE3__)
+#else
+#define TF_PLAT_STR___SSSE3__
+#endif
+#if defined(__TBM__)
+#define TF_PLAT_STR___TBM__ TF_PLAT_STR_(__TBM__)
+#else
+#define TF_PLAT_STR___TBM__
+#endif
+#if defined(__XOP__)
+#define TF_PLAT_STR___XOP__ TF_PLAT_STR_(__XOP__)
+#else
+#define TF_PLAT_STR___XOP__
+#endif
+#if defined(__XSAVEC__)
+#define TF_PLAT_STR___XSAVEC__ TF_PLAT_STR_(__XSAVEC__)
+#else
+#define TF_PLAT_STR___XSAVEC__
+#endif
+#if defined(__XSAVEOPT__)
+#define TF_PLAT_STR___XSAVEOPT__ TF_PLAT_STR_(__XSAVEOPT__)
+#else
+#define TF_PLAT_STR___XSAVEOPT__
+#endif
+#if defined(__XSAVES__)
+#define TF_PLAT_STR___XSAVES__ TF_PLAT_STR_(__XSAVES__)
+#else
+#define TF_PLAT_STR___XSAVES__
+#endif
+#if defined(__XSAVE__)
+#define TF_PLAT_STR___XSAVE__ TF_PLAT_STR_(__XSAVE__)
+#else
+#define TF_PLAT_STR___XSAVE__
+#endif
+#if defined(_SOFT_DOUBLE)
+#define TF_PLAT_STR__SOFT_DOUBLE TF_PLAT_STR_(_SOFT_DOUBLE)
+#else
+#define TF_PLAT_STR__SOFT_DOUBLE
+#endif
+#if defined(_SOFT_FLOAT)
+#define TF_PLAT_STR__SOFT_FLOAT TF_PLAT_STR_(_SOFT_FLOAT)
+#else
+#define TF_PLAT_STR__SOFT_FLOAT
+#endif
+#if defined(__ALTIVEC__)
+#define TF_PLAT_STR___ALTIVEC__ TF_PLAT_STR_(__ALTIVEC__)
+#else
+#define TF_PLAT_STR___ALTIVEC__
+#endif
+#if defined(__APPLE_ALTIVEC__)
+#define TF_PLAT_STR___APPLE_ALTIVEC__ TF_PLAT_STR_(__APPLE_ALTIVEC__)
+#else
+#define TF_PLAT_STR___APPLE_ALTIVEC__
+#endif
+#if defined(__CRYPTO__)
+#define TF_PLAT_STR___CRYPTO__ TF_PLAT_STR_(__CRYPTO__)
+#else
+#define TF_PLAT_STR___CRYPTO__
+#endif
+#if defined(__FLOAT128_HARDWARE__)
+#define TF_PLAT_STR___FLOAT128_HARDWARE__ TF_PLAT_STR_(__FLOAT128_HARDWARE__)
+#else
+#define TF_PLAT_STR___FLOAT128_HARDWARE__
+#endif
+#if defined(__FLOAT128_TYPE__)
+#define TF_PLAT_STR___FLOAT128_TYPE__ TF_PLAT_STR_(__FLOAT128_TYPE__)
+#else
+#define TF_PLAT_STR___FLOAT128_TYPE__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__HTM__)
+#define TF_PLAT_STR___HTM__ TF_PLAT_STR_(__HTM__)
+#else
+#define TF_PLAT_STR___HTM__
+#endif
+#if defined(__NO_FPRS__)
+#define TF_PLAT_STR___NO_FPRS__ TF_PLAT_STR_(__NO_FPRS__)
+#else
+#define TF_PLAT_STR___NO_FPRS__
+#endif
+#if defined(__NO_LWSYNC__)
+#define TF_PLAT_STR___NO_LWSYNC__ TF_PLAT_STR_(__NO_LWSYNC__)
+#else
+#define TF_PLAT_STR___NO_LWSYNC__
+#endif
+#if defined(__POWER8_VECTOR__)
+#define TF_PLAT_STR___POWER8_VECTOR__ TF_PLAT_STR_(__POWER8_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER8_VECTOR__
+#endif
+#if defined(__POWER9_VECTOR__)
+#define TF_PLAT_STR___POWER9_VECTOR__ TF_PLAT_STR_(__POWER9_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER9_VECTOR__
+#endif
+#if defined(__PPC405__)
+#define TF_PLAT_STR___PPC405__ TF_PLAT_STR_(__PPC405__)
+#else
+#define TF_PLAT_STR___PPC405__
+#endif
+#if defined(__QUAD_MEMORY_ATOMIC__)
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__ TF_PLAT_STR_(__QUAD_MEMORY_ATOMIC__)
+#else
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__
+#endif
+#if defined(__RECIPF__)
+#define TF_PLAT_STR___RECIPF__ TF_PLAT_STR_(__RECIPF__)
+#else
+#define TF_PLAT_STR___RECIPF__
+#endif
+#if defined(__RECIP_PRECISION__)
+#define TF_PLAT_STR___RECIP_PRECISION__ TF_PLAT_STR_(__RECIP_PRECISION__)
+#else
+#define TF_PLAT_STR___RECIP_PRECISION__
+#endif
+#if defined(__RECIP__)
+#define TF_PLAT_STR___RECIP__ TF_PLAT_STR_(__RECIP__)
+#else
+#define TF_PLAT_STR___RECIP__
+#endif
+#if defined(__RSQRTEF__)
+#define TF_PLAT_STR___RSQRTEF__ TF_PLAT_STR_(__RSQRTEF__)
+#else
+#define TF_PLAT_STR___RSQRTEF__
+#endif
+#if defined(__RSQRTE__)
+#define TF_PLAT_STR___RSQRTE__ TF_PLAT_STR_(__RSQRTE__)
+#else
+#define TF_PLAT_STR___RSQRTE__
+#endif
+#if defined(__TM_FENCE__)
+#define TF_PLAT_STR___TM_FENCE__ TF_PLAT_STR_(__TM_FENCE__)
+#else
+#define TF_PLAT_STR___TM_FENCE__
+#endif
+#if defined(__UPPER_REGS_DF__)
+#define TF_PLAT_STR___UPPER_REGS_DF__ TF_PLAT_STR_(__UPPER_REGS_DF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_DF__
+#endif
+#if defined(__UPPER_REGS_SF__)
+#define TF_PLAT_STR___UPPER_REGS_SF__ TF_PLAT_STR_(__UPPER_REGS_SF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_SF__
+#endif
+#if defined(__VEC__)
+#define TF_PLAT_STR___VEC__ TF_PLAT_STR_(__VEC__)
+#else
+#define TF_PLAT_STR___VEC__
+#endif
+#if defined(__VSX__)
+#define TF_PLAT_STR___VSX__ TF_PLAT_STR_(__VSX__)
+#else
+#define TF_PLAT_STR___VSX__
+#endif
+#if defined(__ARM_ARCH)
+#define TF_PLAT_STR___ARM_ARCH TF_PLAT_STR_(__ARM_ARCH)
+#else
+#define TF_PLAT_STR___ARM_ARCH
+#endif
+#if defined(__ARM_FEATURE_CLZ)
+#define TF_PLAT_STR___ARM_FEATURE_CLZ TF_PLAT_STR_(__ARM_FEATURE_CLZ)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CLZ
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRYPTO)
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO TF_PLAT_STR_(__ARM_FEATURE_CRYPTO)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO
+#endif
+#if defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING \
+  TF_PLAT_STR_(__ARM_FEATURE_DIRECTED_ROUNDING)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING
+#endif
+#if defined(__ARM_FEATURE_DSP)
+#define TF_PLAT_STR___ARM_FEATURE_DSP TF_PLAT_STR_(__ARM_FEATURE_DSP)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DSP
+#endif
+#if defined(__ARM_FEATURE_FMA)
+#define TF_PLAT_STR___ARM_FEATURE_FMA TF_PLAT_STR_(__ARM_FEATURE_FMA)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_FMA
+#endif
+#if defined(__ARM_FEATURE_IDIV)
+#define TF_PLAT_STR___ARM_FEATURE_IDIV TF_PLAT_STR_(__ARM_FEATURE_IDIV)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_IDIV
+#endif
+#if defined(__ARM_FEATURE_LDREX)
+#define TF_PLAT_STR___ARM_FEATURE_LDREX TF_PLAT_STR_(__ARM_FEATURE_LDREX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_LDREX
+#endif
+#if defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN \
+  TF_PLAT_STR_(__ARM_FEATURE_NUMERIC_MAXMIN)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN
+#endif
+#if defined(__ARM_FEATURE_QBIT)
+#define TF_PLAT_STR___ARM_FEATURE_QBIT TF_PLAT_STR_(__ARM_FEATURE_QBIT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QBIT
+#endif
+#if defined(__ARM_FEATURE_QRDMX)
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX TF_PLAT_STR_(__ARM_FEATURE_QRDMX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX
+#endif
+#if defined(__ARM_FEATURE_SAT)
+#define TF_PLAT_STR___ARM_FEATURE_SAT TF_PLAT_STR_(__ARM_FEATURE_SAT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SAT
+#endif
+#if defined(__ARM_FEATURE_SIMD32)
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32 TF_PLAT_STR_(__ARM_FEATURE_SIMD32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32
+#endif
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED \
+  TF_PLAT_STR_(__ARM_FEATURE_UNALIGNED)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED
+#endif
+#if defined(__ARM_FP)
+#define TF_PLAT_STR___ARM_FP TF_PLAT_STR_(__ARM_FP)
+#else
+#define TF_PLAT_STR___ARM_FP
+#endif
+#if defined(__ARM_NEON_FP)
+#define TF_PLAT_STR___ARM_NEON_FP TF_PLAT_STR_(__ARM_NEON_FP)
+#else
+#define TF_PLAT_STR___ARM_NEON_FP
+#endif
+#if defined(__ARM_NEON__)
+#define TF_PLAT_STR___ARM_NEON__ TF_PLAT_STR_(__ARM_NEON__)
+#else
+#define TF_PLAT_STR___ARM_NEON__
+#endif
+#if defined(__ARM_WMMX)
+#define TF_PLAT_STR___ARM_WMMX TF_PLAT_STR_(__ARM_WMMX)
+#else
+#define TF_PLAT_STR___ARM_WMMX
+#endif
+#if defined(__IWMMXT2__)
+#define TF_PLAT_STR___IWMMXT2__ TF_PLAT_STR_(__IWMMXT2__)
+#else
+#define TF_PLAT_STR___IWMMXT2__
+#endif
+#if defined(__IWMMXT__)
+#define TF_PLAT_STR___IWMMXT__ TF_PLAT_STR_(__IWMMXT__)
+#else
+#define TF_PLAT_STR___IWMMXT__
+#endif
+#if defined(__VFP_FP__)
+#define TF_PLAT_STR___VFP_FP__ TF_PLAT_STR_(__VFP_FP__)
+#else
+#define TF_PLAT_STR___VFP_FP__
+#endif
+#if defined(TARGET_IPHONE_SIMULATOR)
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR \
+  TF_PLAT_STR_(TARGET_IPHONE_SIMULATOR)
+#else
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR
+#endif
+#if defined(TARGET_OS_IOS)
+#define TF_PLAT_STR_TARGET_OS_IOS TF_PLAT_STR_(TARGET_OS_IOS)
+#else
+#define TF_PLAT_STR_TARGET_OS_IOS
+#endif
+#if defined(TARGET_OS_IPHONE)
+#define TF_PLAT_STR_TARGET_OS_IPHONE TF_PLAT_STR_(TARGET_OS_IPHONE)
+#else
+#define TF_PLAT_STR_TARGET_OS_IPHONE
+#endif
+#if defined(_MSC_VER)
+#define TF_PLAT_STR__MSC_VER TF_PLAT_STR_(_MSC_VER)
+#else
+#define TF_PLAT_STR__MSC_VER
+#endif
+#if defined(_M_ARM)
+#define TF_PLAT_STR__M_ARM TF_PLAT_STR_(_M_ARM)
+#else
+#define TF_PLAT_STR__M_ARM
+#endif
+#if defined(_M_ARM64)
+#define TF_PLAT_STR__M_ARM64 TF_PLAT_STR_(_M_ARM64)
+#else
+#define TF_PLAT_STR__M_ARM64
+#endif
+#if defined(_M_ARM_ARMV7VE)
+#define TF_PLAT_STR__M_ARM_ARMV7VE TF_PLAT_STR_(_M_ARM_ARMV7VE)
+#else
+#define TF_PLAT_STR__M_ARM_ARMV7VE
+#endif
+#if defined(_M_ARM_FP)
+#define TF_PLAT_STR__M_ARM_FP TF_PLAT_STR_(_M_ARM_FP)
+#else
+#define TF_PLAT_STR__M_ARM_FP
+#endif
+#if defined(_M_IX86)
+#define TF_PLAT_STR__M_IX86 TF_PLAT_STR_(_M_IX86)
+#else
+#define TF_PLAT_STR__M_IX86
+#endif
+#if defined(_M_X64)
+#define TF_PLAT_STR__M_X64 TF_PLAT_STR_(_M_X64)
+#else
+#define TF_PLAT_STR__M_X64
+#endif
+#if defined(_WIN32)
+#define TF_PLAT_STR__WIN32 TF_PLAT_STR_(_WIN32)
+#else
+#define TF_PLAT_STR__WIN32
+#endif
+#if defined(_WIN64)
+#define TF_PLAT_STR__WIN64 TF_PLAT_STR_(_WIN64)
+#else
+#define TF_PLAT_STR__WIN64
+#endif
+#if defined(__ANDROID__)
+#define TF_PLAT_STR___ANDROID__ TF_PLAT_STR_(__ANDROID__)
+#else
+#define TF_PLAT_STR___ANDROID__
+#endif
+#if defined(__APPLE__)
+#define TF_PLAT_STR___APPLE__ TF_PLAT_STR_(__APPLE__)
+#else
+#define TF_PLAT_STR___APPLE__
+#endif
+#if defined(__BYTE_ORDER__)
+#define TF_PLAT_STR___BYTE_ORDER__ TF_PLAT_STR_(__BYTE_ORDER__)
+#else
+#define TF_PLAT_STR___BYTE_ORDER__
+#endif
+#if defined(__CYGWIN__)
+#define TF_PLAT_STR___CYGWIN__ TF_PLAT_STR_(__CYGWIN__)
+#else
+#define TF_PLAT_STR___CYGWIN__
+#endif
+#if defined(__FreeBSD__)
+#define TF_PLAT_STR___FreeBSD__ TF_PLAT_STR_(__FreeBSD__)
+#else
+#define TF_PLAT_STR___FreeBSD__
+#endif
+#if defined(__LITTLE_ENDIAN__)
+#define TF_PLAT_STR___LITTLE_ENDIAN__ TF_PLAT_STR_(__LITTLE_ENDIAN__)
+#else
+#define TF_PLAT_STR___LITTLE_ENDIAN__
+#endif
+#if defined(__NetBSD__)
+#define TF_PLAT_STR___NetBSD__ TF_PLAT_STR_(__NetBSD__)
+#else
+#define TF_PLAT_STR___NetBSD__
+#endif
+#if defined(__OpenBSD__)
+#define TF_PLAT_STR___OpenBSD__ TF_PLAT_STR_(__OpenBSD__)
+#else
+#define TF_PLAT_STR___OpenBSD__
+#endif
+#if defined(____MSYS__)
+#define TF_PLAT_STR_____MSYS__ TF_PLAT_STR_(____MSYS__)
+#else
+#define TF_PLAT_STR_____MSYS__
+#endif
+#if defined(__aarch64__)
+#define TF_PLAT_STR___aarch64__ TF_PLAT_STR_(__aarch64__)
+#else
+#define TF_PLAT_STR___aarch64__
+#endif
+#if defined(__alpha__)
+#define TF_PLAT_STR___alpha__ TF_PLAT_STR_(__alpha__)
+#else
+#define TF_PLAT_STR___alpha__
+#endif
+#if defined(__arm__)
+#define TF_PLAT_STR___arm__ TF_PLAT_STR_(__arm__)
+#else
+#define TF_PLAT_STR___arm__
+#endif
+#if defined(__i386__)
+#define TF_PLAT_STR___i386__ TF_PLAT_STR_(__i386__)
+#else
+#define TF_PLAT_STR___i386__
+#endif
+#if defined(__i686__)
+#define TF_PLAT_STR___i686__ TF_PLAT_STR_(__i686__)
+#else
+#define TF_PLAT_STR___i686__
+#endif
+#if defined(__ia64__)
+#define TF_PLAT_STR___ia64__ TF_PLAT_STR_(__ia64__)
+#else
+#define TF_PLAT_STR___ia64__
+#endif
+#if defined(__linux__)
+#define TF_PLAT_STR___linux__ TF_PLAT_STR_(__linux__)
+#else
+#define TF_PLAT_STR___linux__
+#endif
+#if defined(__mips32__)
+#define TF_PLAT_STR___mips32__ TF_PLAT_STR_(__mips32__)
+#else
+#define TF_PLAT_STR___mips32__
+#endif
+#if defined(__mips64__)
+#define TF_PLAT_STR___mips64__ TF_PLAT_STR_(__mips64__)
+#else
+#define TF_PLAT_STR___mips64__
+#endif
+#if defined(__powerpc64__)
+#define TF_PLAT_STR___powerpc64__ TF_PLAT_STR_(__powerpc64__)
+#else
+#define TF_PLAT_STR___powerpc64__
+#endif
+#if defined(__powerpc__)
+#define TF_PLAT_STR___powerpc__ TF_PLAT_STR_(__powerpc__)
+#else
+#define TF_PLAT_STR___powerpc__
+#endif
+#if defined(__riscv___)
+#define TF_PLAT_STR___riscv___ TF_PLAT_STR_(__riscv___)
+#else
+#define TF_PLAT_STR___riscv___
+#endif
+#if defined(__s390x__)
+#define TF_PLAT_STR___s390x__ TF_PLAT_STR_(__s390x__)
+#else
+#define TF_PLAT_STR___s390x__
+#endif
+#if defined(__sparc64__)
+#define TF_PLAT_STR___sparc64__ TF_PLAT_STR_(__sparc64__)
+#else
+#define TF_PLAT_STR___sparc64__
+#endif
+#if defined(__sparc__)
+#define TF_PLAT_STR___sparc__ TF_PLAT_STR_(__sparc__)
+#else
+#define TF_PLAT_STR___sparc__
+#endif
+#if defined(__x86_64__)
+#define TF_PLAT_STR___x86_64__ TF_PLAT_STR_(__x86_64__)
+#else
+#define TF_PLAT_STR___x86_64__
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
diff --git a/tensorflow/core/platform/platform_strings_test.cc b/tensorflow/core/platform/platform_strings_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5251f10d4124650dd7b2d260b1665b988bb663c9
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Test for the platform_strings.h header file.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform_strings.h"
+
+// Embed the platform strings in this binary.
+TF_PLATFORM_STRINGS()
+
+// A vector of strings.
+typedef std::vector<std::string> string_vec;
+
+// Append to *found the strings within the named file with the platform_strings
+// magic prefix, and return true; or return false on error.
+
+// Print the platform strings embedded in the binary file_name and return 0,
+// on on error return 2.
+static int PrintStrings(const std::string file_name) {
+  int rc = 0;
+  string_vec str;
+  if (!tensorflow::GetPlatformStrings(file_name, &str)) {
+    for (int i = 0; i != str.size(); i++) {
+      printf("%s\n", str[i].c_str());
+    }
+  } else {
+    perror(file_name.c_str());
+    rc = 2;
+  }
+  return rc;
+}
+
+// Return whether str[] conatins a string with prefix "macro_name="; if so,
+// set *pvalue to the suffix.
+static bool GetValue(const string_vec &str, const std::string &macro_name,
+                     std::string *pvalue) {
+  std::string nam_eq = macro_name + "=";
+  int i = 0;
+  while (i != str.size() && !tensorflow::str_util::StartsWith(str[i], nam_eq)) {
+    i++;
+  }
+  bool found = (i != str.size());
+  if (found) {
+    *pvalue = str[i].substr(nam_eq.size());
+  }
+  return found;
+}
+
+// If macro_name[] is not equal to value[], check that str[] contains the
+// string "macro_name=value".  Otherwise, check that str[] does not contain any
+// string starting with macro_name=".
+static void CheckStr(const string_vec &str, const std::string &macro_name,
+                     const std::string &value) {
+  std::string value_from_str;
+  if (GetValue(str, macro_name, &value_from_str)) {
+    if (value != value_from_str) {
+      // Output everything found, to aid debugging.
+      LOG(ERROR) << "===== value=" << value
+                 << "  value_from_str=" << value_from_str;
+      for (int i = 0; i != str.size(); i++) {
+        LOG(ERROR) << "% " << str[i];
+      }
+      LOG(ERROR) << "=====";
+    }
+    CHECK_EQ(value, value_from_str) << " " << macro_name << ": bad value";
+  } else {
+    // If the string is not found, we expect value to be macro_name.
+    if (value != macro_name) {
+      // Output everything found, to aid debugging.
+      LOG(ERROR) << "===== value=" << value << "  macro_name=" << macro_name;
+      for (int i = 0; i != str.size(); i++) {
+        LOG(ERROR) << "% " << str[i];
+      }
+      LOG(ERROR) << "=====";
+    }
+    CHECK_EQ(value, macro_name) << " " << macro_name << ": not found in binary";
+  }
+}
+
+// Helper for AS_STR(), below, to perform macro expansion.
+#define AS_STR_1_(x) #x
+
+// Yield x after macro expansion as a nul-terminated constant string.
+#define AS_STR(x) AS_STR_1_(x)
+
+// Run the test, and return 0 on success, 2 otherwise.
+static int RunTest(const std::string &binary_name) {
+  int rc = 0;
+  string_vec str;
+
+  if (!tensorflow::GetPlatformStrings(binary_name, &str)) {
+    CheckStr(str, "__linux__", AS_STR(__linux__));
+    CheckStr(str, "_WIN32", AS_STR(_WIN32));
+    CheckStr(str, "__APPLE__", AS_STR(__APPLE__));
+    CheckStr(str, "__x86_64__", AS_STR(__x86_64__));
+    CheckStr(str, "__aarch64__", AS_STR(__aarch64__));
+    CheckStr(str, "__powerpc64__", AS_STR(__powerpc64__));
+    CheckStr(str, "TF_PLAT_STR_VERSION", TF_PLAT_STR_VERSION_);
+  } else {
+    perror(binary_name.c_str());
+    rc = 2;
+  }
+
+  return rc;
+}
+
+int main(int argc, char *argv[]) {
+  tensorflow::Env *env = tensorflow::Env::Default();
+  static const char usage[] = "usage: platform_strings_test [file...]";
+  int rc = 0;
+  tensorflow::port::InitMain(usage, &argc, &argv);
+  if (argc == 1) {
+    printf("rc=%d\n", PrintStrings(env->GetExecutablePath()));
+    rc = RunTest(env->GetExecutablePath());
+  } else {
+    for (int argn = 1; argn != argc; argn++) {
+      rc |= PrintStrings(argv[argn]);
+    }
+  }
+  return rc;
+}
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 4689af06afedb2339449e2f01e6d325ea26cd4c9..174b58866150d1ccd366fb138d87c3b7c5fd58c0 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -413,6 +413,11 @@ message ConfigProto {
     // Any positive value sets the max chunk size.  0 defaults to 4096.
     // Any negative value indicates no max, i.e. one chunk only.
     int32 recv_buf_max_chunk = 4;
+
+    // If true, and supported by the platform, the runtime will attempt to
+    // use NUMA affinity where applicable.  One consequence will be the
+    // existence of as many CPU devices as there are available NUMA nodes.
+    bool use_numa_affinity = 5;
   };
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 143df115f424ae0c5ee7b212806e16a45c1b9fad..515d673828e3792ac6f4268fd55b58e43aab509b 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -38,7 +38,7 @@ message RewriterConfig {
   }
 
   // Enum controlling the number of times to run optimizers. The default is to
-  // run them once.
+  // run them twice.
   enum NumIterationsType {
     DEFAULT_NUM_ITERS = 0;
     ONE = 1;
@@ -137,6 +137,11 @@ message RewriterConfig {
   // meta-optimizer or when manually specified through the optimizers field.
   AutoParallelOptions auto_parallel = 5;
 
+  // If true, any optimization pass failing will cause the MetaOptimizer to
+  // stop with an error. By default - or when set to false, failing passes are
+  // skipped silently.
+  bool fail_on_optimizer_errors = 21;
+
   ScopedAllocatorOptions scoped_allocator_opts = 16;
 
   // If non-empty, will use this as an alternative way to specify a list of
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index 732ed33ede17bc90d3301d3f1eee6302a96028d7..2b035ab0e9c8500931665890a637ea6f3242ba22 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -131,7 +131,7 @@ class CudaLaunchConfigTest : public ::testing::Test {
  protected:
   const int bufsize = 1024;
   int* outbuf = nullptr;
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
 
   virtual void SetUp() {
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
index d0d95736d3f1c37055b5383aa4e3141145838aab..080d4067cec69084b54ba1c096d01198a8e48d20 100644
--- a/tensorflow/core/util/cuda_launch_config.h
+++ b/tensorflow/core/util/cuda_launch_config.h
@@ -128,12 +128,12 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
   CudaLaunchConfig config;
   const int virtual_thread_count = work_element_count;
   const int physical_thread_count = std::min(
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
       virtual_thread_count);
-  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
+  const int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
   const int block_count =
       std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumCudaMultiProcessors());
+               d.getNumGpuMultiProcessors());
 
   config.virtual_thread_count = virtual_thread_count;
   config.thread_per_block = thread_per_block;
@@ -184,7 +184,7 @@ inline CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
   cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, cudaSuccess);
-  block_count = std::min(block_count * d.getNumCudaMultiProcessors(),
+  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                          DivUp(work_element_count, fixed_block_size));
 
   config.virtual_thread_count = work_element_count;
@@ -213,7 +213,7 @@ inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
   int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
 
   const int physical_thread_count =
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
 
   const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
 
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 798836471663ab4f154c62a587d1b63d3a1c098e..b7a6e0b690282b95f08183b5b7a11abd0a5972b6 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
-#include <string>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -54,9 +54,9 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/util/env_var.h"
 
 #ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
@@ -83,7 +83,12 @@ namespace tensorflow {
 // MKL operation, and did not go through a conversion to a standard
 // Tensorflow tensor.
 
+// For use with MKL ML, has been deprecated
 typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
+
+// The dimensions order that MKL DNN internally uses for 2D activations
+// [Batch, Channel, Height, Width] and
+// for 2D filters [Out_Channel, In_Channel, Height, Width].
 typedef enum {
   Dim_N = 0,
   Dim_C = 1,
@@ -93,6 +98,9 @@ typedef enum {
   Dim_I = 1
 } MklDnnDims;
 
+// The dimensions order that MKL DNN internally uses for 3D activations
+// [Batch, Channel, Depth, Height, Width] and
+// for 3D filters [Out_Channel, In_Channel, Depth, Height, Width].
 typedef enum {
   Dim3d_N = 0,
   Dim3d_C = 1,
@@ -103,6 +111,13 @@ typedef enum {
   Dim3d_I = 1
 } MklDnnDims3D;
 
+// Enum used to templatize MklOp kernel implementations
+// that support both fp32 and int8 versions.
+enum class MklQuantization {
+  QUANTIZED_VERSION,
+  FP_VERSION,
+};
+
 static const int kSmallBatchSize = 32;
 
 #ifdef INTEL_MKL_ML_ONLY
@@ -653,7 +668,6 @@ class MklDnnShape {
     }
   }
 
-
   inline void SetTfDimOrder(const size_t dimension, memory::format format) {
     TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
     SetTfDimOrder(dimension, data_format);
@@ -782,7 +796,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
 }
 #else
 using mkldnn::stream;
-template <typename T> class MklDnnData;
+template <typename T>
+class MklDnnData;
 
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
@@ -792,11 +807,12 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
     if (!mkl_shape.IsMklTensor())
       return mkl_tensor;  // return input since it is already TF tensor
 
-    TensorShape output_shape = mkl_shape.GetTfShape();;
+    TensorShape output_shape = mkl_shape.GetTfShape();
+    ;
 
     // Allocate output tensor.
-    context->allocate_temp(DataTypeToEnum<T>::v(),
-        output_shape, &output_tensor);
+    context->allocate_temp(DataTypeToEnum<T>::v(), output_shape,
+                           &output_tensor);
 
     auto cpu_engine = engine(engine::cpu, 0);
     MklDnnData<T> input(&cpu_engine);
@@ -811,7 +827,7 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
     if (input.IsReorderNeeded(output_tf_pd)) {
       std::vector<primitive> net;
       CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
-             true);
+               true);
       stream(stream::kind::eager).submit(net).wait();
     } else {
       // If not, just forward input tensor to output tensor.
@@ -1386,6 +1402,18 @@ template <>
 memory::data_type MklDnnType<float>() {
   return memory::data_type::f32;
 }
+template <>
+memory::data_type MklDnnType<quint8>() {
+  return memory::data_type::u8;
+}
+template <>
+memory::data_type MklDnnType<qint8>() {
+  return memory::data_type::s8;
+}
+template <>
+memory::data_type MklDnnType<qint32>() {
+  return memory::data_type::s32;
+}
 
 /// Map TensorFlow's data format into MKL-DNN 3D data format
 /// @input: TensorFlow data format
@@ -2003,8 +2031,7 @@ const mkldnn::memory::dims NONE_DIMS = {};
 template <typename T>
 class MklPrimitiveFactory {
  public:
-  MklPrimitiveFactory() {
-  }
+  MklPrimitiveFactory() {}
 
   ~MklPrimitiveFactory() {}
 
@@ -2032,8 +2059,8 @@ class MklPrimitiveFactory {
   /// For those legacy device(w/o AVX512 and AVX2),
   /// MKL-DNN GEMM will be used.
   static inline bool IsLegacyPlatform() {
-    return (!port::TestCPUFeature(port::CPUFeature::AVX512F)
-                   && !port::TestCPUFeature(port::CPUFeature::AVX2));
+    return (!port::TestCPUFeature(port::CPUFeature::AVX512F) &&
+            !port::TestCPUFeature(port::CPUFeature::AVX2));
   }
 
   /// Fuction to check whether primitive memory optimization is enabled
@@ -2054,15 +2081,13 @@ class MklPrimitiveFactory {
 // utility class for creating keys of MKL primitive pool.
 class FactoryKeyCreator {
  public:
-  FactoryKeyCreator() {
-    key_.reserve(kMaxKeyLength);
-  }
+  FactoryKeyCreator() { key_.reserve(kMaxKeyLength); }
 
   ~FactoryKeyCreator() {}
 
   void AddAsKey(const string& str) { Append(str); }
 
-  void AddAsKey(const mkldnn::memory::dims &dims) {
+  void AddAsKey(const mkldnn::memory::dims& dims) {
     for (unsigned int i = 0; i < dims.size(); i++) {
       AddAsKey<int>(dims[i]);
     }
@@ -2070,7 +2095,7 @@ class FactoryKeyCreator {
 
   template <typename T>
   void AddAsKey(const T data) {
-    auto buffer = reinterpret_cast<const char *>(&data);
+    auto buffer = reinterpret_cast<const char*>(&data);
     Append(StringPiece(buffer, sizeof(T)));
   }
 
@@ -2086,7 +2111,6 @@ class FactoryKeyCreator {
   }
 };
 
-
 static inline memory::format get_desired_format(int channel,
                                                 bool is_2d = true) {
   memory::format fmt_desired = memory::format::any;
@@ -2108,37 +2132,34 @@ class MklReorderPrimitive : public MklPrimitive {
   explicit MklReorderPrimitive(const memory* from, const memory* to) {
     Setup(from, to);
   }
-    ~MklReorderPrimitive() {}
+  ~MklReorderPrimitive() {}
 
-    std::shared_ptr<primitive> GetPrimitive() {
-      return context_.reorder_prim;
-    }
+  std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
 
-    void SetMemory(const memory* from, const memory* to) {
-      context_.src_mem->set_data_handle(from->get_data_handle());
-      context_.dst_mem->set_data_handle(to->get_data_handle());
-    }
+  void SetMemory(const memory* from, const memory* to) {
+    context_.src_mem->set_data_handle(from->get_data_handle());
+    context_.dst_mem->set_data_handle(to->get_data_handle());
+  }
 
  private:
-    struct ReorderContext {
-      std::shared_ptr<mkldnn::memory> src_mem;
-      std::shared_ptr<mkldnn::memory> dst_mem;
-      std::shared_ptr<primitive> reorder_prim;
-      ReorderContext():
-        src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {
-      }
-    } context_;
-
-    engine cpu_engine_ = engine(engine::cpu, 0);
-
-    void Setup(const memory* from, const memory* to) {
-      context_.src_mem.reset(new memory(
-            {from->get_primitive_desc().desc(), cpu_engine_}, DummyData));
-      context_.dst_mem.reset(new memory(
-            {to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
-      context_.reorder_prim = std::make_shared<mkldnn::reorder>(
-          reorder(*context_.src_mem, *context_.dst_mem));
-    }
+  struct ReorderContext {
+    std::shared_ptr<mkldnn::memory> src_mem;
+    std::shared_ptr<mkldnn::memory> dst_mem;
+    std::shared_ptr<primitive> reorder_prim;
+    ReorderContext()
+        : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
+  } context_;
+
+  engine cpu_engine_ = engine(engine::cpu, 0);
+
+  void Setup(const memory* from, const memory* to) {
+    context_.src_mem.reset(new memory(
+        {from->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    context_.dst_mem.reset(
+        new memory({to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    context_.reorder_prim = std::make_shared<mkldnn::reorder>(
+        reorder(*context_.src_mem, *context_.dst_mem));
+  }
 };
 
 template <typename T>
@@ -2156,52 +2177,51 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     return reorderPrim;
   }
 
-    static MklReorderPrimitiveFactory & GetInstance() {
-      static MklReorderPrimitiveFactory instance_;
-      return instance_;
-    }
+  static MklReorderPrimitiveFactory& GetInstance() {
+    static MklReorderPrimitiveFactory instance_;
+    return instance_;
+  }
 
  private:
-    MklReorderPrimitiveFactory() {}
-    ~MklReorderPrimitiveFactory() {}
-
-    static string CreateKey(const memory* from, const memory* to) {
-      string prefix = "reorder";
-      FactoryKeyCreator key_creator;
-      auto const &from_desc =  from->get_primitive_desc().desc().data;
-      auto const &to_desc =  to->get_primitive_desc().desc().data;
-      const int KIdxFirstStride = 0;
-      memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
-      memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
-      memory::dims from_strides(
-          from_desc.layout_desc.blocking.strides[KIdxFirstStride],
-          &from_desc.layout_desc.blocking
-               .strides[KIdxFirstStride][from_desc.ndims]);
-      memory::dims to_strides(
-          to_desc.layout_desc.blocking.strides[KIdxFirstStride],
-          &to_desc.layout_desc.blocking
-               .strides[KIdxFirstStride][to_desc.ndims]);
-      key_creator.AddAsKey(prefix);
-      key_creator.AddAsKey(static_cast<int>(from_desc.format));
-      key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
-      key_creator.AddAsKey(from_dims);
-      key_creator.AddAsKey(from_strides);
-      key_creator.AddAsKey(static_cast<int>(to_desc.format));
-      key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
-      key_creator.AddAsKey(to_dims);
-      key_creator.AddAsKey(to_strides);
-      return key_creator.GetKey();
-    }
-
-    MklPrimitive* GetReorder(const memory* from, const memory* to) {
-      string key = CreateKey(from, to);
-      return this->GetOp(key);
-    }
-
-    void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
-      string key = CreateKey(from, to);
-      this->SetOp(key, op);
-    }
+  MklReorderPrimitiveFactory() {}
+  ~MklReorderPrimitiveFactory() {}
+
+  static string CreateKey(const memory* from, const memory* to) {
+    string prefix = "reorder";
+    FactoryKeyCreator key_creator;
+    auto const& from_desc = from->get_primitive_desc().desc().data;
+    auto const& to_desc = to->get_primitive_desc().desc().data;
+    const int KIdxFirstStride = 0;
+    memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
+    memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
+    memory::dims from_strides(
+        from_desc.layout_desc.blocking.strides[KIdxFirstStride],
+        &from_desc.layout_desc.blocking
+             .strides[KIdxFirstStride][from_desc.ndims]);
+    memory::dims to_strides(
+        to_desc.layout_desc.blocking.strides[KIdxFirstStride],
+        &to_desc.layout_desc.blocking.strides[KIdxFirstStride][to_desc.ndims]);
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(static_cast<int>(from_desc.format));
+    key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
+    key_creator.AddAsKey(from_dims);
+    key_creator.AddAsKey(from_strides);
+    key_creator.AddAsKey(static_cast<int>(to_desc.format));
+    key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
+    key_creator.AddAsKey(to_dims);
+    key_creator.AddAsKey(to_strides);
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetReorder(const memory* from, const memory* to) {
+    string key = CreateKey(from, to);
+    return this->GetOp(key);
+  }
+
+  void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
+    string key = CreateKey(from, to);
+    this->SetOp(key, op);
+  }
 };
 
 /// Fuction to find(or create) a reorder from memory pointed by
diff --git a/tensorflow/core/util/permutation_input_iterator.h b/tensorflow/core/util/permutation_input_iterator.h
index f6375b25157644cda97aa195958b60ac27b8a4d6..649318ebf3b4542a244f98342702cef087d28fce 100644
--- a/tensorflow/core/util/permutation_input_iterator.h
+++ b/tensorflow/core/util/permutation_input_iterator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
-#define TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
 
 #include <iostream>
 #include <iterator>
@@ -131,4 +131,4 @@ class PermutationInputIterator {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/permutation_output_iterator.h b/tensorflow/core/util/permutation_output_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..638c0f4545853b28dd5822817c1ec8759bb3a80b
--- /dev/null
+++ b/tensorflow/core/util/permutation_output_iterator.h
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename ValueType, typename OutputIteratorT, typename IndexIteratorT,
+          typename OffsetT = ptrdiff_t>
+class PermutationOutputIterator {
+ public:
+  // Required iterator traits
+  typedef PermutationOutputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;  ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef ValueType
+      value_type;  ///< The type of the element the iterator can point to
+  typedef ValueType* pointer;    ///< The type of a pointer to an element the
+                                 ///< iterator can point to
+  typedef ValueType& reference;  ///< The type of a reference to an element the
+                                 ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+ private:
+  OutputIteratorT output_itr;
+  IndexIteratorT index_itr;
+
+ public:
+  /// Constructor
+  __host__ __device__ __forceinline__ PermutationOutputIterator(
+      OutputIteratorT output_itr,  ///< Input iterator to wrap
+      IndexIteratorT index_itr)    ///< Conversion functor to wrap
+      : output_itr(output_itr), index_itr(index_itr) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    index_itr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    index_itr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return output_itr[*index_itr];
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(output_itr, index_itr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    index_itr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(output_itr, index_itr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    index_itr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return index_itr - other.index_itr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return output_itr[index_itr[n]];
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (index_itr == rhs.index_itr && output_itr == rhs.output_itr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return !(*this == rhs);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index b9ca8ab395bb85048e9dfca1db48303ce92e8316..89c163aa5133fafc23b01c7153ac40d32efcaaf6 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -238,15 +238,6 @@ class SparseTensor {
   static Status Split(const SparseTensor& tensor, const int split_dim,
                       const int num_split, std::vector<SparseTensor>* result);
 
-  template <typename T>
-  ABSL_DEPRECATED(
-      "Use the form of Split() that takes an output pointer and returns a "
-      "status instead.")
-  static std::vector<SparseTensor> Split(const SparseTensor& tensor,
-                                         const int split_dim,
-                                         const int num_split,
-                                         Status* status = nullptr);
-
   // Slice() will slice the input SparseTensor into a SparseTensor based on
   // specified start and size. Both start and size are 1-D array with each
   // element of the array representing one dimension. The start is the start
@@ -578,10 +569,9 @@ SparseTensor SparseTensor::Concat(
 }
 
 template <typename T>
-std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
-                                              const int split_dim,
-                                              const int num_split,
-                                              Status* status /* = nullptr */) {
+Status SparseTensor::Split(const SparseTensor& input_tensor,
+                           const int split_dim, const int num_split,
+                           std::vector<SparseTensor>* result) {
   std::vector<Tensor> output_indices;
   std::vector<Tensor> output_values;
   std::vector<TensorShape> output_shapes;
@@ -601,17 +591,15 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
   const int split_dim_size = input_tensor.shape()[split_dim];
   const int split_size = split_dim_size / num_split;
 
-  if (!(num_split > 0 && num_split <= split_dim_size) && status != nullptr) {
-    *status = Status(error::INVALID_ARGUMENT,
-                     strings::StrCat("num_split must be in the interval (0, ",
-                                     split_dim_size, "]"));
-    return {};
+  if (!(num_split > 0 && num_split <= split_dim_size)) {
+    return Status(error::INVALID_ARGUMENT,
+                  strings::StrCat("num_split must be in the interval (0, ",
+                                  split_dim_size, "]"));
   }
   if (!(split_dim >= 0 && split_dim < num_dim)) {
-    *status = Status(
+    return Status(
         error::INVALID_ARGUMENT,
         strings::StrCat("num_dim must be in the interval [0, ", num_dim, ")"));
-    return {};
   }
 
   const int residual = split_dim_size % num_split;
@@ -649,28 +637,18 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
     }
   }
 
-  std::vector<SparseTensor> output_tensors;
-  output_tensors.reserve(num_split);
+  result->clear();
+  result->reserve(num_split);
   for (int i = 0; i < num_split; ++i) {
     SparseTensor tensor;
     Status create_status =
         Create(output_indices[i], output_values[i], output_shapes[i], &tensor);
-    if (!create_status.ok() && status != nullptr) {
-      *status = create_status;
-      return {};
+    if (!create_status.ok()) {
+      return create_status;
     }
-    output_tensors.push_back(std::move(tensor));
+    result->push_back(std::move(tensor));
   }
-  return output_tensors;
-}
-
-template <typename T>
-Status SparseTensor::Split(const SparseTensor& input_tensor,
-                           const int split_dim, const int num_split,
-                           std::vector<SparseTensor>* result) {
-  Status status;
-  *result = Split<T>(input_tensor, split_dim, num_split, &status);
-  return status;
+  return Status::OK();
 }
 
 template <typename T>
diff --git a/tensorflow/examples/adding_an_op/zero_out_3_test.py b/tensorflow/examples/adding_an_op/zero_out_3_test.py
index 15d62495aaee769f8aad79b844e3bb9b0a1e0df2..2327e7cd8fa44845682051ebd39e697e39efc64b 100644
--- a/tensorflow/examples/adding_an_op/zero_out_3_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_3_test.py
@@ -39,13 +39,13 @@ class ZeroOut3Test(tf.test.TestCase):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=-1)
       with self.assertRaisesOpError("Need preserve_index >= 0, got -1"):
-        result.eval()
+        self.evaluate(result)
 
   def testLarge(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=17)
       with self.assertRaisesOpError("preserve_index out of range"):
-        result.eval()
+        self.evaluate(result)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/examples/autograph/integration_tests/keras_test.py b/tensorflow/examples/autograph/integration_tests/keras_test.py
index dca7c07b470498394593756a93a69af48c4ece43..9828ac34dc94f534a786a3729b47b5ae7fd255c5 100644
--- a/tensorflow/examples/autograph/integration_tests/keras_test.py
+++ b/tensorflow/examples/autograph/integration_tests/keras_test.py
@@ -96,7 +96,7 @@ class KerasTest(tf.test.TestCase):
           sess.run(init)
           sample_input = tf.random_uniform((1, 10, 10, 1))
           output = model(sample_input)  # pylint: disable=not-callable
-          self.assertEqual(sess.run(output).shape, (1, 3))
+          self.assertEqual(self.evaluate(output).shape, (1, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/autograph/integration_tests/list_literals_test.py b/tensorflow/examples/autograph/integration_tests/list_literals_test.py
index 917f5ff9d849d131d18e7e6c748d9c679b1b119e..e85d4abcfc9adfbb4bc6390589b846f7e59f3739 100644
--- a/tensorflow/examples/autograph/integration_tests/list_literals_test.py
+++ b/tensorflow/examples/autograph/integration_tests/list_literals_test.py
@@ -34,7 +34,7 @@ class ListLiteralsTest(tf.test.TestCase):
     result = converted()
 
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(result), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(result), [1, 2, 3])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/saved_model/saved_model_half_plus_two.py b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
index e1231708fa0c02caa04e40868918794bd2f03fb1..dfdde445404a5ec99f3d821dff6d9f217bfadefc 100644
--- a/tensorflow/examples/saved_model/saved_model_half_plus_two.py
+++ b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
@@ -160,7 +160,7 @@ def _generate_saved_model_for_half_plus_two(export_dir,
 
       x2 = tf.identity(tf_example["x2"], name="x2")
       y3 = tf.add(tf.multiply(a, x2), c)
-      y2 = tf.identity(y3, name="y3")
+      y3 = tf.identity(y3, name="y3")
 
     # Create an assets file that can be saved and restored as part of the
     # SavedModel.
diff --git a/tensorflow/examples/speech_commands/input_data_test.py b/tensorflow/examples/speech_commands/input_data_test.py
index b766ba6de0de93fa160b3464e5a860b5f665a76d..33b58b9d09bae78f2eafc51fd77acb4bebcc4d70 100644
--- a/tensorflow/examples/speech_commands/input_data_test.py
+++ b/tensorflow/examples/speech_commands/input_data_test.py
@@ -35,7 +35,7 @@ class InputDataTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([32000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
diff --git a/tensorflow/examples/speech_commands/label_wav_test.py b/tensorflow/examples/speech_commands/label_wav_test.py
index f0af2a4798785d53fe937fde45dbc9c9d67acfbc..77a88f98e165758994ddbbd21acab8823dcf5686 100644
--- a/tensorflow/examples/speech_commands/label_wav_test.py
+++ b/tensorflow/examples/speech_commands/label_wav_test.py
@@ -33,7 +33,7 @@ class LabelWavTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([1000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
diff --git a/tensorflow/examples/speech_commands/wav_to_features_test.py b/tensorflow/examples/speech_commands/wav_to_features_test.py
index 87f298769390a7e9b3d3e8bada70770ba7452172..cb8ea912fa293ddb6dbcd8ffff72754cbe554649 100644
--- a/tensorflow/examples/speech_commands/wav_to_features_test.py
+++ b/tensorflow/examples/speech_commands/wav_to_features_test.py
@@ -33,7 +33,7 @@ class WavToFeaturesTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([32000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
diff --git a/tensorflow/examples/tf2_showcase/BUILD b/tensorflow/examples/tf2_showcase/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..922bc96b25bcc940685c7e01f033856e7d39f5f8
--- /dev/null
+++ b/tensorflow/examples/tf2_showcase/BUILD
@@ -0,0 +1,32 @@
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+test_suite(
+    name = "all_tests",
+    tags = [
+        "manual",
+        "no_oss",
+        "notap",
+    ],
+    tests = [
+        ":mnist",
+    ],
+)
+
+py_test(
+    name = "mnist",
+    srcs = ["mnist.py"],
+    tags = [
+        "manual",
+        "no_oss",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/absl:app",
+        "//third_party/py/absl/flags",
+    ],
+)
diff --git a/tensorflow/examples/tf2_showcase/README.md b/tensorflow/examples/tf2_showcase/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8211fb1d30d8f9da8966d4babed05c33367bcecc
--- /dev/null
+++ b/tensorflow/examples/tf2_showcase/README.md
@@ -0,0 +1,25 @@
+# TF 2.0 Showcase
+
+The code here shows idiomatic ways to write TensorFlow 2.0 code. It doubles as
+an integration test.
+
+## General guidelines for showcase code:
+
+- Code should minimize dependencies and be self-contained in one file. A user
+  should be able to copy-paste the example code into their project and have it
+  just work.
+- Code should emphasize simplicity over performance, as long as it performs
+  within a factor of 2-3x of the optimized implementation.
+- Code should work on CPU and single GPU.
+- Code should run in Python 3.
+- Code should conform to the [Google Python Style Guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md)
+
+
+- Code should follow these guidelines:
+  - Prefer Keras.
+  - Split code into separate input pipeline and model code segments.
+  - Don't use tf.cond or tf.while_loop; instead, make use of AutoGraph's
+    functionality to compile Python `for`, `while`, and `if` statements.
+  - Prefer a simple training loop over Estimator
+  - Save and restore a SavedModel.
+  - Write basic TensorBoard metrics - loss, accuracy,
diff --git a/tensorflow/examples/tf2_showcase/mnist.py b/tensorflow/examples/tf2_showcase/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4bfe4e53a8e16f7bb615ce481018cb3dce02150
--- /dev/null
+++ b/tensorflow/examples/tf2_showcase/mnist.py
@@ -0,0 +1,262 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MNIST model training with TensorFlow eager execution.
+
+See:
+https://research.googleblog.com/2017/10/eager-execution-imperative-define-by.html
+
+This program demonstrates training, export, and inference of a convolutional
+neural network model with eager execution enabled.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow as tf
+
+tfe = tf.contrib.eager
+
+flags.DEFINE_integer(
+    name='log_interval',
+    default=10,
+    help='batches between logging training status')
+
+flags.DEFINE_float(name='learning_rate', default=0.01, help='Learning rate.')
+
+flags.DEFINE_float(
+    name='momentum', short_name='m', default=0.5, help='SGD momentum.')
+
+flags.DEFINE_integer(
+    name='batch_size',
+    default=100,
+    help='Batch size to use during training / eval')
+
+flags.DEFINE_integer(
+    name='train_epochs', default=10, help='Number of epochs to train')
+
+flags.DEFINE_string(
+    name='model_dir',
+    default='/tmp/tensorflow/mnist',
+    help='Where to save checkpoints, tensorboard summaries, etc.')
+
+flags.DEFINE_bool(
+    name='clean',
+    default=False,
+    help='Whether to clear model directory before training')
+
+FLAGS = flags.FLAGS
+
+
+def create_model():
+  """Model to recognize digits in the MNIST dataset.
+
+  Network structure is equivalent to:
+  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  and
+  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
+  But uses the tf.keras API.
+  Returns:
+    A tf.keras.Model.
+  """
+  # Assumes data_format == 'channel_last'.
+  # See https://www.tensorflow.org/performance/performance_guide#data_formats
+
+  input_shape = [28, 28, 1]
+
+  l = tf.keras.layers
+  max_pool = l.MaxPooling2D((2, 2), (2, 2), padding='same')
+  # The model consists of a sequential chain of layers, so tf.keras.Sequential
+  # (a subclass of tf.keras.Model) makes for a compact description.
+  model = tf.keras.Sequential(
+      [
+          l.Reshape(
+              target_shape=input_shape,
+              input_shape=(28 * 28,)),
+          l.Conv2D(2, 5, padding='same', activation=tf.nn.relu),
+          max_pool,
+          l.Conv2D(4, 5, padding='same', activation=tf.nn.relu),
+          max_pool,
+          l.Flatten(),
+          l.Dense(32, activation=tf.nn.relu),
+          l.Dropout(0.4),
+          l.Dense(10)
+      ])
+  # TODO(brianklee): Remove when @kaftan makes this happen by default.
+  # TODO(brianklee): remove `autograph=True` when kwarg default is flipped.
+  model.call = tfe.function(model.call, autograph=True)
+  # Needs to have input_signature specified in order to be exported
+  # since model.predict() is never called before saved_model.export()
+  # TODO(brianklee): Update with input signature, depending on how the impl of
+  # saved_model.restore() pans out.
+  model.predict = tfe.function(model.predict, autograph=True)
+  # ,input_signature=(tensor_spec.TensorSpec(shape=[28, 28, None], dtype=tf.float32),) # pylint: disable=line-too-long
+  return model
+
+
+def mnist_datasets():
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32.
+  x_train, x_test = x_train / np.float32(255), x_test / np.float32(255)
+  y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
+  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  return train_dataset, test_dataset
+
+
+def loss(logits, labels):
+  return tf.reduce_mean(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=labels))
+
+
+def compute_accuracy(logits, labels):
+  predictions = tf.argmax(logits, axis=1, output_type=tf.int64)
+  labels = tf.cast(labels, tf.int64)
+  return tf.reduce_mean(
+      tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
+
+
+# TODO(brianklee): Enable @tf.function on the training loop when zip, enumerate
+# are supported by autograph.
+def train(model, optimizer, dataset, step_counter, log_interval=None,
+          num_steps=None):
+  """Trains model on `dataset` using `optimizer`."""
+  start = time.time()
+  for (batch, (images, labels)) in enumerate(dataset):
+    if num_steps is not None and batch > num_steps:
+      break
+    with tf.contrib.summary.record_summaries_every_n_global_steps(
+        10, global_step=step_counter):
+      # Record the operations used to compute the loss given the input,
+      # so that the gradient of the loss with respect to the variables
+      # can be computed.
+      with tf.GradientTape() as tape:
+        logits = model(images, training=True)
+        loss_value = loss(logits, labels)
+        tf.contrib.summary.scalar('loss', loss_value)
+        tf.contrib.summary.scalar('accuracy', compute_accuracy(logits, labels))
+      grads = tape.gradient(loss_value, model.variables)
+      optimizer.apply_gradients(
+          zip(grads, model.variables), global_step=step_counter)
+      if log_interval and batch % log_interval == 0:
+        rate = log_interval / (time.time() - start)
+        print('Step #%d\tLoss: %.6f (%d steps/sec)' % (batch, loss_value, rate))
+        start = time.time()
+
+
+def test(model, dataset):
+  """Perform an evaluation of `model` on the examples from `dataset`."""
+  avg_loss = tfe.metrics.Mean('loss', dtype=tf.float32)
+  accuracy = tfe.metrics.Accuracy('accuracy', dtype=tf.float32)
+
+  for (images, labels) in dataset:
+    logits = model(images, training=False)
+    avg_loss(loss(logits, labels))
+    accuracy(
+        tf.argmax(logits, axis=1, output_type=tf.int64),
+        tf.cast(labels, tf.int64))
+  print('Test set: Average loss: %.4f, Accuracy: %4f%%\n' %
+        (avg_loss.result(), 100 * accuracy.result()))
+  with tf.contrib.summary.always_record_summaries():
+    tf.contrib.summary.scalar('loss', avg_loss.result())
+    tf.contrib.summary.scalar('accuracy', accuracy.result())
+
+
+def train_and_export(flags_obj):
+  """Run MNIST training and eval loop in eager mode.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+  """
+  # Load the datasets
+  train_ds, test_ds = mnist_datasets()
+  train_ds = train_ds.shuffle(60000).batch(flags_obj.batch_size)
+  test_ds = test_ds.batch(flags_obj.batch_size)
+
+  # Create the model and optimizer
+  model = create_model()
+  optimizer = tf.train.MomentumOptimizer(
+      flags_obj.learning_rate, flags_obj.momentum)
+
+  # See summaries with `tensorboard --logdir=<model_dir>`
+  train_dir = os.path.join(flags_obj.model_dir, 'summaries', 'train')
+  test_dir = os.path.join(flags_obj.model_dir, 'summaries', 'eval')
+  summary_writer = tf.contrib.summary.create_file_writer(
+      train_dir, flush_millis=10000)
+  test_summary_writer = tf.contrib.summary.create_file_writer(
+      test_dir, flush_millis=10000, name='test')
+
+  # Create and restore checkpoint (if one exists on the path)
+  checkpoint_dir = os.path.join(flags_obj.model_dir, 'checkpoints')
+  checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
+  step_counter = tf.train.get_or_create_global_step()
+  checkpoint = tf.train.Checkpoint(
+      model=model, optimizer=optimizer, step_counter=step_counter)
+  # Restore variables on creation if a checkpoint exists.
+  checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
+
+  # Train and evaluate for a set number of epochs.
+  for _ in range(flags_obj.train_epochs):
+    start = time.time()
+    with summary_writer.as_default():
+      train(model, optimizer, train_ds, step_counter,
+            flags_obj.log_interval, num_steps=1)
+    end = time.time()
+    print('\nTrain time for epoch #%d (%d total steps): %f' %
+          (checkpoint.save_counter.numpy() + 1,
+           step_counter.numpy(),
+           end - start))
+    with test_summary_writer.as_default():
+      test(model, test_ds)
+    checkpoint.save(checkpoint_prefix)
+
+  # TODO(brianklee): Enable this functionality after @allenl implements this.
+  # export_path = os.path.join(flags_obj.model_dir, 'export')
+  # tf.saved_model.save(export_path, model)
+
+
+def import_and_eval(flags_obj):
+  export_path = os.path.join(flags_obj.model_dir, 'export')
+  model = tf.saved_model.restore(export_path)
+  _, (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  x_test = x_test / np.float32(255)
+  y_predict = model(x_test)
+  accuracy = compute_accuracy(y_predict, y_test)
+  print('Model accuracy: {:0.2f}%'.format(accuracy.numpy() * 100))
+
+
+def apply_clean(flags_obj):
+  if flags_obj.clean and tf.gfile.Exists(flags_obj.model_dir):
+    tf.logging.info('--clean flag set. Removing existing model dir: {}'.format(
+        flags_obj.model_dir))
+    tf.gfile.DeleteRecursively(flags_obj.model_dir)
+
+
+def main(_):
+  apply_clean(flags.FLAGS)
+  train_and_export(flags.FLAGS)
+  # TODO(brianklee): Enable this functionality after @allenl implements this.
+  # import_and_eval(flags.FLAGS)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index d9706b5478335ad054d53543b40fe5664e9e2554..9b59c03e557ed73ade142d2e8383a26f0e024b81 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -5475,19 +5475,187 @@ func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []
 	return values
 }
 
-// Returns the truth value of x OR y element-wise.
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
+
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// REQUIRES: value >= 0
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LogicalOr",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
-			x, y,
+			key, indices,
 		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapPeek", err)
+		return
+	}
+	return values
+}
+
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
+
+// MapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeCapacity(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapSizeContainer(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapSizeSharedName(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapSize",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -6994,6 +7162,36 @@ func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y
 	return op.Output(0)
 }
 
+// Outputs a tensor containing the reduction across all input tensors.
+//
+// Outputs a tensor containing the reduction across all input tensors passed to ops
+// within the same `shared_name.
+//
+// The graph should be constructed so if one op runs with shared_name value `c`,
+// then `num_devices` ops will run with shared_name value `c`.  Failure to do so
+// will cause the graph execution to fail to complete.
+//
+// input: the input to the reduction
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+// num_devices: The number of devices participating in this reduction.
+// shared_name: Identifier that shared between ops of the same reduction.
+func NcclAllReduce(scope *Scope, input tf.Output, reduction string, num_devices int64, shared_name string) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"reduction": reduction, "num_devices": num_devices, "shared_name": shared_name}
+	opspec := tf.OpSpec{
+		Type: "NcclAllReduce",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RegexReplaceAttr is an optional argument to RegexReplace.
 type RegexReplaceAttr func(optionalAttr)
 
@@ -8270,6 +8468,53 @@ func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 	return op.Output(0)
 }
 
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns element-wise remainder of division. This emulates C semantics in that
 //
 // the result here is consistent with a truncating divide. E.g.
@@ -8971,20 +9216,46 @@ func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dens
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
-
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
+// Reduces `input` from `num_devices` using `reduction` to a single device.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// The graph should be constructed so that all inputs have a valid device
+// assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the reduction.
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"reduction": reduction}
+	opspec := tf.OpSpec{
+		Type: "NcclReduce",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
+
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
@@ -9284,6 +9555,149 @@ func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
 // Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
@@ -10738,6 +11152,31 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+//
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
+//
+// Arguments:
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `inner_values` for the `RaggedTensor`.
+//
+// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToSparse",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Check if the input matches the regex pattern.
 //
 // The input is a string tensor of any shape. The pattern is a scalar
@@ -15397,147 +15836,75 @@ func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
 
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// LRNBias sets the optional bias attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["bias"] = value
 	}
 }
 
-// OrderedMapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["alpha"] = value
 	}
 }
 
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["beta"] = value
 	}
 }
 
-// Op peeks at the values at the specified key.  If the
+// Local Response Normalization.
 //
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
-		return
-	}
-	return values
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
-//
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-//
-// Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "LRN",
 		Input: []tf.Input{
 			input,
 		},
@@ -16586,6 +16953,124 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns immutable tensor from memory region.
 //
 // The current implementation memmaps the tensor from a file.
@@ -16657,7 +17142,7 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 // handle: an empty tensor list.
 // element_dtype: the type of elements in the list.
 // element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16665,7 +17150,7 @@ func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.Dat
 	opspec := tf.OpSpec{
 		Type: "EmptyTensorList",
 		Input: []tf.Input{
-			element_shape,
+			element_shape, max_num_elements,
 		},
 		Attrs: attrs,
 	}
@@ -17907,170 +18392,57 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			y, dy,
+			tag, tensor,
 		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
-
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
-//
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
-	}
-}
-
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
-//
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
-	}
-}
-
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
-//
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
-	}
-}
-
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
-//
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
-	}
-}
-
-// Transforms a spectrogram into a form that's useful for speech recognition.
-//
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
-//
-// Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Mfcc",
-		Input: []tf.Input{
-			spectrogram, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
-		Attrs: attrs,
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -19051,149 +19423,6 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
 type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
@@ -19413,6 +19642,119 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.O
 	return op.Output(0)
 }
 
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
+
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["upper_frequency_limit"] = value
+	}
+}
+
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["lower_frequency_limit"] = value
+	}
+}
+
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+//
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
+	}
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+//
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
+//
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
+//
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Mfcc",
+		Input: []tf.Input{
+			spectrogram, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
 // The Hurwitz zeta function is defined as:
@@ -19508,70 +19850,13 @@ func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "IFFT2D",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			input,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Returns element-wise remainder of division. This emulates C semantics in that
@@ -20650,67 +20935,6 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
 // The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
@@ -22120,6 +22344,108 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 	return op.Output(0)
 }
 
+// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
+type UnicodeTranscodeAttr func(optionalAttr)
+
+// UnicodeTranscodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+//
+// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
+// as ' ', will preserve string alignment to the source since invalid bytes will be
+// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
+// replacement character will preserve byte alignment to the source.
+// If not specified, defaults to 65533
+func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Transcode the input text from a source encoding to a destination encoding.
+//
+// The input is a string tensor of any shape. The output is a string tensor of
+// the same shape containing the transcoded strings. Output strings are always
+// valid unicode. If the input contains invalid encoding positions, the
+// `errors` attribute sets the policy for how to deal with them. If the default
+// error-handling policy is used, invalid formatting will be substituted in the
+// output by the `replacement_char`. If the errors policy is to `ignore`, any
+// invalid encoding positions in the input are skipped and not included in the
+// output. If it set to `strict` then any invalid formatting will result in an
+// InvalidArgument error.
+//
+// This operation can be used with `output_encoding = input_encoding` to enforce
+// correct formatting for inputs even if they are already in the desired encoding.
+//
+// If the input is prefixed by a Byte Order Mark needed to determine encoding
+// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
+// BOM will be consumed and not emitted into the output. If the input encoding
+// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
+// interpreted as a non-breaking-space and is preserved in the output (including
+// always for UTF-8).
+//
+// The end result is that if the input is marked as an explicit endianness the
+// transcoding is faithful to all codepoints in the source. If it is not marked
+// with an explicit endianness, the BOM is not considered part of the string itself
+// but as metadata, and so is not preserved in the output.
+//
+// Arguments:
+//	input: The text to be processed. Can have any shape.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	output_encoding: The unicode encoding to use in the output. Must be one of
+// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
+//
+// Returns A string tensor containing unicode text encoded using `output_encoding`.
+func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeTranscode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes inverse hyperbolic sine of x element-wise.
 func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -22428,6 +22754,24 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Returns the truth value of x OR y element-wise.
+//
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Selects elements from `x` or `y`, depending on `condition`.
 //
 // The `x`, and `y` tensors must all have the same shape, and the
@@ -24464,6 +24808,33 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 	return op.Output(0)
 }
 
+// Sends `input` to all devices that are connected to the output.
+//
+// Sends `input` to all devices that are connected to the output.
+//
+// The graph should be constructed so that all ops connected to the output have a
+// valid device assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the broadcast.
+// output: The same as input.
+// shape: The shape of the input tensor.
+//
+func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "NcclBroadcast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResizeAreaAttr is an optional argument to ResizeArea.
 type ResizeAreaAttr func(optionalAttr)
 
@@ -26065,24 +26436,6 @@ func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
-//
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
 //
 // This operation computes
@@ -26124,6 +26477,24 @@ func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
+// Computes the gradient for the tanh of `x` wrt its input.
+//
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Outputs a `Summary` protocol buffer with scalar values.
 //
 // The input `tags` and `values` must have the same shape.  The generated summary
@@ -29815,6 +30186,83 @@ func UnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.Dat
 	return op.Output(0)
 }
 
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // RpcAttr is an optional argument to Rpc.
 type RpcAttr func(optionalAttr)
 
@@ -29934,83 +30382,6 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // StackPushV2Attr is an optional argument to StackPushV2.
 type StackPushV2Attr func(optionalAttr)
 
@@ -30398,63 +30769,6 @@ func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, o
 	return op.Output(0)
 }
 
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
-
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SparseToDenseAttr is an optional argument to SparseToDense.
 type SparseToDenseAttr func(optionalAttr)
 
@@ -31344,73 +31658,6 @@ func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D average pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder for input pipeline graph optimizations.
-//
-// A placeholder for input pipeline graph optimizations.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-func SinkDataset(scope *Scope, input_dataset tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SinkDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Constructs an Optional variant from a tuple of tensors.
 func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
@@ -33750,60 +33997,3 @@ func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.Data
 	}
 	return values
 }
-
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
-
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 9dce78b9a367cdf5243dfab621cc6fc77d732ee5..10808e162ee4cc679430c0573e5bff8322ad6fff 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -381,6 +381,7 @@ tf_cc_binary(
     linkshared = 1,
     linkstatic = 1,
     deps = [
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/java/src/main/native",
         LINKER_VERSION_SCRIPT,
         LINKER_EXPORTED_SYMBOLS,
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 2fa81ed88f6187c5306584b705522f9fcf3aeac1..951e8bdd0dd8aae46a361a8ffcff276579433641 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -1,12 +1,13 @@
 # TensorFlow for Java
 
 > *WARNING*: The TensorFlow Java API is not currently covered by the TensorFlow
-> [API stability guarantees](https://www.tensorflow.org/guide/version_semantics).
+> [API stability guarantees](https://www.tensorflow.org/guide/version_compat).
 >
 > For using TensorFlow on Android refer instead to
 > [contrib/android](https://www.tensorflow.org/code/tensorflow/contrib/android),
 > [makefile](https://www.tensorflow.org/code/tensorflow/contrib/makefile#android)
-> and/or the [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
+> and/or the
+> [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
 
 ## Quickstart
 
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index c2ece557d5bed86c5d0c2ba045c4ca208e56d908..db3a3609f1ac4fda18ff5a1248e61c675a8bf9f9 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 0d6f46c6fe62f42d0fa0f2f75d626c0b34374cbd..53f7a2d63ef5bc8cfe4fbe372cf2fd3f58a0fe33 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index ab54a61076aa2e13f98e81957e24e41ce45f2938..a17724c805e38239c61dd27a5cc9ec918bbb2e0f 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 557a755236ffab9f5d7dc94a9d22d96acd1ae5bc..30831f90b9f7b4beb5ae3f2ceebadcb6e1f8771e 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.12.0-rc2</version>
+  <version>1.12.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 2f435a6da0cf2b9c8b78db3f7c95db8eb809e6eb..dd6b52be62487ba6cb989b4917a15df7f473a848 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index da94c58c42f782d9802ec4b13dbf7ac648bace0c..f47c11809d58464953028c388d491b91f67c3510 100644
--- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -6,7 +6,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>spark-tensorflow-connector_2.11</artifactId>
     <packaging>jar</packaging>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <name>spark-tensorflow-connector</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
diff --git a/tensorflow/java/maven/tensorflow-hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
index 73ce7a9ffd818669900cb47ddd6f582935175f1b..11aaba983f6ded9a6e757703fd9a2411db82ceb6 100644
--- a/tensorflow/java/maven/tensorflow-hadoop/pom.xml
+++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
@@ -5,7 +5,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>tensorflow-hadoop</artifactId>
     <packaging>jar</packaging>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <name>tensorflow-hadoop</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index fa137e9440990bea5dd2484b11c23e0ae8dafbf7..07fcfa5144600f7d9cbf6edbfbecbecc7c115631 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0-rc2</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Server.java b/tensorflow/java/src/main/java/org/tensorflow/Server.java
new file mode 100644
index 0000000000000000000000000000000000000000..6adcdba17b3b56ef5b65314e1d225c41c7d63fd3
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Server.java
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * An in-process TensorFlow server, for use in distributed training.
+ *
+ * <p>A {@code Server} instance encapsulates a set of devices and a {@link org.tensorflow.Session}
+ * target that can participate in distributed training. A server belongs to a cluster (specified by
+ * a {@code ClusterSpec}), and corresponds to a particular task in a named job. The server can
+ * communicate with any other server in the same cluster. The server will not serve any requests
+ * until {@link #start()} is invoked. The server will stop serving requests once {@link #stop()} or
+ * {@link #close()} is invoked. Be aware that {@link #close()} method stops the server if it is
+ * running.
+ *
+ * <p><b>WARNING:</b> A {@code Server} owns resources that <b>must</b> be explicitly freed by
+ * invoking {@link #close()}.
+ *
+ * <p>Instances of a {@code Server} are thread-safe.
+ *
+ * <p>Using example:
+ *
+ * <pre>{@code
+ * import org.tensorflow.Server;
+ * import org.tensorflow.distruntime.ClusterDef;
+ * import org.tensorflow.distruntime.JobDef;
+ * import org.tensorflow.distruntime.ServerDef;
+ *
+ * ClusterDef clusterDef = ClusterDef.newBuilder()
+ *   .addJob(JobDef.newBuilder()
+ *   .setName("worker")
+ *   .putTasks(0, "localhost:4321")
+ *   .build()
+ * ).build();
+ *
+ * ServerDef serverDef = ServerDef.newBuilder()
+ *   .setCluster(clusterDef)
+ *   .setJobName("worker")
+ *   .setTaskIndex(0)
+ *   .setProtocol("grpc")
+ * .build();
+ *
+ * try (Server srv = new Server(serverDef.toByteArray())) {
+ *   srv.start();
+ *   srv.join();
+ * }
+ * }</pre>
+ */
+public final class Server implements AutoCloseable {
+  /**
+   * Constructs a new instance of server.
+   *
+   * @param serverDef Server definition specified as a serialized <a
+   *     href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/tensorflow_server.proto">ServerDef</a>
+   *     protocol buffer.
+   */
+  public Server(byte[] serverDef) {
+    nativeHandle = allocate(serverDef);
+  }
+
+  /** Starts an in-process TensorFlow server. */
+  public synchronized void start() {
+    start(nativeHandle);
+  }
+
+  /** Stops an in-process TensorFlow server. */
+  public synchronized void stop() {
+    stop(nativeHandle);
+  }
+
+  /** Blocks until the server has been successfully stopped. */
+  public void join() {
+    long handle = 0;
+    synchronized (this) {
+      handle = nativeHandle;
+      if (handle != 0) {
+        numJoining++;
+      }
+    }
+    try {
+      join(handle);
+    } finally {
+      synchronized (this) {
+        if (handle != 0) {
+          numJoining--;
+        }
+        notifyAll();
+      }
+    }
+  }
+
+  /** Destroy an in-process TensorFlow server, frees memory. */
+  @Override
+  public synchronized void close() throws InterruptedException {
+    stop();
+    while (numJoining > 0) {
+      wait();
+    }
+    delete(nativeHandle);
+    nativeHandle = 0;
+  }
+
+  private static native long allocate(byte[] serverDef);
+
+  private static native void start(long nativeHandle);
+
+  private static native void stop(long nativeHandle);
+
+  private static native void join(long nativeHandle);
+
+  private static native void delete(long nativeHandle);
+
+  private long nativeHandle;
+
+  private int numJoining;
+
+  static {
+    TensorFlow.init();
+  }
+}
diff --git a/tensorflow/java/src/main/native/server_jni.cc b/tensorflow/java/src/main/native/server_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d92685740458831011c3f6cc1ee8df8a995e9363
--- /dev/null
+++ b/tensorflow/java/src/main/native/server_jni.cc
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/server_jni.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+#include "tensorflow/java/src/main/native/utils_jni.h"
+
+namespace {
+
+TF_Server* requireHandle(JNIEnv* env, jlong handle) {
+  static_assert(sizeof(jlong) >= sizeof(TF_Server*),
+                "Cannot package C object pointers as a Java long");
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "close() has been called on the Server");
+    return nullptr;
+  }
+
+  return reinterpret_cast<TF_Server*>(handle);
+}
+
+}  // namespace
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate(
+    JNIEnv* env, jclass clazz, jbyteArray server_def) {
+  TF_Status* status = TF_NewStatus();
+
+  jbyte* server_def_ptr = env->GetByteArrayElements(server_def, nullptr);
+
+  TF_Server* server = TF_NewServer(
+      server_def_ptr, static_cast<size_t>(env->GetArrayLength(server_def)),
+      status);
+
+  env->ReleaseByteArrayElements(server_def, server_def_ptr, JNI_ABORT);
+  bool ok = throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+
+  return ok ? reinterpret_cast<jlong>(server) : 0;
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env,
+                                                        jclass clazz,
+                                                        jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_Status* status = TF_NewStatus();
+
+  TF_ServerStart(server, status);
+  throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env,
+                                                       jclass clazz,
+                                                       jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_Status* status = TF_NewStatus();
+
+  TF_ServerStop(server, status);
+  throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env,
+                                                       jclass clazz,
+                                                       jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_Status* status = TF_NewStatus();
+
+  TF_ServerJoin(server, status);
+  throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv* env,
+                                                         jclass clazz,
+                                                         jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_DeleteServer(server);
+}
diff --git a/tensorflow/java/src/main/native/server_jni.h b/tensorflow/java/src/main/native/server_jni.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bfe90b7a85b1a21f91ffe136c8bbf717da59d78
--- /dev/null
+++ b/tensorflow/java/src/main/native/server_jni.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_Server
+ * Method:    allocate
+ * Signature: ([B)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_Server_allocate(JNIEnv *, jclass, jbyteArray server_def);
+
+/*
+ * Class:     org_tensorflow_Server
+ * Method:    start
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv *, jclass,
+                                                        jlong);
+
+/*
+ * Class:     org_tensorflow_Server
+ * Method:    stop
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    join
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    delete
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv *, jclass,
+                                                         jlong);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index f8bb7191c4eb8f93f6062384874ac3f5c00f7f26..bb2c53b8c9e4300f67fa8badbdbaaf73532005fe 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -27,11 +27,11 @@ config_setting(
     },
 )
 
-# Enables inclusion of TensorFlow kernels via the TF Lite Flex delegate.
+# Enables inclusion of select TensorFlow kernels via the TF Lite Flex delegate.
 # WARNING: This build flag is experimental and subject to change.
 config_setting(
-    name = "with_tflite_flex",
-    define_values = {"with_tflite_flex": "true"},
+    name = "with_select_tf_ops",
+    define_values = {"with_select_tf_ops": "true"},
     visibility = ["//visibility:public"],
 )
 
@@ -131,6 +131,7 @@ cc_library(
     name = "framework",
     srcs = [
         "allocation.cc",
+        "core/subgraph.cc",
         "graph_info.cc",
         "interpreter.cc",
         "model.cc",
@@ -155,6 +156,7 @@ cc_library(
         "allocation.h",
         "context.h",
         "context_util.h",
+        "core/subgraph.h",
         "error_reporter.h",
         "graph_info.h",
         "interpreter.h",
@@ -190,7 +192,7 @@ cc_library(
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
-        ":with_tflite_flex": [
+        ":with_select_tf_ops": [
             "//tensorflow/lite/delegates/flex:delegate",
         ],
         "//conditions:default": [],
@@ -201,6 +203,7 @@ cc_library(
     name = "string_util",
     srcs = ["string_util.cc"],
     hdrs = ["string_util.h"],
+    copts = tflite_copts(),
     deps = [
         ":framework",
         ":string",
@@ -323,6 +326,7 @@ cc_library(
     name = "util",
     srcs = ["util.cc"],
     hdrs = ["util.h"],
+    copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:c_api_internal",
     ],
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 3b0af52fb93690e7b2623bb39ef5bb6639cc030c..fcd72559103ee046026c50eff6354e1a86a36556 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -29,8 +29,11 @@ def tflite_copts():
         ],
         str(Label("//tensorflow:windows")): [
             "/DTF_COMPILE_LIBRARY",
+            "/wd4018",  # -Wno-sign-compare
+        ],
+        "//conditions:default": [
+            "-Wno-sign-compare",
         ],
-        "//conditions:default": [],
     }) + select({
         str(Label("//tensorflow:with_default_optimizations")): [],
         "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
@@ -245,6 +248,7 @@ def generated_test_models():
         "sum",
         "l2norm",
         "l2_pool",
+        "leaky_relu",
         "less",
         "less_equal",
         "local_response_norm",
@@ -289,15 +293,18 @@ def generated_test_models():
         "split",
         "sqrt",
         "square",
+        "squared_difference",
         "squeeze",
         "strided_slice",
         "strided_slice_1d_exhaustive",
+        "strided_slice_buggy",
         "sub",
         "tile",
         "topk",
         "transpose",
         "transpose_conv",
         "unpack",
+        "unroll_batch_matmul",
         "where",
         "zeros_like",
     ]
@@ -417,7 +424,13 @@ def gen_selected_ops(name, model):
         tools = [tool],
     )
 
-def gen_model_coverage_test(model_name, data, failure_type):
+def flex_dep(target_op_sets):
+    if "SELECT_TF_OPS" in target_op_sets:
+        return ["//tensorflow/lite/delegates/flex:delegate"]
+    else:
+        return []
+
+def gen_model_coverage_test(model_name, data, failure_type, tags):
     """Generates Python test targets for testing TFLite models.
 
     Args:
@@ -427,6 +440,7 @@ def gen_model_coverage_test(model_name, data, failure_type):
       failure_type: List of failure types (none, toco, crash, inference)
         expected for the corresponding combinations of op sets
         ("TFLITE_BUILTINS", "TFLITE_BUILTINS,SELECT_TF_OPS", "SELECT_TF_OPS").
+      tags: List of strings of additional tags.
     """
     i = 0
     for target_op_sets in ["TFLITE_BUILTINS", "TFLITE_BUILTINS,SELECT_TF_OPS", "SELECT_TF_OPS"]:
@@ -448,10 +462,10 @@ def gen_model_coverage_test(model_name, data, failure_type):
                 "no_oss",
                 "no_windows",
                 "notap",
-            ],
+            ] + tags,
             deps = [
                 "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
                 "//tensorflow/lite/python:lite",
                 "//tensorflow/python:client_testlib",
-            ],
+            ] + flex_dep(target_op_sets),
         )
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index b8c05f57bb59b5770ec2dca00d41e1ebd8ca23c4..2300ff4ed21fa9df9d289c0ede23c2afe3d6b90b 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -123,6 +123,9 @@ typedef enum {
   kTfLiteBuiltinFloorMod = 95,
   kTfLiteBuiltinRange = 96,
   kTfLiteBuiltinResizeNearestNeighbor = 97,
+  kTfLiteBuiltinLeakyRelu = 98,
+  kTfLiteBuiltinSquaredDifference = 99,
+  kTfLiteBuiltinMirrorPad = 100,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 855983d60dfd18d3b35ced7fed93f8fa3dfca80a..33aaac3c80310ed4463bd209a003a495ad8fa0b9 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -35,11 +35,21 @@ typedef enum {
   kTfLitePaddingValid,
 } TfLitePadding;
 
+typedef enum {
+  kTfLiteMirrorPaddingUnknown = 0,
+  kTfLiteMirrorPaddingReflect,
+  kTfLiteMirrorPaddingSymmetric,
+} TfLiteMirrorPaddingMode;
+
 typedef struct {
   int width;
   int height;
 } TfLitePaddingValues;
 
+typedef struct {
+  TfLiteMirrorPaddingMode mode;
+} TfLiteMirrorPaddingParams;
+
 // Possible fused activation functions.
 // TODO(aselle): rename to TfLiteActivation
 typedef enum {
@@ -328,6 +338,10 @@ typedef struct {
   int axis;
 } TfLiteUnpackParams;
 
+typedef struct {
+  float alpha;
+} TfLiteLeakyReluParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
index b131f0677467b3504bda84d96a95707cb1587884..7f67b1c27223f0bdc8c3bf663b39c7d5580609da 100644
--- a/tensorflow/lite/c/c_api_internal.c
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -125,6 +125,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
       return "INT32";
     case kTfLiteUInt8:
       return "UINT8";
+    case kTfLiteInt8:
+      return "INT8";
     case kTfLiteInt64:
       return "INT64";
     case kTfLiteBool:
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index e76971f57df5ed22830af73b9ea0e46d8a060799..d7bf06442bf227290db830828474c2cbb9ee5303 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -179,6 +179,7 @@ typedef enum {
   kTfLiteBool = 6,
   kTfLiteInt16 = 7,
   kTfLiteComplex64 = 8,
+  kTfLiteInt8 = 9,
 } TfLiteType;
 
 // Return the name of a given type, for error reporting purposes.
@@ -203,6 +204,7 @@ typedef union {
   bool* b;
   int16_t* i16;
   TfLiteComplex64* c64;
+  int8_t* int8;
 } TfLitePtrUnion;
 
 // Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
@@ -373,7 +375,7 @@ typedef struct TfLiteContext {
 
   // Replace ops with one or more stub delegate operations. This function
   // does not take ownership of `nodes_to_replace`.
-  TfLiteStatus (*ReplaceSubgraphsWithDelegateKernels)(
+  TfLiteStatus (*ReplaceNodeSubsetsWithDelegateKernels)(
       struct TfLiteContext*, TfLiteRegistration registration,
       const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
 
@@ -456,6 +458,22 @@ typedef struct _TfLiteRegistration {
   int version;
 } TfLiteRegistration;
 
+// The flags used in `TfLiteDelegate`. Note that this is a bitmask, so the
+// values should be 1, 2, 4, 8, ...etc.
+typedef enum {
+  kTfLiteDelegateFlagsNone = 0,
+  // The flag is set if the delegate can handle dynamic sized tensors.
+  // For example, the output shape of a `Resize` op with non-constant shape
+  // can only be inferred when the op is invoked.
+  // In this case, the Delegate is responsible for calling
+  // `SetTensorToDynamic` to mark the tensor as a dynamic tensor, and calling
+  // `ResizeTensor` when invoking the op.
+  //
+  // If the delegate isn't capable to handle dynamic tensors, this flag need
+  // to be set to false.
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1
+} TfLiteDelegateFlags;
+
 // WARNING: This is an experimental interface that is subject to change.
 typedef struct _TfLiteDelegate {
   // Data that delegate needs to identify itself. This data is owned by the
@@ -465,7 +483,7 @@ typedef struct _TfLiteDelegate {
 
   // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
   // delegate a view of the current graph through TfLiteContext*. It typically
-  // will look at the nodes and call ReplaceSubgraphsWithDelegateKernels()
+  // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
   // to ask the TensorFlow lite runtime to create macro-nodes to represent
   // delegated subgraphs of the original graph.
   TfLiteStatus (*Prepare)(TfLiteContext* context, TfLiteDelegate* delegate);
@@ -490,6 +508,9 @@ typedef struct _TfLiteDelegate {
   // This can be null if the delegate doesn't use its own buffer.
   void (*FreeBufferHandle)(TfLiteContext* context, TfLiteDelegate* delegate,
                            TfLiteBufferHandle* handle);
+
+  // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
+  int64_t flags;
 } TfLiteDelegate;
 
 // WARNING: This is an experimental interface that is subject to change.
diff --git a/tensorflow/lite/c/c_api_internal_test.cc b/tensorflow/lite/c/c_api_internal_test.cc
index e21823c41f0b43e7395b19f241d6a628b8a78f41..acf0dfc5be8e233b642ccea42f72cbf6af2d4c5d 100644
--- a/tensorflow/lite/c/c_api_internal_test.cc
+++ b/tensorflow/lite/c/c_api_internal_test.cc
@@ -74,6 +74,7 @@ TEST(Types, TestTypeNames) {
   EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
   EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
   EXPECT_EQ(type_name(kTfLiteUInt8), "UINT8");
+  EXPECT_EQ(type_name(kTfLiteInt8), "INT8");
   EXPECT_EQ(type_name(kTfLiteInt64), "INT64");
   EXPECT_EQ(type_name(kTfLiteBool), "BOOL");
   EXPECT_EQ(type_name(kTfLiteComplex64), "COMPLEX64");
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 8cd3faabb728097832796bb7c9d56e5f2e9632b0..aa9b3723985d2e2b57a87e43d84b829e85592297 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -61,6 +61,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_UINT8:
       *type = kTfLiteUInt8;
       break;
+    case TensorType_INT8:
+      *type = kTfLiteInt8;
+      break;
     case TensorType_INT64:
       *type = kTfLiteInt64;
       break;
@@ -617,6 +620,28 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_LEAKY_RELU: {
+      TfLiteLeakyReluParams* params =
+          allocator->AllocatePOD<TfLiteLeakyReluParams>();
+      if (auto* leaky_relu_params = op->builtin_options_as_LeakyReluOptions()) {
+        params->alpha = leaky_relu_params->alpha();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_MIRROR_PAD: {
+      TfLiteMirrorPaddingParams* params =
+          allocator->AllocatePOD<TfLiteMirrorPaddingParams>();
+      auto* mirror_pad_params = op->builtin_options_as_MirrorPadOptions();
+      if (mirror_pad_params != nullptr) {
+        params->mode =
+            mirror_pad_params->mode() == tflite::MirrorPadMode_REFLECT
+                ? TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect
+                : TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingSymmetric;
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
     case BuiltinOperator_BATCH_TO_SPACE_ND:
@@ -668,6 +693,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_FILL:
     case BuiltinOperator_FLOOR_MOD:
     case BuiltinOperator_RANGE:
+    case BuiltinOperator_SQUARED_DIFFERENCE:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05a60962b197fecc7b93d1fcba59157f350f722b
--- /dev/null
+++ b/tensorflow/lite/core/subgraph.cc
@@ -0,0 +1,976 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/arena_planner.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/nnapi_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+namespace {
+TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
+                           const TfLiteRegistration& registration,
+                           int node_index, const char* message) {
+  context->ReportError(
+      context, "Node number %d (%s) %s.\n", node_index,
+      registration.custom_name
+          ? registration.custom_name
+          : EnumNameBuiltinOperator(
+                static_cast<BuiltinOperator>(registration.builtin_code)),
+      message);
+  return kTfLiteError;
+}
+
+// Stub method which returns kTfLiteError when the function is forbidden.
+// We're registrating this function to several different function to save
+// compiled binary size. Please note the restrictions:
+// * The type of first parameter have to be `TfLiteContext*`.
+// * All paramteters must be trivailly destructible. (E.g. No C++ class)
+TfLiteStatus ForbiddenContextFunction(TfLiteContext* context, ...) {
+  context->ReportError(context,
+                       "The function is forbidden if not calling in delegate.");
+  return kTfLiteError;
+}
+
+// Set the ForbiddenContextFunction to a compatible function pointer.
+template <typename FunctionType>
+void SetForbiddenContextFunction(FunctionType* func) {
+  *func = reinterpret_cast<FunctionType>(ForbiddenContextFunction);
+}
+
+// Returns true if at least one tensor in the given list is kTfLiteDynamic.
+template <typename TensorIntArray>
+bool HasDynamicTensorImpl(const TfLiteContext& context,
+                          const TensorIntArray& int_array) {
+  for (int i : int_array) {
+    const TfLiteTensor& tensor = context.tensors[i];
+    if (tensor.allocation_type == kTfLiteDynamic) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HasDynamicTensor(const TfLiteContext& context,
+                      const TfLiteIntArray* int_array) {
+  return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
+}
+
+}  // namespace
+
+// A trivial implementation of GraphInfo around the Interpreter.
+// NOTE: this interpreter info represents the subset of the
+// graph that is executed according to execution plan. Thus,
+// the indices are execution plan indices rather than raw node
+// indices.
+class InterpreterInfo : public GraphInfo {
+ public:
+  explicit InterpreterInfo(Subgraph* subgraph) : subgraph_(subgraph) {}
+
+  size_t num_tensors() const override { return subgraph_->tensors().size(); }
+  TfLiteTensor* tensor(size_t index) override {
+    return &subgraph_->tensors()[index];
+  }
+  size_t num_nodes() const override {
+    return subgraph_->execution_plan().size();
+  }
+  const TfLiteNode& node(size_t index) const override {
+    int node_index = subgraph_->execution_plan()[index];
+    return subgraph_->nodes_and_registration()[node_index].first;
+  }
+  const std::vector<int>& inputs() const override {
+    return subgraph_->inputs();
+  }
+  const std::vector<int>& outputs() const override {
+    return subgraph_->outputs();
+  }
+  const std::vector<int>& variables() const override {
+    return subgraph_->variables();
+  }
+
+ public:
+  Subgraph* subgraph_;
+};
+
+Subgraph::Subgraph(ErrorReporter* error_reporter,
+                   TfLiteExternalContext** external_contexts)
+    : context_(&owned_context_),
+      error_reporter_(error_reporter),
+      next_execution_plan_index_to_prepare_(0),
+      external_contexts_(external_contexts) {
+  context_->impl_ = static_cast<void*>(this);
+  context_->ResizeTensor = ResizeTensor;
+  context_->ReportError = ReportErrorC;
+  context_->AddTensors = AddTensors;
+  context_->tensors = nullptr;
+  context_->tensors_size = 0;
+  context_->allow_fp32_relax_to_fp16 = false;
+  context_->recommended_num_threads = -1;
+  context_->GetExternalContext = GetExternalContext;
+  context_->SetExternalContext = SetExternalContext;
+
+  // Reserve some space for the tensors to avoid excessive resizing.
+  tensors_.reserve(kTensorsReservedCapacity);
+  nodes_and_registration().reserve(kTensorsReservedCapacity);
+  // Invalid to call these these except from TfLiteDelegate
+  SwitchToKernelContext();
+}
+
+Subgraph::~Subgraph() {
+  for (auto& node_and_reg : nodes_and_registration_) {
+    TfLiteNode& node = node_and_reg.first;
+    TfLiteIntArrayFree(node.inputs);
+    TfLiteIntArrayFree(node.outputs);
+    TfLiteIntArrayFree(node.temporaries);
+    if (node.builtin_data) free(node.builtin_data);
+    OpFree(node_and_reg.second, node.user_data);
+    node.builtin_data = nullptr;
+  }
+
+  for (size_t i = 0; i < context_->tensors_size; i++) {
+    TfLiteTensor* tensor = &context_->tensors[i];
+    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
+        tensor->delegate->FreeBufferHandle != nullptr) {
+      tensor->delegate->FreeBufferHandle(context_, tensor->delegate,
+                                         &tensor->buffer_handle);
+    }
+    TfLiteTensorFree(tensor);
+  }
+}
+
+TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
+    TfLiteContext* context, TfLiteRegistration registration,
+    const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->ReplaceNodeSubsetsWithDelegateKernels(registration, nodes_to_replace,
+                                              delegate);
+}
+
+namespace {
+
+// Copy a std::vector<int> to an existing TfLiteIntArray.
+// This is a low-level data manipulation function, and it's caller's
+// responsibility to ensure TfLiteIntArray has enough size.
+void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
+                                TfLiteIntArray* arr) {
+  arr->size = vec.size();
+  memcpy(arr->data, vec.data(), sizeof(int) * arr->size);
+}
+
+// This function allocates a continuous memory space that contains a
+// TfLiteDelegateParams followed by a several TfLiteIntArray.
+// When calling `free` at TfLiteDelegateParams*, all the allocated space
+// will be freed together.
+//
+// +-----------------------------------+
+// | TfLiteDelegateParams              |
+// | TfLiteDelegate* delegate;         |
+// | TfLiteIntArray* nodes_to_replace; |--\
+// | TfLiteIntArray* input_tensors;    |--+--\
+// | TfLiteIntArray* output_tensors;   |--+--+--\
+// +-----------------------------------+  |  |  |
+// | TfLiteIntArray (variable size)    |<-/  |  |
+// +-----------------------------------+     |  |
+// | TfLiteIntArray (variable size)    |<----/  |
+// +-----------------------------------+        |
+// | TfLiteIntArray (variable size)    |<-------/
+// +-----------------------------------+
+TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
+                                           const NodeSubset& node_subset) {
+  // Step 1: Calculate the allocation size.
+  int allocation_size = sizeof(TfLiteDelegateParams);
+
+  int nodes_to_replace_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.nodes.size());
+  allocation_size += nodes_to_replace_size;
+
+  int input_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.input_tensors.size());
+  allocation_size += input_tensors_size;
+
+  int output_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.output_tensors.size());
+  allocation_size += output_tensors_size;
+
+  // Step 2: Allocate the memory.
+  // Use `char*` for conveniently step through the allocated space by bytes.
+  char* allocation = reinterpret_cast<char*>(malloc(allocation_size));
+
+  // Step 3: Fill all data structures structures.
+  TfLiteDelegateParams* params =
+      reinterpret_cast<TfLiteDelegateParams*>(allocation);
+  params->delegate = delegate;
+  allocation += sizeof(TfLiteDelegateParams);
+
+  params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.nodes, params->nodes_to_replace);
+  allocation += nodes_to_replace_size;
+
+  params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.input_tensors, params->input_tensors);
+  allocation += input_tensors_size;
+
+  params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.output_tensors,
+                             params->output_tensors);
+  allocation += output_tensors_size;
+
+  return params;
+}
+
+}  // namespace
+
+TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
+    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+    TfLiteDelegate* delegate) {
+  // Annotate the registration as DELEGATE op.
+  registration.builtin_code = BuiltinOperator_DELEGATE;
+
+  // Analyze the graph to find all independent node_subsets that are either
+  // fully not-this-delegate or this-delegate computation.
+  InterpreterInfo info(this);
+  std::vector<NodeSubset> node_subsets;
+  PartitionGraphIntoIndependentNodeSubsets(&info, nodes_to_replace,
+                                           &node_subsets);
+
+  execution_plan_.clear();
+
+  for (auto& node_subset : node_subsets) {
+    // Subsets calimed by the delegate should have a "macro" op created, the
+    // other node_subsets (kTfNonPartition) just have their nodes added back to
+    // the execution plan.
+    switch (node_subset.type) {
+      case NodeSubset::kTfNonPartition:
+        for (auto it = node_subset.nodes.begin(); it != node_subset.nodes.end();
+             ++it) {
+          execution_plan_.push_back(*it);
+        }
+        break;
+      case NodeSubset::kTfPartition: {
+        int node_index;
+
+        TfLiteDelegateParams* params =
+            CreateDelegateParams(delegate, node_subset);
+        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
+            node_subset.input_tensors, node_subset.output_tensors, nullptr, 0,
+            params, &registration, &node_index));
+
+        // Initialize the output tensors's delegate-related fields.
+        for (int tensor_index : node_subset.output_tensors) {
+          TfLiteTensor* tensor = &tensors_[tensor_index];
+          TF_LITE_ENSURE(context_, tensor->delegate == nullptr ||
+                                       tensor->delegate == delegate);
+          tensor->delegate = delegate;
+        }
+
+        // Associate the node with the delegate.
+        TfLiteNode* node = &nodes_and_registration_[node_index].first;
+        node->delegate = delegate;
+      } break;
+      case NodeSubset::kTfUnexplored:
+        return kTfLiteError;
+        break;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteExternalContext* Subgraph::GetExternalContext(
+    TfLiteExternalContextType type) {
+  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
+    return external_contexts_[type];
+  }
+  return nullptr;
+}
+
+TfLiteExternalContext* Subgraph::GetExternalContext(
+    struct TfLiteContext* context, TfLiteExternalContextType type) {
+  return static_cast<Subgraph*>(context->impl_)->GetExternalContext(type);
+}
+
+void Subgraph::SetExternalContext(TfLiteExternalContextType type,
+                                  TfLiteExternalContext* ctx) {
+  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
+    external_contexts_[type] = ctx;
+  }
+}
+
+void Subgraph::SetExternalContext(struct TfLiteContext* context,
+                                  TfLiteExternalContextType type,
+                                  TfLiteExternalContext* ctx) {
+  return static_cast<Subgraph*>(context->impl_)->SetExternalContext(type, ctx);
+}
+
+// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
+// this memory and it is only guaranteed to exist during the invocation of the
+// delegate prepare.
+TfLiteStatus Subgraph::GetExecutionPlan(TfLiteIntArray** execution_plan) {
+  // TODO(aselle): Do not make a copy here
+  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
+  *execution_plan = plan_cache_.get();
+  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
+                "TfLiteIntArray and execution_plan do not contain same type.");
+  std::memcpy(plan_cache_->data, execution_plan_.data(),
+              sizeof(plan_cache_->data[0]) * execution_plan_.size());
+  return kTfLiteOk;
+}
+
+// WARNING: This is an experimental interface that is subject to change.
+// Entry point for C node plugin API to get the execution plan
+TfLiteStatus Subgraph::GetExecutionPlan(struct TfLiteContext* context,
+                                        TfLiteIntArray** execution_plan) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->GetExecutionPlan(execution_plan);
+}
+
+TfLiteStatus Subgraph::SetInputs(std::vector<int> inputs) {
+  TF_LITE_ENSURE_OK(&context_,
+                    CheckTensorIndices("inputs", inputs.data(), inputs.size()));
+  inputs_ = std::move(inputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetOutputs(std::vector<int> outputs) {
+  TF_LITE_ENSURE_OK(
+      &context_, CheckTensorIndices("outputs", outputs.data(), outputs.size()));
+  outputs_ = std::move(outputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetVariables(std::vector<int> variables) {
+  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("variables", variables.data(),
+                                                  variables.size()));
+  variables_ = std::move(variables);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
+                                          int length) {
+  // Making sure kOptionalTensor is not re-defined to something other than -1.
+  static_assert(kOptionalTensor == -1, "kOptionalTensor should be defined -1");
+
+  for (int i = 0; i < length; i++) {
+    int index = indices[i];
+    // Continue if index == kOptionalTensor before additional comparisons below,
+    // size_t(-1) is always >= context_tensors_size.
+    if (index == kOptionalTensor) {
+      continue;
+    }
+    if (index < 0 || static_cast<size_t>(index) >= context_->tensors_size) {
+      ReportError("Invalid tensor index %d in %s\n", index, label);
+      consistent_ = false;
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::BytesRequired(TfLiteType type, const int* dims,
+                                     size_t dims_size, size_t* bytes) {
+  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
+  // MultiplyWithoutOverflow.
+  TF_LITE_ENSURE(context_, bytes != nullptr);
+  size_t count = 1;
+  for (int k = 0; k < dims_size; k++) count *= dims[k];
+  switch (type) {
+    case kTfLiteFloat32:
+      *bytes = sizeof(float) * count;
+      break;
+    case kTfLiteInt16:
+      *bytes = sizeof(int16_t) * count;
+      break;
+    case kTfLiteInt32:
+      *bytes = sizeof(int32_t) * count;
+      break;
+    case kTfLiteUInt8:
+      *bytes = sizeof(uint8_t) * count;
+      break;
+    case kTfLiteInt64:
+      *bytes = sizeof(int64_t) * count;
+      break;
+    case kTfLiteBool:
+      *bytes = sizeof(bool) * count;
+      break;
+    case kTfLiteComplex64:
+      *bytes = sizeof(std::complex<float>) * count;
+      break;
+    case kTfLiteInt8:
+      *bytes = sizeof(int8_t) * count;
+      break;
+    default:
+      ReportError(
+          "Only float32, int8, int16, int32, int64, uint8, bool, complex64 "
+          "supported currently.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AllocateTensors() {
+  if (!consistent_) {
+    ReportError("AllocateTensors() called on inconsistent model.");
+    return kTfLiteError;
+  }
+
+  // Explicit (re)allocation is necessary if nodes have been changed or tensors
+  // have been resized. For inputs marked as dynamic, we can't short-circuit the
+  // allocation as the client may have done the resize manually.
+  if (state_ != kStateUninvokable &&
+      !HasDynamicTensorImpl(*context_, inputs())) {
+    return kTfLiteOk;
+  }
+
+  next_execution_plan_index_to_prepare_ = 0;
+  if (memory_planner_) {
+    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
+  }
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+
+  state_ = kStateInvokable;
+
+  // Reset the variable tensors to zero after (re)allocating the tensors.
+  // Developers shouldn't rely on the side effect of this function to reset
+  // variable tesnsors. They should call `ResetVariableTensors` directly
+  // instead.
+  ResetVariableTensors();
+
+  return kTfLiteOk;
+}
+
+// TODO(ycling): Support non-zero default values.
+TfLiteStatus Subgraph::ResetVariableTensors() {
+  for (auto& tensor : tensors_) {
+    if (!tensor.is_variable) {
+      continue;
+    }
+
+    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
+    // allocated after the initial `PrepareOpsAndTensors()` is called.
+    TF_LITE_ENSURE_EQ(context_, tensor.allocation_type,
+                      kTfLiteArenaRwPersistent);
+    TF_LITE_ENSURE(context_, tensor.data.raw != nullptr);
+
+    memset(tensor.data.raw, 0, tensor.bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AddNodeWithParameters(
+    const std::vector<int>& inputs, const std::vector<int>& outputs,
+    const char* init_data, size_t init_data_size, void* builtin_data,
+    const TfLiteRegistration* registration, int* node_index) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError("AddNodeWithParameters is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  state_ = kStateUninvokable;
+
+  std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
+                                                              free);
+
+  TF_LITE_ENSURE_OK(context_, CheckTensorIndices("node inputs", inputs.data(),
+                                                 inputs.size()));
+  TF_LITE_ENSURE_OK(
+      &context_,
+      CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
+
+  int new_node_index = nodes_and_registration_.size();
+  if (node_index) *node_index = new_node_index;
+  nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
+  auto& node_and_reg = nodes_and_registration_.back();
+  TfLiteNode& node = node_and_reg.first;
+  if (node.inputs) TfLiteIntArrayFree(node.inputs);
+  if (node.outputs) TfLiteIntArrayFree(node.outputs);
+  if (node.temporaries) TfLiteIntArrayFree(node.temporaries);
+
+  // NOTE, here we are not using move semantics yet, since our internal
+  // representation isn't std::vector, but in the future we would like to avoid
+  // copies, so we want the interface to take r-value references now.
+  node.inputs = ConvertVectorToTfLiteIntArray(inputs);
+  node.outputs = ConvertVectorToTfLiteIntArray(outputs);
+  node.temporaries = TfLiteIntArrayCreate(0);
+  if (init_data) {
+    node.user_data = OpInit(*registration, init_data, init_data_size);
+  } else {
+    node.user_data =
+        OpInit(*registration,
+               reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
+  }
+
+  node.builtin_data = builtin_data_deleter.release();
+  // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
+  // properly for nodes generated by ReplaceNodeSubsetsWithDelegateKernels.
+
+  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
+    // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
+    // `Operator` table is passed in.
+    node.custom_initial_data = init_data;
+    node.custom_initial_data_size = init_data_size;
+  } else {
+    node.custom_initial_data = nullptr;
+    node.custom_initial_data_size = 0;
+  }
+
+  node.delegate = nullptr;
+  node_and_reg.second = *registration;
+  execution_plan_.push_back(new_node_index);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
+                                         const std::vector<int>& dims) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError("ResizeInputTensor is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
+  // TODO(aselle): All bounds checks can be implemented as one-sided bounds
+  // checks by casting to unsigned for efficiency. Profile before doing this.
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  TfLiteTensor* tensor = &context_->tensors[tensor_index];
+
+  // Short-circuit the state change if the dimensions don't change, avoiding
+  // unnecessary (re)allocations.
+  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
+    return kTfLiteOk;
+  }
+
+  state_ = kStateUninvokable;
+  return ResizeTensorImpl(tensor, ConvertVectorToTfLiteIntArray(dims));
+}
+
+TfLiteStatus Subgraph::PrepareOpsStartingAt(
+    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+  for (int execution_plan_index = first_execution_plan_index;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
+    EnsureTensorsVectorCapacity();
+    if (OpPrepare(registration, &node) == kTfLiteError) {
+      return ReportOpError(context_, node, registration, node_index,
+                           "failed to prepare");
+    }
+
+    *last_execution_plan_index_prepared = execution_plan_index;
+
+    // Discontinue if the node has dynamic outputs. Note that we don't
+    // stop for dynamic temporary tensors since they won't affect the
+    // sizes of other tensors in the graph.
+    if (HasDynamicTensor(*context_, node.outputs)) {
+      break;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::PrepareOpsAndTensors() {
+  if (!memory_planner_) {
+    memory_planner_.reset(new ArenaPlanner(
+        context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this)),
+        /*preserve_inputs=*/true, /*preserve_intermediates*/ false));
+    memory_planner_->PlanAllocations();
+  }
+
+  int last_exec_plan_index_prepared = 0;
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
+  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
+      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
+
+  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::Invoke() {
+  if (!consistent_) {
+    ReportError("Invoke called on model that is not consistent.");
+    return kTfLiteError;
+  }
+
+  TfLiteStatus status = kTfLiteOk;
+  if (state_ == kStateUninvokable) {
+    ReportError("Invoke called on model that is not ready.");
+    return kTfLiteError;
+  }
+
+  if (nnapi_delegate_) {
+    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
+      TF_LITE_ENSURE_OK(context_, nnapi_delegate_->Invoke(this));
+      return kTfLiteOk;
+    } else {
+      // TODO(aselle): In the future, we would like this to be an
+      // automatic tflite CPU fallback.
+      ReportError(
+          "NNAPI was requested, but dependent sized tensors "
+          "being used.\n");
+      return kTfLiteError;
+    }
+  }
+
+  // Invocations are always done in node order.
+  // Note that calling Invoke repeatedly will cause the original memory plan to
+  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
+  // called.
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
+      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+      TF_LITE_ENSURE(context_, next_execution_plan_index_to_prepare_ >=
+                                   execution_plan_index);
+    }
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
+    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
+
+    // TODO(ycling): This is an extra loop through inputs to check if the data
+    // need to be copied from Delegate buffer to raw memory, which is often not
+    // needed. We may want to cache this in prepare to know if this needs to be
+    // done for a node or not.
+    for (int i = 0; i < node.inputs->size; ++i) {
+      int tensor_index = node.inputs->data[i];
+      if (tensor_index == kOptionalTensor) {
+        continue;
+      }
+      TfLiteTensor* tensor = &tensors_[tensor_index];
+      if (tensor->delegate && tensor->delegate != node.delegate &&
+          tensor->data_is_stale) {
+        EnsureTensorDataIsReadable(tensor_index);
+      }
+    }
+
+    EnsureTensorsVectorCapacity();
+    tensor_resized_since_op_invoke_ = false;
+    if (OpInvoke(registration, &node) == kTfLiteError) {
+      status = ReportOpError(context_, node, registration, node_index,
+                             "failed to invoke");
+    }
+
+    // Force execution prep for downstream ops if the latest op triggered the
+    // resize of a dynamic tensor.
+    if (tensor_resized_since_op_invoke_ &&
+        HasDynamicTensor(*context_, node.outputs)) {
+      next_execution_plan_index_to_prepare_ = execution_plan_index + 1;
+    }
+  }
+
+  return status;
+}
+
+TfLiteStatus Subgraph::ResizeTensor(TfLiteContext* context,
+                                    TfLiteTensor* tensor,
+                                    TfLiteIntArray* new_size) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function ResizeTensorImpl
+  // (this function is static).
+  return static_cast<Subgraph*>(context->impl_)
+      ->ResizeTensorImpl(tensor, new_size);
+}
+
+void Subgraph::ReportErrorImpl(const char* format, va_list args) {
+  error_reporter_->Report(format, args);
+}
+
+void Subgraph::ReportErrorC(TfLiteContext* context, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  auto* f = static_cast<Subgraph*>(context->impl_);
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Subgraph to call into the member function ReportErrorImpl
+  // (this function is static).
+  f->ReportErrorImpl(format, args);
+  va_end(args);
+}
+
+// Entry point for C node plugin API to report an error.
+void Subgraph::ReportError(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  auto* f = static_cast<Subgraph*>(context_->impl_);
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Subgraph to call into the member function ReportErrorImpl
+  // (this function is static).
+  f->ReportErrorImpl(format, args);
+  va_end(args);
+}
+
+TfLiteStatus Subgraph::AddTensors(int tensors_to_add,
+                                  int* first_new_tensor_index) {
+  const size_t base_index = tensors_.size();
+  if (first_new_tensor_index) *first_new_tensor_index = base_index;
+  tensors_.resize(tensors_.size() + tensors_to_add);
+  for (size_t i = base_index; i < tensors_.size(); i++) {
+    memset(&tensors_[i], 0, sizeof(tensors_[i]));
+    tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
+  }
+  context_->tensors = tensors_.data();
+  context_->tensors_size = tensors_.size();
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AddTensors(TfLiteContext* context, int tensors_to_add,
+                                  int* first_new_tensor_index) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function AddTensors
+  // (this function is static).
+  return static_cast<Subgraph*>(context->impl_)
+      ->AddTensors(tensors_to_add, first_new_tensor_index);
+}
+
+TfLiteStatus Subgraph::GetNodeAndRegistration(
+    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
+  TF_LITE_ENSURE(context_, node_index >= 0);
+  auto nodes_size = nodes_and_registration_.size();
+  TF_LITE_ENSURE(context_, static_cast<size_t>(node_index) < nodes_size);
+  TF_LITE_ENSURE(context_, node != nullptr && registration != nullptr);
+  auto& node_and_reg = nodes_and_registration_[node_index];
+  *node = &node_and_reg.first;
+  *registration = &node_and_reg.second;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::GetNodeAndRegistration(
+    struct TfLiteContext* context, int node_index, TfLiteNode** node,
+    TfLiteRegistration** registration) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->GetNodeAndRegistration(node_index, node, registration);
+}
+
+TfLiteStatus Subgraph::SetTensorParametersReadOnly(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
+    size_t bytes, const Allocation* allocation) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  // For most tensors we know exactly how much memory is necessary so we can
+  // ensure the buffer is large enough. However, we need to skip string tensors
+  // because their sizes change with the contents of the individual strings.
+  if (type != kTfLiteString) {
+    size_t required_bytes;
+    TF_LITE_ENSURE_OK(context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
+    TF_LITE_ENSURE_EQ(context_, required_bytes, bytes);
+  }
+
+  TfLiteTensor& tensor = context_->tensors[tensor_index];
+  if (type == tensor.type &&
+      EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
+    // Fast path which does not invalidate the invokable property.
+    TfLiteTensorDataFree(&tensor);
+    tensor.data.raw = const_cast<char*>(buffer);
+    if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
+    tensor.params = quantization;
+    tensor.allocation_type = kTfLiteMmapRo;
+    tensor.allocation = allocation;
+  } else {
+    state_ = kStateUninvokable;
+    TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                      quantization, const_cast<char*>(buffer), bytes,
+                      kTfLiteMmapRo, allocation, false, &tensor);
+  }
+  return kTfLiteOk;
+}
+
+// Set description of inputs/outputs/data/fptrs for node `node_index`.
+// This variant assumes an external buffer has been allocated of size
+// bytes. The lifetime of buffer must be ensured to be greater or equal
+// to Interpreter.
+TfLiteStatus Subgraph::SetTensorParametersReadWrite(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  size_t required_bytes = 0;
+  if (type != kTfLiteString) {
+    // These types will be allocated in our arena so we need to record how
+    // many bytes we will need based on the dimensions. String tensors are
+    // allocated dynamically and we can't know ahead of time how much space
+    // they will require.
+    TF_LITE_ENSURE_OK(context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
+  }
+
+  TfLiteAllocationType allocation_type = kTfLiteArenaRw;
+  if (type == kTfLiteString) {
+    if (is_variable) {
+      // We don't have a real use case for string variable tensor.
+      ReportError("String variable tensor isn't supported.");
+      return kTfLiteError;
+    }
+    allocation_type = kTfLiteDynamic;
+  } else if (is_variable) {
+    allocation_type = kTfLiteArenaRwPersistent;
+  }
+
+  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                    quantization,
+                    /*buffer=*/nullptr, required_bytes, allocation_type,
+                    nullptr, is_variable, &context_->tensors[tensor_index]);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetExecutionPlan(const std::vector<int>& new_plan) {
+  for (int node_index : new_plan) {
+    TF_LITE_ENSURE(context_, node_index >= 0 &&
+                                 node_index < nodes_and_registration_.size());
+  }
+  execution_plan_ = new_plan;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
+                                        TfLiteIntArray* new_size) {
+  // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
+  if (tensor->allocation_type == kTfLiteArenaRw ||
+      tensor->allocation_type == kTfLiteDynamic ||
+      tensor->allocation_type == kTfLiteArenaRwPersistent) {
+    tensor_resized_since_op_invoke_ |=
+        TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
+    if (tensor->type != kTfLiteString) {
+      size_t bytesRequired;
+      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
+                                          new_size->size, &bytesRequired);
+      if (status != kTfLiteOk) {
+        TfLiteIntArrayFree(new_size);
+        return kTfLiteError;
+      }
+
+      // Realloc space for kTfLiteDynamic tensors.
+      TfLiteTensorRealloc(bytesRequired, tensor);
+      tensor->bytes = bytesRequired;
+    }
+    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
+    tensor->dims = new_size;
+
+    if (tensor->allocation_type != kTfLiteDynamic) {
+      tensor->data.raw = nullptr;
+    }
+  } else {
+    // kTfLiteMmapRo tensors are stored in the flatbuffer and are therefore
+    // of fixed size.
+    TfLiteIntArrayFree(new_size);
+    ReportError("Attempting to resize a fixed-size tensor.");
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+void Subgraph::UseNNAPI(bool enable) {
+  // TODO(aselle): This is a workaround for finding if NNAPI exists.
+  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
+  // prefixed.
+  if (!NNAPIDelegate::IsSupported()) enable = false;
+  if (!enable) {
+    nnapi_delegate_.reset();
+  } else if (!nnapi_delegate_) {
+    nnapi_delegate_.reset(new NNAPIDelegate);
+  }
+}
+
+void Subgraph::SwitchToDelegateContext() {
+  context_->GetNodeAndRegistration = GetNodeAndRegistration;
+  context_->ReplaceNodeSubsetsWithDelegateKernels =
+      ReplaceNodeSubsetsWithDelegateKernels;
+  context_->GetExecutionPlan = GetExecutionPlan;
+}
+
+void Subgraph::SwitchToKernelContext() {
+  context_->GetNodeAndRegistration = [](struct TfLiteContext* context,
+                                        int node_index, TfLiteNode** node,
+                                        TfLiteRegistration** registration) {
+    return ForbiddenContextFunction(context);
+  };
+  context_->ReplaceNodeSubsetsWithDelegateKernels =
+      [](TfLiteContext* context, TfLiteRegistration registration,
+         const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
+        return ForbiddenContextFunction(context);
+      };
+  context_->GetExecutionPlan = [](struct TfLiteContext* context,
+                                  TfLiteIntArray**) {
+    return ForbiddenContextFunction(context);
+  };
+}
+
+TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
+    int last_execution_plan_index_prepared;
+    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
+                                     0, &last_execution_plan_index_prepared));
+
+    bool has_dynamic_tensors = true;
+    // Dynamic tensors exist if not all nodes can be prepared.
+    if (last_execution_plan_index_prepared + 1 == execution_plan_.size()) {
+      // If all the nodes can be prepared, check if the last node has dynamic
+      // tensors.
+      int node_index = execution_plan_[last_execution_plan_index_prepared];
+      TfLiteNode& node = nodes_and_registration_[node_index].first;
+      if (!HasDynamicTensor(*context_, node.outputs)) {
+        has_dynamic_tensors = false;
+      }
+    }
+    if (has_dynamic_tensors) {
+      ReportError(
+          "Attempting to use a delegate that only supports static-sized "
+          "tensors with a graph that has dynamic-sized tensors.");
+      return kTfLiteError;
+    }
+  }
+
+  // TODO(aselle): Consider if it is worth storing pointers to delegates.
+  // Setup additional context interface.
+  SwitchToDelegateContext();
+
+  TfLiteStatus status = delegate->Prepare(context_, delegate);
+
+  // Remove additional context info.
+  SwitchToKernelContext();
+
+  TF_LITE_ENSURE_OK(context_, status);
+
+  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
+    // Reset the state to force tensor/op reallocation.
+    state_ = kStateUninvokable;
+    TF_LITE_ENSURE_OK(context_, AllocateTensors());
+    TF_LITE_ENSURE_EQ(context_, state_, kStateInvokable);
+    // After using a delegate which doesn't support dynamic tensors, make the
+    // entire graph immutable.
+    state_ = kStateInvokableAndImmutable;
+  }
+
+  return status;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
new file mode 100644
index 0000000000000000000000000000000000000000..e85d6df97484d5e8c7530647d5951717c58e6357
--- /dev/null
+++ b/tensorflow/lite/core/subgraph.h
@@ -0,0 +1,481 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+#define TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+
+#include <cstdlib>
+#include <vector>
+
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/profiling/profiler.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+// Forward declare since NNAPIDelegate uses Interpreter.
+class NNAPIDelegate;
+
+class Subgraph {
+ public:
+  friend class Interpreter;
+
+  Subgraph(ErrorReporter* error_reporter,
+           TfLiteExternalContext** external_contexts);
+  Subgraph(const Subgraph&) = delete;
+
+  // Subgraphs should be movable but not copyable.
+  Subgraph(Subgraph&&) = default;
+  Subgraph& operator=(const Subgraph&) = delete;
+  virtual ~Subgraph();
+
+  // Provide a list of tensor indexes that are inputs to the model.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetInputs(std::vector<int> inputs);
+
+  // Provide a list of tensor indexes that are outputs to the model
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetOutputs(std::vector<int> outputs);
+
+  // Provide a list of tensor indexes that are variable tensors.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetVariables(std::vector<int> variables);
+
+  // Adds a node with the given parameters and returns the index of the new
+  // node in `node_index` (optionally). Interpreter will take ownership of
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
+  // remains with the caller.
+  TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
+                                     const std::vector<int>& outputs,
+                                     const char* init_data,
+                                     size_t init_data_size, void* builtin_data,
+                                     const TfLiteRegistration* registration,
+                                     int* node_index);
+
+  // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
+  // The value pointed to by `first_new_tensor_index` will be set to the
+  // index of the first new tensor if `first_new_tensor_index` is non-null.
+  TfLiteStatus AddTensors(int tensors_to_add, int* first_new_tensor_index);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization, bool is_variable);
+
+  // WARNING: Experimental interface, subject to change
+  // Overrides execution plan. This bounds checks indices sent in.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
+  // Get a mutable tensor data structure.
+  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
+  // read/write access to structure
+  TfLiteTensor* tensor(int tensor_index) {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_->tensors_size) {
+      return nullptr;
+    }
+    return &context_->tensors[tensor_index];
+  }
+
+  // Get an immutable tensor data structure.
+  const TfLiteTensor* tensor(int tensor_index) const {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_->tensors_size) {
+      return nullptr;
+    }
+    return &context_->tensors[tensor_index];
+  }
+
+  // Read only access to list of inputs.
+  std::vector<int>& inputs() { return inputs_; }
+
+  // Read only access to list of inputs.
+  const std::vector<int>& inputs() const { return inputs_; }
+
+  // Read only access to list of outputs.
+  std::vector<int>& outputs() { return outputs_; }
+
+  // Read only access to list of outputs.
+  const std::vector<int>& outputs() const { return outputs_; }
+
+  // Read only access to list of variable tensors.
+  std::vector<int>& variables() { return variables_; }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& variables() const { return variables_; }
+
+  size_t tensors_size() const { return tensors_.size(); }
+
+  // Return the number of ops in the model.
+  size_t nodes_size() const { return nodes_and_registration_.size(); }
+
+  // Read only access to list of variable tensors.
+  std::vector<int>& execution_plan() { return execution_plan_; }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& execution_plan() const { return execution_plan_; }
+
+  // Mutable form of tensors (TEMPORARY for refactor).
+  // TODO(b/119495520): remove when refactoring complete.
+  std::vector<TfLiteTensor>& tensors() { return tensors_; }
+  // Mutable form of tensors (TEMPORARY for refactor).
+  // TODO(b/119495520): remove when refactoring complete.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() {
+    return nodes_and_registration_;
+  }
+
+  const std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() const {
+    return nodes_and_registration_;
+  }
+
+  // Get a pointer to an operation and registration data structure if in bounds.
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) const {
+    if (node_index < 0 || static_cast<size_t>(node_index) >= nodes_size())
+      return nullptr;
+    return &nodes_and_registration_[node_index];
+  }
+
+  // Change the dimensionality of a given tensor. Note, this is only acceptable
+  // for tensor indices that are inputs.
+  // Returns status of failure or success.
+  // TODO(aselle): Consider implementing ArraySlice equivalent to make this
+  //   more adept at accepting data without an extra copy. Use absl::ArraySlice
+  //   if our partners determine that dependency is acceptable.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims);
+
+  // Update allocations for all tensors. This will redim dependent tensors using
+  // the input tensor dimensionality as given. This is relatively expensive.
+  // If you know that your sizes are not changing, you need not call this.
+  // Returns status of success or failure.
+  TfLiteStatus AllocateTensors();
+
+  // Invoke the subgraph (run the whole graph in dependency order).
+  //
+  // NOTE: It is possible that the interpreter is not in a ready state
+  // to evaluate (i.e. if a ResizeTensor() has been performed without an
+  // AllocateTensors().
+  // Returns status of success or failure.
+  TfLiteStatus Invoke();
+
+  // Entry point for C node plugin API to report an error.
+  void ReportError(const char* format, ...);
+
+  void UseNNAPI(bool enable);
+
+  // Return the subgraph specific context.
+  TfLiteContext* context() { return context_; }
+
+  // Set the value of an external context.
+  void SetExternalContext(TfLiteExternalContextType type,
+                          TfLiteExternalContext* ctx);
+  // Get the half precision flag.
+  // WARNING: This is an experimental API and subject to change.
+  bool GetAllowFp16PrecisionForFp32() const {
+    return context_->allow_fp32_relax_to_fp16;
+  }
+
+  // Ensure the data in `tensor.data` is readable. In case delegate is used,
+  // it might require to copy the data from delegate buffer to raw memory.
+  // WARNING: This is an experimental API and subject to change.
+  // TODO(b/119495520): make this private when refactoring complete.
+  TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
+    TfLiteTensor* t = &tensors_[tensor_index];
+    TF_LITE_ENSURE(context_, t != nullptr);
+    if (t->data_is_stale) {
+      TF_LITE_ENSURE(context_, t->delegate != nullptr);
+      TF_LITE_ENSURE(context_, t->buffer_handle != kTfLiteNullBufferHandle);
+      // This can be null if the delegate doesn't use its own buffer.
+      TF_LITE_ENSURE(context_, t->delegate->CopyFromBufferHandle != nullptr);
+      t->delegate->CopyFromBufferHandle(context_, t->delegate, t->buffer_handle,
+                                        t->data.raw, t->bytes);
+      t->data_is_stale = false;
+    }
+    return kTfLiteOk;
+  }
+
+  // The default capacity of `tensors_` vector.
+  static constexpr int kTensorsReservedCapacity = 128;
+  // The capacity headroom of `tensors_` vector before calling ops'
+  // `prepare` and `invoke` function. In these functions, it's guaranteed
+  // allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
+  // pointers to existing tensors.
+  static constexpr int kTensorsCapacityHeadroom = 16;
+
+  // Reset all variable tensors to the default value.
+  // If a variable tensor doesn't have a buffer, reset it to zero.
+  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  // to the value of the buffer.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ResetVariableTensors();
+
+  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+
+  profiling::Profiler* GetProfiler() { return profiler_; }
+
+ private:
+  // Prevent 'context_' from accessing functions that are only available to
+  // delegated kernels.
+  void SwitchToKernelContext();
+
+  // Add delegate-only functions to 'context_'.
+  void SwitchToDelegateContext();
+
+  // Give 'op_reg' a chance to initialize itself using the contents of
+  // 'buffer'.
+  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
+               size_t length) {
+    if (op_reg.init == nullptr) return nullptr;
+    return op_reg.init(context_, buffer, length);
+  }
+
+  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
+  void OpFree(const TfLiteRegistration& op_reg, void* buffer) {
+    if (op_reg.free == nullptr) return;
+    if (buffer) {
+      op_reg.free(context_, buffer);
+    }
+  }
+
+  // Prepare the given 'node' for execution.
+  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.prepare == nullptr) return kTfLiteOk;
+    return op_reg.prepare(context_, node);
+  }
+
+  // Invoke the operator represented by 'node'.
+  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.invoke == nullptr) return kTfLiteError;
+    return op_reg.invoke(context_, node);
+  }
+
+  // Call OpPrepare() for as many ops as possible, allocating memory for their
+  // tensors. If an op containing dynamic tensors is found, preparation will be
+  // postponed until this function is called again. This allows the interpreter
+  // to wait until Invoke() to resolve the sizes of dynamic tensors.
+  TfLiteStatus PrepareOpsAndTensors();
+
+  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
+  // dynamic tensors is found or all ops have been prepared. Fill
+  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
+  // the last in the graph.
+  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    int* last_execution_plan_index_prepared);
+
+  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
+  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
+  // `context_` whenever this std::vector is reallocated. Currently this
+  // only happens in `AddTensors()`.
+  std::vector<TfLiteTensor> tensors_;
+
+  // Check if an array of tensor indices are valid with respect to the Tensor
+  // array.
+  // NOTE: this changes consistent_ to be false if indices are out of bounds.
+  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
+                                  int length);
+
+  // Compute the number of bytes required to represent a tensor with dimensions
+  // specified by the array dims (of length dims_size). Returns the status code
+  // and bytes.
+  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
+                             size_t* bytes);
+
+  // Request an tensor be resized implementation. If the given tensor is of
+  // type kTfLiteDynamic it will also be allocated new memory.
+  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
+
+  // Report a detailed error string (will be printed to stderr).
+  // TODO(aselle): allow user of class to provide alternative destinations.
+  void ReportErrorImpl(const char* format, va_list args);
+
+  // Entry point for C node plugin API to request an tensor be resized.
+  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
+                                   TfLiteIntArray* new_size);
+  // Entry point for C node plugin API to report an error.
+  static void ReportErrorC(TfLiteContext* context, const char* format, ...);
+
+  // Entry point for C node plugin API to add new tensors.
+  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
+                                 int* first_new_tensor_index);
+
+  // WARNING: This is an experimental API and subject to change.
+  // Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
+  static TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteContext* context, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
+
+  // Update the execution graph to replace some of the nodes with stub
+  // nodes. Specifically any node index that has `nodes[index]==1` will be
+  // slated for replacement with a delegate kernel specified by registration.
+  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegate* delegate);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets the internal pointer to a TensorFlow lite node by node_index.
+  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
+                                      TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get a node by index.
+  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
+                                             int node_index, TfLiteNode** node,
+                                             TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
+  // owns this memory and it is only guaranteed to exist during the invocation
+  // of the delegate prepare.
+  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get the execution plan.
+  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
+                                       TfLiteIntArray** execution_plan);
+
+  // Retrieve an existing external context by type.
+  TfLiteExternalContext* GetExternalContext(TfLiteExternalContextType type);
+  static TfLiteExternalContext* GetExternalContext(
+      struct TfLiteContext* context, TfLiteExternalContextType type);
+
+  // Set the value of an external context.
+  static void SetExternalContext(struct TfLiteContext* context,
+                                 TfLiteExternalContextType type,
+                                 TfLiteExternalContext* ctx);
+
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+
+  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
+  // capacity. Calling this function may invalidate existing pointers to
+  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
+  // more tensors won't invalidate the pointer to existing tensors.
+  void EnsureTensorsVectorCapacity() {
+    const size_t required_capacity = tensors_.size() + kTensorsCapacityHeadroom;
+    if (required_capacity > tensors_.capacity()) {
+      tensors_.reserve(required_capacity);
+      context_->tensors = tensors_.data();
+    }
+  }
+
+  // The state of the Interpreter.
+  enum State {
+    // The interpreter isn't ready to be invoked.
+    // `AllocateTensor` need to be called to enter an invokable state.
+    kStateUninvokable = 0,
+    // The interpreter is ready to be invoked.
+    kStateInvokable,
+    // The interpreter is ready to be invoked, and graph can't be further
+    // modified. The interpreter will enter this state when calling
+    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
+    kStateInvokableAndImmutable,
+  };
+  State state_ = kStateUninvokable;
+
+  // A pure C data structure used to communicate with the pure C plugin
+  // interface. To avoid copying tensor metadata, this is also the definitive
+  // structure to store tensors.
+  // TODO(b/119495520): Get rid of owned and just make context_ a instance.
+  TfLiteContext owned_context_;
+  TfLiteContext* context_;
+
+  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
+  // function pointers to actual implementation.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
+      nodes_and_registration_;
+
+  // Whether the model is consistent. That is to say if the inputs and outputs
+  // of every node and the global inputs and outputs are valid indexes into
+  // the tensor array.
+  bool consistent_ = true;
+
+  // Array of indices representing the tensors that are inputs to the
+  // interpreter.
+  std::vector<int> inputs_;
+
+  // Array of indices representing the tensors that are outputs to the
+  // interpreter.
+  std::vector<int> outputs_;
+
+  // Array of indices representing the tensors that are variable tensors.
+  std::vector<int> variables_;
+
+  // The error reporter delegate that tflite will forward queries errors to.
+  ErrorReporter* error_reporter_;
+
+  // Index of the next node to prepare.
+  // During Invoke(), Interpreter will allocate input tensors first, which are
+  // known to be fixed size. Then it will allocate outputs from nodes as many
+  // as possible. When there is a node that produces dynamic sized tensor.
+  // Interpreter will stop allocating tensors, set the value of next allocate
+  // node id, and execute the node to generate the output tensor before continue
+  // to allocate successors. This process repeats until all nodes are executed.
+  // NOTE: this relies on the order of nodes that is in topological order.
+  int next_execution_plan_index_to_prepare_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  std::vector<int> execution_plan_;
+
+  // In the future, we'd like a TfLiteIntArray compatible representation.
+  // TODO(aselle): replace execution_plan_ with this.
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
+
+  // Whether to delegate to NN API
+  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
+
+  std::unique_ptr<MemoryPlanner> memory_planner_;
+
+  // Tracking bit for whether a tensor was resized in the course of an op
+  // invocation. This is a useful hint to ensure that dynamic tensor outputs
+  // trigger downstream reallocation after op invocation.
+  bool tensor_resized_since_op_invoke_ = false;
+
+  // External contexts (kTfLiteMaxExternalContexts).
+  TfLiteExternalContext** external_contexts_;
+
+  // Profiler for this interpreter instance.
+  profiling::Profiler* profiler_ = nullptr;
+};
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_CORE_SUBGRAPH_H_
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index f9850112abd2fd3189187ebe0fb948007bc41166..222a043a88e8804c6cad85150c55261f6bec9973 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -53,7 +53,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":delegate_data",
         ":delegate_only_runtime",
+        "//tensorflow/lite/c:c_api_internal",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib",
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index 2c4aa7075db3c3a76e48353e34e56809acfc57ff..9a6c5e74a7b8d71a04c20bbcb969cfe0b0ce3478 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -130,6 +130,10 @@ bool BufferMap::HasTensor(int tensor_index) const {
   return id_to_tensor_.count(tensor_index) != 0;
 }
 
+bool BufferMap::IsTensorFlowTensor(int tensor_index) const {
+  return HasTensor(tensor_index) && owned_by_tf_.count(tensor_index) > 0;
+}
+
 tensorflow::Tensor BufferMap::GetTensor(int tensor_index) const {
   return id_to_tensor_.at(tensor_index);
 }
@@ -154,11 +158,13 @@ void BufferMap::SetFromTfLite(int tensor_index, const TfLiteTensor* tensor) {
       GetTensorFlowDataType(tensor->type), shape, buf);
   buf->Unref();
 
-  SetFromTensorFlow(tensor_index, std::move(t));
+  id_to_tensor_[tensor_index] = std::move(t);
+  owned_by_tf_.erase(tensor_index);
 }
 
 void BufferMap::SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor) {
   id_to_tensor_[tensor_index] = std::move(tensor);
+  owned_by_tf_.insert(tensor_index);
 }
 
 }  // namespace flex
diff --git a/tensorflow/lite/delegates/flex/buffer_map.h b/tensorflow/lite/delegates/flex/buffer_map.h
index 269a0a2a27659d18fe21238fea3bcbdeb442031b..b73ed88d3789d5df8dadaee19d468596ccd4c782 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.h
+++ b/tensorflow/lite/delegates/flex/buffer_map.h
@@ -38,12 +38,17 @@ class BufferMap {
   // tensorflow::Tensor.
   bool HasTensor(int tensor_index) const;
 
+  // Returns true if the given 'tensor_index' has a corresponding
+  // tensorflow::Tensor *and* the content is owned by TensorFlow (that is, the
+  // mapping was added by SetFromTensorFlow()).
+  bool IsTensorFlowTensor(int tensor_index) const;
+
   // Returns the tensorflow::Tensor associated with the given 'tensor_index'.
   // Precondition: HasTensor() is true.
   tensorflow::Tensor GetTensor(int tensor_index) const;
 
   // Associates the given tensorflow::Tensor with the given 'tensor_index'.
-  // Note that tensorflow Tensors share data buffers, so this method is only a
+  // Note that TensorFlow Tensors share data buffers, so this method is only a
   // shallow copy.
   void SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor);
 
@@ -52,7 +57,17 @@ class BufferMap {
   void SetFromTfLite(int tensor_index, const TfLiteTensor* tensor);
 
  private:
+  // Mapping from TL Lite tensor ID to TensorFlow's Tensor. All tensors that
+  // are inputs or outputs of a subgraph will be added here, irrespective of
+  // whether their data are managed by TF Lite or TensorFlow.
   std::map<int, tensorflow::Tensor> id_to_tensor_;
+  // A list of tensors that are completely managed by TensorFlow. Most of the
+  // time, TF Lite will populate tensors that are inputs to subgraphs, while
+  // TensorFlow will populate output tensors. Occasionally, however, an input
+  // tensor is coming from a previous subgraph and could have been populated by
+  // TensorFlow. This set keeps track of all input or output tensors that have
+  // been populated by tensorflow.
+  std::set<int> owned_by_tf_;
 };
 
 }  // namespace flex
diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc
index fd52273fb40f208e8664e055f305e38c311bfc82..9e8472f1e7d2c3e0f5e73f3e5ce98bae7f15063f 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_test.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc
@@ -203,6 +203,7 @@ TEST(BufferMapTest, TfLiteOverwritesTensorFlow) {
   buffer_map.SetFromTensorFlow(0, t1);
   buffer_map.SetFromTfLite(0, t2.get());
 
+  EXPECT_FALSE(buffer_map.IsTensorFlowTensor(0));
   EXPECT_THAT(GetTensorData<int>(buffer_map.GetTensor(0)),
               ElementsAre(0, 0, 0, 3, 0, 0, 1, 2));
 }
@@ -216,6 +217,7 @@ TEST(BufferMapTest, TensorFlowOverwritesTfLite) {
   buffer_map.SetFromTfLite(0, t2.get());
   buffer_map.SetFromTensorFlow(0, t1);
 
+  EXPECT_TRUE(buffer_map.IsTensorFlowTensor(0));
   EXPECT_THAT(GetTensorData<float>(buffer_map.GetTensor(0)),
               ElementsAre(0, 0, 0, 0.123f, 0, 0));
 }
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index c31d48539bb7c844f16e9c8a987d5f1224c2fb2f..4fc2d82b494a4cd8165ae2d070aad1cc9e2440f4 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -46,11 +46,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
   }
 
   // Request TFLite to partition the graph and make kernels for each independent
-  // subgraph.
+  // node sub set.
   TfLiteIntArray* size_and_nodes =
       ConvertVectorToTfLiteIntArray(supported_nodes);
-  context->ReplaceSubgraphsWithDelegateKernels(context, GetKernel(),
-                                               size_and_nodes, delegate);
+  context->ReplaceNodeSubsetsWithDelegateKernels(context, GetKernel(),
+                                                 size_and_nodes, delegate);
   TfLiteIntArrayFree(size_and_nodes);
   return kTfLiteOk;
 }
@@ -109,7 +109,8 @@ FlexDelegate::FlexDelegate(std::unique_ptr<flex::DelegateData> delegate_data)
           /*nullptr,*/ &flex::delegate::Prepare,
           /*CopyFromBufferHandle=*/&flex::delegate::CopyFromBufferHandle,
           /*CopyToBufferHandle=*/nullptr,
-          /*FreeBufferHandle=*/nullptr},
+          /*FreeBufferHandle=*/nullptr,
+          /*flags=*/kTfLiteDelegateFlagsAllowDynamicTensors},
       delegate_data_(std::move(delegate_data)) {}
 
 FlexDelegate::~FlexDelegate() {}
diff --git a/tensorflow/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
index f9087d5d857f0d433946397f3ccaa972c1f7d7ba..e13029d9a514e7207c69a530713d2dcb6ec11ad5 100644
--- a/tensorflow/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -40,8 +40,7 @@ class DelegateTest : public testing::FlexModelTest {
   }
 
   void ConfigureDelegate() {
-    ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(
-                  delegate_.get(), /*allow_dynamic_tensors=*/true),
+    ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()),
               kTfLiteOk);
   }
 
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 2c19580235fedbf3b4bf2a65b453c766a0235723..02da1d1a224ee87c34c2a019bff6430fd0e7d88a 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -15,6 +15,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/kernel.h"
 
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
@@ -22,11 +28,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/common_runtime/eager/execute.h"
-#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
 // completed soon.
@@ -78,11 +79,18 @@ tensorflow::Status ExecuteFlexOp(tensorflow::EagerContext* eager_context,
                                  const std::vector<int>& inputs,
                                  const std::vector<int>& outputs) {
   const tensorflow::AttrTypeMap* attr_types;
+  bool is_function = false;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types),
+      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types, &is_function),
       " (while processing attributes of '", op_name, "')");
-
-  tensorflow::EagerOperation op(eager_context, op_name.c_str(), attr_types);
+  if (is_function) {
+    return tensorflow::errors::NotFound(
+        "Operation '", op_name,
+        "' is not registered.  (while processing attributes of '", op_name,
+        "')");
+  }
+  tensorflow::EagerOperation op(eager_context, op_name.c_str(),
+                                /*is_function=*/false, attr_types);
   for (const auto& attr : nodedef.attr()) {
     op.MutableAttrs()->Set(attr.first, attr.second);
   }
@@ -251,7 +259,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   for (auto tensor_index : op_data->subgraph_inputs) {
     TfLiteTensor* tensor = &context->tensors[tensor_index];
     if (!IsConstantTensor(tensor)) {
-      buffer_map->SetFromTfLite(tensor_index, tensor);
+      // If this tensor is part of an earlier TF subgraph we should not add it
+      // to the BufferMap again, because TF already knows about it and its
+      // contents are kept automatically up-to-date.
+      if (!buffer_map->IsTensorFlowTensor(tensor_index)) {
+        buffer_map->SetFromTfLite(tensor_index, tensor);
+      }
     }
   }
 
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index 93d472d3f4b93a2c421fd667fd0f02333fad4948..f55759594df51356986c2a328165c17b3ead2d80 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -30,7 +30,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
                             const std::vector<int>& supported_nodes) {
   TfLiteIntArray* size_and_nodes =
       ConvertVectorToTfLiteIntArray(supported_nodes);
-  TF_LITE_ENSURE_STATUS(context->ReplaceSubgraphsWithDelegateKernels(
+  TF_LITE_ENSURE_STATUS(context->ReplaceNodeSubsetsWithDelegateKernels(
       context, flex::GetKernel(), size_and_nodes, delegate));
   TfLiteIntArrayFree(size_and_nodes);
   return kTfLiteOk;
@@ -53,6 +53,7 @@ class KernelTest : public testing::FlexModelTest {
   template <typename T>
   void ConfigureDelegate(T prepare_function) {
     delegate_.data_ = delegate_data_.get();
+    delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
     delegate_.FreeBufferHandle = nullptr;
     delegate_.Prepare = prepare_function;
     delegate_.CopyFromBufferHandle = [](TfLiteContext* context,
@@ -66,8 +67,7 @@ class KernelTest : public testing::FlexModelTest {
       memcpy(data, values.data(), values.size());
       return kTfLiteOk;
     };
-    CHECK(interpreter_->ModifyGraphWithDelegate(
-              &delegate_, /*allow_dynamic_tensors=*/true) == kTfLiteOk);
+    CHECK(interpreter_->ModifyGraphWithDelegate(&delegate_) == kTfLiteOk);
   }
 
  private:
@@ -100,6 +100,17 @@ TEST_F(KernelTest, FullGraph) {
 
   ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
   ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+
+  // Try again with different inputs
+  SetShape(0, {2, 3, 1});
+  SetValues(0, {2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f});
+  SetShape(3, {2, 3, 1});
+  SetValues(3, {2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(3, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(24.0f, 32.0f, 48.0f));
 }
 
 TEST_F(KernelTest, BadTensorFlowOp) {
@@ -194,29 +205,69 @@ TEST_F(KernelTest, MixedGraph) {
   ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
 }
 
+// We will build a complex graph where most of the ops are TF ops, but one
+// of them, right in the middle is handle natively by TF Lite. This results
+// in two flex subgraphs to handle the TF ops, and some of the tensors
+// connect those two subgraphs directly.
 TEST_F(KernelTest, SplitGraph) {
-  AddTensors(10, {0}, {9}, kTfLiteFloat32, {3});
+  std::vector<float> a = {3.0f, 1.0f, 0.5f, -1.0f, 4.0f, -1.0f, -2.0f, 5.0f};
+  std::vector<float> b = {0.0f, 1.0f, 1.5f, 3.0f};
 
-  AddTfOp(testing::kUnpack, {0}, {1, 2});
-  AddTfOp(testing::kAdd, {1, 2}, {3});
-  AddTfOp(testing::kUnpack, {3}, {4, 5});
+  AddTensors(18, {0, 1}, {17}, kTfLiteFloat32, {3});
+
+  // Split the first input. Each branch below uses one half of it.
+  AddTfOp(testing::kUnpack, {0}, {2, 10});
 
-  AddTfLiteMulOp({4, 5}, {6});
+  // The left branch: l = (a0 + b0) * (a2 + b2) + (a1 + b1) * (a3 + b3) = 10
+  AddTfOp(testing::kAdd, {1, 2}, {3});     // => 3, 2, 2, 2
+  AddTfOp(testing::kUnpack, {3}, {4, 5});  // => 3, 2 --- 2, 2
+  AddTfLiteMulOp({4, 5}, {6});             // => 6, 4
+  AddTfOp(testing::kUnpack, {6}, {7, 8});  // => 6 -- 4
+  AddTfOp(testing::kAdd, {7, 8}, {9});     // => 10
 
-  AddTfOp(testing::kUnpack, {6}, {7, 8});
-  AddTfOp(testing::kAdd, {7, 8}, {9});
+  // The right branch: r = (a4 + a6) + (a5 + a7) = 6
+  AddTfOp(testing::kUnpack, {10}, {11, 12});  // => 4, -1 --- -2, 5
+  AddTfOp(testing::kAdd, {11, 12}, {13});     // => 2, 4
+  AddTfOp(testing::kUnpack, {13}, {14, 15});  // => 2 --- 4
+  AddTfOp(testing::kAdd, {14, 15}, {16});     // => 6
+
+  // The two branches added together:
+  AddTfOp(testing::kAdd, {9, 16}, {17});  // => 16
 
   ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1, 2, 4, 5});
+    // All ops by #3 are TF ops, handled by the delegate. However, because #4
+    // depends on the non-TF op, two subgraphs are necessary:
+    //    TF subgraph 1: 0, 1, 2, 6, 7, 8, 9
+    //    TF Lite Op: 3
+    //    TF subgraph 2: 4, 5, 10
+    return GenericPrepare(context, delegate, {0, 1, 2, 4, 5, 6, 7, 8, 9, 10});
   });
 
   SetShape(0, {2, 2, 2, 1});
-  SetValues(0, {3.0f, 1.0f, 0.5f, -1.0f, 0.0f, 1.0f, 1.5f, 3.0f});
+  SetValues(0, a);
+  SetShape(1, {2, 2, 1});
+  SetValues(1, b);
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(17), ElementsAre(1));
+  ASSERT_THAT(GetValues(17), ElementsAre(16.0f));
+
+  // Same as above but with slightly different output.
+  // We still expect the result to be l + r where
+  //     l = (a0 + b0) * (a2 + b2) + (a1 + b1) * (a3 + b3)
+  //     r = (a4 + a6) + (a5 + a7)
+  SetShape(0, {2, 2, 2, 1});
+  SetValues(0, {4.0f, 1.0f, 1.5f, -2.0f, 2.0f, 0.0f, -2.0f, 3.0f});
+  SetShape(1, {2, 2, 1});
+  SetValues(1, {0.0f, 2.0f, 1.5f, 3.0f});
+  // So l = (4 + 0) * (1.5 + 1.5) + (1 + 2) * (-2 + 3) =  12 + 3 = 15
+  //    r = (2 - 2) + (0 + 3) = 3
 
   ASSERT_TRUE(Invoke());
 
-  ASSERT_THAT(GetShape(9), ElementsAre(1));
-  ASSERT_THAT(GetValues(9), ElementsAre(10.0f));
+  ASSERT_THAT(GetShape(17), ElementsAre(1));
+  ASSERT_THAT(GetValues(17), ElementsAre(18.0f));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index c786ffa1a2150b24ec9b283f5fb254813d1d4ba2..c995b360f9d5ecfaced217a372af38690aee74f6 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -66,6 +66,8 @@ TF_DataType GetTensorFlowDataType(TfLiteType type) {
       return TF_INT32;
     case kTfLiteUInt8:
       return TF_UINT8;
+    case kTfLiteInt8:
+      return TF_INT8;
     case kTfLiteInt64:
       return TF_INT64;
     case kTfLiteComplex64:
@@ -87,6 +89,8 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
       return kTfLiteInt32;
     case TF_UINT8:
       return kTfLiteUInt8;
+    case TF_INT8:
+      return kTfLiteInt8;
     case TF_INT64:
       return kTfLiteInt64;
     case TF_COMPLEX64:
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 5f17e113751026ed8f14523e960b490fac74e985..4fe07004a82ff30228d866bcc7a90067e5940aca 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -374,7 +374,7 @@ struct NNAPIOpMappingArgs {
   std::vector<int>* model_state_tfl_inputs;
 };
 
-// The kernel that represents the subgraph of TF Lite being run on NN API.
+// The kernel that represents the node sub set of TF Lite being run on NN API.
 class NNAPIDelegateKernel {
  public:
   NNAPIDelegateKernel() = default;
@@ -1141,6 +1141,7 @@ class NNAPIDelegateKernel {
 TfLiteDelegate* NnApiDelegate() {
   static TfLiteDelegate delegate = {
       .data_ = nullptr,
+      .flags = kTfLiteDelegateFlagsNone,
       .Prepare = [](TfLiteContext* context,
                     TfLiteDelegate* delegate) -> TfLiteStatus {
         // Do not check nodes_ if NN API is unavailable.
@@ -1173,7 +1174,7 @@ TfLiteDelegate* NnApiDelegate() {
         supported_nodes[0] = supported_nodes.size() - 1;
 
         // NN API Delegate Registration (the pseudo kernel that will invoke NN
-        // API subgraphs)
+        // API node sub sets)
         static const TfLiteRegistration nnapi_delegate_kernel = {
             .init = [](TfLiteContext* context, const char* buffer,
                        size_t length) -> void* {
@@ -1206,8 +1207,8 @@ TfLiteDelegate* NnApiDelegate() {
         };
 
         // Request TFLite to partition the graph and make kernels
-        // for each independent subgraph a new nnapi_delegate_kernel.
-        context->ReplaceSubgraphsWithDelegateKernels(
+        // for each independent node sub set a new nnapi_delegate_kernel.
+        context->ReplaceNodeSubsetsWithDelegateKernels(
             context, nnapi_delegate_kernel,
             reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
             delegate);
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 75399fd5e46e9f616cc566cccf02af3ec592d4b0..ca48af0c95211e644fc7e2a1a1472a2f1b46ad35 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -31,9 +31,14 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   SingleOpModelWithNNAPI() {
     this->SetApplyDelegate([](Interpreter* interpreter) {
-      interpreter->ModifyGraphWithDelegate(NnApiDelegate(), false);
+      interpreter->ModifyGraphWithDelegate(NnApiDelegate());
     });
   }
+
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims) {
+    return interpreter_->ResizeInputTensor(tensor_index, dims);
+  }
 };
 
 class FloatAddOpModel : public SingleOpModelWithNNAPI {
@@ -97,6 +102,17 @@ TEST(NNAPIDelegate, AddWithRelu) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.4, 1.0, 1.3}));
 }
 
+// Verify that resize attempts fail.
+// TODO(b/113110851): Verify success after the delegate supports resizing.
+TEST(NNAPIDelegate, ResizeFails) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  EXPECT_EQ(m.ResizeInputTensor(m.input1(), {1, 3, 3, 1}), kTfLiteError);
+}
+
 class FloatMulOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatMulOpModel(const TensorData& input1, const TensorData& input2,
diff --git a/tensorflow/lite/examples/android/build.gradle b/tensorflow/lite/examples/android/build.gradle
index 66a62a921a7f492df30b3de2e5dc4b68fc84f1d9..74dacbcddbdafa65d061e83a0199bfc2d60a361b 100644
--- a/tensorflow/lite/examples/android/build.gradle
+++ b/tensorflow/lite/examples/android/build.gradle
@@ -22,3 +22,7 @@ allprojects {
 task clean(type: Delete) {
     delete rootProject.buildDir
 }
+
+// Changed since default name 'build' conflicts with
+// bazel BUILD file name.
+buildDir = "gradle-build"
diff --git a/tensorflow/lite/experimental/c/BUILD b/tensorflow/lite/experimental/c/BUILD
index 5dd62194deac2e035bf9b1c1e46b4fbfbe93c66a..cde53e283830aca9c7990e3d8c4901f997621bc2 100644
--- a/tensorflow/lite/experimental/c/BUILD
+++ b/tensorflow/lite/experimental/c/BUILD
@@ -58,6 +58,7 @@ cc_library(
     srcs = ["c_api.cc"],
     hdrs = ["c_api.h"],
     copts = tflite_copts(),
+    tags = ["swift_module=TensorFlowLiteCAPI"],
     visibility = [
         ":experimental",
     ],
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
index 7a475a24d36b6afaab62cac943ebc960369bb4f2..0c351ee4eccee515ed34ec5e8607914f7064ffbf 100644
--- a/tensorflow/lite/experimental/examples/lstm/BUILD
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -23,10 +23,8 @@ py_test(
     srcs = ["unidirectional_sequence_lstm_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "manual",
         "no_oss",
         "no_pip",
-        "notap",
     ],
     deps = [
         ":tflite_lstm",
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index 81ab6691df7389de900ca34f69ae122648e56d42..eeb48d123113c5924a74286ad1e0851eb484cdb8 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -19,8 +19,9 @@ import tempfile
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
 from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.tools import optimize_for_inference_lib
@@ -50,17 +51,17 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     # Batch size
     self.batch_size = 16
     # Lstm Units.
-    self.num_units = 64
+    self.num_units = 16
 
   def buildLstmLayer(self):
     return tf.nn.rnn_cell.MultiRNNCell([
         TFLiteLSTMCell(
             self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
-        TFLiteLSTMCell(self.num_units, num_proj=64, forget_bias=0, name="rnn2"),
+        TFLiteLSTMCell(self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
         TFLiteLSTMCell(
             self.num_units // 2,
             use_peepholes=True,
-            num_proj=64,
+            num_proj=8,
             forget_bias=0,
             name="rnn3"),
         TFLiteLSTMCell(self.num_units, forget_bias=0, name="rnn4")
@@ -150,7 +151,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
     with tf.Session() as sess:
       curr = sess.graph_def
-      curr = tf.lite.convert_op_hints_to_stubs(graph_def=curr)
+      curr = convert_op_hints_to_stubs(graph_def=curr)
 
     curr = optimize_for_inference_lib.optimize_for_inference(
         curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
@@ -189,7 +190,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
         x, output_class, new_sess)
 
     result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
-    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-3))
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
   def testDynamicRnnMultiRnnCell(self):
     sess = tf.Session(config=CONFIG)
@@ -219,7 +220,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
         x, output_class, new_sess)
 
     result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
-    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-3))
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 69022b611ed14adcef7dea54ec135988a7452823..07fb87641133edb5550844dd5920cf712f0fe262 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -32,7 +32,7 @@ tflite_micro_cc_test(
 )
 
 tflite_micro_cc_test(
-    name = "preprocessor_test",
+    name = "preprocessor_reference_test",
     srcs = [
         "no_30ms_sample_data.cc",
         "no_30ms_sample_data.h",
@@ -52,3 +52,25 @@ tflite_micro_cc_test(
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "preprocessor_fixed_test",
+    srcs = [
+        "fixed_point/preprocessor.cc",
+        "no_30ms_sample_data.cc",
+        "no_30ms_sample_data.h",
+        "no_power_spectrum_data.cc",
+        "no_power_spectrum_data.h",
+        "preprocessor.h",
+        "preprocessor_test.cc",
+        "yes_30ms_sample_data.cc",
+        "yes_30ms_sample_data.h",
+        "yes_power_spectrum_data.cc",
+        "yes_power_spectrum_data.h",
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de60c982f3a062a6a1f32369f388f5ed3b10f6ac
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
@@ -0,0 +1,218 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of the preprocessing pipeline, with the same
+// results as the audio tutorial at
+// https://www.tensorflow.org/tutorials/sequences/audio_recognition
+// This module takes 30ms of PCM-encoded signed 16-bit audio samples (at 16KHz,
+// so 480 values), and extracts a power spectrum of frequencies. There are 43
+// frequency bands in the result, derived from the original 256 output from the
+// discrete Fourier transform, and averaged together in groups of 6.
+// It's expected that most platforms will have optimized versions of the
+// functions used here, for example replacing the DFT with an FFT, so this
+// version shouldn't be used where performance is critical.
+// This implementation uses fixed point for any non-constant calculations,
+// instead of floating point, to help show how this can work on platforms that
+// don't have good float support.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+
+#include <cmath>
+
+namespace {
+
+// q format notation: qx.y => 1 sign bit, x-1 integer bits, y fraction bits.
+// Use standard (non-saturating) arithmetic with signed ints of size x+y bits.
+// Sacrifice some precision to avoid use of 64-bit ints.
+
+// q1.15 * q1.15 => q2.30
+inline int32_t Q1_15_FixedMultiply_Q2_30(int16_t a, int16_t b) {
+  int32_t big_a = a;
+  int32_t big_b = b;
+  return big_a * big_b;
+}
+
+// q2.30 * q2.30 => q10.22
+inline int32_t Q2_30_FixedMultiply_Q10_22(int32_t a, int32_t b) {
+  // q2.30 result
+  int32_t tmp = (a >> 15) * (b >> 15);
+  // q10.22 result
+  return tmp >> 8;
+}
+
+// q10.22 * q10.22 => q10.22
+// Will overflow if product is >= 512.
+// Largest product in small test set is 465.25
+inline int32_t Q10_22_FixedMultiply_Q10_22(int32_t a, int32_t b) {
+  // q10.22 result
+  return (a >> 11) * (b >> 11);
+}
+
+// float => q2.30
+// No checking for saturation.  Only used for inputs in range [-1, 1].
+inline int32_t FloatToFixed_Q2_30(float input) {
+  return static_cast<int32_t>(roundf(input * (1 << 30)));
+}
+
+// These constants allow us to allocate fixed-sized arrays on the stack for our
+// working memory.
+constexpr int kInputSize = 512;
+constexpr int kAverageWindowSize = 6;
+constexpr int kOutputSize =
+    ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+
+// Performs a discrete Fourier transform on the real inputs. This corresponds to
+// rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
+// and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft.
+// It takes in an array of float real values, and returns a result of the same
+// length with q10.22 fixed point real and imaginary components interleaved, so
+// fourier_output[0] is the first real value, fourier_output[1] is the first
+// imaginary, fourier_output[2] is the second real, and so on.
+// The calling function should ensure that the array passed in as fourier_output
+// is at least time_series_size in length. Most optimized FFT implementations
+// require the length to be a power of two as well, but this version doesn't
+// enforce that.
+
+// input: q2.30 fixed point.  output: q10.22 fixed point.
+// Outputs interpreted as q10.22 fixed point are un-scaled.
+void CalculateDiscreteFourierTransform(int32_t* time_series,
+                                       int time_series_size,
+                                       int32_t* fourier_output) {
+  for (int i = 0; i < time_series_size / 2; ++i) {
+    int32_t real = 0;
+    for (int j = 0; j < time_series_size; ++j) {
+      const int32_t real_scale =
+          FloatToFixed_Q2_30(cos(j * i * M_PI * 2 / time_series_size));
+      real += Q2_30_FixedMultiply_Q10_22(time_series[j], real_scale);
+    }
+    int32_t imaginary = 0;
+    for (int j = 0; j < time_series_size; ++j) {
+      const int32_t imaginary_scale =
+          FloatToFixed_Q2_30(sin(j * i * M_PI * 2 / time_series_size));
+      imaginary -= Q2_30_FixedMultiply_Q10_22(time_series[j], imaginary_scale);
+    }
+    fourier_output[(i * 2) + 0] = real;
+    fourier_output[(i * 2) + 1] = imaginary;
+  }
+}
+
+// Produces a simple sine curve that is used to ensure frequencies at the center
+// of the current sample window are weighted more heavily than those at the end.
+// q1.15 output format.
+void CalculatePeriodicHann(int window_length, int16_t* window_function) {
+  for (int i = 0; i < window_length; ++i) {
+    const float real_value = (0.5 - 0.5 * cos((2 * M_PI * i) / window_length));
+    int tmp = static_cast<int32_t>(roundf(real_value * (1 << 15)));
+    // Saturate the 0x8000 value to 0x7fff
+    if (tmp > 0x7fff) tmp = 0x7fff;
+    window_function[i] = tmp;
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                        const int16_t* input, int input_size, int output_size,
+                        uint8_t* output) {
+  // Ensure our input and output data arrays are valid.
+  if (input_size > kInputSize) {
+    error_reporter->Report("Input size %d larger than %d", input_size,
+                           kInputSize);
+    return kTfLiteError;
+  }
+  if (output_size != kOutputSize) {
+    error_reporter->Report("Requested output size %d doesn't match %d",
+                           output_size, kOutputSize);
+    return kTfLiteError;
+  }
+
+  // Pre-calculate the window function we'll be applying to the input data.
+  // In a real application, we'd calculate this table once in an initialization
+  // function and store it for repeated reuse.
+  // q1.15 format.
+  int16_t window_function[kInputSize];
+  CalculatePeriodicHann(input_size, window_function);
+
+  // Apply the window function to our time series input, and pad it with zeroes
+  // to the next power of two.
+  int32_t fixed_input[kInputSize];
+  for (int i = 0; i < kInputSize; ++i) {
+    if (i < input_size) {
+      // input is int16_t.  Treat as q1.15 fixed point value in range [-1,1)
+      // window_function is also q1.15 fixed point number
+      fixed_input[i] =
+          Q1_15_FixedMultiply_Q2_30(input[i], window_function[i]);
+    } else {
+      fixed_input[i] = 0;
+    }
+  }
+
+  // Pull the frequency data from the time series sample.
+  // Calculated in q10.22 format from q2.30 inputs.
+  int32_t fourier_values[kInputSize];
+  CalculateDiscreteFourierTransform(fixed_input, kInputSize, fourier_values);
+
+  // We have the complex numbers giving us information about each frequency
+  // band, but all we want to know is how strong each frequency is, so calculate
+  // the squared magnitude by adding together the squares of each component.
+  int32_t power_spectrum[kInputSize / 2];
+  for (int i = 0; i < (kInputSize / 2); ++i) {
+    const int32_t real = fourier_values[(i * 2) + 0];
+    const int32_t imaginary = fourier_values[(i * 2) + 1];
+    // q10.22 results
+    power_spectrum[i] =
+        Q10_22_FixedMultiply_Q10_22(real, real) +
+        Q10_22_FixedMultiply_Q10_22(imaginary, imaginary);
+  }
+
+  // Finally, reduce the size of the output by averaging together six adjacent
+  // frequencies into each slot, producing an array of 43 values.
+  // Power_spectrum numbers are q10.22.  Divide by kAverageWindowSize inside
+  // loop to prevent overflow.
+  for (int i = 0; i < kOutputSize; ++i) {
+    int32_t average = 0;
+    for (int j = 0; j < kAverageWindowSize; ++j) {
+      const int index = (i * kAverageWindowSize) + j;
+      if (index < (kInputSize / 2)) {
+        average += power_spectrum[index] / kAverageWindowSize;
+      }
+    }
+    // Quantize the result into eight bits, effectively multiplying by two.
+    // The 127.5 constant here has to match the features_max value defined in
+    // tensorflow/examples/speech_commands/input_data.py, and this also assumes
+    // that features_min is zero.
+    //
+    // q10.22 input
+    // integer output
+    //
+    // output = (input - features_min) *
+    //     (output_max - output_min) / (features_max - features_min)
+    // == (input) * (255) / (127.5)
+    // == input * 2
+    // == input << 1
+    // Also want to round to nearest integer and only keep integer bits
+    // => ((input << 1) + 0x200000) >> 22
+    // == (input + 0x100000) >> 21
+    int32_t quantized_average = (average + 0x100000) >> 21;
+    if (quantized_average < 0) {
+      quantized_average = 0;
+    }
+    if (quantized_average > 255) {
+      quantized_average = 255;
+    }
+    output[i] = quantized_average;
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/kernels/test_utils.h b/tensorflow/lite/experimental/micro/kernels/test_utils.h
index 4207c609812f16b3051ca3871b167d557fb8a3c0..95f2d8a9d217a1b1f23c0198ddce5156e1c6cb36 100644
--- a/tensorflow/lite/experimental/micro/kernels/test_utils.h
+++ b/tensorflow/lite/experimental/micro/kernels/test_utils.h
@@ -89,7 +89,7 @@ inline void PopulateContext(TfLiteTensor* tensors, int tensors_size,
   context->ReportError = ReportOpError;
   context->AddTensors = nullptr;
   context->GetNodeAndRegistration = nullptr;
-  context->ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context->ReplaceNodeSubsetsWithDelegateKernels = nullptr;
   context->recommended_num_threads = 1;
   context->GetExternalContext = nullptr;
   context->SetExternalContext = nullptr;
diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc
index e0460c5d3e5cd800c34345b210e0c20cb8bd0d28..f1c236fb62f002fc17b06852d09c8675f4ccb755 100644
--- a/tensorflow/lite/experimental/micro/micro_interpreter.cc
+++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc
@@ -149,7 +149,7 @@ MicroInterpreter::MicroInterpreter(const Model* model,
   context_.ReportError = ReportOpError;
   context_.AddTensors = nullptr;
   context_.GetNodeAndRegistration = nullptr;
-  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context_.ReplaceNodeSubsetsWithDelegateKernels = nullptr;
   context_.recommended_num_threads = 1;
   context_.GetExternalContext = nullptr;
   context_.SetExternalContext = nullptr;
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 5492003e5af2f33dc907c4a6ba1674a78fcd1557..0caf0ca099e0520f90530b02f9a95efbe6e3d299 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -62,12 +62,19 @@ tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
 # Test binary for the microcontroller speech model.
 PREPROCESSOR_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
 
+PREPROCESSOR_REFERENCE_TEST_SRCS = \
+$(PREPROCESSOR_TEST_SRCS) \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+
+PREPROCESSOR_FIXED_TEST_SRCS += \
+$(PREPROCESSOR_TEST_SRCS) \
+tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
+
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*test.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc)
@@ -91,7 +98,8 @@ include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
 
 ALL_SRCS := \
 	$(MICRO_SPEECH_TEST_SRCS) \
-	$(PREPROCESSOR_TEST_SRCS) \
+	$(PREPROCESSOR_REFERENCE_TEST_SRCS) \
+	$(PREPROCESSOR_FIXED_TEST_SRCS) \
 	$(MICROLITE_CC_SRCS) \
 	$(MICROLITE_TEST_SRCS)
 
@@ -104,7 +112,8 @@ LIBDIR := $(GENDIR)lib/
 MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
 
 MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
-PREPROCESSOR_TEST_BINARY := $(BINDIR)preprocessor_test
+PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
+PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
 
 CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
@@ -113,8 +122,11 @@ AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
 
-PREPROCESSOR_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_TEST_SRCS))))
+PREPROCESSOR_REFERENCE_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
+
+PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
 
 MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
@@ -158,18 +170,29 @@ micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
 test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
 	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
-$(PREPROCESSOR_TEST_BINARY): $(PREPROCESSOR_TEST_OBJS) $(MICROLITE_LIB_PATH)
+$(PREPROCESSOR_REFERENCE_TEST_BINARY): $(PREPROCESSOR_REFERENCE_TEST_OBJS) $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_TEST_BINARY) $(PREPROCESSOR_TEST_OBJS) \
+	-o $(PREPROCESSOR_REFERENCE_TEST_BINARY) $(PREPROCESSOR_REFERENCE_TEST_OBJS) \
 	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
 
-preprocessor_test: $(PREPROCESSOR_TEST_BINARY)
-preprocessor_test_bin: $(PREPROCESSOR_TEST_BINARY).bin
+preprocessor_reference_test: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
+preprocessor_reference_test_bin: $(PREPROCESSOR_REFERENCE_TEST_BINARY).bin
+
+test_preprocessor_reference: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_REFERENCE_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
 
-test_preprocessor: $(PREPROCESSOR_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY)
+preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
 
+test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
 $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..8ea78e8f3e3db75f86ce39e6adf9b82ff4080ff1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc
@@ -0,0 +1,9 @@
+# Settings for x86 on Linux
+ifeq ($(TARGET), linux)
+  ifeq ($(TARGET_ARCH), x86_64)
+    PLATFORM_FLAGS = \
+      -DTF_LITE_DISABLE_X86_NEON
+    CXXFLAGS += $(PLATFORM_FLAGS)
+    CCFLAGS += $(PLATFORM_FLAGS)
+  endif
+endif
diff --git a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
index 020d40bc13c1bbd3d950bd505508ef654ac756ec..561f5f7a50e0207ab64fd06211e94e406208e894 100644
--- a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
+++ b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
@@ -110,7 +110,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
           left_context=1,
           right_context=1)
       self.assertAllEqual(
-          filterbanks.eval(),
+          self.evaluate(filterbanks),
           [[479, 425, 479, 425, 436, 378], [479, 425, 436, 378, 410, 350],
            [436, 378, 410, 350, 391, 325], [410, 350, 391, 325, 391, 325]])
 
@@ -153,7 +153,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
           frame_stride=3,
           zero_padding=True)
       self.assertAllEqual(
-          filterbanks.eval(),
+          self.evaluate(filterbanks),
           [[0, 0, 0, 0, 479, 425], [436, 378, 410, 350, 391, 325],
            [374, 308, 362, 292, 352, 275]])
 
diff --git a/tensorflow/lite/experimental/writer/BUILD b/tensorflow/lite/experimental/writer/BUILD
index 506c668cf2c70f1e294bcf2039fbb88ec9c4fd96..57ce63636714aa616cb50e04fe2c15210cc2eb1c 100644
--- a/tensorflow/lite/experimental/writer/BUILD
+++ b/tensorflow/lite/experimental/writer/BUILD
@@ -1,6 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    features = ["-parse_headers"],
+)
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index 036809e94abcfc20df315b973c855152f923181b..b44750e8b21e1bcfcde6176052fc0e18b2e66122 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -66,6 +66,8 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteFakeQuantParams",
                                       "TfLitePackParams",
                                       "TfLiteOneHotParams",
+                                      "TfLiteLeakyReluParams",
+                                      "TfLiteMirrorPaddingParams",
                                       nullptr};
 }  // namespace
 
@@ -152,6 +154,7 @@ class OpOptionData {
     op_to_option_["BIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
+    op_to_option_["MIRROR_PAD"] = "";  // TODO(karimnosseir): MirrorPadOptions.
     // Manually specified mappings between ops and options (none)
     op_to_option_["EMBEDDING_LOOKUP"] =
         "";  // TODO(aselle): maybe something else.
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index ab0d186848fcac1f11ddfe3b55e4ffa2292b8395..a51c7a667f355f04e272d9868f996225444557fb 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -3,11 +3,11 @@ upper_tabs:
 - include: /_upper_tabs_left.yaml
 - include: /api_docs/_upper_tabs_api.yaml
 # Dropdown menu
-- name: Ecosystem
-  path: /ecosystem
+- name: Resources
+  path: /resources
   is_default: true
   menu:
-  - include: /ecosystem/_menu_toc.yaml
+  - include: /resources/_menu_toc.yaml
   lower_tabs:
     # Subsite tabs
     other:
diff --git a/tensorflow/lite/g3doc/_index.yaml b/tensorflow/lite/g3doc/_index.yaml
index 43b5e3cfc01b458718bbe1f3ae7d18b27a81fa6e..1b3f1d616ae953e3c6a659301d7a7dd6860dcbf2 100644
--- a/tensorflow/lite/g3doc/_index.yaml
+++ b/tensorflow/lite/g3doc/_index.yaml
@@ -182,35 +182,41 @@ landing_page:
     background: grey
     heading: Updates
     items:
-    - heading: Introducing the Model Optimization Toolkit
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
+    - heading: "AI in motion: react in the real world"
+      image_path: ./images/landing-page/ai_in_motion.png
+      path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
+      buttons:
+      - label: Read more
+        path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
+    - heading: "Introducing the Model Optimization Toolkit"
+      image_path: /resources/images/tf-logo-card-16x9.png
       path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
       buttons:
       - label: Read on TensorFlow blog
         path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
-    - heading: East Africa Cassava App
+    - heading: "East Africa Cassava App"
       image_path: ./images/landing-page/detect_crop_disease_in_africa.png
       path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
       buttons:
       - label: Read more
         path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
-    - heading: Using TensorFlow Lite on Android
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
-      path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
-      buttons:
-      - label: Read on TensorFlow blog
-        path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
 
   - classname: devsite-landing-row-cards
     background: grey
     items:
-    - heading: TensorFlow Lite at the Dev Summit
+    - heading: "Using TensorFlow Lite on Android"
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
+      buttons:
+      - label: Read on TensorFlow blog
+        path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
+    - heading: "TensorFlow Lite at the Dev Summit"
       youtube_id: FAMfy7izB6A
       buttons:
       - label: Watch the video
         path: https://www.youtube.com/watch?v=FAMfy7izB6A
-    - heading: TensorFlow Lite on GitHub
-      image_path: /ecosystem/images/github-card-16x9.png
+    - heading: "TensorFlow Lite on GitHub"
+      image_path: /resources/images/github-card-16x9.png
       path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite
       buttons:
       - label: View on GitHub
diff --git a/tensorflow/lite/g3doc/convert/cmdline_examples.md b/tensorflow/lite/g3doc/convert/cmdline_examples.md
index 341f7120fc8ba2bd8520cbf6c5dc55980f5eeaea..59f26b35051ce2ec410e25a5c877344ffe96dc45 100644
--- a/tensorflow/lite/g3doc/convert/cmdline_examples.md
+++ b/tensorflow/lite/g3doc/convert/cmdline_examples.md
@@ -95,11 +95,10 @@ tflite_convert \
 
 The TensorFlow Lite Converter is compatible with fixed point quantization models
 described [here](https://www.tensorflow.org/performance/quantization). These are
-float models with
-[`FakeQuant*`](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization)
-ops inserted at the boundaries of fused layers to record min-max range
-information. This generates a quantized inference workload that reproduces the
-quantization behavior that was used during training.
+float models with `FakeQuant*` ops inserted at the boundaries of fused layers
+to record min-max range information. This generates a quantized inference
+workload that reproduces the quantization behavior that was used during
+training.
 
 The following command generates a quantized TensorFlow Lite FlatBuffer from a
 "quantized" TensorFlow GraphDef.
diff --git a/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png b/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png
new file mode 100644
index 0000000000000000000000000000000000000000..b8eedce7eaeb0f0440d7c36a243cfd729c3699d0
Binary files /dev/null and b/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png differ
diff --git a/tensorflow/lite/g3doc/tf_ops_compatibility.md b/tensorflow/lite/g3doc/tf_ops_compatibility.md
index b0dfb0fed1f7a072487a06c11bddf5545911ffdf..2864c6aaf438b54ac0ef4ec08a2722e1c6738c38 100644
--- a/tensorflow/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/lite/g3doc/tf_ops_compatibility.md
@@ -1,4 +1,3 @@
-
 # TensorFlow Lite & TensorFlow Compatibility Guide
 
 TensorFlow Lite supports a number of TensorFlow operations used in common
@@ -75,6 +74,7 @@ counterparts:
     0D tensor*
 *   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze) - *as
     long as axis is not provided*
+*   [tf.squared_difference](https://www.tensorflow.org/versions/master/api_docs/python/tf/squared_difference)
 *   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice) -
     *as long as ellipsis_mask and new_axis_mask are not used*
 *   [tf.transpose](https://www.tensorflow.org/versions/master/api_docs/python/tf/transpose) -
@@ -154,6 +154,30 @@ Options {
 }
 ```
 
+**ARG_MAX**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: A tensor of indices of maximum values.
+}
+```
+
+**ARG_MIN**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: A tensor of indices of minium values.
+}
+```
+
 **AVERAGE_POOL_2D**
 
 ```
@@ -280,6 +304,18 @@ Outputs {
 }
 ```
 
+**FILL**
+
+```
+Inputs {
+  0: a 1D tensor
+  1: a 0D (scalar) tensor
+}
+Outputs {
+  0: A tensor of shape `tensor 0` filled with the value in `tensor 1`.
+}
+```
+
 **FLOOR**
 
 ```
@@ -291,6 +327,30 @@ outputs: {
 }
 ```
 
+**FLOOR_DIV**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: result of computing element-wise floor of `tensor 0` divided by `tensor 1`.
+}
+```
+
+**FLOOR_MOD**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: result of computing element-wise floor of `tensor 0` modulo `tensor 1`.
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
@@ -378,6 +438,34 @@ Options {
 }
 ```
 
+**LEAKY_RELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(input, input * alpha)
+}
+Options {
+  alpha: slope of the activation at x < 0 (provided alpha <= 1)
+}
+```
+
+**LEAKY_RELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(input, input * alpha)
+}
+Options {
+  alpha
+}
+```
+
 **LESS**
 
 ```
@@ -421,6 +509,18 @@ Options {
 }
 ```
 
+**LOGICAL_OR**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: a list of tensors.
+}
+Outputs {
+  0: A tensor of logical_or output tensors.
+}
+```
+
 **LOGISTIC**
 
 ```
@@ -498,6 +598,18 @@ Outputs {
 }
 ```
 
+**PACK**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: an integer.
+}
+Outputs {
+  0: A tensor of stacked tensors.
+}
+```
+
 **PAD**
 
 ```
@@ -539,6 +651,35 @@ Outputs {
 }
 ```
 
+**POW**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: elementwise pow of the input tensors
+}
+```
+
+**RANGE**
+
+```
+Inputs {
+  0: a 0D (scalar) tensor
+  1: a 0D (scalar) tensor
+  2: a 0D (scalar) tensor
+}
+Outputs {
+  0: A 1D tensor of type `dtype` defined by a sequence where `tensor 0` is the
+  start, `tensor 1` is the limit, and `tensor 2` is the delta.
+}
+Options {
+  dtype
+}
+```
+
 **RELU**
 
 ```
@@ -587,6 +728,22 @@ Options {
 }
 ```
 
+**RESIZE_NEAREST_NEIGHBOR**
+
+```
+Inputs {
+  0: a 4D tensor
+  1: a 1D tensor with 2 elements
+}
+Outputs {
+  0: A tensor of type `tensor 0` resized according to `tensor 1` heigh/width values
+  using nearest neighbors interpolation.
+}
+Options {
+  align_corners
+}
+```
+
 **RSQRT**
 
 ```
@@ -781,66 +938,6 @@ Outputs {
 }
 ```
 
-**POW**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise pow of the input tensors
-}
-```
-
-**ARG_MAX**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of maximum values.
-}
-```
-
-**ARG_MIN**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of minium values.
-}
-```
-
-**PACK**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: an integer.
-}
-Outputs {
-  0: A tensor of stacked tensors.
-}
-```
-
-**LOGICAL_OR**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
-}
-Outputs {
-  0: A tensor of logical_or output tensors.
-}
-```
-
 **UNPACK**
 
 ```
@@ -854,18 +951,6 @@ Outputs {
 }
 ```
 
-**FLOOR_DIV**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
-}
-Outputs {
-  0: A tensor of floor_div output tensors.
-}
-```
-
 **ZEROS_LIKE**
 
 ```
diff --git a/tensorflow/lite/g3doc/using_select_tf_ops.md b/tensorflow/lite/g3doc/using_select_tf_ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa51f58baa4ecf01fbe75d2ce9095bb1a5286ae8
--- /dev/null
+++ b/tensorflow/lite/g3doc/using_select_tf_ops.md
@@ -0,0 +1,249 @@
+# [Experimental] Using TensorFlow Lite with select TensorFlow ops
+
+The TensorFlow Lite builtin op library has grown rapidly, and will continue to
+grow, but there remains a long tail of TensorFlow ops that are not yet natively
+supported by TensorFlow Lite . These unsupported ops can be a point of friction
+in the TensorFlow Lite model conversion process. To that end, the team has
+recently been working on an experimental mechanism for reducing this friction.
+
+This document outlines how to use TensorFlow Lite with select TensorFlow ops.
+*Note that this feature is experimental and is under active development.* As you
+use this feature, keep in mind the [known limitations](#known-limitations), and
+please send feedback about models that work and issues you are facing to
+tflite@tensorflow.org.
+
+TensorFlow Lite will continue to have
+[TensorFlow Lite builtin ops](tf_ops_compatibility.md) optimized for mobile and
+embedded devices. However, TensorFlow Lite models can now use a subset of
+TensorFlow ops when TFLite builtin ops are not sufficient.
+
+Models converted with TensorFlow ops will require a TensorFlow Lite interpreter
+that has a larger binary size than the interpreter with only TFLite builtin ops.
+Additionally, performance optimizations will not be available for any TensorFlow
+ops in the TensorFlow Lite model.
+
+This document outlines how to [convert](#converting-the-model) and
+[run](#running-the-model) a TFLite model with TensorFlow ops on your platform of
+choice. It also discusses some [known limitations](#known-limitations), the
+[future plans](#future-plans) for this feature, and basic
+[performance and size metrics](#metrics).
+
+## Converting the model
+
+To convert a TensorFlow model to a TensorFlow Lite model with TensorFlow ops,
+use the `target_ops` argument in the
+[TensorFlow Lite converter](https://www.tensorflow.org/lite/convert/). The
+following values are valid options for `target_ops`:
+
+*   `TFLITE_BUILTINS` - Converts models using TensorFlow Lite builtin ops.
+*   `SELECT_TF_OPS` - Converts models using TensorFlow ops. The exact subset of
+    supported ops can be found in the whitelist at
+    `lite/toco/tflite/whitelisted_flex_ops.cc`.
+
+The recommended approach is to convert the model with `TFLITE_BUILTINS`, then
+with both `TFLITE_BUILTINS,SELECT_TF_OPS`, and finally with only
+`SELECT_TF_OPS`. Using both options (i.e. `TFLITE_BUILTINS,SELECT_TF_OPS`)
+creates models with TensorFlow Lite ops where possible. Using only
+`SELECT_TF_OPS` is useful when the model contains TensorFlow ops that are only
+partially supported by TensorFlow Lite, and one would like to avoid those
+limitations.
+
+The following example shows how to use `target_ops` in the
+[`TFLiteConverter`](https://www.tensorflow.org/lite/convert/python_api) Python
+API.
+
+```
+import tensorflow as tf
+
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.target_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
+                        tf.lite.OpsSet.SELECT_TF_OPS]
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+The following example shows how to use `target_ops` in the
+[`tflite_convert`](https://www.tensorflow.org/lite/convert/cmdline_examples)
+command line tool.
+
+```
+tflite_convert \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/foo.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+When building and running `tflite_convert` directly with `bazel`, please pass
+`--define=with_select_tf_ops=true` as an additional argument.
+
+```
+bazel run --define=with_select_tf_ops=true tflite_convert -- \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/foo.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+## Running the model
+
+When using a TensorFlow Lite model that has been converted with support for
+select TensorFlow ops, the client must also use a TensorFlow Lite runtime that
+includes the necessary library of TensorFlow ops.
+
+### Android AAR
+
+A new Android AAR target with select TensorFlow ops has been added for
+convenience. Assuming a <a href="./demo_android.md">working TensorFlow Lite
+build environment</a>, build the Android AAR with select TensorFlow ops as
+follows:
+
+```sh
+bazel build --cxxopt='--std=c++11' -c opt             \
+  --config=android_arm --config=monolithic          \
+  //tensorflow/lite/java:tensorflow-lite-with-select-tf-ops
+```
+
+This will generate an AAR file in `bazel-genfiles/tensorflow/lite/java/`. From
+there, you can either import the AAR directly into your project, or publish the
+custom AAR to your local Maven repository:
+
+```sh
+mvn install:install-file \
+  -Dfile=bazel-genfiles/tensorflow/lite/java/tensorflow-lite-with-select-tf-ops.aar \
+  -DgroupId=org.tensorflow \
+  -DartifactId=tensorflow-lite-with-select-tf-ops -Dversion=0.1.100 -Dpackaging=aar
+```
+
+Finally, in your app's `build.gradle`, ensure you have the `mavenLocal()`
+dependency and replace the standard TensorFlow Lite dependency with the one that
+has support for select TensorFlow ops:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+        mavenLocal()
+    }
+}
+
+dependencies {
+    compile 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
+}
+```
+
+### iOS
+
+With XCode Command Line Tools installed, TensorFlow Lite with select TensorFlow
+ops support can be built with the following command:
+
+```sh
+tensorflow/contrib/makefile/build_all_ios_with_tflite.sh
+```
+
+This will generate the required static linking libraries in the
+`tensorflow/contrib/makefile/gen/lib/` directory.
+
+The TensorFlow Lite camera example app can be used to test this. A new
+TensorFlow Lite XCode project with support for select TensorFlow ops has been
+added to
+`tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj`.
+
+To use this feature in a your own project, either clone the example project or
+set the project settings for a new or existing project to the following:
+
+*   In Build Phases -> Link Binary With Libraries, add the static libraries
+    under `tensorflow/contrib/makefile/gen/lib/` directory:
+    *   `libtensorflow-lite.a`
+    *   `libprotobuf.a`
+    *   `nsync.a`
+*   In Build Settings -> Header Search Paths, add the following directories:
+    *   `tensorflow/lite/`
+    *   `tensorflow/contrib/makefile/downloads/flatbuffer/include`
+    *   `tensorflow/contrib/makefile/downloads/eigen`
+*   In Build Settings -> Other Linker Flags, add `-force_load
+    tensorflow/contrib/makefile/gen/lib/libtensorflow-lite.a`.
+
+A CocoaPod with support for select TensorFlow ops will also be released in the
+future.
+
+### C++
+
+When building TensorFlow Lite libraries using the bazel pipeline, the additional
+TensorFlow ops library can be included and enabled as follows:
+
+*   Enable monolithic builds if necessary by adding the `--config=monolithic`
+    build flag.
+*   Do one of the following:
+    *   Include the `--define=with_select_tf_ops=true` build flag in the `bazel
+        build` invocation when building TensorFlow Lite.
+    *   Add the TensorFlow ops delegate library dependency to the build
+        dependencies: `tensorflow/lite/delegates/flex:delegate`.
+
+Note that the necessary `TfLiteDelegate` will be installed automatically when
+creating the interpreter at runtime as long as the delegate is linked into the
+client library. It is not necessary to explicitly install the delegate instance
+as is typically required with other delegate types.
+
+### Python pip Package
+
+Python support is actively under development.
+
+## Metrics
+
+### Performance
+
+When using a mixture of both builtin and select TensorFlow ops, all of the same
+TensorFlow Lite optimizations and optimized builtin kernels will be be available
+and usable with the converted model. For the TensorFlow ops, performance should
+generally be comparable to that of
+[TensorFlow Mobile](https://www.tensorflow.org/lite/tfmobile/).
+
+The following table describes the average time taken to run inference on
+MobileNet on a Pixel 2. The listed times are an average of 100 runs. These
+targets were built for Android using the flags: `--config=android_arm64 -c opt`.
+
+Build                                | Time (milliseconds)
+------------------------------------ | -------------------
+Only built-in ops (`TFLITE_BUILTIN`) | 260.7
+Using only TF ops (`SELECT_TF_OPS`)  | 264.5
+
+### Binary Size
+
+The following table describes the binary size of TensorFlow Lite for each build.
+These targets were built for Android using `--config=android_arm -c opt`.
+
+Build                 | C++ Binary Size | Android APK Size
+--------------------- | --------------- | ----------------
+Only built-in ops     | 796 KB          | 561 KB
+Built-in ops + TF ops | 23.0 MB         | 8.0 MB
+
+## Known Limitations
+
+The following is a list of some of the known limitations:
+
+*   Control flow ops are not yet supported.
+*   The
+    [`post_training_quantization`](https://www.tensorflow.org/performance/post_training_quantization)
+    flag is currently not supported for TensorFlow ops so it will not quantize
+    weights for any TensorFlow ops. In models with both TensorFlow Lite builtin
+    ops and TensorFlow ops, the weights for the builtin ops will be quantized.
+*   Ops that require explicit initialization from resources, like HashTableV2,
+    are not yet supported.
+*   Certain TensorFlow ops may not support the full set of input/output types
+    that are typically available on stock TensorFlow.
+
+## Future Plans
+
+The following is a list of improvements to this pipeline that are in progress:
+
+*   *Selective registration* - There is work being done to make it simple to
+    generate TFLite interpreter binaries that only contain the TensorFlow ops
+    required for a particular set of models.
+*   *Improved usability* - The conversion process will be simplified to only
+    require a single pass through the converter. Additionally, pre-built Android
+    AAR and iOS CocoaPod binaries will be provided.
+*   *Improved performance* - There is work being done to ensure TensorFlow Lite
+    with TensorFlow ops has performance parity to TensorFlow Mobile.
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index cdbe66a3a4fa906d64d8598715c4a1a0eabd24cb..1cec0d0c290679c7755cbf84858317489c0ba159 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -41,31 +41,31 @@ class TfLiteIntArrayView {
   const TfLiteIntArray* int_array_;
 };
 
-// Helper class that actually performs partitioning by subgraph.
-// Outputs to a provided `subgraphs` structure.
+// Helper class that actually performs partitioning by node sub set.
+// Outputs to a provided `NodeSubset` structure.
 //
 // Example usage:
-// PartitionGraphIntoIndependentSubgraphsImpl partitioner(
-//     info, nodes_to_part, subgraphs);
+// PartitionGraphIntoIndependentNodeSubsetsImpl partitioner(
+//     info, nodes_to_part, node_subsets);
 // partitioner.Partition();
-class PartitionGraphIntoIndependentSubgraphsImpl {
+class PartitionGraphIntoIndependentNodeSubsetsImpl {
  public:
-  PartitionGraphIntoIndependentSubgraphsImpl(
+  PartitionGraphIntoIndependentNodeSubsetsImpl(
       const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
-      std::vector<Subgraph>* subgraphs)
+      std::vector<NodeSubset>* node_subsets)
       : info_(info),
-        subgraphs_(subgraphs),
-        node_type_(info->num_nodes(), Subgraph::kTfNonPartition) {
+        node_subsets_(node_subsets),
+        node_type_(info->num_nodes(), NodeSubset::kTfNonPartition) {
     // Populate the node_type_ map.
     for (auto node_index : TfLiteIntArrayView(nodes_to_partition)) {
-      node_type_[node_index] = Subgraph::kTfPartition;
+      node_type_[node_index] = NodeSubset::kTfPartition;
     }
   }
 
   // Actually partition the graph.
   void Partition() {
     // Initialize here to make Partition() re-entrant.
-    subgraphs_->clear();
+    node_subsets_->clear();
     tensor_epochs_.clear();
     tensor_epochs_.resize(info_->num_tensors(), kEpochAlwaysReady);
     node_epochs_.clear();
@@ -80,35 +80,35 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
     }
 
     // Do a graph traversal where each iteration in the loop is an epoch
-    // that corresponds to a subgraph that only contains nodes that are of
+    // that corresponds to a node sub set that only contains nodes that are of
     // the same node_type_.
     while (true) {
-      BuildSubgraph();
-      if (subgraphs_->back().nodes.empty()) {
-        subgraphs_->pop_back();
+      BuildNodeSubset();
+      if (node_subsets_->back().nodes.empty()) {
+        node_subsets_->pop_back();
         break;
       }
     }
 
-    // Mark model outputs as subgraph outputs. All the rest have already been
-    // identified.
+    // Mark model outputs as node sub set outputs. All the rest have already
+    // been identified.
     for (int output_index : info_->outputs()) {
       int output_epoch = tensor_epochs_[output_index];
-      Subgraph& output_subgraph = (*subgraphs_)[output_epoch];
-      output_subgraph.output_tensors.push_back(output_index);
+      NodeSubset& output_subset = (*node_subsets_)[output_epoch];
+      output_subset.output_tensors.push_back(output_index);
     }
-    // Make sure every subgraph's inputs and outputs are unique. Since the
+    // Make sure every node sub set's inputs and outputs are unique. Since the
     // list of inputs and outputs is generated in a way that produces
     // duplicates.
-    for (Subgraph& subgraph : *subgraphs_) {
+    for (NodeSubset& node_subset : *node_subsets_) {
       // Sort and uniquefy using standard library algorithms.
       auto uniquefy = [](std::vector<int>* items) {
         std::sort(items->begin(), items->end());
         auto last = std::unique(items->begin(), items->end());
         items->erase(last, items->end());
       };
-      uniquefy(&subgraph.input_tensors);
-      uniquefy(&subgraph.output_tensors);
+      uniquefy(&node_subset.input_tensors);
+      uniquefy(&node_subset.output_tensors);
     }
   }
 
@@ -129,14 +129,14 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
   // epoch since the epoch's node_type doesn't match.
   bool UpdateNode(int node_index) {
     const TfLiteNode& node = info_->node(node_index);
-    Subgraph& current_subgraph = subgraphs_->back();
-    int current_epoch = subgraphs_->size() - 1;
+    NodeSubset& current_subset = node_subsets_->back();
+    int current_epoch = node_subsets_->size() - 1;
     // Check if node is already done.
     if (node_epochs_[node_index] != kEpochNotReady) {
       return false;
     }
     // See if all dependencies of this node are already assigned to a
-    // subgraph.
+    // node sub set.
     for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
       if (tensor_epochs_[input_tensor_index] == kEpochNotReady) {
         return false;
@@ -144,16 +144,16 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
     }
     // When we are starting a new epoch, the first ready node defines
     // the type of that epoch.
-    if (current_subgraph.type == Subgraph::kTfUnexplored) {
-      current_subgraph.type = node_type_[node_index];
+    if (current_subset.type == NodeSubset::kTfUnexplored) {
+      current_subset.type = node_type_[node_index];
     }
     // The node gets assigned to this epoch if it is the same type as
     // the epoch's assigned type. Note, if this is the current ready
     // node encountered during this epoch, this condition will be
     // automatically true.
-    if (current_subgraph.type == node_type_[node_index]) {
+    if (current_subset.type == node_type_[node_index]) {
       node_epochs_[node_index] = current_epoch;
-      current_subgraph.nodes.push_back(node_index);
+      current_subset.nodes.push_back(node_index);
       // All outputs of this node now are assigned to this epoch as
       // well.
       for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
@@ -165,13 +165,13 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
         int input_epoch = tensor_epochs_[input_tensor_index];
         int node_epoch = current_epoch;
         if (input_epoch != node_epoch) {
-          current_subgraph.input_tensors.push_back(input_tensor_index);
-          // Set inputs to be outputs of the subgraph where they reside.
+          current_subset.input_tensors.push_back(input_tensor_index);
+          // Set inputs to be outputs of the node sub set where they reside.
           // the if condition makes sure inputs to the whole computation
           // are not included (i.e. those initialized to -2 above).
           if (input_epoch >= 0) {
-            Subgraph& input_subgraph = (*subgraphs_)[input_epoch];
-            input_subgraph.output_tensors.push_back(input_tensor_index);
+            NodeSubset& input_subset = (*node_subsets_)[input_epoch];
+            input_subset.output_tensors.push_back(input_tensor_index);
           }
         }
       }
@@ -181,9 +181,9 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
     }
   }
 
-  // Completely populates the current subgraph by doing graph traversal
-  void BuildSubgraph() {
-    subgraphs_->emplace_back(Subgraph());
+  // Completely populates the current node_subset by doing graph traversal
+  void BuildNodeSubset() {
+    node_subsets_->emplace_back(NodeSubset());
     // loop until no more nodes can be updated.
     while (true) {
       bool did_something = false;
@@ -198,9 +198,9 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
 
   // Temporary data needed for partitioning.
   const GraphInfo* info_;
-  // List of subgraphs to populate
-  std::vector<Subgraph>* subgraphs_;
-  std::vector<Subgraph::Type> node_type_;
+  // List of node_subsets to populate
+  std::vector<NodeSubset>* node_subsets_;
+  std::vector<NodeSubset::Type> node_type_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
   // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
   // is an input or constant.
@@ -212,11 +212,11 @@ class PartitionGraphIntoIndependentSubgraphsImpl {
 
 }  // namespace
 
-TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
+TfLiteStatus PartitionGraphIntoIndependentNodeSubsets(
     const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
-    std::vector<Subgraph>* subgraphs) {
-  PartitionGraphIntoIndependentSubgraphsImpl(info, nodes_to_partition,
-                                             subgraphs)
+    std::vector<NodeSubset>* node_subsets) {
+  PartitionGraphIntoIndependentNodeSubsetsImpl(info, nodes_to_partition,
+                                               node_subsets)
       .Partition();
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/graph_info.h b/tensorflow/lite/graph_info.h
index ff7ce669aceccf4295af499c3bfbb2182a672384..4da696c132e27ce4a57fccd7935c78dd015e6850 100644
--- a/tensorflow/lite/graph_info.h
+++ b/tensorflow/lite/graph_info.h
@@ -51,31 +51,32 @@ class GraphInfo {
   virtual const std::vector<int>& variables() const = 0;
 };
 
-// Represents a subgraph of a TensorFlow Lite graph.
-struct Subgraph {
+// Represents a subset of nodes in a TensorFlow Lite graph.
+struct NodeSubset {
   enum Type {
     kTfUnexplored = 0,  // temporarily used during creation
     kTfPartition,
     kTfNonPartition
   };
   Type type = kTfUnexplored;
-  // Nodes within the subgraph
+  // Nodes within the node sub set
   std::vector<int> nodes;
-  // Tensors that stride output from another subgraph that this depends on,
+  // Tensors that stride output from another node sub set that this depends on,
   // or global inputs to the TensorFlow Lite full graph.
   std::vector<int> input_tensors;
-  // Outputs that are consumed by other subgraphs or are global output tensors.
-  // All output tensors of the nodes in the subgraph that do not appear in this
-  // list are intermediate results that can be potentially elided.
+  // Outputs that are consumed by other node sub sets or are global output
+  // tensors. All output tensors of the nodes in the node sub set that do not
+  // appear in this list are intermediate results that can be potentially
+  // elided.
   std::vector<int> output_tensors;
 };
 
-// Partitions a list of node indices `nodes_to_partition` into subgraphs.
-// Each subgraph is in dependency order (i.e. all members of the subgraph).
-// `subgraphs` is assumed to be empty.
-TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
+// Partitions a list of node indices `nodes_to_partition` into node sub sets.
+// Each node sub set is in dependency order (i.e. all members of the node sub
+// sets). `node_subsets` is assumed to be empty.
+TfLiteStatus PartitionGraphIntoIndependentNodeSubsets(
     const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
-    std::vector<Subgraph>* subgraphs);
+    std::vector<NodeSubset>* node_subsets);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
index 5ecc3774e13060466634d81c3915f5046ad91795..4d8bbdc0eef49b3f79b3c74c1d07fd86467e1d65 100644
--- a/tensorflow/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -76,17 +76,18 @@ class SimpleTestGraph : public GraphInfo {
 // TfLiteIntArray. Populates `subgraphs` with resulting generated subgraphs.
 void PartitionGraph(const SimpleTestGraph& graph,
                     const std::vector<int>& nodes_to_partition,
-                    std::vector<Subgraph>* subgraphs) {
+                    std::vector<NodeSubset>* subgraphs) {
   TfLiteIntArray* nodes_to_partition_int_array =
       ConvertVector(nodes_to_partition);
-  PartitionGraphIntoIndependentSubgraphs(&graph, nodes_to_partition_int_array,
-                                         subgraphs);
+  PartitionGraphIntoIndependentNodeSubsets(&graph, nodes_to_partition_int_array,
+                                           subgraphs);
   TfLiteIntArrayFree(nodes_to_partition_int_array);
 }
 
 // Check a generated list of subgraphs against the expected list of subgraphs.
-void CheckPartitionSubgraphs(const std::vector<Subgraph>& generated_subgraphs,
-                             const std::vector<Subgraph>& expected_subgraphs) {
+void CheckPartitionSubgraphs(
+    const std::vector<NodeSubset>& generated_subgraphs,
+    const std::vector<NodeSubset>& expected_subgraphs) {
   ASSERT_EQ(generated_subgraphs.size(), expected_subgraphs.size());
   for (int subgraph_index = 0; subgraph_index < generated_subgraphs.size();
        subgraph_index++) {
@@ -103,7 +104,7 @@ void CheckPartitionSubgraphs(const std::vector<Subgraph>& generated_subgraphs,
 TEST(PartitionTest, Nodes0_PartitionNodes0) {
   SimpleTestGraph graph;
   std::vector<int> nodes_to_partition = {};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
   CheckPartitionSubgraphs(generated_subgraphs, {});
 }
@@ -117,11 +118,11 @@ TEST(PartitionTest, Nodes1PartitionNodes0) {
   graph.AddNode({0}, {1});
   graph.SetInputsAndOutputs({0}, {1});
   std::vector<int> nodes_to_partition = {};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph;
-  expected_subgraph.type = Subgraph::kTfNonPartition;
+  NodeSubset expected_subgraph;
+  expected_subgraph.type = NodeSubset::kTfNonPartition;
   expected_subgraph.nodes = {0};
   expected_subgraph.input_tensors = {0};
   expected_subgraph.output_tensors = {1};
@@ -136,12 +137,12 @@ TEST(PartitionTest, Nodes1PartitionNodes0Inputs0) {
   graph.AddTensors(1);
   graph.AddNode({}, {0});
   graph.SetInputsAndOutputs({}, {0});
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   std::vector<int> nodes_to_partition = {0};
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph;
-  expected_subgraph.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph;
+  expected_subgraph.type = NodeSubset::kTfPartition;
   expected_subgraph.nodes = {0};
   expected_subgraph.input_tensors = {};
   expected_subgraph.output_tensors = {0};
@@ -157,11 +158,11 @@ TEST(PartitionTest, Nodes1PartitionNodes1) {
   graph.AddNode({0}, {1});
   graph.SetInputsAndOutputs({0}, {1});
   std::vector<int> nodes_to_partition = {0};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph;
-  expected_subgraph.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph;
+  expected_subgraph.type = NodeSubset::kTfPartition;
   expected_subgraph.nodes = {0};
   expected_subgraph.input_tensors = {0};
   expected_subgraph.output_tensors = {1};
@@ -180,16 +181,16 @@ TEST(PartitionTest, Nodes2PartitionNodes1) {
   graph.AddNode({1}, {2});
   graph.SetInputsAndOutputs({0}, {2});
   std::vector<int> nodes_to_partition = {1};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph0;
-  expected_subgraph0.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
   expected_subgraph0.nodes = {0};
   expected_subgraph0.input_tensors = {0};
   expected_subgraph0.output_tensors = {1};
-  Subgraph expected_subgraph1;
-  expected_subgraph1.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph1;
+  expected_subgraph1.type = NodeSubset::kTfPartition;
   expected_subgraph1.nodes = {1};
   expected_subgraph1.input_tensors = {1};
   expected_subgraph1.output_tensors = {2};
@@ -208,11 +209,11 @@ TEST(PartitionTest, Nodes2PartitionNodes2) {
   graph.AddNode({1}, {2});
   graph.SetInputsAndOutputs({0}, {2});
   std::vector<int> nodes_to_partition = {0, 1};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph0;
-  expected_subgraph0.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
   expected_subgraph0.nodes = {0, 1};
   expected_subgraph0.input_tensors = {0};
   expected_subgraph0.output_tensors = {2};
@@ -239,21 +240,21 @@ TEST(PartitionTest, Nodes3PartitionNodes2) {
   graph.AddNode({1, 2}, {3});
   graph.SetInputsAndOutputs({0}, {3});
   std::vector<int> nodes_to_partition = {0, 2};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph0;
-  expected_subgraph0.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
   expected_subgraph0.nodes = {0};
   expected_subgraph0.input_tensors = {0};
   expected_subgraph0.output_tensors = {1};
-  Subgraph expected_subgraph1;
-  expected_subgraph1.type = Subgraph::kTfNonPartition;
+  NodeSubset expected_subgraph1;
+  expected_subgraph1.type = NodeSubset::kTfNonPartition;
   expected_subgraph1.nodes = {1};
   expected_subgraph1.input_tensors = {1};
   expected_subgraph1.output_tensors = {2};
-  Subgraph expected_subgraph2;
-  expected_subgraph2.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph2;
+  expected_subgraph2.type = NodeSubset::kTfPartition;
   expected_subgraph2.nodes = {2};
   expected_subgraph2.input_tensors = {1, 2};
   expected_subgraph2.output_tensors = {3};
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 9d5d2772bec5fbb7b85217cb1af1a4dab3796b5d..326aff5ce486feea2b46b2458555c3a42df0c8f4 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 
-#include "tensorflow/lite/arena_planner.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
@@ -32,110 +31,14 @@ limitations under the License.
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
-namespace {
-
-TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
-                           const TfLiteRegistration& registration,
-                           int node_index, const char* message) {
-  context->ReportError(
-      context, "Node number %d (%s) %s.\n", node_index,
-      registration.custom_name
-          ? registration.custom_name
-          : EnumNameBuiltinOperator(
-                static_cast<BuiltinOperator>(registration.builtin_code)),
-      message);
-  return kTfLiteError;
-}
-
-// Stub method which returns kTfLiteError when the function is forbidden.
-// We're registrating this function to several different function to save
-// compiled binary size. Please note the restrictions:
-// * The type of first parameter have to be `TfLiteContext*`.
-// * All paramteters must be trivailly destructible. (E.g. No C++ class)
-TfLiteStatus ForbiddenContextFunction(TfLiteContext* context, ...) {
-  context->ReportError(context,
-                       "The function is forbidden if not calling in delegate.");
-  return kTfLiteError;
-}
-
-// Set the ForbiddenContextFunction to a compatible function pointer.
-template <typename FunctionType>
-void SetForbiddenContextFunction(FunctionType* func) {
-  *func = reinterpret_cast<FunctionType>(ForbiddenContextFunction);
-}
-
-// Returns true if at least one tensor in the given list is kTfLiteDynamic.
-template <typename TensorIntArray>
-bool HasDynamicTensorImpl(const TfLiteContext& context,
-                          const TensorIntArray& int_array) {
-  for (int i : int_array) {
-    const TfLiteTensor& tensor = context.tensors[i];
-    if (tensor.allocation_type == kTfLiteDynamic) {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace
-
-// A trivial implementation of GraphInfo around the Interpreter.
-// NOTE: this interpreter info represents the subset of the
-// graph that is executed according to execution plan. Thus,
-// the indices are execution plan indices rather than raw node
-// indices.
-class InterpreterInfo : public GraphInfo {
- public:
-  explicit InterpreterInfo(Interpreter* interpreter)
-      : interpreter_(interpreter) {}
-
-  size_t num_tensors() const override { return interpreter_->tensors_size(); }
-  TfLiteTensor* tensor(size_t index) override {
-    return interpreter_->tensor(index);
-  }
-  size_t num_nodes() const override {
-    return interpreter_->execution_plan().size();
-  }
-  const TfLiteNode& node(size_t index) const override {
-    int node_index = interpreter_->execution_plan()[index];
-    return interpreter_->node_and_registration(node_index)->first;
-  }
-  const std::vector<int>& inputs() const override {
-    return interpreter_->inputs();
-  }
-  const std::vector<int>& outputs() const override {
-    return interpreter_->outputs();
-  }
-  const std::vector<int>& variables() const override {
-    return interpreter_->variables();
-  }
-
- public:
-  Interpreter* interpreter_;
-};
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
-  context_.impl_ = static_cast<void*>(this);
-  context_.ResizeTensor = ResizeTensor;
-  context_.ReportError = ReportError;
-  context_.AddTensors = AddTensors;
-  context_.tensors = nullptr;
-  context_.tensors_size = 0;
-  context_.allow_fp32_relax_to_fp16 = false;
-  context_.recommended_num_threads = -1;
-  context_.GetExternalContext = GetExternalContext;
-  context_.SetExternalContext = SetExternalContext;
-
-  // Invalid to call these these except from TfLiteDelegate
-  SwitchToKernelContext();
+  subgraphs_.emplace_back(new Subgraph(error_reporter_, external_contexts_));
+  context_ = primary_subgraph().context();
 
   // Reserve some space for the tensors to avoid excessive resizing.
-  tensors_.reserve(kTensorsReservedCapacity);
-  nodes_and_registration_.reserve(kTensorsReservedCapacity);
-  next_execution_plan_index_to_prepare_ = 0;
-
   for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
     external_contexts_[i] = nullptr;
   }
@@ -143,669 +46,75 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   UseNNAPI(false);
 }
 
-Interpreter::~Interpreter() {
-  for (auto& nodeAndReg : nodes_and_registration_) {
-    TfLiteNode& node = nodeAndReg.first;
-    TfLiteIntArrayFree(node.inputs);
-    TfLiteIntArrayFree(node.outputs);
-    TfLiteIntArrayFree(node.temporaries);
-    if (node.builtin_data) free(node.builtin_data);
-    OpFree(nodeAndReg.second, node.user_data);
-    node.builtin_data = nullptr;
-  }
-
-  for (int i = 0; i < context_.tensors_size; i++) {
-    TfLiteTensor* tensor = &context_.tensors[i];
-    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
-        tensor->delegate->FreeBufferHandle != nullptr) {
-      tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
-                                         &tensor->buffer_handle);
-    }
-    TfLiteTensorFree(tensor);
-  }
-}
-
-TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
-    TfLiteContext* context, TfLiteRegistration registration,
-    const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->ReplaceSubgraphsWithDelegateKernels(registration, nodes_to_replace,
-                                            delegate);
-}
-
-namespace {
-
-// Copy a std::vector<int> to an existing TfLiteIntArray.
-// This is a low-level data manipulation function, and it's caller's
-// responsibility to ensure TfLiteIntArray has enough size.
-void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
-                                TfLiteIntArray* arr) {
-  arr->size = vec.size();
-  memcpy(arr->data, vec.data(), sizeof(int) * arr->size);
-}
-
-// This function allocates a continuous memory space that contains a
-// TfLiteDelegateParams followed by a several TfLiteIntArray.
-// When calling `free` at TfLiteDelegateParams*, all the allocated space
-// will be freed together.
-//
-// +-----------------------------------+
-// | TfLiteDelegateParams              |
-// | TfLiteDelegate* delegate;         |
-// | TfLiteIntArray* nodes_to_replace; |--\
-// | TfLiteIntArray* input_tensors;    |--+--\
-// | TfLiteIntArray* output_tensors;   |--+--+--\
-// +-----------------------------------+  |  |  |
-// | TfLiteIntArray (variable size)    |<-/  |  |
-// +-----------------------------------+     |  |
-// | TfLiteIntArray (variable size)    |<----/  |
-// +-----------------------------------+        |
-// | TfLiteIntArray (variable size)    |<-------/
-// +-----------------------------------+
-TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
-                                           const Subgraph& subgraph) {
-  // Step 1: Calculate the allocation size.
-  int allocation_size = sizeof(TfLiteDelegateParams);
-
-  int nodes_to_replace_size =
-      TfLiteIntArrayGetSizeInBytes(subgraph.nodes.size());
-  allocation_size += nodes_to_replace_size;
-
-  int input_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(subgraph.input_tensors.size());
-  allocation_size += input_tensors_size;
-
-  int output_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(subgraph.output_tensors.size());
-  allocation_size += output_tensors_size;
-
-  // Step 2: Allocate the memory.
-  // Use `char*` for conveniently step through the allocated space by bytes.
-  char* allocation = reinterpret_cast<char*>(malloc(allocation_size));
-
-  // Step 3: Fill all data structures structures.
-  TfLiteDelegateParams* params =
-      reinterpret_cast<TfLiteDelegateParams*>(allocation);
-  params->delegate = delegate;
-  allocation += sizeof(TfLiteDelegateParams);
-
-  params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(subgraph.nodes, params->nodes_to_replace);
-  allocation += nodes_to_replace_size;
-
-  params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(subgraph.input_tensors, params->input_tensors);
-  allocation += input_tensors_size;
-
-  params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(subgraph.output_tensors, params->output_tensors);
-  allocation += output_tensors_size;
-
-  return params;
-}
-
-}  // namespace
-
-TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
-    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
-    TfLiteDelegate* delegate) {
-  // Annotate the registration as DELEGATE op.
-  registration.builtin_code = BuiltinOperator_DELEGATE;
-
-  // Analyze the graph to find all independent subgraphs that are either
-  // fully not-this-delegate or this-delegate computation.
-  InterpreterInfo info(this);
-  std::vector<Subgraph> subgraphs;
-  PartitionGraphIntoIndependentSubgraphs(&info, nodes_to_replace, &subgraphs);
-
-  execution_plan_.clear();
-  for (auto& subgraph : subgraphs) {
-    // Subgraphs calimed by the delegate should have a "macro" op created, the
-    // other subgraphs (kTfNonPartition) just have their nodes added back to
-    // the execution plan.
-    switch (subgraph.type) {
-      case Subgraph::kTfNonPartition:
-        for (auto it = subgraph.nodes.begin(); it != subgraph.nodes.end();
-             ++it) {
-          execution_plan_.push_back(*it);
-        }
-        break;
-      case Subgraph::kTfPartition: {
-        int node_index;
-
-        TfLiteDelegateParams* params = CreateDelegateParams(delegate, subgraph);
-        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
-            subgraph.input_tensors, subgraph.output_tensors, nullptr, 0, params,
-            &registration, &node_index));
-
-        // Initialize the output tensors's delegate-related fields.
-        for (int tensor_index : subgraph.output_tensors) {
-          TfLiteTensor* tensor = &tensors_[tensor_index];
-          TF_LITE_ENSURE(&context_, tensor->delegate == nullptr ||
-                                        tensor->delegate == delegate);
-          tensor->delegate = delegate;
-        }
-
-        // Associate the node with the delegate.
-        TfLiteNode* node = &nodes_and_registration_[node_index].first;
-        node->delegate = delegate;
-      } break;
-      case Subgraph::kTfUnexplored:
-        return kTfLiteError;
-        break;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteExternalContext* Interpreter::GetExternalContext(
-    TfLiteExternalContextType type) {
-  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
-    return external_contexts_[type];
-  }
-  return nullptr;
-}
-
-TfLiteExternalContext* Interpreter::GetExternalContext(
-    struct TfLiteContext* context, TfLiteExternalContextType type) {
-  return static_cast<Interpreter*>(context->impl_)->GetExternalContext(type);
-}
+Interpreter::~Interpreter() {}
 
 void Interpreter::SetExternalContext(TfLiteExternalContextType type,
                                      TfLiteExternalContext* ctx) {
-  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
-    external_contexts_[type] = ctx;
-  }
-}
-
-void Interpreter::SetExternalContext(struct TfLiteContext* context,
-                                     TfLiteExternalContextType type,
-                                     TfLiteExternalContext* ctx) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->SetExternalContext(type, ctx);
-}
-
-// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
-// this memory and it is only guaranteed to exist during the invocation of the
-// delegate prepare.
-TfLiteStatus Interpreter::GetExecutionPlan(TfLiteIntArray** execution_plan) {
-  // TODO(aselle): Do not make a copy here
-  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
-  *execution_plan = plan_cache_.get();
-  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
-                "TfLiteIntArray and execution_plan do not contain same type.");
-  std::memcpy(plan_cache_->data, execution_plan_.data(),
-              sizeof(plan_cache_->data[0]) * execution_plan_.size());
-  return kTfLiteOk;
-}
-
-// WARNING: This is an experimental interface that is subject to change.
-// Entry point for C node plugin API to get the execution plan
-TfLiteStatus Interpreter::GetExecutionPlan(struct TfLiteContext* context,
-                                           TfLiteIntArray** execution_plan) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->GetExecutionPlan(execution_plan);
+  primary_subgraph().SetExternalContext(type, ctx);
 }
 
 TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
-  TF_LITE_ENSURE_OK(&context_,
-                    CheckTensorIndices("inputs", inputs.data(), inputs.size()));
-  inputs_ = std::move(inputs);
-  return kTfLiteOk;
+  return primary_subgraph().SetInputs(inputs);
 }
 
 TfLiteStatus Interpreter::SetOutputs(std::vector<int> outputs) {
-  TF_LITE_ENSURE_OK(
-      &context_, CheckTensorIndices("outputs", outputs.data(), outputs.size()));
-  outputs_ = std::move(outputs);
-  return kTfLiteOk;
+  return primary_subgraph().SetOutputs(outputs);
 }
 
 TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
-  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("variables", variables.data(),
-                                                  variables.size()));
-  variables_ = std::move(variables);
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::CheckTensorIndices(const char* label,
-                                             const int* indices, int length) {
-  // Making sure kOptionalTensor is not re-defined to something other than -1.
-  static_assert(kOptionalTensor == -1, "kOptionalTensor should be defined -1");
-
-  for (int i = 0; i < length; i++) {
-    int index = indices[i];
-    // Continue if index == kOptionalTensor before additional comparisons below,
-    // size_t(-1) is always >= context_tensors_size.
-    if (index == kOptionalTensor) {
-      continue;
-    }
-    if (index < 0 || static_cast<size_t>(index) >= context_.tensors_size) {
-      ReportError(&context_, "Invalid tensor index %d in %s\n", index, label);
-      consistent_ = false;
-      return kTfLiteError;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
-                                        size_t dims_size, size_t* bytes) {
-  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
-  // MultiplyWithoutOverflow.
-  TF_LITE_ENSURE(&context_, bytes != nullptr);
-  size_t count = 1;
-  for (int k = 0; k < dims_size; k++) count *= dims[k];
-  switch (type) {
-    case kTfLiteFloat32:
-      *bytes = sizeof(float) * count;
-      break;
-    case kTfLiteInt16:
-      *bytes = sizeof(int16_t) * count;
-      break;
-    case kTfLiteInt32:
-      *bytes = sizeof(int32_t) * count;
-      break;
-    case kTfLiteUInt8:
-      *bytes = sizeof(uint8_t) * count;
-      break;
-    case kTfLiteInt64:
-      *bytes = sizeof(int64_t) * count;
-      break;
-    case kTfLiteBool:
-      *bytes = sizeof(bool) * count;
-      break;
-    case kTfLiteComplex64:
-      *bytes = sizeof(std::complex<float>) * count;
-      break;
-    default:
-      ReportError(&context_,
-                  "Only float32, int16, int32, int64, uint8, bool, complex64 "
-                  "supported currently.");
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
+  return primary_subgraph().SetVariables(variables);
 }
 
 TfLiteStatus Interpreter::AllocateTensors() {
-  if (!consistent_) {
-    ReportError(&context_, "AllocateTensors() called on inconsistent model.");
-    return kTfLiteError;
-  }
-
-  // Explicit (re)allocation is necessary if nodes have been changed or tensors
-  // have been resized. For inputs marked as dynamic, we can't short-circuit the
-  // allocation as the client may have done the resize manually.
-  if (state_ != kStateUninvokable && !HasDynamicTensorImpl(context_, inputs_)) {
-    return kTfLiteOk;
-  }
-
-  next_execution_plan_index_to_prepare_ = 0;
-  if (memory_planner_) {
-    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
-  }
-
-  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-
-  state_ = kStateInvokable;
-
-  // Reset the variable tensors to zero after (re)allocating the tensors.
-  // Developers shouldn't rely on the side effect of this function to reset
-  // variable tesnsors. They should call `ResetVariableTensors` directly
-  // instead.
-  ResetVariableTensors();
-
-  return kTfLiteOk;
-}
-
-// TODO(ycling): Support non-zero default values.
-TfLiteStatus Interpreter::ResetVariableTensors() {
-  for (auto& tensor : tensors_) {
-    if (!tensor.is_variable) {
-      continue;
-    }
-
-    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
-    // allocated after the initial `PrepareOpsAndTensors()` is called.
-    TF_LITE_ENSURE_EQ(&context_, tensor.allocation_type,
-                      kTfLiteArenaRwPersistent);
-    TF_LITE_ENSURE(&context_, tensor.data.raw != nullptr);
-
-    memset(tensor.data.raw, 0, tensor.bytes);
-  }
-  return kTfLiteOk;
+  return primary_subgraph().AllocateTensors();
 }
 
 void Interpreter::ReserveNodes(int count) {
-  nodes_and_registration_.reserve(count);
+  primary_subgraph().nodes_and_registration().reserve(count);
 }
 
 TfLiteStatus Interpreter::AddNodeWithParameters(
     const std::vector<int>& inputs, const std::vector<int>& outputs,
     const char* init_data, size_t init_data_size, void* builtin_data,
     const TfLiteRegistration* registration, int* node_index) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(&context_,
-                "AddNodeWithParameters is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-  state_ = kStateUninvokable;
-
-  std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
-                                                              free);
-
-  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("node inputs", inputs.data(),
-                                                  inputs.size()));
-  TF_LITE_ENSURE_OK(
-      &context_,
-      CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
-
-  int new_node_index = nodes_and_registration_.size();
-  if (node_index) *node_index = new_node_index;
-  nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
-  auto& node_and_reg = nodes_and_registration_.back();
-  TfLiteNode& node = node_and_reg.first;
-  if (node.inputs) TfLiteIntArrayFree(node.inputs);
-  if (node.outputs) TfLiteIntArrayFree(node.outputs);
-  if (node.temporaries) TfLiteIntArrayFree(node.temporaries);
-
-  // NOTE, here we are not using move semantics yet, since our internal
-  // representation isn't std::vector, but in the future we would like to avoid
-  // copies, so we want the interface to take r-value references now.
-  node.inputs = ConvertVectorToTfLiteIntArray(inputs);
-  node.outputs = ConvertVectorToTfLiteIntArray(outputs);
-  node.temporaries = TfLiteIntArrayCreate(0);
-  if (init_data) {
-    node.user_data = OpInit(*registration, init_data, init_data_size);
-  } else {
-    node.user_data =
-        OpInit(*registration,
-               reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
-  }
-
-  node.builtin_data = builtin_data_deleter.release();
-  // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
-  // properly for nodes generated by ReplaceSubgraphsWithDelegateKernels.
-
-  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
-    // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
-    // `Operator` table is passed in.
-    node.custom_initial_data = init_data;
-    node.custom_initial_data_size = init_data_size;
-  } else {
-    node.custom_initial_data = nullptr;
-    node.custom_initial_data_size = 0;
-  }
-
-  node.delegate = nullptr;
-  node_and_reg.second = *registration;
-  execution_plan_.push_back(new_node_index);
-  return kTfLiteOk;
+  return primary_subgraph().AddNodeWithParameters(inputs, outputs, init_data,
+                                                  init_data_size, builtin_data,
+                                                  registration, node_index);
 }
 
 TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
                                             const std::vector<int>& dims) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(&context_,
-                "ResizeInputTensor is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-
-  // TODO(aselle): All bounds checks can be implemented as one-sided bounds
-  // checks by casting to unsigned for efficiency. Profile before doing this.
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  TfLiteTensor* tensor = &context_.tensors[tensor_index];
-
-  // Short-circuit the state change if the dimensions don't change, avoiding
-  // unnecessary (re)allocations.
-  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
-    return kTfLiteOk;
-  }
-
-  state_ = kStateUninvokable;
-  return ResizeTensorImpl(tensor, ConvertVectorToTfLiteIntArray(dims));
-}
-
-bool HasDynamicTensor(const TfLiteContext& context,
-                      const TfLiteIntArray* int_array) {
-  return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
-}
-
-TfLiteStatus Interpreter::PrepareOpsStartingAt(
-    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
-  for (int execution_plan_index = first_execution_plan_index;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    int node_index = execution_plan_[execution_plan_index];
-    TfLiteNode& node = nodes_and_registration_[node_index].first;
-    const TfLiteRegistration& registration =
-        nodes_and_registration_[node_index].second;
-    EnsureTensorsVectorCapacity();
-    if (OpPrepare(registration, &node) == kTfLiteError) {
-      return ReportOpError(&context_, node, registration, node_index,
-                           "failed to prepare");
-    }
-
-    *last_execution_plan_index_prepared = execution_plan_index;
-
-    // Discontinue if the node has dynamic outputs. Note that we don't
-    // stop for dynamic temporary tensors since they won't affect the
-    // sizes of other tensors in the graph.
-    if (HasDynamicTensor(context_, node.outputs)) {
-      break;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::PrepareOpsAndTensors() {
-  if (!memory_planner_) {
-    memory_planner_.reset(new ArenaPlanner(
-        &context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this)),
-        /*preserve_inputs=*/true, /*preserve_intermediates*/ false));
-    memory_planner_->PlanAllocations();
-  }
-
-  int last_exec_plan_index_prepared = 0;
-
-  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
-      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
-  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
-      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
-
-  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
-  return kTfLiteOk;
+  return primary_subgraph().ResizeInputTensor(tensor_index, dims);
 }
 
 TfLiteStatus Interpreter::Invoke() {
-  if (!consistent_) {
-    ReportError(&context_, "Invoke called on model that is not consistent.");
-    return kTfLiteError;
-  }
-  if (state_ == kStateUninvokable) {
-    ReportError(&context_, "Invoke called on model that is not ready.");
-    return kTfLiteError;
-  }
-
-  TfLiteStatus status = kTfLiteOk;
-  if (nnapi_delegate_) {
-    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
-      TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
-      return kTfLiteOk;
-    } else {
-      // TODO(aselle): In the future, we would like this to be an
-      // automatic tflite CPU fallback.
-      ReportError(&context_,
-                  "NNAPI was requested, but dependent sized tensors "
-                  "being used.\n");
-      return kTfLiteError;
-    }
-  }
-
-  // Invocations are always done in node order.
-  // Note that calling Invoke repeatedly will cause the original memory plan to
-  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
-  // called.
-  // TODO(b/71913981): we should force recalculation in the presence of dynamic
-  // tensors, because they may have new value which in turn may affect shapes
-  // and allocations.
-  for (int execution_plan_index = 0;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
-      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-      TF_LITE_ENSURE(&context_, next_execution_plan_index_to_prepare_ >=
-                                    execution_plan_index);
-    }
-    int node_index = execution_plan_[execution_plan_index];
-    TfLiteNode& node = nodes_and_registration_[node_index].first;
-    const TfLiteRegistration& registration =
-        nodes_and_registration_[node_index].second;
-    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
-
-    // TODO(ycling): This is an extra loop through inputs to check if the data
-    // need to be copied from Delegate buffer to raw memory, which is often not
-    // needed. We may want to cache this in prepare to know if this needs to be
-    // done for a node or not.
-    for (int i = 0; i < node.inputs->size; ++i) {
-      int tensor_index = node.inputs->data[i];
-      if (tensor_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor* tensor = &tensors_[tensor_index];
-      if (tensor->delegate && tensor->delegate != node.delegate &&
-          tensor->data_is_stale) {
-        EnsureTensorDataIsReadable(tensor_index);
-      }
-    }
-
-    EnsureTensorsVectorCapacity();
-    tensor_resized_since_op_invoke_ = false;
-    if (OpInvoke(registration, &node) == kTfLiteError) {
-      status = ReportOpError(&context_, node, registration, node_index,
-                             "failed to invoke");
-    }
-
-    // Force execution prep for downstream ops if the latest op triggered the
-    // resize of a dynamic tensor.
-    if (tensor_resized_since_op_invoke_ &&
-        HasDynamicTensor(context_, node.outputs)) {
-      next_execution_plan_index_to_prepare_ = execution_plan_index + 1;
-    }
-  }
+  TfLiteStatus status = primary_subgraph().Invoke();
 
   if (!allow_buffer_handle_output_) {
-    for (int tensor_index : outputs_) {
-      EnsureTensorDataIsReadable(tensor_index);
+    for (int tensor_index : outputs()) {
+      primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
     }
   }
 
   return status;
 }
 
-TfLiteStatus Interpreter::ResizeTensor(TfLiteContext* context,
-                                       TfLiteTensor* tensor,
-                                       TfLiteIntArray* new_size) {
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function ResizeTensorImpl
-  // (this function is static).
-  return static_cast<Interpreter*>(context->impl_)
-      ->ResizeTensorImpl(tensor, new_size);
-}
-
-void Interpreter::ReportErrorImpl(const char* format, va_list args) {
-  error_reporter_->Report(format, args);
-}
-
-void Interpreter::ReportError(TfLiteContext* context, const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  auto* f = static_cast<Interpreter*>(context->impl_);
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function ReportErrorImpl
-  // (this function is static).
-  f->ReportErrorImpl(format, args);
-  va_end(args);
-}
-
 TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
                                      int* first_new_tensor_index) {
-  int base_index = tensors_.size();
-  if (first_new_tensor_index) *first_new_tensor_index = base_index;
-  tensors_.resize(tensors_.size() + tensors_to_add);
-  for (int i = base_index; i < tensors_.size(); i++) {
-    memset(&tensors_[i], 0, sizeof(tensors_[i]));
-    tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
-  }
-  context_.tensors = tensors_.data();
-  context_.tensors_size = tensors_.size();
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::AddTensors(TfLiteContext* context, int tensors_to_add,
-                                     int* first_new_tensor_index) {
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function AddTensors
-  // (this function is static).
-  return static_cast<Interpreter*>(context->impl_)
-      ->AddTensors(tensors_to_add, first_new_tensor_index);
-}
-
-TfLiteStatus Interpreter::GetNodeAndRegistration(
-    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
-  TF_LITE_ENSURE(&context_, node_index < nodes_size() && node_index >= 0);
-  TF_LITE_ENSURE(&context_, node != nullptr && registration != nullptr);
-  *node = &nodes_and_registration_[node_index].first;
-  *registration = &nodes_and_registration_[node_index].second;
-  return kTfLiteOk;
+  return primary_subgraph().AddTensors(tensors_to_add, first_new_tensor_index);
 }
 
-TfLiteStatus Interpreter::GetNodeAndRegistration(
-    struct TfLiteContext* context, int node_index, TfLiteNode** node,
-    TfLiteRegistration** registration) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->GetNodeAndRegistration(node_index, node, registration);
+TfLiteStatus Interpreter::ResetVariableTensors() {
+  return primary_subgraph().ResetVariableTensors();
 }
 
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(
-        &context_,
-        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  // For most tensors we know exactly how much memory is necessary so we can
-  // ensure the buffer is large enough. However, we need to skip string tensors
-  // because their sizes change with the contents of the individual strings.
-  if (type != kTfLiteString) {
-    size_t required_bytes;
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, rank, &required_bytes));
-    TF_LITE_ENSURE_EQ(&context_, required_bytes, bytes);
-  }
-
-  TfLiteTensor& tensor = context_.tensors[tensor_index];
-  if (type == tensor.type &&
-      EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
-    // Fast path which does not invalidate the invokable property.
-    TfLiteTensorDataFree(&tensor);
-    tensor.data.raw = const_cast<char*>(buffer);
-    if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
-    tensor.params = quantization;
-    tensor.allocation_type = kTfLiteMmapRo;
-    tensor.allocation = allocation;
-  } else {
-    state_ = kStateUninvokable;
-    TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                      quantization, const_cast<char*>(buffer), bytes,
-                      kTfLiteMmapRo, allocation, false, &tensor);
-  }
-  return kTfLiteOk;
+  return primary_subgraph().SetTensorParametersReadOnly(
+      tensor_index, type, name, rank, dims, quantization, buffer, bytes,
+      allocation);
 }
 
 // Set description of inputs/outputs/data/fptrs for node `node_index`.
@@ -815,187 +124,52 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(
-        &context_,
-        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  size_t required_bytes = 0;
-  if (type != kTfLiteString) {
-    // These types will be allocated in our arena so we need to record how
-    // many bytes we will need based on the dimensions. String tensors are
-    // allocated dynamically and we can't know ahead of time how much space
-    // they will require.
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, rank, &required_bytes));
-  }
-
-  TfLiteAllocationType allocation_type = kTfLiteArenaRw;
-  if (type == kTfLiteString) {
-    if (is_variable) {
-      // We don't have a real use case for string variable tensor.
-      ReportError(&context_, "String variable tensor isn't supported.");
-      return kTfLiteError;
-    }
-    allocation_type = kTfLiteDynamic;
-  } else if (is_variable) {
-    allocation_type = kTfLiteArenaRwPersistent;
-  }
-
-  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                    quantization,
-                    /*buffer=*/nullptr, required_bytes, allocation_type,
-                    nullptr, is_variable, &context_.tensors[tensor_index]);
-  return kTfLiteOk;
+  return primary_subgraph().SetTensorParametersReadWrite(
+      tensor_index, type, name, rank, dims, quantization, is_variable);
 }
 
 TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
-  for (int node_index : new_plan) {
-    TF_LITE_ENSURE(&context_, node_index >= 0 && node_index < nodes_size());
-  }
-  execution_plan_ = new_plan;
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
-                                           TfLiteIntArray* new_size) {
-  // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
-  if (tensor->allocation_type == kTfLiteArenaRw ||
-      tensor->allocation_type == kTfLiteDynamic ||
-      tensor->allocation_type == kTfLiteArenaRwPersistent) {
-    tensor_resized_since_op_invoke_ |=
-        TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
-    if (tensor->type != kTfLiteString) {
-      size_t bytesRequired;
-      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
-                                          new_size->size, &bytesRequired);
-      if (status != kTfLiteOk) {
-        TfLiteIntArrayFree(new_size);
-        return kTfLiteError;
-      }
-
-      // Realloc space for kTfLiteDynamic tensors.
-      TfLiteTensorRealloc(bytesRequired, tensor);
-      tensor->bytes = bytesRequired;
-    }
-    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
-    tensor->dims = new_size;
-
-    if (tensor->allocation_type != kTfLiteDynamic) {
-      tensor->data.raw = nullptr;
-    }
-  } else {
-    // kTfLiteMmapRo tensors are stored in the flatbuffer and are therefore
-    // of fixed size.
-    TfLiteIntArrayFree(new_size);
-    ReportError(&context_, "Attempting to resize a fixed-size tensor.");
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
+  return primary_subgraph().SetExecutionPlan(new_plan);
 }
 
-void Interpreter::UseNNAPI(bool enable) {
-  // TODO(aselle): This is a workaround for finding if NNAPI exists.
-  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
-  // prefixed.
-  if (!NNAPIDelegate::IsSupported()) enable = false;
-  if (!enable) {
-    nnapi_delegate_.reset();
-  } else if (!nnapi_delegate_) {
-    nnapi_delegate_.reset(new NNAPIDelegate);
-  }
-}
+void Interpreter::UseNNAPI(bool enable) { primary_subgraph().UseNNAPI(enable); }
 
 void Interpreter::SetNumThreads(int num_threads) {
-  context_.recommended_num_threads = num_threads;
+  for (auto& subgraph : subgraphs_) {
+    subgraph->context()->recommended_num_threads = num_threads;
+  }
 
   for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
     auto* c = external_contexts_[i];
     if (c && c->Refresh) {
-      c->Refresh(&context_);
+      c->Refresh(context_);
     }
   }
 }
 
-void Interpreter::SwitchToDelegateContext() {
-  context_.GetNodeAndRegistration = GetNodeAndRegistration;
-  context_.ReplaceSubgraphsWithDelegateKernels =
-      ReplaceSubgraphsWithDelegateKernels;
-  context_.GetExecutionPlan = GetExecutionPlan;
-}
-
-void Interpreter::SwitchToKernelContext() {
-  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
-  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
-  SetForbiddenContextFunction(&context_.GetExecutionPlan);
-}
-
-TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate,
-                                                  bool allow_dynamic_tensors) {
-  if (!allow_dynamic_tensors) {
-    int last_execution_plan_index_prepared;
-    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
-                                     0, &last_execution_plan_index_prepared));
-
-    bool has_dynamic_tensors = true;
-    // Dynamic tensors exist if not all nodes can be prepared.
-    if (last_execution_plan_index_prepared + 1 == execution_plan_.size()) {
-      // If all the nodes can be prepared, check if the last node has dynamic
-      // tensors.
-      int node_index = execution_plan_[last_execution_plan_index_prepared];
-      TfLiteNode& node = nodes_and_registration_[node_index].first;
-      if (!HasDynamicTensor(context_, node.outputs)) {
-        has_dynamic_tensors = false;
-      }
-    }
-    if (has_dynamic_tensors) {
-      ReportError(
-          &context_,
-          "Attempting to use a delegate that only supports static-sized "
-          "tensors with a graph that has dynamic-sized tensors.");
-      return kTfLiteError;
-    }
-  }
-
-  // TODO(aselle): Consider if it is worth storing pointers to delegates.
-  // Setup additional context interface.
-  SwitchToDelegateContext();
-
-  TfLiteStatus status = delegate->Prepare(&context_, delegate);
-
-  // Remove additional context info.
-  SwitchToKernelContext();
-
-  TF_LITE_ENSURE_OK(&context_, status);
-
-  if (!allow_dynamic_tensors) {
-    // Reset the state to force tensor/op reallocation.
-    state_ = kStateUninvokable;
-    TF_LITE_ENSURE_OK(&context_, AllocateTensors());
-    TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable);
-    // After using a delegate which doesn't support dynamic tensors, make the
-    // entire graph immutable.
-    state_ = kStateInvokableAndImmutable;
+void Interpreter::SetAllowFp16PrecisionForFp32(bool allow) {
+  for (auto& subgraph : subgraphs_) {
+    subgraph->context()->allow_fp32_relax_to_fp16 = allow;
   }
+}
 
-  return status;
+TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  return primary_subgraph().ModifyGraphWithDelegate(delegate);
 }
 
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
-  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
-  TfLiteTensor* tensor = &tensors_[tensor_index];
+  TF_LITE_ENSURE(context_, tensor_index < tensors_size());
+  std::vector<TfLiteTensor>& tensors = primary_subgraph().tensors();
+  TfLiteTensor* tensor = &tensors[tensor_index];
 
-  TF_LITE_ENSURE(&context_,
+  TF_LITE_ENSURE(context_,
                  tensor->delegate == nullptr || tensor->delegate == delegate);
   tensor->delegate = delegate;
   if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
-    TF_LITE_ENSURE(&context_, tensor->delegate->FreeBufferHandle != nullptr);
-    tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
+    TF_LITE_ENSURE(context_, tensor->delegate->FreeBufferHandle != nullptr);
+    tensor->delegate->FreeBufferHandle(context_, tensor->delegate,
                                        &tensor->buffer_handle);
   }
   tensor->buffer_handle = buffer_handle;
@@ -1006,8 +180,9 @@ TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
 TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle* buffer_handle,
                                           TfLiteDelegate** delegate) {
-  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
-  TfLiteTensor* tensor = &tensors_[tensor_index];
+  TF_LITE_ENSURE(context_, tensor_index < tensors_size());
+  std::vector<TfLiteTensor>& tensors = primary_subgraph().tensors();
+  TfLiteTensor* tensor = &tensors[tensor_index];
 
   *delegate = tensor->delegate;
   *buffer_handle = tensor->buffer_handle;
@@ -1015,4 +190,12 @@ TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
   return kTfLiteOk;
 }
 
+void Interpreter::SetProfiler(profiling::Profiler* profiler) {
+  for (auto& subgraph : subgraphs_) subgraph->SetProfiler(profiler);
+}
+
+profiling::Profiler* Interpreter::GetProfiler() {
+  return primary_subgraph().GetProfiler();
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 6bb47726c01e69102035c64d5186b9e317420259..405cf640b941c78dd3d0f99133304ae90f6cf2c0 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/stderr_reporter.h"
@@ -57,6 +58,10 @@ constexpr TfLiteType typeToTfLiteType<unsigned char>() {
   return kTfLiteUInt8;
 }
 template <>
+constexpr TfLiteType typeToTfLiteType<int8_t>() {
+  return kTfLiteInt8;
+}
+template <>
 constexpr TfLiteType typeToTfLiteType<bool>() {
   return kTfLiteBool;
 }
@@ -69,9 +74,6 @@ constexpr TfLiteType typeToTfLiteType<string>() {
   return kTfLiteString;
 }
 
-// Forward declare since NNAPIDelegate uses Interpreter.
-class NNAPIDelegate;
-
 // An interpreter for a graph of nodes that input and output from tensors.
 // Each node of the graph processes a set of input tensors and produces a
 // set of output Tensors. All inputs/output tensors are referenced by index.
@@ -100,12 +102,6 @@ class NNAPIDelegate;
 // foo.Invoke();
 //
 
-struct TfLiteIntArrayDeleter {
-  void operator()(TfLiteIntArray* a) {
-    if (a) TfLiteIntArrayFree(a);
-  }
-};
-
 class Interpreter {
  public:
   // Instantiate an interpreter. All errors associated with reading and
@@ -117,6 +113,7 @@ class Interpreter {
 
   ~Interpreter();
 
+  // Interpreters are not copyable as they have non-trivial memory semantics.
   Interpreter(const Interpreter&) = delete;
   Interpreter& operator=(const Interpreter&) = delete;
 
@@ -197,34 +194,40 @@ class Interpreter {
   // Functions to access tensor data
 
   // Read only access to list of inputs.
-  const std::vector<int>& inputs() const { return inputs_; }
+  const std::vector<int>& inputs() const { return primary_subgraph().inputs(); }
 
   // Return the name of a given input. The given index must be between 0 and
   // inputs().size().
   const char* GetInputName(int index) const {
-    return context_.tensors[inputs_[index]].name;
+    return context_->tensors[inputs()[index]].name;
   }
 
   // Read only access to list of outputs.
-  const std::vector<int>& outputs() const { return outputs_; }
+  const std::vector<int>& outputs() const {
+    return primary_subgraph().outputs();
+  }
 
   // Read only access to list of variable tensors.
-  const std::vector<int>& variables() const { return variables_; }
+  const std::vector<int>& variables() const {
+    return primary_subgraph().variables();
+  }
 
   // Return the name of a given output. The given index must be between 0 and
   // outputs().size().
   const char* GetOutputName(int index) const {
-    return context_.tensors[outputs_[index]].name;
+    return context_->tensors[outputs()[index]].name;
   }
 
   // Return the number of tensors in the model.
-  size_t tensors_size() const { return context_.tensors_size; }
+  size_t tensors_size() const { return context_->tensors_size; }
 
   // Return the number of ops in the model.
-  size_t nodes_size() const { return nodes_and_registration_.size(); }
+  size_t nodes_size() const { return primary_subgraph().nodes_size(); }
 
   // WARNING: Experimental interface, subject to change
-  const std::vector<int>& execution_plan() const { return execution_plan_; }
+  const std::vector<int>& execution_plan() const {
+    return primary_subgraph().execution_plan();
+  }
 
   // WARNING: Experimental interface, subject to change
   // Overrides execution plan. This bounds checks indices sent in.
@@ -234,27 +237,18 @@ class Interpreter {
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
-    if (tensor_index < 0 ||
-        static_cast<size_t>(tensor_index) >= context_.tensors_size)
-      return nullptr;
-    return &context_.tensors[tensor_index];
+    return primary_subgraph().tensor(tensor_index);
   }
 
   // Get an immutable tensor data structure.
   const TfLiteTensor* tensor(int tensor_index) const {
-    if (tensor_index < 0 ||
-        static_cast<size_t>(tensor_index) >= context_.tensors_size)
-      return nullptr;
-    return &context_.tensors[tensor_index];
+    return primary_subgraph().tensor(tensor_index);
   }
 
   // Get a pointer to an operation and registration data structure if in bounds.
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
       int node_index) const {
-    if (node_index < 0 ||
-        static_cast<size_t>(node_index) >= nodes_and_registration_.size())
-      return nullptr;
-    return &nodes_and_registration_[node_index];
+    return primary_subgraph().node_and_registration(node_index);
   }
 
   // Perform a checked cast to the appropriate tensor type (mutable pointer
@@ -285,28 +279,28 @@ class Interpreter {
   // index must be between 0 and inputs().size().
   template <class T>
   T* typed_input_tensor(int index) {
-    return typed_tensor<T>(inputs_[index]);
+    return typed_tensor<T>(inputs()[index]);
   }
 
   // Return an immutable pointer into the data of a given input tensor. The
   // given index must be between 0 and inputs().size().
   template <class T>
   const T* typed_input_tensor(int index) const {
-    return typed_tensor<T>(inputs_[index]);
+    return typed_tensor<T>(inputs()[index]);
   }
 
   // Return a mutable pointer into the data of a given output tensor. The given
   // index must be between 0 and outputs().size().
   template <class T>
   T* typed_output_tensor(int index) {
-    return typed_tensor<T>(outputs_[index]);
+    return typed_tensor<T>(outputs()[index]);
   }
 
   // Return an immutable pointer into the data of a given output tensor. The
   // given index must be between 0 and outputs().size().
   template <class T>
   const T* typed_output_tensor(int index) const {
-    return typed_tensor<T>(outputs_[index]);
+    return typed_tensor<T>(outputs()[index]);
   }
 
   // Change the dimensionality of a given tensor. Note, this is only acceptable
@@ -321,7 +315,6 @@ class Interpreter {
   // Update allocations for all tensors. This will redim dependent tensors using
   // the input tensor dimensionality as given. This is relatively expensive.
   // If you know that your sizes are not changing, you need not call this.
-
   // Returns status of success or failure.
   TfLiteStatus AllocateTensors();
 
@@ -342,14 +335,12 @@ class Interpreter {
   // Allow float16 precision for FP32 calculation when possible.
   // default: not allow.
   // WARNING: This is an experimental API and subject to change.
-  void SetAllowFp16PrecisionForFp32(bool allow) {
-    context_.allow_fp32_relax_to_fp16 = allow;
-  }
+  void SetAllowFp16PrecisionForFp32(bool allow);
 
   // Get the half precision flag.
   // WARNING: This is an experimental API and subject to change.
   bool GetAllowFp16PrecisionForFp32() const {
-    return context_.allow_fp32_relax_to_fp16;
+    return context_->allow_fp32_relax_to_fp16;
   }
 
   // Owning handle to a TfLiteDelegate instance.
@@ -360,25 +351,13 @@ class Interpreter {
   // parts of the graph themselves. After this is called, the graph may
   // contain new nodes that replace 1 more nodes.
   // WARNING: This is an experimental API and subject to change.
-  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate,
-                                       bool allow_dynamic_tensors = false);
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
 
   // Ensure the data in `tensor.data` is readable. In case delegate is used,
   // it might require to copy the data from delegate buffer to raw memory.
   // WARNING: This is an experimental API and subject to change.
   TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
-    TfLiteTensor* t = tensor(tensor_index);
-    TF_LITE_ENSURE(&context_, t != nullptr);
-    if (t->data_is_stale) {
-      TF_LITE_ENSURE(&context_, t->delegate != nullptr);
-      TF_LITE_ENSURE(&context_, t->buffer_handle != kTfLiteNullBufferHandle);
-      // This can be null if the delegate doesn't use its own buffer.
-      TF_LITE_ENSURE(&context_, t->delegate->CopyFromBufferHandle != nullptr);
-      t->delegate->CopyFromBufferHandle(
-          &context_, t->delegate, t->buffer_handle, t->data.raw, t->bytes);
-      t->data_is_stale = false;
-    }
-    return kTfLiteOk;
+    return primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
   }
 
   // Set the delegate buffer handle to a tensor. It can be called in the
@@ -401,9 +380,9 @@ class Interpreter {
                                TfLiteBufferHandle* buffer_handle,
                                TfLiteDelegate** delegate);
 
-  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+  void SetProfiler(profiling::Profiler* profiler);
 
-  profiling::Profiler* GetProfiler() { return profiler_; }
+  profiling::Profiler* GetProfiler();
 
   // The default capacity of `tensors_` vector.
   static constexpr int kTensorsReservedCapacity = 128;
@@ -436,7 +415,7 @@ class Interpreter {
   const char* OpProfilingString(const TfLiteRegistration& op_reg,
                                 const TfLiteNode* node) const {
     if (op_reg.profiling_string == nullptr) return nullptr;
-    return op_reg.profiling_string(&context_, node);
+    return op_reg.profiling_string(context_, node);
   }
 
   // Set the value of an external context.
@@ -447,132 +426,14 @@ class Interpreter {
   friend class InterpreterBuilder;
   friend class InterpreterTest;
 
-  // Prevent 'context_' from accessing functions that are only available to
-  // delegated kernels.
-  void SwitchToKernelContext();
-
-  // Add delegate-only functions to 'context_'.
-  void SwitchToDelegateContext();
-
-  // Give 'op_reg' a chance to initialize itself using the contents of
-  // 'buffer'.
-  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
-               size_t length) {
-    if (op_reg.init == nullptr) return nullptr;
-    return op_reg.init(&context_, buffer, length);
-  }
-
-  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
-  void OpFree(const TfLiteRegistration& op_reg, void* buffer) {
-    if (op_reg.free == nullptr) return;
-    if (buffer) {
-      op_reg.free(&context_, buffer);
-    }
-  }
-
-  // Prepare the given 'node' for execution.
-  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node) {
-    if (op_reg.prepare == nullptr) return kTfLiteOk;
-    return op_reg.prepare(&context_, node);
+  Subgraph& primary_subgraph() {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
   }
 
-  // Invoke the operator represented by 'node'.
-  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node) {
-    if (op_reg.invoke == nullptr) return kTfLiteError;
-    return op_reg.invoke(&context_, node);
+  const Subgraph& primary_subgraph() const {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
   }
 
-  // Call OpPrepare() for as many ops as possible, allocating memory for their
-  // tensors. If an op containing dynamic tensors is found, preparation will be
-  // postponed until this function is called again. This allows the interpreter
-  // to wait until Invoke() to resolve the sizes of dynamic tensors.
-  TfLiteStatus PrepareOpsAndTensors();
-
-  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
-  // dynamic tensors is found or all ops have been prepared. Fill
-  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
-  // the last in the graph.
-  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
-                                    int* last_execution_plan_index_prepared);
-
-  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
-  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
-  // `context_` whenever this std::vector is reallocated. Currently this
-  // only happens in `AddTensors()`.
-  std::vector<TfLiteTensor> tensors_;
-
-  // Check if an array of tensor indices are valid with respect to the Tensor
-  // array.
-  // NOTE: this changes consistent_ to be false if indices are out of bounds.
-  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
-                                  int length);
-
-  // Compute the number of bytes required to represent a tensor with dimensions
-  // specified by the array dims (of length dims_size). Returns the status code
-  // and bytes.
-  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
-                             size_t* bytes);
-
-  // Request an tensor be resized implementation. If the given tensor is of
-  // type kTfLiteDynamic it will also be allocated new memory.
-  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
-
-  // Report a detailed error string (will be printed to stderr).
-  // TODO(aselle): allow user of class to provide alternative destinations.
-  void ReportErrorImpl(const char* format, va_list args);
-
-  // Entry point for C node plugin API to request an tensor be resized.
-  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
-                                   TfLiteIntArray* new_size);
-  // Entry point for C node plugin API to report an error.
-  static void ReportError(TfLiteContext* context, const char* format, ...);
-
-  // Entry point for C node plugin API to add new tensors.
-  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
-                                 int* first_new_tensor_index);
-
-  // WARNING: This is an experimental API and subject to change.
-  // Entry point for C API ReplaceSubgraphsWithDelegateKernels
-  static TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
-      TfLiteContext* context, TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
-
-  // Update the execution graph to replace some of the nodes with stub
-  // nodes. Specifically any node index that has `nodes[index]==1` will be
-  // slated for replacement with a delegate kernel specified by registration.
-  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
-      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
-      TfLiteDelegate* delegate);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Gets the internal pointer to a TensorFlow lite node by node_index.
-  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
-                                      TfLiteRegistration** registration);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Entry point for C node plugin API to get a node by index.
-  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
-                                             int node_index, TfLiteNode** node,
-                                             TfLiteRegistration** registration);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
-  // owns this memory and it is only guaranteed to exist during the invocation
-  // of the delegate prepare.
-  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Entry point for C node plugin API to get the execution plan.
-  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
-                                       TfLiteIntArray** execution_plan);
-
-  // Retrieve an existing external context by type.
-  TfLiteExternalContext* GetExternalContext(TfLiteExternalContextType type);
-  static TfLiteExternalContext* GetExternalContext(
-      struct TfLiteContext* context, TfLiteExternalContextType type);
-
   // Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
                                  TfLiteExternalContextType type,
@@ -581,114 +442,35 @@ class Interpreter {
   // Variant of the public ModifyGraphWithDelegate method that additionally
   // Assumes ownership of the provided delegate.
   // WARNING: This is an experimental API and subject to change.
-  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegatePtr delegate,
-                                       bool allow_dynamic_tensors = false) {
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegatePtr delegate) {
     // Note that we retain ownership of the delegate even if graph modification
     // fails, as delegate use will be in an indeterminate state at that point.
     owned_delegates_.push_back(std::move(delegate));
-    return ModifyGraphWithDelegate(owned_delegates_.back().get(),
-                                   allow_dynamic_tensors);
-  }
-
-  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
-  // capacity. Calling this function may invalidate existing pointers to
-  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
-  // more tensors won't invalidate the pointer to existing tensors.
-  void EnsureTensorsVectorCapacity() {
-    const size_t required_capacity = tensors_size() + kTensorsCapacityHeadroom;
-    if (required_capacity > tensors_.capacity()) {
-      tensors_.reserve(required_capacity);
-      context_.tensors = tensors_.data();
-    }
+    return ModifyGraphWithDelegate(owned_delegates_.back().get());
   }
 
-  // The state of the Interpreter.
-  enum State {
-    // The interpreter isn't ready to be invoked.
-    // `AllocateTensor` need to be called to enter an invokable state.
-    kStateUninvokable = 0,
-    // The interpreter is ready to be invoked.
-    kStateInvokable,
-    // The interpreter is ready to be invoked, and graph can't be further
-    // modified. The interpreter will enter this state when calling
-    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
-    kStateInvokableAndImmutable,
-  };
-  State state_ = kStateUninvokable;
-
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
-  TfLiteContext context_;
-
-  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
-  // function pointers to actual implementation.
-  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
-      nodes_and_registration_;
-
-  // Whether the model is consistent. That is to say if the inputs and outputs
-  // of every node and the global inputs and outputs are valid indexes into
-  // the tensor array.
-  bool consistent_ = true;
-
-  // Array of indices representing the tensors that are inputs to the
-  // interpreter.
-  std::vector<int> inputs_;
-
-  // Array of indices representing the tensors that are outputs to the
-  // interpreter.
-  std::vector<int> outputs_;
-
-  // Array of indices representing the tensors that are variable tensors.
-  std::vector<int> variables_;
+  // This is the primary subgraph context.
+  TfLiteContext* context_;
 
   // The error reporter delegate that tflite will forward queries errors to.
   ErrorReporter* error_reporter_;
 
-  // Index of the next node to prepare.
-  // During Invoke(), Interpreter will allocate input tensors first, which are
-  // known to be fixed size. Then it will allocate outputs from nodes as many
-  // as possible. When there is a node that produces dynamic sized tensor.
-  // Interpreter will stop allocating tensors, set the value of next allocate
-  // node id, and execute the node to generate the output tensor before continue
-  // to allocate successors. This process repeats until all nodes are executed.
-  // NOTE: this relies on the order of nodes that is in topological order.
-  int next_execution_plan_index_to_prepare_;
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // This is a list of node indices (to index into nodes_and_registration).
-  // This represents a valid topological sort (dependency ordered) execution
-  // plan. In particular, it is valid for this ordering to contain only a
-  // subset of the node indices.
-  std::vector<int> execution_plan_;
-
-  // In the future, we'd like a TfLiteIntArray compatible representation.
-  // TODO(aselle): replace execution_plan_ with this.
-  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
-
-  // Whether to delegate to NN API
-  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
-
   // List of delegates that have been installed and are owned by this
   // interpreter instance. Useful if client delegate ownership is burdensome.
   // WARNING: This is an experimental API and subject to change.
   // TODO(b/116667551): Use TfLiteExternalContext for storing state.
   std::vector<TfLiteDelegatePtr> owned_delegates_;
 
-  std::unique_ptr<MemoryPlanner> memory_planner_;
-
   bool allow_buffer_handle_output_ = false;
 
-  // Tracking bit for whether a tensor was resized in the course of an op
-  // invocation. This is a useful hint to ensure that dynamic tensor outputs
-  // trigger downstream reallocation after op invocation.
-  bool tensor_resized_since_op_invoke_ = false;
-
-  // Profiler for this interpreter instance.
-  profiling::Profiler* profiler_ = nullptr;
-
   // List of active external contexts.
   TfLiteExternalContext* external_contexts_[kTfLiteMaxExternalContexts];
+
+  // Subgraphs
+  std::vector<std::unique_ptr<Subgraph>> subgraphs_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index a37603b695cb30a7f1058ae77ab2214a7d49c2e3..2e0dc77dcd4f09184f0af996d864832bc334a93f 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -38,7 +38,7 @@ class InterpreterTest : public ::testing::Test {
   }
 
  protected:
-  TfLiteContext* GetInterpreterContext() { return &interpreter_.context_; }
+  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
 
   Interpreter interpreter_;
 };
@@ -698,7 +698,7 @@ TEST(BasicInterpreter, TestUnsupportedDelegateFunctions) {
                                                   nullptr};
       TfLiteIntArray nodes_to_replace;
       nodes_to_replace.size = 0;
-      EXPECT_EQ(context->ReplaceSubgraphsWithDelegateKernels(
+      EXPECT_EQ(context->ReplaceNodeSubsetsWithDelegateKernels(
                     context, delegate_registration, &nodes_to_replace, nullptr),
                 kTfLiteError);
     }
@@ -1085,7 +1085,7 @@ class TestDelegate : public ::testing::Test {
           TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
         }
 
-        context->ReplaceSubgraphsWithDelegateKernels(
+        context->ReplaceNodeSubsetsWithDelegateKernels(
             context, FakeFusedRegistration(), nodes_to_separate, delegate);
         TfLiteIntArrayFree(nodes_to_separate);
         return kTfLiteOk;
@@ -1109,6 +1109,7 @@ class TestDelegate : public ::testing::Test {
              TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
       // Store type-punned data SimpleDelegate structure.
       delegate_.data_ = reinterpret_cast<void*>(this);
+      delegate_.flags = kTfLiteDelegateFlagsNone;
     }
 
     static TfLiteRegistration FakeFusedRegistration() {
@@ -1210,7 +1211,7 @@ TEST_F(TestDelegate, SetInvalidHandleToTensor) {
   interpreter_->Invoke();
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate, true);
+  interpreter_->ModifyGraphWithDelegate(delegate);
 
   SimpleDelegate another_simple_delegate({0, 1, 2});
 
@@ -1264,10 +1265,11 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
       TfLiteIntArray* execution_plan;
       TF_LITE_ENSURE_STATUS(
           context->GetExecutionPlan(context, &execution_plan));
-      context->ReplaceSubgraphsWithDelegateKernels(
+      context->ReplaceNodeSubsetsWithDelegateKernels(
           context, DelegateRegistration(), execution_plan, delegate);
       return kTfLiteOk;
     };
+    delegate_.flags = kTfLiteDelegateFlagsNone;
   }
 
   static TfLiteRegistration DynamicCopyOpRegistration() {
@@ -1296,7 +1298,7 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
 };
 
 TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
-  interpreter_->ModifyGraphWithDelegate(&delegate_, false);
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
 
   ASSERT_EQ(interpreter_->execution_plan().size(), 1);
   // The interpreter should not call delegate's `Prepare` when dynamic tensors
@@ -1305,7 +1307,8 @@ TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
 }
 
 TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
-  interpreter_->ModifyGraphWithDelegate(&delegate_, true);
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
 
   ASSERT_EQ(interpreter_->execution_plan().size(), 1);
   // The node should be replaced because dynamic tensors are allowed. Therefore
@@ -1317,6 +1320,7 @@ TEST(TestDelegateOwnership, ProperlyDisposed) {
   struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate {
     TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared)
         : destroyed(destroyed), prepared(prepared) {
+      flags = kTfLiteDelegateFlagsNone;
       Prepare = [](TfLiteContext*, TfLiteDelegate* delegate) -> TfLiteStatus {
         *static_cast<TfLiteInterpreterOwnedDelegate*>(delegate)->prepared =
             true;
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index e6f47a9773ae4e651fcd10968f2b065a18eaabf4..adf7bc9087878ad84824844139058c140d7084f8 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -90,7 +90,6 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java"],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.TensorFlowLiteTest",
     deps = [
         ":tensorflowlitelib",
@@ -104,7 +103,6 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/DataTypeTest.java"],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.DataTypeTest",
     deps = [
         ":tensorflowlitelib",
@@ -122,11 +120,11 @@ java_test(
         "src/testdata/int32.bin",
         "src/testdata/int64.bin",
         "src/testdata/invalid_model.bin",
+        "src/testdata/quantized.bin",
         "src/testdata/uint8.bin",
         "src/testdata/with_custom_op.lite",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
     deps = [
         ":tensorflowlitelib",
@@ -142,11 +140,10 @@ java_test(
     srcs = ["src/test/java/org/tensorflow/lite/InterpreterTest.java"],
     data = [
         "src/testdata/add.bin",
-        "src/testdata/mobilenet.tflite.bin",
+        "//tensorflow/lite:testdata/multi_add.bin",
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.InterpreterTest",
     visibility = ["//visibility:private"],
     deps = [
@@ -165,7 +162,6 @@ java_test(
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],
     deps = [
@@ -183,7 +179,6 @@ java_test(
         "src/testdata/add.bin",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
     test_class = "org.tensorflow.lite.TensorTest",
     deps = [
         ":tensorflowlitelib",
diff --git a/tensorflow/lite/java/ovic/README.md b/tensorflow/lite/java/ovic/README.md
index 9e3ceb7e18e260e0dd40da131301d1c222fd1e09..368c486f4f1ddd021e0bcfcdf9d82034ba5db82b 100644
--- a/tensorflow/lite/java/ovic/README.md
+++ b/tensorflow/lite/java/ovic/README.md
@@ -97,9 +97,17 @@ filegroup(
     ...
 ```
 
-* Modify `OvicClassifierTest.java` and `OvicDetectorTest.java` to test your model.
+* For classification models, modify `OvicClassifierTest.java`:
+  * change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
 
-Change `TEST_IMAGE_PATH` to `my_test_image.jpg`. Change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize).
+  * change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize).
+
+  * change `TEST_IMAGE_GROUNDTRUTH` (ImageNet class ID) to be consistent with your test image.
+
+* For detection models, modify `OvicDetectorTest.java`:
+  * change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
+  * change `MODEL_PATH` to `my_model.lite`.
+  * change `GROUNDTRUTH` (COCO class ID) to be consistent with your test image.
 
 Now you can run the bazel tests to catch any runtime issues with the submission.
 
@@ -115,12 +123,17 @@ Make sure that you have followed instructions in [Test your submissions](#test-y
 
 Modify `tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java`:
 
-* Add your model to the benchmarker apk by changing `MODEL_PATH` and `TEST_IMAGE_PATH` below to your submission and test image.
+* Add your model to the benchmarker apk by changing `modelPath` and `testImagePath` to your submission and test image.
 
 ```
-  private static final String TEST_IMAGE_PATH = "my_test_image.jpg";
-  private static final String MODEL_PATH = "my_model.lite";
+  if (benchmarkClassification) {
+    ...
+    testImagePath = "my_test_image.jpg";
+    modelPath = "my_model.lite";
+  } else {  // Benchmarking detection.
+  ...
 ```
+If you are adding a detection model, simply modify `modelPath` and `testImagePath` in the else block above.
 
 * Adjust the benchmark parameters when needed:
 
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 64a27739a84a7cba3abdb52fb4286f7971aac7c5..1952db0267bb7b26f24d819a69f9f312caf776ac 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -283,7 +283,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   /** Gets the number of output tensors. */
   int getOutputTensorCount() {
-    return inputTensors.length;
+    return outputTensors.length;
   }
 
   /**
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 100eb81510d7374c03c6a225a4a919a30c94aa58..c7389c581100acbef3b53c215b4449753cfd2a68 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -465,9 +465,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_applyDelegate(
   TfLiteDelegate* delegate = convertLongToDelegate(env, delegate_handle);
   if (delegate == nullptr) return;
 
-  TfLiteStatus status =
-      interpreter->ModifyGraphWithDelegate(delegate,
-                                           /* allow_dynamic_tensors= */ true);
+  TfLiteStatus status = interpreter->ModifyGraphWithDelegate(delegate);
   if (status != kTfLiteOk) {
     throwException(env, kIllegalArgumentException,
                    "Internal error: Failed to apply delegate: %s",
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
index b22399a4a47dcf0b625be5d0c86e6fe07f920144..21c431a82bf0f3ddb1684a224dffaa7bbf6d004e 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
@@ -18,6 +18,8 @@ package org.tensorflow.lite;
 import static com.google.common.truth.Truth.assertThat;
 
 import java.io.File;
+import java.util.HashMap;
+import java.util.Map;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -38,9 +40,17 @@ public final class InterpreterFlexTest {
     try (Interpreter interpreter = new Interpreter(FLEX_MODEL_FILE)) {
       assertThat(interpreter.getInputTensorCount()).isEqualTo(4);
       assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
-      assertThat(interpreter.getOutputTensorCount()).isEqualTo(4);
+      assertThat(interpreter.getInputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getInputTensor(2).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getInputTensor(3).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getOutputTensorCount()).isEqualTo(2);
       assertThat(interpreter.getOutputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
-      interpreter.run(new float[1], new float[1]);
+      assertThat(interpreter.getOutputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+      Object[] inputs = new Object[] {new float[1], new float[1], new float[1], new float[1]};
+      Map<Integer, Object> outputs = new HashMap<>();
+      outputs.put(0, new float[1]);
+      outputs.put(1, new float[1]);
+      interpreter.runForMultipleInputsOutputs(inputs, outputs);
     }
   }
 
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..b69bfa076e226850f3de305ab4f0a4e03a302764
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.io.File;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.lite.Interpreter} agains a MobileNet model. */
+@RunWith(JUnit4.class)
+public final class InterpreterMobileNetTest {
+
+  private static final File MOBILENET_MODEL_FILE =
+      new File("tensorflow/lite/java/src/testdata/mobilenet.tflite.bin");
+
+  @Test
+  public void testMobilenetRun() {
+    // Create a gray image.
+    float[][][][] img = new float[1][224][224][3];
+    for (int i = 0; i < 224; ++i) {
+      for (int j = 0; j < 224; ++j) {
+        img[0][i][j][0] = 0.5f;
+        img[0][i][j][1] = 0.5f;
+        img[0][i][j][2] = 0.5f;
+      }
+    }
+
+    // Allocate memory to receive the output values.
+    float[][] labels = new float[1][1001];
+
+    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    interpreter.run(img, labels);
+    assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+    assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+    interpreter.close();
+
+    assertThat(labels[0])
+        .usingExactEquality()
+        .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
+  }
+}
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 612229d172736a6e80a585d4c987b4e0691f9197..e635515de8cfdc2b4ed283adc8fc64803816258e 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -40,8 +40,8 @@ public final class InterpreterTest {
   private static final File MODEL_FILE =
       new File("tensorflow/lite/java/src/testdata/add.bin");
 
-  private static final File MOBILENET_MODEL_FILE =
-      new File("tensorflow/lite/java/src/testdata/mobilenet.tflite.bin");
+  private static final File MULTIPLE_INPUTS_MODEL_FILE =
+      new File("tensorflow/lite/testdata/multi_add.bin");
 
   private static final File FLEX_MODEL_FILE =
       new File("tensorflow/lite/testdata/multi_add_flex.bin");
@@ -167,20 +167,29 @@ public final class InterpreterTest {
 
   @Test
   public void testRunForMultipleInputsOutputs() {
-    Interpreter interpreter = new Interpreter(MODEL_FILE);
-    float[] oneD = {1.23f, 6.54f, 7.81f};
-    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
-    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
-    float[][][][] fourD = {threeD, threeD};
-    Object[] inputs = {fourD};
-    float[][][][] parsedOutputs = new float[2][8][8][3];
+    Interpreter interpreter = new Interpreter(MULTIPLE_INPUTS_MODEL_FILE);
+    assertThat(interpreter.getInputTensorCount()).isEqualTo(4);
+    assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getInputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getInputTensor(2).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getInputTensor(3).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getOutputTensorCount()).isEqualTo(2);
+    assertThat(interpreter.getOutputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getOutputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+
+    float[] input0 = {1.23f};
+    float[] input1 = {2.43f};
+    Object[] inputs = {input0, input1, input0, input1};
+    float[] parsedOutput0 = new float[1];
+    float[] parsedOutput1 = new float[1];
     Map<Integer, Object> outputs = new HashMap<>();
-    outputs.put(0, parsedOutputs);
+    outputs.put(0, parsedOutput0);
+    outputs.put(1, parsedOutput1);
     interpreter.runForMultipleInputsOutputs(inputs, outputs);
-    float[] outputOneD = parsedOutputs[0][0][0];
-    float[] expected = {3.69f, 19.62f, 23.43f};
-    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
-    interpreter.close();
+    float[] expected0 = {4.89f};
+    float[] expected1 = {6.09f};
+    assertThat(parsedOutput0).usingTolerance(0.1f).containsExactly(expected0).inOrder();
+    assertThat(parsedOutput1).usingTolerance(0.1f).containsExactly(expected1).inOrder();
   }
 
   @Test
@@ -214,32 +223,6 @@ public final class InterpreterTest {
     }
   }
 
-  @Test
-  public void testMobilenetRun() {
-    // Create a gray image.
-    float[][][][] img = new float[1][224][224][3];
-    for (int i = 0; i < 224; ++i) {
-      for (int j = 0; j < 224; ++j) {
-        img[0][i][j][0] = 0.5f;
-        img[0][i][j][1] = 0.5f;
-        img[0][i][j][2] = 0.5f;
-      }
-    }
-
-    // Allocate memory to receive the output values.
-    float[][] labels = new float[1][1001];
-
-    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
-    interpreter.run(img, labels);
-    assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
-    assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
-    interpreter.close();
-
-    assertThat(labels[0])
-        .usingExactEquality()
-        .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
-  }
-
   @Test
   public void testRunWithWrongInputType() {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
@@ -286,7 +269,7 @@ public final class InterpreterTest {
 
   @Test
   public void testGetInputIndex() {
-    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
     try {
       interpreter.getInputIndex("WrongInputName");
       fail();
@@ -303,7 +286,7 @@ public final class InterpreterTest {
 
   @Test
   public void testGetOutputIndex() {
-    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
     try {
       interpreter.getOutputIndex("WrongOutputName");
       fail();
@@ -312,9 +295,9 @@ public final class InterpreterTest {
           .hasMessageThat()
           .contains(
               "'WrongOutputName' is not a valid name for any output. Names of outputs and their"
-                  + " indexes are {MobilenetV1/Predictions/Softmax=0}");
+                  + " indexes are {output=0}");
     }
-    int index = interpreter.getOutputIndex("MobilenetV1/Predictions/Softmax");
+    int index = interpreter.getOutputIndex("output");
     assertThat(index).isEqualTo(0);
   }
 
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
index 27fc95f1f7f8d28c6e5eb23e63bac2498bccbde3..4d3e82b1ac14990be13aaba1d917e26dcc00b961 100644
--- a/tensorflow/lite/java/src/test/native/BUILD
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -21,6 +21,10 @@ cc_library(
             "//tensorflow/lite/java/src/main/native:jni_md.h",
         ],
     }),
+    includes = select({
+        "//tensorflow:android": [],
+        "//conditions:default": ["../../main/native/."],
+    }),
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
diff --git a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
index 954b1144f2d950d9ed27604da6cabdead0f714c6..1a0072a7c67b418975625aefff3a4dd84b4e6bf9 100644
--- a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
+++ b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
@@ -25,6 +25,8 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
     JNIEnv* env, jclass clazz) {
   // A simple op which outputs a vector of length 1 with the value [7].
   static TfLiteRegistration registration = {
+      .init = nullptr,
+      .free = nullptr,
       .prepare =
           [](TfLiteContext* context, TfLiteNode* node) {
             TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
@@ -38,19 +40,30 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
             TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
             output->data.f[0] = 7.0f;
             return kTfLiteOk;
-          }};
+          },
+      .profiling_string = nullptr,
+      .builtin_code = 0,
+      .custom_name = "",
+      .version = 1,
+  };
   // A simple delegate which replaces all ops with a single op that outputs a
   // vector of length 1 with the value [7].
   static TfLiteDelegate delegate = {
+      .data_ = nullptr,
       .Prepare = [](TfLiteContext* context,
                     TfLiteDelegate* delegate) -> TfLiteStatus {
         TfLiteIntArray* execution_plan;
         TF_LITE_ENSURE_STATUS(
             context->GetExecutionPlan(context, &execution_plan));
-        context->ReplaceSubgraphsWithDelegateKernels(context, registration,
-                                                     execution_plan, delegate);
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, registration, execution_plan, delegate);
         return kTfLiteOk;
-      }};
+      },
+      .CopyFromBufferHandle = nullptr,
+      .CopyToBufferHandle = nullptr,
+      .FreeBufferHandle = nullptr,
+      .flags = kTfLiteDelegateFlagsAllowDynamicTensors,
+  };
   return reinterpret_cast<jlong>(&delegate);
 }
 
@@ -59,10 +72,14 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForInvalidDelegate(
     JNIEnv* env, jclass clazz) {
   // A simple delegate that fails during preparation.
   static TfLiteDelegate delegate = {
-      .Prepare = [](TfLiteContext* context,
-                    TfLiteDelegate* delegate) -> TfLiteStatus {
-        return kTfLiteError;
-      }};
+      .data_ = nullptr,
+      .Prepare = [](TfLiteContext* context, TfLiteDelegate* delegate)
+          -> TfLiteStatus { return kTfLiteError; },
+      .CopyFromBufferHandle = nullptr,
+      .CopyToBufferHandle = nullptr,
+      .FreeBufferHandle = nullptr,
+      .flags = kTfLiteDelegateFlagsNone,
+  };
   return reinterpret_cast<jlong>(&delegate);
 }
 
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 010ba834661f7df7856cd7d2eebe396ba8746987..0bf4f01ac385eedafd38331c06f422016554d7e2 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -219,6 +219,7 @@ cc_library(
         "sparse_output_fully_connected.cc",
         "sparse_to_dense.cc",
         "split.cc",
+        "squared_difference.cc",
         "squeeze.cc",
         "strided_slice.cc",
         "sub.cc",
@@ -1379,6 +1380,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "squared_difference_test",
+    size = "small",
+    srcs = ["squared_difference_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 9c525d964077eb7007a27a004786961e67ea21dd..82072bccb243b240cecbb5e9377e18c18e18d782 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -45,6 +45,11 @@ struct LogSoftmaxOpData : public OpData {
   int32_t reverse_scaling_right_shift = 0;
 };
 
+struct PreluOpData : public OpData {
+  int32_t output_multiplier = 0;
+  int output_shift = 0;
+};
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
@@ -57,6 +62,10 @@ void* LogSoftmaxInit(TfLiteContext* context, const char* buffer,
   return new LogSoftmaxOpData;
 }
 
+void* PreluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  return new PreluOpData;
+}
+
 void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
@@ -65,6 +74,10 @@ void LogSoftmaxFree(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<LogSoftmaxOpData*>(buffer);
 }
 
+void PreluFree(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<PreluOpData*>(buffer);
+}
+
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -253,13 +266,18 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
+  PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
 
-  // Currently only Float32 is supported
-  // TODO(ycling): Support other data types.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, alpha->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->type, alpha->type);
   output->type = input->type;
 
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+    double real_multiplier =
+        input->params.scale * alpha->params.scale / output->params.scale;
+    QuantizeMultiplierSmallerThanOneExp(
+        real_multiplier, &data->output_multiplier, &data->output_shift);
+  }
+
   // PRelu (parameteric Relu) shares the same alpha value on "shared axis".
   // This means it's always required to "broadcast" alpha values in PRelu.
   TfLiteIntArray* output_size = nullptr;
@@ -288,8 +306,8 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -309,8 +327,8 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -328,8 +346,8 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -367,8 +385,8 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -407,9 +425,8 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
-      return kTfLiteError;
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
   }
   return kTfLiteOk;
 }
@@ -604,8 +621,8 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
     }
     default:
       context->ReportError(
-          context, "Only float32 and uint8_t supported currently, got %d.",
-          input->type);
+          context, "Only float32 and uint8_t supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -636,8 +653,8 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently., got %d",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently., got %s",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -651,16 +668,57 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  if (input->type != kTfLiteFloat32) {
-    context->ReportError(context, "Only float32 supported currently, got %d.",
-                         input->type);
-    return kTfLiteError;
+  const PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+          GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(alpha), GetTensorData<float>(alpha),
+          GetTensorShape(output), GetTensorData<float>(output),
+          ApplyPrelu<float>);
+      return kTfLiteOk;
+    } break;
+    case kTfLiteUInt8: {
+      PreluParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.alpha_offset = -alpha->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier = data->output_multiplier;
+      op_params.output_shift = data->output_shift;
+      reference_ops::BroadcastPrelu4DSlow(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context,
+                           "Only float32, uint8 supported currently, got %d.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  const auto* params =
+      reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
+
+  LeakyReluParams op_params;
+  op_params.alpha = params->alpha;
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      optimized_ops::LeakyRelu(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
   }
-  reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-      GetTensorShape(input), GetTensorData<float>(input), GetTensorShape(alpha),
-      GetTensorData<float>(alpha), GetTensorShape(output),
-      GetTensorData<float>(output), ApplyPrelu<float>);
-  return kTfLiteOk;
 }
 
 }  // namespace activations
@@ -715,12 +773,19 @@ TfLiteRegistration* Register_LOG_SOFTMAX() {
 }
 
 TfLiteRegistration* Register_PRELU() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+  static TfLiteRegistration r = {activations::PreluInit, activations::PreluFree,
                                  activations::PreluPrepare,
                                  activations::PreluEval};
   return &r;
 }
 
+TfLiteRegistration* Register_LEAKY_RELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::LeakyReluEval};
+  return &r;
+}
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index fff4121dc0c265d9dc3fe50521683b0be4ab4f94..1de3dbc44f89065489063676dc07cf2fe530c30e 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -563,15 +563,29 @@ TEST(QuantizedActivationsOpTest, LogSoftmax) {
               ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
 }
 
-class PReluOpModel : public SingleOpModel {
+// A base class of PRelu op model. It provides the constructor for
+// FloatPReluOpModel and QuantizedPReluOpModel.
+class BasePReluOpModel : public SingleOpModel {
  public:
-  PReluOpModel(const TensorData& input, const TensorData& alpha) {
+  BasePReluOpModel(const TensorData& input, const TensorData& alpha) {
     input_ = AddInput(input);
     alpha_ = AddInput(alpha);
-    output_ = AddOutput(input);
+    output_ = AddOutput({input.type, input.shape, input.min, input.max});
     SetBuiltinOp(BuiltinOperator_PRELU, BuiltinOptions_NONE, 0);
     BuildInterpreter({GetShape(input_), GetShape(alpha_)});
   }
+
+ protected:
+  int input_;
+  int alpha_;
+  int output_;
+};
+
+// The FloatPReluOpModel class handles float input and output.
+class FloatPReluOpModel : public BasePReluOpModel {
+ public:
+  using BasePReluOpModel::BasePReluOpModel;
+
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }
@@ -579,16 +593,35 @@ class PReluOpModel : public SingleOpModel {
     PopulateTensor(alpha_, data);
   }
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
 
- protected:
-  int input_;
-  int alpha_;
-  int output_;
+// The QuantizedPReluOpModel class handles quantized input and output.
+class QuantizedPReluOpModel : public BasePReluOpModel {
+ public:
+  using BasePReluOpModel::BasePReluOpModel;
+
+  template <typename T>
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+  template <typename T>
+  void SetAlpha(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(alpha_, data);
+  }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
 };
 
 TEST(FloatActivationsOpTest, PRelu) {
-  PReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
-                 {TensorType_FLOAT32, {1, 1, 3}});
+  FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                      {TensorType_FLOAT32, {1, 1, 3}});
 
   m.SetInput({
       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
@@ -606,6 +639,69 @@ TEST(FloatActivationsOpTest, PRelu) {
                              }));
 }
 
+TEST(QuantizedActivationsOpTest, PRelu) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_UINT8, {1, 1, 3}, kMin, kMax});
+  m.SetInput<uint8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+  });
+  m.SetAlpha<uint8_t>({0.0f, 0.5f, -0.5f});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          128, 128, 128,  // Row 1, Column 1
+                                          192, 192, 192,  // Row 1, Column 2
+                                          128, 64, 192,   // Row 2, Column 1
+                                          128, 112, 144,  // Row 1, Column 2
+                                      }));
+}
+
+class LeakyReluOpModel : public SingleOpModel {
+ public:
+  LeakyReluOpModel(const TensorData& input, float alpha) {
+    input_ = AddInput(input);
+    output_ = AddOutput(input);
+    SetBuiltinOp(BuiltinOperator_LEAKY_RELU, BuiltinOptions_LeakyReluOptions,
+                 CreateLeakyReluOptions(builder_, alpha).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(FloatActivationsOpTest, LeakyRelu) {
+  LeakyReluOpModel m({TensorType_FLOAT32, {2, 3}}, 0.5f);
+
+  m.SetInput({
+      0.0f, 1.0f, 3.0f,    // Row 1
+      1.0f, -1.0f, -2.0f,  // Row 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 1.0f, 3.0f,    // Row 1
+                                 1.0f, -0.5f, -1.0f,  // Row 2
+                             }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index f4bfd8d32481789a1ea706ec7c7fe8de06e9f5bc..32a7c100ce53101063d81345bcb052e680e64a28 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -220,30 +220,38 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
   if (output->type == kTfLiteUInt8) {
+    tflite::ArithmeticParams op_params;
+    op_params.left_shift = data->left_shift;
+    op_params.input1_offset = data->input1_offset;
+    op_params.input1_multiplier = data->input1_multiplier;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_offset = data->input2_offset;
+    op_params.input2_multiplier = data->input2_multiplier;
+    op_params.input2_shift = data->input2_shift;
+    op_params.output_offset = data->output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
 #define TF_LITE_ADD(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  op_params.left_shift = data->left_shift;                             \
-  op_params.input1_offset = data->input1_offset;                       \
-  op_params.input1_multiplier = data->input1_multiplier;               \
-  op_params.input1_shift = data->input1_shift;                         \
-  op_params.input2_offset = data->input2_offset;                       \
-  op_params.input2_multiplier = data->input2_multiplier;               \
-  op_params.input2_shift = data->input2_shift;                         \
-  op_params.output_offset = data->output_offset;                       \
-  op_params.output_multiplier = data->output_multiplier;               \
-  op_params.output_shift = data->output_shift;                         \
-  SetActivationParams(data->output_activation_min,                     \
-                      data->output_activation_max, &op_params);        \
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
                GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-    // The quantized version of Add doesn't support activations, so we
-    // always use BroadcastAdd.
+               GetTensorData<uint8_t>(output));
     if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
+      } else {
+        TF_LITE_ADD(reference_ops, Add);
+      }
     } else {
-      TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow);
+      if (need_broadcast) {
+        TF_LITE_ADD(optimized_ops, BroadcastAddFivefold);
+      } else {
+        TF_LITE_ADD(optimized_ops, Add);
+      }
     }
 #undef TF_LITE_ADD
   } else if (output->type == kTfLiteInt16) {
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 3c278c1f9e10977e4605d36acf029a806c86dadf..ab10c959a4d6b234cb6ae0810174e8f1c48898d1 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -455,7 +455,7 @@ TEST(ComparisonsTest, LessEqualQuantized) {
 TEST(ComparisonsTest, QuantizedEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -473,7 +473,7 @@ TEST(ComparisonsTest, QuantizedEqualWithBroadcast) {
 TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -491,7 +491,7 @@ TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
 TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -509,7 +509,7 @@ TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
 TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -527,7 +527,7 @@ TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
 TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -545,7 +545,7 @@ TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
 TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
diff --git a/tensorflow/lite/kernels/detection_postprocess_test.cc b/tensorflow/lite/kernels/detection_postprocess_test.cc
index d7ffaf1d82b542ab591fa0637f0657f050dfdc04..a1c061a3cad4407ec965b67387f006b5e0be4ec7 100644
--- a/tensorflow/lite/kernels/detection_postprocess_test.cc
+++ b/tensorflow/lite/kernels/detection_postprocess_test.cc
@@ -194,7 +194,7 @@ TEST(DetectionPostprocessOpTest, QuantizedTest) {
       {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
       {TensorType_FLOAT32, {}});
   // six boxes in center-size encoding
-  std::vector<std::initializer_list<float>> inputs1 = {{
+  std::vector<std::vector<float>> inputs1 = {{
       0.0, 0.0,  0.0, 0.0,  // box #1
       0.0, 1.0,  0.0, 0.0,  // box #2
       0.0, -1.0, 0.0, 0.0,  // box #3
@@ -204,12 +204,12 @@ TEST(DetectionPostprocessOpTest, QuantizedTest) {
   }};
   m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
   // class scores - two classes with background
-  std::vector<std::initializer_list<float>> inputs2 = {
-      {0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0., .5, .4, 0., .3,
-       .2}};
+  std::vector<std::vector<float>> inputs2 = {{0., .9, .8, 0., .75, .72, 0., .6,
+                                              .5, 0., .93, .95, 0., .5, .4, 0.,
+                                              .3, .2}};
   m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
   // six anchors in center-size encoding
-  std::vector<std::initializer_list<float>> inputs3 = {{
+  std::vector<std::vector<float>> inputs3 = {{
       0.5, 0.5,   1.0, 1.0,  // anchor #1
       0.5, 0.5,   1.0, 1.0,  // anchor #2
       0.5, 0.5,   1.0, 1.0,  // anchor #3
@@ -405,7 +405,7 @@ TEST(DetectionPostprocessOpTest, QuantizedTestFastNMS) {
       {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
       {TensorType_FLOAT32, {}}, false);
   // six boxes in center-size encoding
-  std::vector<std::initializer_list<float>> inputs1 = {{
+  std::vector<std::vector<float>> inputs1 = {{
       0.0, 0.0,  0.0, 0.0,  // box #1
       0.0, 1.0,  0.0, 0.0,  // box #2
       0.0, -1.0, 0.0, 0.0,  // box #3
@@ -415,12 +415,12 @@ TEST(DetectionPostprocessOpTest, QuantizedTestFastNMS) {
   }};
   m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
   // class scores - two classes with background
-  std::vector<std::initializer_list<float>> inputs2 = {
-      {0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0., .5, .4, 0., .3,
-       .2}};
+  std::vector<std::vector<float>> inputs2 = {{0., .9, .8, 0., .75, .72, 0., .6,
+                                              .5, 0., .93, .95, 0., .5, .4, 0.,
+                                              .3, .2}};
   m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
   // six anchors in center-size encoding
-  std::vector<std::initializer_list<float>> inputs3 = {{
+  std::vector<std::vector<float>> inputs3 = {{
       0.5, 0.5,   1.0, 1.0,  // anchor #1
       0.5, 0.5,   1.0, 1.0,  // anchor #2
       0.5, 0.5,   1.0, 1.0,  // anchor #3
@@ -517,7 +517,7 @@ TEST(DetectionPostprocessOpTest, QuantizedTestRegularNMS) {
       {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
       {TensorType_FLOAT32, {}}, true);
   // six boxes in center-size encoding
-  std::vector<std::initializer_list<float>> inputs1 = {{
+  std::vector<std::vector<float>> inputs1 = {{
       0.0, 0.0,  0.0, 0.0,  // box #1
       0.0, 1.0,  0.0, 0.0,  // box #2
       0.0, -1.0, 0.0, 0.0,  // box #3
@@ -527,12 +527,12 @@ TEST(DetectionPostprocessOpTest, QuantizedTestRegularNMS) {
   }};
   m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
   // class scores - two classes with background
-  std::vector<std::initializer_list<float>> inputs2 = {
-      {0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0., .5, .4, 0., .3,
-       .2}};
+  std::vector<std::vector<float>> inputs2 = {{0., .9, .8, 0., .75, .72, 0., .6,
+                                              .5, 0., .93, .95, 0., .5, .4, 0.,
+                                              .3, .2}};
   m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
   // six anchors in center-size encoding
-  std::vector<std::initializer_list<float>> inputs3 = {{
+  std::vector<std::vector<float>> inputs3 = {{
       0.5, 0.5,   1.0, 1.0,  // anchor #1
       0.5, 0.5,   1.0, 1.0,  // anchor #2
       0.5, 0.5,   1.0, 1.0,  // anchor #3
diff --git a/tensorflow/lite/kernels/floor_mod.cc b/tensorflow/lite/kernels/floor_mod.cc
index beddac2174e372cb9125f4a0e6750426b2fc9a5a..878716a5b4a97be62aa3f966b03bd90194e75aae 100644
--- a/tensorflow/lite/kernels/floor_mod.cc
+++ b/tensorflow/lite/kernels/floor_mod.cc
@@ -81,7 +81,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
   const TfLiteType type = input1->type;
-  if (type != kTfLiteInt32 && type != kTfLiteFloat32) {
+  if (type != kTfLiteInt32 && type != kTfLiteFloat32 && type != kTfLiteInt64) {
     context->ReportError(context, "Type '%s' is not supported by floor_mod.",
                          TfLiteTypeGetName(type));
     return kTfLiteError;
@@ -107,7 +107,7 @@ TfLiteStatus EvalImpl(TfLiteContext* context, bool requires_broadcast,
                       TfLiteTensor* output) {
   const T* denominator_data = GetTensorData<T>(input2);
 
-  if (input2->type == kTfLiteInt32) {
+  if (input2->type == kTfLiteInt32 || input2->type == kTfLiteInt64) {
     // Validate the denominator only for integer.
     const int num_elements = NumElements(input2);
     for (int i = 0; i < num_elements; ++i) {
@@ -144,6 +144,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return EvalImpl<int32_t>(context, data->requires_broadcast, input1,
                                input2, output);
     }
+    case kTfLiteInt64: {
+      return EvalImpl<int64_t>(context, data->requires_broadcast, input1,
+                               input2, output);
+    }
     case kTfLiteFloat32: {
       return EvalImpl<float>(context, data->requires_broadcast, input1, input2,
                              output);
diff --git a/tensorflow/lite/kernels/floor_mod_test.cc b/tensorflow/lite/kernels/floor_mod_test.cc
index 9d75f5ce2e3ef82dc5f11084af83c11eb188f77b..9d78673f320d8f797012dfc63220a99b091a6419 100644
--- a/tensorflow/lite/kernels/floor_mod_test.cc
+++ b/tensorflow/lite/kernels/floor_mod_test.cc
@@ -80,6 +80,17 @@ TEST(FloorModModel, BroadcastFloorMod) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(-2, 0, -2, -2));
 }
 
+TEST(FloorModModel, Int64WithBroadcast) {
+  FloorModModel<int64_t> model({TensorType_INT64, {1, 2, 2, 1}},
+                               {TensorType_INT64, {1}}, {TensorType_INT64, {}});
+  model.PopulateTensor<int64_t>(model.input1(), {10, -9, -11, (1LL << 34) + 9});
+  model.PopulateTensor<int64_t>(model.input2(), {-(1LL << 33)});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAre(-8589934582, -9, -11, -8589934583));
+}
+
 TEST(FloorModModel, FloatSimple) {
   FloorModModel<float> model({TensorType_FLOAT32, {1, 2, 2, 1}},
                              {TensorType_FLOAT32, {1, 2, 2, 1}},
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 63cca1cf5427f9c328b68868a4cfbef3fec08bf9..a1eecb284ab647e8b7fc7b18dfd8ad82aedeece3 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -117,7 +117,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
   TfLiteType data_type = input->type;
-  if (data_type != kTfLiteFloat32) {
+  if (data_type != kTfLiteFloat32 && data_type != kTfLiteInt32) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, input, filter, bias, output, &real_multiplier));
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 195a6d2b81b6b7a247f6a6f2086edfe4632a8163..61884d6a12c3e150d910244108a357dd34fe8783 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -38,18 +38,28 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  // Only INT32 positions are supported.
-  TF_LITE_ENSURE_EQ(context, positions->type, kTfLiteInt32);
+
+  switch (positions->type) {
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    default:
+      context->ReportError(
+          context, "Positions of type '%s' are not supported by gather.",
+          TfLiteTypeGetName(positions->type));
+      return kTfLiteError;
+  }
+
   // Assign to output the input type.
   output->type = input->type;
+
   // Check conditions for different types.
   switch (input->type) {
     case kTfLiteFloat32:
     case kTfLiteUInt8:
-    case kTfLiteInt32: {
-      // Fully supported by reference_ops::Gather.
-    } break;
-
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
     case kTfLiteString: {
       // Only 1D input is supported.
       TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
@@ -82,51 +92,83 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_shape);
 }
 
+template <typename InputT, typename PositionsT>
+TfLiteStatus Gather(const TfLiteGatherParams& params, const TfLiteTensor* input,
+                    const TfLiteTensor* positions, TfLiteTensor* output) {
+  tflite::GatherParams op_params;
+  op_params.axis = params.axis;
+  optimized_ops::Gather(op_params, GetTensorShape(input),
+                        GetTensorData<InputT>(input), GetTensorShape(positions),
+                        GetTensorData<PositionsT>(positions),
+                        GetTensorShape(output), GetTensorData<InputT>(output));
+  return kTfLiteOk;
+}
+
+template <typename PositionT>
+TfLiteStatus GatherStrings(TfLiteContext* context, const TfLiteTensor* input,
+                           const TfLiteTensor* positions,
+                           TfLiteTensor* output) {
+  // TODO(mgubin): Currently support only for 1D output tensors.
+  DynamicBuffer buffer;
+  const PositionT* indexes = GetTensorData<PositionT>(positions);
+  const PositionT num_strings = GetStringCount(input);
+  for (int i = 0; i < positions->dims->data[0]; ++i) {
+    const PositionT pos = indexes[i];
+    TF_LITE_ENSURE(context, pos < num_strings);
+    const auto string_ref = GetString(input, pos);
+    buffer.AddString(string_ref.str, string_ref.len);
+  }
+  buffer.WriteToTensor(output);
+  return kTfLiteOk;
+}
+
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const int input_rank = NumDimensions(input);
-#define TF_LITE_GATHER(data_type, index_type)                              \
-  {                                                                        \
-    tflite::GatherParams op_params;                                        \
-    op_params.input_rank = input_rank;                                     \
-    op_params.axis = params->axis;                                         \
-    optimized_ops::Gather(                                                 \
-        op_params, GetTensorShape(input), GetTensorData<data_type>(input), \
-        GetTensorShape(positions), GetTensorData<index_type>(positions),   \
-        GetTensorShape(output), GetTensorData<data_type>(output));         \
+
+  if (positions->type == kTfLiteInt32) {
+    switch (input->type) {
+      case kTfLiteFloat32:
+        return Gather<float, int32_t>(*params, input, positions, output);
+      case kTfLiteUInt8:
+        return Gather<uint8_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt32:
+        return Gather<int32_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt64:
+        return Gather<int64_t, int32_t>(*params, input, positions, output);
+      case kTfLiteString:
+        return GatherStrings<int32_t>(context, input, positions, output);
+      default:
+        context->ReportError(context, "Type '%s' is not supported by gather.",
+                             TfLiteTypeGetName(input->type));
+        return kTfLiteError;
+    }
   }
-  switch (input->type) {
-    case kTfLiteFloat32:
-      TF_LITE_GATHER(float, int32_t);
-      break;
-    case kTfLiteUInt8:
-      TF_LITE_GATHER(uint8_t, int32_t);
-      break;
-    case kTfLiteInt32:
-      TF_LITE_GATHER(int32_t, int32_t);
-      break;
-    case kTfLiteString: {
-      // TODO(mgubin): Currently support only for 1D output tensors.
-      DynamicBuffer buffer;
-      const int32* indexes = positions->data.i32;
-      const int num_strings = GetStringCount(input);
-      for (int i = 0; i < positions->dims->data[0]; ++i) {
-        const int pos = indexes[i];
-        TF_LITE_ENSURE(context, pos < num_strings);
-        const auto string_ref = GetString(input, pos);
-        buffer.AddString(string_ref.str, string_ref.len);
-      }
-      buffer.WriteToTensor(output);
-    } break;
-    default:
-      return kTfLiteError;
+  if (positions->type == kTfLiteInt64) {
+    switch (input->type) {
+      case kTfLiteFloat32:
+        return Gather<float, int64_t>(*params, input, positions, output);
+      case kTfLiteUInt8:
+        return Gather<uint8_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt32:
+        return Gather<int32_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt64:
+        return Gather<int64_t, int64_t>(*params, input, positions, output);
+      case kTfLiteString:
+        return GatherStrings<int64_t>(context, input, positions, output);
+      default:
+        context->ReportError(context, "Type '%s' is not supported by gather.",
+                             TfLiteTypeGetName(input->type));
+        return kTfLiteError;
+    }
   }
-#undef TF_LITE_GATHER
-  return kTfLiteOk;
+  context->ReportError(context,
+                       "Positions of type '%s' are not supported by gather.",
+                       TfLiteTypeGetName(positions->type));
+  return kTfLiteError;
 }
 }  // namespace gather
 
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 58460f847fa06218b7b73d87e33092a3e98eaccf..7b5f84348903a3cc436f1bd6cf32b3175b2f5815 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -26,37 +26,36 @@ using ::testing::ElementsAreArray;
 
 class GatherOpModel : public SingleOpModel {
  public:
-  GatherOpModel(std::initializer_list<int> input_shape, TensorType input_type,
-                std::initializer_list<int> positions_shape, int axis = 0) {
-    input_ = AddInput(input_type);
-    positions_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(input_type);
+  GatherOpModel(const TensorData& input, const TensorData& positions,
+                int axis = 0) {
+    input_ = AddInput(input);
+    positions_ = AddInput(positions);
+    output_ = AddOutput(input.type);
     SetBuiltinOp(BuiltinOperator_GATHER, BuiltinOptions_GatherOptions,
                  CreateGatherOptions(builder_, axis).Union());
-    BuildInterpreter({input_shape, positions_shape});
+    BuildInterpreter({GetShape(input_), GetShape(positions_)});
   }
 
-  void SetInputFloat(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
 
-  void SetInputUint8(std::initializer_list<uint8_t> data) {
-    PopulateTensor<uint8_t>(input_, data);
-  }
-
-  void SetInput(std::initializer_list<string> data) {
+  void SetStringInput(std::initializer_list<string> data) {
     PopulateStringTensor(input_, data);
   }
 
-  void SetPositions(std::initializer_list<int> data) {
-    PopulateTensor<int>(positions_, data);
+  template <typename T>
+  void SetPositions(std::initializer_list<T> data) {
+    PopulateTensor<T>(positions_, data);
   }
 
-  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
-  std::vector<uint8_t> GetOutputUint8() {
-    return ExtractVector<uint8_t>(output_);
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
   }
-  std::vector<string> GetOutputString() {
+
+  std::vector<string> GetStringOutput() {
     return ExtractVector<string>(output_);
   }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
@@ -68,99 +67,171 @@ class GatherOpModel : public SingleOpModel {
 };
 
 TEST(GatherOpTest, Shuffle) {
-  GatherOpModel m({2, 2}, TensorType_FLOAT32, {2});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({1, 0});
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({1, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({0.7, 0.8, -2, 0.2})));
 }
 
 TEST(GatherOpTest, Test0DIndex) {
-  GatherOpModel m({2, 2}, TensorType_FLOAT32, {});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({1});
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
 }
 
 TEST(GatherOpTest, Test0DIndexWith0DResult) {
   // 0D tensor is special case in current TFLite. Test it once to make sure
   // existing workarounds are fine with it.
-  GatherOpModel m({3}, TensorType_FLOAT32, {});
-  m.SetInputFloat({1.0, 2.0, 3.0});
-  m.SetPositions({1});
+  GatherOpModel m({TensorType_FLOAT32, {3}}, {TensorType_INT32, {}});
+  m.SetInput<float>({1.0, 2.0, 3.0});
+  m.SetPositions<int32_t>({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0})));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({2.0})));
   EXPECT_TRUE(m.GetOutputShape().empty());
 }
 
 TEST(GatherOpTest, Test2DIndexWith2DResult) {
-  GatherOpModel m({3}, TensorType_FLOAT32, {1, 2});
-  m.SetInputFloat({1.0, 2.0, 3.0});
-  m.SetPositions({1, 0});
+  GatherOpModel m({TensorType_FLOAT32, {3}}, {TensorType_INT32, {1, 2}});
+  m.SetInput<float>({1.0, 2.0, 3.0});
+  m.SetPositions<int32_t>({1, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0, 1.0})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({2.0, 1.0})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
 }
 
 TEST(FloatGatherOpTest, Duplicate) {
-  GatherOpModel m({1, 2, 2}, TensorType_FLOAT32, {2});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({0, 0});
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({0, 0});
   m.Invoke();
   EXPECT_THAT(
-      m.GetOutputFloat(),
+      m.GetOutput<float>(),
       ElementsAreArray(ArrayFloatNear({-2, 0.2, 0.7, 0.8, -2, 0.2, 0.7, 0.8})));
 }
 
 TEST(FloatGatherOpTest, Slice) {
-  GatherOpModel m({4, 1}, TensorType_FLOAT32, {2});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({1, 3});
+  GatherOpModel m({TensorType_FLOAT32, {4, 1}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({1, 3});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.2, 0.8})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({0.2, 0.8})));
 }
 
 TEST(FloatGatherOpTest, Axis1) {
   const int axis = 1;
-  GatherOpModel m({1, 2, 3}, TensorType_FLOAT32, {2}, axis);
-  m.SetInputFloat({1, 2, 3, 4, 5, 6});
-  m.SetPositions({1, 0});
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 3}}, {TensorType_INT32, {2}},
+                  axis);
+  m.SetInput<float>({1, 2, 3, 4, 5, 6});
+  m.SetPositions<int32_t>({1, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({4, 5, 6, 1, 2, 3})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3}));
 }
 
 TEST(FloatGatherOpTest, LastAxis) {
   const int axis = -1;
-  GatherOpModel m({1, 2, 3}, TensorType_FLOAT32, {2}, axis);
-  m.SetInputFloat({1, 2, 3, 4, 5, 6});
-  m.SetPositions({2, 0});
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 3}}, {TensorType_INT32, {2}},
+                  axis);
+  m.SetInput<float>({1, 2, 3, 4, 5, 6});
+  m.SetPositions<int32_t>({2, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({3, 1, 6, 4})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2}));
 }
 
-TEST(Uint8tGatherOpTest, Shuffle) {
-  GatherOpModel m({2, 2}, TensorType_UINT8, {2});
-  m.SetInputUint8({133, 134, 14, 15});
-  m.SetPositions({1, 0});
+TEST(TypesGatherOpTest, Float32Int32) {
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({13.3, -13.4, -1.4, 1.5});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.4, 1.5, 13.3, -13.4}));
+}
+
+TEST(TypesGatherOpTest, Float32Int64) {
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<float>({13.3, -13.4, -1.4, 1.5});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.4, 1.5, 13.3, -13.4}));
+}
+
+TEST(TypesGatherOpTest, Int32Int32) {
+  GatherOpModel m({TensorType_INT32, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int32_t>({-1330, 1340, 140, -150});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({140, -150, -1330, 1340}));
+}
+
+TEST(TypesGatherOpTest, Int32Int64) {
+  GatherOpModel m({TensorType_INT32, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int32_t>({-1330, 1340, 140, -150});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({140, -150, -1330, 1340}));
+}
+
+TEST(TypesGatherOpTest, Uint8Int32) {
+  GatherOpModel m({TensorType_UINT8, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<uint8_t>({133, 134, 14, 15});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({14, 15, 133, 134}));
+}
+
+TEST(TypesGatherOpTest, Uint8Int64) {
+  GatherOpModel m({TensorType_UINT8, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<uint8_t>({133, 134, 14, 15});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({14, 15, 133, 134}));
+}
+
+TEST(TypesGatherOpTest, Int64Int32) {
+  GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({14LL, 15LL, -(1LL << 34), 134LL}));
+}
+
+TEST(TypesGatherOpTest, Int64Int64) {
+  GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
+  m.SetPositions<int64_t>({1LL, 0LL});
   m.Invoke();
 
-  EXPECT_THAT(m.GetOutputUint8(), ElementsAreArray({14, 15, 133, 134}));
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({14LL, 15LL, -(1LL << 34), 134LL}));
 }
 
 TEST(GatherOpTest, SimpleString) {
-  GatherOpModel m({3}, TensorType_STRING, {2});
-  m.SetInput({"A", "B", "C"});
-  m.SetPositions({0, 2});
+  GatherOpModel m({TensorType_STRING, {3}}, {TensorType_INT32, {2}});
+  m.SetStringInput({"A", "B", "C"});
+  m.SetPositions<int32_t>({0, 2});
   m.Invoke();
   ASSERT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutputString(), ElementsAreArray({"A", "C"}));
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"A", "C"}));
 }
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 32f61e02807d6ff6d22c77d2d0c119fe6ec28826..6d9690ea460bd86ef481d7c82e0f8770969e35d4 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -591,6 +591,7 @@ cc_test(
 cc_test(
     name = "depthwiseconv_quantized_test",
     srcs = ["depthwiseconv_quantized_test.cc"],
+    shard_count = 2,
     tags = [
         "no_oss",
         "tflite_not_portable_ios",
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index e31f47d2cea16cb17aca6685f4236c234243e873..fdb72037f84e4cea9018516ef70eb8c8fa039082 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include <arm_neon.h>
 #endif
 
-#if defined __GNUC__ && defined __SSE4_1__
+#if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
 #define USE_NEON
 
 #define OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
index f71ddbf3220db5d7d80afe198d0c13edc4a5448e..6461a5e5426f9eaffb0fadb2b7e5b2f3e2848254 100644
--- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -12,25 +12,55 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
 
-#define EIGEN_USE_CUSTOM_THREAD_POOL
-#define EIGEN_USE_THREADS
+// This is essentially unsupported/CXX11/Eigen/Tensor.h
+// TODO(petewarden) - move this to a common location in Eigen itself.
 
 // clang-format off
 
-#include <stdint.h>
 
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+
+
+#include "Eigen/Core"
+
+#if defined(EIGEN_USE_SYCL)
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <utility>
+#endif
+#include <cmath>
 #include <cstddef>
 #include <cstring>
-#include <cmath>
+
+
+
+
+
+#ifdef _WIN32
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#include <windows.h>
+#else
+#include <stdint.h>
+#include <unistd.h>
+#endif
+
+#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
 #include <random>
-#include <atomic>
-#include <condition_variable>  // NOLINT(build/c++11)
-#include <mutex>  // NOLINT(build/c++11)
-#include <thread>  // NOLINT(build/c++11)
-#include <functional>
+#endif
 
 #ifdef _WIN32
 #include <windows.h>
@@ -40,58 +70,53 @@ limitations under the License.
 #include <time.h>
 #endif
 
+// #if defined(EIGEN_USE_LIBXSMM)
+// #include "libxsmm.h"
+// #endif
 
-// Because some programs may link Eigen in through other frameworks with
-// different flags, we can run into multiple definition issues if we don't have
-// a private namespace for our versions. This is a nasty hack, but a similar
-// approach is used elsewhere to handle the problem, so it should be stable.
-#define Eigen EigenForTFLite
+#ifdef EIGEN_USE_THREADS
+#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
+#endif
 
-#include "Eigen/src/Core/util/StaticAssert.h"
-#include "unsupported/Eigen/CXX11/Core"
-#include "unsupported/Eigen/SpecialFunctions"
 
 #include "Eigen/src/Core/util/DisableStupidWarnings.h"
 
-#include "Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/SpecialFunctions"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Meta.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h"
+
 
-// Beware: the order of the include matters to some compilers. For example
-// TensorIndexList.h should be included before TensorDimensions.h in order to
-// use index lists to encode tensor dimensions when compiling with llvm.
-// We're defining this ourselves rather than using the Eigen Tensor header file
-// so that we can alter the macro definition of TENSOR_CONTRACTION_DISPATCH to
-// reduce binary size.
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/ThreadPoolInterface.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorNonBlockingThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStats.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
-
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+
 #undef TENSOR_CONTRACTION_DISPATCH
 #define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)    \
   if (this->m_lhs_inner_dim_contiguous &&                       \
@@ -102,8 +127,9 @@ limitations under the License.
     eigen_assert(false && "Unsupported contraction formats");   \
   }
 
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
@@ -125,19 +151,18 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h"
-
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
 
 #include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
index 5e83b7b846e33b6c463edb93aade48b315b84dfe..f5576fbff7005d359b3766a3708f45f487744ff4 100644
--- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
@@ -94,7 +94,7 @@ typedef unsigned __int64 uint64_t;
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
@@ -106,10 +106,11 @@ typedef unsigned __int64 uint64_t;
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
@@ -128,7 +129,7 @@ typedef unsigned __int64 uint64_t;
 
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 6f7031b36d25e82383400333ae35b67c96b56b5b..e2329c79c78180b9126025686ac86cb6cb9117ae 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -65,11 +65,13 @@ using reference_ops::Greater;
 using reference_ops::GreaterEqual;
 using reference_ops::GreaterEqualWithScaling;
 using reference_ops::GreaterWithScaling;
+using reference_ops::LeakyRelu;
 using reference_ops::Less;
 using reference_ops::LessEqual;
 using reference_ops::LessEqualWithScaling;
 using reference_ops::LessWithScaling;
 using reference_ops::Mean;
+using reference_ops::ProcessBroadcastShapes;
 using reference_ops::RankOneSelect;
 using reference_ops::Relu1;
 using reference_ops::Relu6;
@@ -3151,12 +3153,12 @@ inline void LstmCell(
   // Combined memory state and final output calculation
   gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
   output_state_map =
-      input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      input_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
           new_input_sm.tanh() +
-      forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      forget_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
           prev_state_map;
   output_activ_map =
-      output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      output_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
       output_state_map.tanh();
 }
 
@@ -4291,7 +4293,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   using FixedPointScaledDiff =
       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
@@ -4367,7 +4368,7 @@ inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
   output_map.array() =
-      input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>());
+      input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
 }
 
 // Convenience version that allows, for example, generated-code calls to be
diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index c92f28c79efed056927eb6474db5eef4b1f9d419..380fc8f98ebbdd90bb68144a46903640734bff08 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -802,7 +802,6 @@ inline void Gather(const T* input_data, const Dims<4>& input_dims,
                    const Dims<4>& coords_dims, T* output_data,
                    const Dims<4>& output_dims) {
   tflite::GatherParams op_params;
-  op_params.input_rank = input_rank;
   op_params.axis = 4 - input_rank;
 
   Gather(op_params, DimsToShape(input_dims), input_data,
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index b1fefbef04c87eff8c8efee6f3fb599945908802..920f154049e0e8b7e36c30c8f8d404ad802f0f4d 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -100,6 +100,98 @@ gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
 
 namespace reference_ops {
 
+// Return true for broadcast case, false otherwise.
+inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
+                                   const RuntimeShape& shape1,
+                                   tflite::ArithmeticParams* params) {
+  const int dims_count =
+      std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
+
+  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  RuntimeShape scalar_shape(dims_count, 1);
+
+  auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
+  auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
+
+  // Check for "exact" match, implicitly accepting any scalar shapes.
+  if (extended_shape0 == extended_shape1) {
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
+    return false;
+  }
+
+  for (int i = dims_count - 1; i >= 0; --i) {
+    if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
+      continue;
+    } else if (extended_shape0.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kFirstInputBroadcastsFast;
+      break;
+    } else if (extended_shape1.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kSecondInputBroadcastsFast;
+      break;
+    } else {
+      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+      break;
+    }
+  }
+
+  if (params->broadcast_category !=
+          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
+      params->broadcast_category !=
+          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    return false;
+  }
+
+  // From this point it is assumed contractually that corresponding dimensions
+  // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
+  const bool swap_inputs = params->broadcast_category ==
+                           BroadcastableOpCategory::kSecondInputBroadcastsFast;
+  const RuntimeShape* shape_a =
+      swap_inputs ? &extended_shape1 : &extended_shape0;
+  const RuntimeShape* shape_b =
+      swap_inputs ? &extended_shape0 : &extended_shape1;
+
+  int i = dims_count - 1;
+  params->broadcast_shape[0] = 1;
+  params->broadcast_shape[1] = 1;
+  params->broadcast_shape[2] = 1;
+  params->broadcast_shape[3] = 1;
+  params->broadcast_shape[4] = 1;
+  // y_0 is greedy: include dims if both or neither equal 1: in other words,
+  // test for equality rather than (shape_a->Dims(i) != 1).
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[4] *= shape_b->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
+  // that has the unit dimension, the next two loops are not entered.
+  while (i >= 0 && shape_a->Dims(i) == 1) {
+    params->broadcast_shape[3] *= shape_b->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[2] *= shape_a->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).
+  while (i >= 0 && shape_b->Dims(i) == 1) {
+    params->broadcast_shape[1] *= shape_a->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[0] *= shape_b->Dims(i);
+    --i;
+  }
+
+  // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
+  // loop.
+  if (i >= 0) {
+    params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  }
+  return true;
+}
+
 template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
@@ -466,6 +558,19 @@ inline void ReluX(const tflite::ActivationParams& params,
   }
 }
 
+inline void LeakyRelu(const tflite::LeakyReluParams& params,
+                      const RuntimeShape& input_shape, const float* input_data,
+                      const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("LeakyRelu (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    // Note that this implementation matches that of TensorFlow, and corresponds
+    // to the traditional LeakyRelu equation only for alpha <= 1.
+    output_data[i] = std::max(val, val * params.alpha);
+  }
+}
+
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                             const RuntimeShape& input_shape,
                             const float* input_data,
@@ -2631,7 +2736,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   using FixedPointScaledDiff =
       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
@@ -2936,10 +3040,10 @@ inline void Floor(const RuntimeShape& input_shape, const float* input_data,
   }
 }
 
-template <typename T>
+template <typename T, typename CoordsT = int32>
 inline void Gather(const tflite::GatherParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
-                   const RuntimeShape& coords_shape, const int32* coords_data,
+                   const RuntimeShape& coords_shape, const CoordsT* coords_data,
                    const RuntimeShape& output_shape, T* output_data) {
   int axis = op_params.axis;
   if (axis < 0) {
@@ -4462,6 +4566,53 @@ inline void ResizeNearestNeighbor(
   }
 }
 
+inline void BroadcastPrelu4DSlow(const PreluParams& params,
+                                 const RuntimeShape& input_shape,
+                                 const uint8* input_data,
+                                 const RuntimeShape& alpha_shape,
+                                 const uint8* alpha_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          int output_index = Offset(extended_output_shape, b, y, x, c);
+          int input_index = SubscriptToIndex(desc1, b, y, x, c);
+          const int32 input_value =
+              params.input_offset + input_data[input_index];
+          if (input_value >= 0) {
+            output_data[output_index] = input_data[input_index];
+          } else {
+            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
+            const int32 alpha_value =
+                params.alpha_offset + alpha_data[alpha_index];
+            const int32 unclamped_output =
+                params.output_offset +
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    input_value * alpha_value, params.output_multiplier,
+                    params.output_shift);
+            const int32 quantized_min = std::numeric_limits<uint8_t>::min();
+            const int32 quantized_max = std::numeric_limits<uint8_t>::max();
+            const int32 clamped_output = std::min(
+                quantized_max, std::max(quantized_min, unclamped_output));
+            output_data[output_index] = static_cast<uint8>(clamped_output);
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
index d24dca9bfbbee78498f797713dac8b67a232923a..b4822d57019b508f7e3d53403ff427f461ed263f 100644
--- a/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -66,6 +66,11 @@ inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.uint8 : nullptr;
 }
 
+template <>
+inline const int8_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.int8 : nullptr;
+}
+
 template <>
 inline const int16_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.i16 : nullptr;
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index 04b95ddc63d7a651ff57417e476320f6d3a53ffb..859ec8c68252538e3cf6d06ce7864f62d2a236dc 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -852,7 +852,6 @@ struct FullyConnectedParams {
 };
 
 struct GatherParams {
-  int16 input_rank;
   int16 axis;
 };
 
@@ -905,6 +904,14 @@ struct PadParams {
   ResizingCategory resizing_category;
 };
 
+struct PreluParams {
+  int32 input_offset;
+  int32 alpha_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+};
+
 struct PoolParams {
   FusedActivationFunctionType activation;
   PaddingType padding_type;
@@ -1007,6 +1014,10 @@ struct UnpackParams {
   int16 axis;
 };
 
+struct LeakyReluParams {
+  float alpha;
+};
+
 template <typename P>
 inline void SetActivationParams(float min, float max, P* params) {
   params->float_activation_min = min;
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 2ac3fbe4f5d7d608eb9c9cd453913e6875b01505..e39890e3320eb4d1e2dcd0c8256bb96631e75011 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -117,6 +117,10 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
   int64_t dims1 = NumDimensions(input1);
   int64_t dims2 = NumDimensions(input2);
   int64_t out_dims = std::max(dims1, dims2);
+  if (NumElements(input1) == 0) {
+    *output_shape = TfLiteIntArrayCopy(input1->dims);
+    return kTfLiteOk;
+  }
   std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
       TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
   for (int i = 0; i < out_dims; ++i) {
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index b405dee47ef01e4c84b1b6081faf917c9ed1328f..01039a705438af2a92a68b01c2146daf69c46250 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -153,26 +153,34 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            const TfLiteTensor* input2, TfLiteTensor* output) {
   if (input1->type == kTfLiteUInt8 && input2->type == kTfLiteUInt8 &&
       output->type == kTfLiteUInt8) {
+    tflite::ArithmeticParams op_params;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    op_params.input1_offset = -input1->params.zero_point;
+    op_params.input2_offset = -input2->params.zero_point;
+    op_params.output_offset = output->params.zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
 #define TF_LITE_MUL(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  SetActivationParams(data->output_activation_min,                     \
-                      data->output_activation_max, &op_params);        \
-  op_params.input1_offset = -input1->params.zero_point;                \
-  op_params.input2_offset = -input2->params.zero_point;                \
-  op_params.output_offset = output->params.zero_point;                 \
-  op_params.output_multiplier = data->output_multiplier;               \
-  op_params.output_shift = data->output_shift;                         \
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
                GetTensorData<uint8_t>(input2), GetTensorShape(output), \
                GetTensorData<uint8_t>(output))
 
-    // The quantized version of Mul doesn't support activations, so we
-    // always use BroadcastMul.
     if (kernel_type == kReference) {
-      TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
+      if (need_broadcast) {
+        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
+      } else {
+        TF_LITE_MUL(reference_ops, Mul);
+      }
     } else {
-      TF_LITE_MUL(optimized_ops, BroadcastMul4DSlow);
+      if (need_broadcast) {
+        TF_LITE_MUL(optimized_ops, BroadcastMulFivefold);
+      } else {
+        TF_LITE_MUL(optimized_ops, Mul);
+      }
     }
 #undef TF_LITE_MUL
   } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index 24fabccde09fab6c200b51c8bbc8a9e000df5e0b..479495c875dac5d4e827864548c6b4a188e284ee 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -40,7 +40,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // TODO(renjieliu): Support negative axis.
   TF_LITE_ENSURE(context, data->axis >= 0);
   if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 &&
-      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16) {
+      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16 &&
+      input0->type != kTfLiteInt64) {
     context->ReportError(context, "Type '%s' is not supported by pack.",
                          TfLiteTypeGetName(input0->type));
     return kTfLiteError;
@@ -110,6 +111,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       PackImpl<int32_t>(context, node, output, data->values_count, data->axis);
       break;
     }
+    case kTfLiteInt64: {
+      PackImpl<int64_t>(context, node, output, data->values_count, data->axis);
+      break;
+    }
     default: {
       context->ReportError(context, "Type '%s' is not supported by pack.",
                            TfLiteTypeGetName(output->type));
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index a47e9ff40d079ba13c99891f10b372a5f0841daa..4f58debc5c872ea640ed97cd51884a39b412ff2f 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -113,6 +113,40 @@ TEST(PackOpTest, Int32MultilDimensions) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
+// int64 tests.
+TEST(PackOpTest, Int64ThreeInputs) {
+  PackOpModel<int64_t> model({TensorType_INT64, {2}}, 0, 3);
+  model.SetInput(0, {1LL << 33, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, -(1LL << 34)});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1LL << 33, 4LL, 2LL, 5LL, 3LL, -(1LL << 34)}));
+}
+
+TEST(PackOpTest, Int64ThreeInputsDifferentAxis) {
+  PackOpModel<int64_t> model({TensorType_INT64, {2}}, 1, 3);
+  model.SetInput(0, {1LL << 33, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, -(1LL << 34)});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1LL << 33, 2LL, 3LL, 4LL, 5LL, -(1LL << 34)}));
+}
+
+TEST(PackOpTest, Int64MultilDimensions) {
+  PackOpModel<int64_t> model({TensorType_INT64, {2, 3}}, 1, 2);
+  model.SetInput(0, {1LL << 33, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, -(1LL << 34), 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1LL << 33, 2LL, 3LL, 7LL, 8LL, -(1LL << 34),
+                                4LL, 5LL, 6LL, 10LL, 11LL, 12LL}));
+}
+
 // uint8
 TEST(PackOpTest, Uint8ThreeInputs) {
   PackOpModel<uint8_t> model({TensorType_UINT8, {2}}, 0, 3);
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index c6834537671034b5736c232486dd5eecfea75033..f4aa5cc4388033933913a3164e527c94744a0434 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -123,6 +123,8 @@ TfLiteRegistration* Register_SQUARE();
 TfLiteRegistration* Register_ZEROS_LIKE();
 TfLiteRegistration* Register_FLOOR_MOD();
 TfLiteRegistration* Register_RANGE();
+TfLiteRegistration* Register_LEAKY_RELU();
+TfLiteRegistration* Register_SQUARED_DIFFERENCE();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -256,6 +258,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
   AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD());
   AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
+  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
+  AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index eb5ce667d4c9ebcc8e392d06b92736ea41432bd6..059c9d165ee8a81096cce3885fc940f5977d7342 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_REGISTER_H_
 #define TENSORFLOW_LITE_KERNELS_REGISTER_H_
 
-#include <unordered_map>
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59b53a6287dbbc863a61875be82090c1b9c6d442
--- /dev/null
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace squared_difference {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  bool requires_broadcast;
+};
+
+template <typename T>
+T SquaredDifference(T input1, T input2) {
+  const T difference = input1 - input2;
+  return difference * difference;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+void EvalSquaredDifference(TfLiteContext* context, TfLiteNode* node,
+                           const OpData* data, const TfLiteTensor* input1,
+                           const TfLiteTensor* input2, TfLiteTensor* output) {
+  if (data->requires_broadcast) {
+    reference_ops::BroadcastBinaryFunction4DSlow<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), GetTensorData<T>(input2),
+        GetTensorShape(output), GetTensorData<T>(output), SquaredDifference<T>);
+  } else {
+    reference_ops::BinaryFunction<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), GetTensorData<T>(input2),
+        GetTensorShape(output), GetTensorData<T>(output), SquaredDifference<T>);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalSquaredDifference<float>(context, node, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt32) {
+    EvalSquaredDifference<int32_t>(context, node, data, input1, input2, output);
+  } else {
+    context->ReportError(context,
+                         "SquaredDifference only supports FLOAT32, INT32 and "
+                         "quantized UINT8 now, got %d.",
+                         output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace squared_difference
+
+TfLiteRegistration* Register_SQUARED_DIFFERENCE() {
+  static TfLiteRegistration r = {
+      squared_difference::Init, squared_difference::Free,
+      squared_difference::Prepare, squared_difference::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/squared_difference_test.cc b/tensorflow/lite/kernels/squared_difference_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32bcab3b87f5f0cf5ad47724cc06c98f1a561e4a
--- /dev/null
+++ b/tensorflow/lite/kernels/squared_difference_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseSquaredDifferenceOpModel : public SingleOpModel {
+ public:
+  BaseSquaredDifferenceOpModel(const TensorData& input1,
+                               const TensorData& input2,
+                               const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SQUARED_DIFFERENCE,
+                 BuiltinOptions_SquaredDifferenceOptions,
+                 CreateSquaredDifferenceOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatSquaredDifferenceOpModel : public BaseSquaredDifferenceOpModel {
+ public:
+  using BaseSquaredDifferenceOpModel::BaseSquaredDifferenceOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class IntegerSquaredDifferenceOpModel : public BaseSquaredDifferenceOpModel {
+ public:
+  using BaseSquaredDifferenceOpModel::BaseSquaredDifferenceOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_SameShape) {
+  FloatSquaredDifferenceOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                  {TensorType_FLOAT32, {1, 2, 2, 1}},
+                                  {TensorType_FLOAT32, {}});
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.5, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({0.49, 0.0, 0.09, 0.09})));
+}
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_VariousInputShapes) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSquaredDifferenceOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                                    {TensorType_FLOAT32, test_shapes[i]},
+                                    {TensorType_FLOAT32, {}});
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.3, 0.8, 1.1, -2.0});
+    m.PopulateTensor<float>(m.input2(), {1.0, 0.2, 0.6, 0.4, -1.0, -0.0});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({9.0, 0.0, 0.09, 0.16, 4.41, 4.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_WithBroadcast) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSquaredDifferenceOpModel m(
+        {TensorType_FLOAT32, test_shapes[i]},
+        {TensorType_FLOAT32, {}},  // always a scalar
+        {TensorType_FLOAT32, {}});
+    m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, 0.5, 0.8, 0.11, 1.1});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({0.09, 0.01, 0.16, 0.49, 0.0001, 1.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_SameShape) {
+  IntegerSquaredDifferenceOpModel m({TensorType_INT32, {1, 2, 2, 1}},
+                                    {TensorType_INT32, {1, 2, 2, 1}},
+                                    {TensorType_INT32, {}});
+  m.PopulateTensor<int32_t>(m.input1(), {-2, 2, -15, 8});
+  m.PopulateTensor<int32_t>(m.input2(), {5, -2, -3, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({49, 16, 144, 9}));
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_VariousInputShapes) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    IntegerSquaredDifferenceOpModel m({TensorType_INT32, test_shapes[i]},
+                                      {TensorType_INT32, test_shapes[i]},
+                                      {TensorType_INT32, {}});
+    m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 3, 8, 11, -20});
+    m.PopulateTensor<int32_t>(m.input2(), {1, 2, 6, 5, -5, -20});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({441, 0, 9, 9, 256, 0}))
+        << "With shape number " << i;
+  }
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_WithBroadcast) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    IntegerSquaredDifferenceOpModel m(
+        {TensorType_INT32, test_shapes[i]},
+        {TensorType_INT32, {}},  // always a scalar
+        {TensorType_INT32, {}});
+    m.PopulateTensor<int32_t>(m.input1(), {-20, 10, 7, 3, 1, 13});
+    m.PopulateTensor<int32_t>(m.input2(), {3});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({529, 49, 16, 0, 4, 100}))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index f0b9447ff61ced57638e182a6487d89d41a7edb4..41503300ab599fbfcfee425c41033dd3bc10d2ea 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -99,7 +99,7 @@ TEST(FloatSubOpModel, ActivationRELU_N1_TO_1) {
 }
 
 TEST(FloatSubOpModel, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -116,7 +116,7 @@ TEST(FloatSubOpModel, VariousInputShapes) {
 }
 
 TEST(FloatSubOpModel, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -153,7 +153,7 @@ TEST(IntegerSubOpModel, ActivationRELU_N1_TO_1) {
 }
 
 TEST(IntegerSubOpModel, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerSubOpModel m({TensorType_INT32, test_shapes[i]},
@@ -168,7 +168,7 @@ TEST(IntegerSubOpModel, VariousInputShapes) {
 }
 
 TEST(IntegerSubOpModel, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerSubOpModel m({TensorType_INT32, test_shapes[i]},
@@ -185,14 +185,13 @@ TEST(IntegerSubOpModel, WithBroadcast) {
 
 TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
-  std::vector<std::initializer_list<float>> inputs1 = {
+  std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
-  std::vector<std::initializer_list<float>> inputs2 = {
+  std::vector<std::vector<float>> inputs2 = {
       {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.2}, {0.6, 0.4, -0.18, 0.5}};
-  std::vector<std::initializer_list<float>> results = {
-      {-0.5, -0.2, 0.0, 0.3},
-      {-0.8, -0.2, -0.1, 0.9},
-      {-0.61, -0.2, 0.88, -0.2}};
+  std::vector<std::vector<float>> results = {{-0.5, -0.2, 0.0, 0.3},
+                                             {-0.8, -0.2, -0.1, 0.9},
+                                             {-0.61, -0.2, 0.88, -0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
@@ -209,12 +208,12 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
 
 TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
-  std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
-                                                       {-0.8, 0.2, 0.7, 0.5}};
-  std::vector<std::initializer_list<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
-                                                       {0.6, 0.4, -0.8, 0.3}};
-  std::vector<std::initializer_list<float>> results = {{-1.0, -0.2, 0.0, 1.0},
-                                                       {-1.0, -0.2, 1.0, 0.2}};
+  std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                             {-0.8, 0.2, 0.7, 0.5}};
+  std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                             {0.6, 0.4, -0.8, 0.3}};
+  std::vector<std::vector<float>> results = {{-1.0, -0.2, 0.0, 1.0},
+                                             {-1.0, -0.2, 1.0, 0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
@@ -231,7 +230,7 @@ TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
 
 TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
@@ -250,7 +249,7 @@ TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
 
 TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 43a5137a941d5062fbbc5d89724face9bd0976d9..a49d6c2cae2b0e3423c69e2894405979e57d870b 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -307,6 +307,7 @@ class SingleOpModel {
 
     if (is_quantized) {
       if (t.min != 0 || t.max != 0) {
+        // TODO(b/119422369): Handle signed int8 here.
         if (t.type == TensorType_UINT8) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<uint8_t>(t.min, t.max);
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index 6d13f9e92f9bd4d0e189a47b0b78f417b26c5121..1b7479747431ad903f64f3011045266c415dc6c5 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -182,6 +182,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt64:
       Tile<int64_t>(*(input->dims), input, multipliers, output);
       break;
+    case kTfLiteBool:
+      Tile<bool>(*(input->dims), input, multipliers, output);
+      break;
     default:
       context->ReportError(context, "Type '%s' is not supported by tile.",
                            TfLiteTypeGetName(output->type));
diff --git a/tensorflow/lite/kernels/tile_test.cc b/tensorflow/lite/kernels/tile_test.cc
index d12a7c19a367bf655c4b7158b4960eb4ef3afb58..a88ff66f0754549c96077d6edf655039caf62e34 100644
--- a/tensorflow/lite/kernels/tile_test.cc
+++ b/tensorflow/lite/kernels/tile_test.cc
@@ -34,34 +34,18 @@ class TileOpModel : public SingleOpModel {
     BuildInterpreter({input_shape, {static_cast<int>(input_shape.size())}});
   }
 
-  void SetInputFloat(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
-  }
-
-  void SetInputUInt8(std::initializer_list<uint8_t> data) {
-    PopulateTensor<uint8_t>(input_, data);
-  }
-
-  void SetInputInt32(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(input_, data);
-  }
-
-  void SetInputInt64(std::initializer_list<int64_t> data) {
-    PopulateTensor<int64_t>(input_, data);
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
 
   void SetMultipliers(std::initializer_list<int32_t> data) {
     PopulateTensor<int32_t>(multipliers_, data);
   }
 
-  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
-
-  std::vector<uint8_t> GetOutputUInt8() { return ExtractVector<uint8_t>(output_); }
-
-  std::vector<int32_t> GetOutputInt32() { return ExtractVector<int32_t>(output_); }
-
-  std::vector<int64_t> GetOutputInt64() {
-    return ExtractVector<int64_t>(output_);
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
   }
 
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
@@ -74,16 +58,16 @@ class TileOpModel : public SingleOpModel {
 
 TEST(TileTest, Float32Vector) {
   TileOpModel m({3}, TensorType_FLOAT32, TensorType_INT32);
-  m.SetInputFloat({1.f, 2.f, 3.f});
+  m.SetInput<float>({1.f, 2.f, 3.f});
   m.SetMultipliers({2});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray({1.f, 2.f, 3.f, 1.f, 2.f, 3.f}));
 }
 
 TEST(TileTest, Float32Matrix) {
   TileOpModel m({2, 3}, TensorType_FLOAT32, TensorType_INT32);
-  m.SetInputFloat({
+  m.SetInput<float>({
       11.f,
       12.f,
       13.f,
@@ -93,26 +77,26 @@ TEST(TileTest, Float32Matrix) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray({
-                                      11.f,
-                                      12.f,
-                                      13.f,
-                                      21.f,
-                                      22.f,
-                                      23.f,
-                                      11.f,
-                                      12.f,
-                                      13.f,
-                                      21.f,
-                                      22.f,
-                                      23.f,
-                                  }));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({
+                                        11.f,
+                                        12.f,
+                                        13.f,
+                                        21.f,
+                                        22.f,
+                                        23.f,
+                                        11.f,
+                                        12.f,
+                                        13.f,
+                                        21.f,
+                                        22.f,
+                                        23.f,
+                                    }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 
 TEST(TileTest, Float32HighDimension) {
   TileOpModel m({1, 2, 3}, TensorType_FLOAT32, TensorType_INT32);
-  m.SetInputFloat({
+  m.SetInput<float>({
       11.f,
       12.f,
       13.f,
@@ -123,7 +107,7 @@ TEST(TileTest, Float32HighDimension) {
   m.SetMultipliers({2, 3, 1});
   m.Invoke();
   EXPECT_THAT(
-      m.GetOutputFloat(),
+      m.GetOutput<float>(),
       ElementsAreArray({11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
                         21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f,
                         11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
@@ -133,7 +117,7 @@ TEST(TileTest, Float32HighDimension) {
 
 TEST(TileTest, Uint8Matrix) {
   TileOpModel m({2, 3}, TensorType_UINT8, TensorType_INT32);
-  m.SetInputUInt8({
+  m.SetInput<uint8_t>({
       11,
       12,
       13,
@@ -143,26 +127,26 @@ TEST(TileTest, Uint8Matrix) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputUInt8(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 
 TEST(TileTest, Int32Matrix) {
   TileOpModel m({2, 3}, TensorType_INT32, TensorType_INT32);
-  m.SetInputInt32({
+  m.SetInput<int32_t>({
       11,
       12,
       13,
@@ -172,26 +156,39 @@ TEST(TileTest, Int32Matrix) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputInt32(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, BooleanMatrix) {
+  TileOpModel m({2, 3}, TensorType_BOOL, TensorType_INT32);
+  m.SetInput<bool>({true, false, false, true, true, false});
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<bool>(),
+              ElementsAreArray({
+                  true, false, false, true, true, false,  // first tiletrue,
+                  true, false, false, true, true, false   // second tile
+              }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 
 TEST(TileTest, Int64Matrix) {
   TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT32);
-  m.SetInputInt64({
+  m.SetInput<int64_t>({
       11,
       12,
       13,
@@ -201,26 +198,26 @@ TEST(TileTest, Int64Matrix) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 
 TEST(TileTest, Int64Matrix64Multipliers) {
   TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT64);
-  m.SetInputInt64({
+  m.SetInput<int64_t>({
       11,
       12,
       13,
@@ -230,20 +227,20 @@ TEST(TileTest, Int64Matrix64Multipliers) {
   });
   m.SetMultipliers({2, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
 }
 }  // namespace
diff --git a/tensorflow/lite/lib_package/create_ios_frameworks.sh b/tensorflow/lite/lib_package/create_ios_frameworks.sh
index fa466ed5bc7ad31f371a7f1b67754d446b45063a..7901655b7c6926a38dc30009a8b95185fdc2d8cc 100755
--- a/tensorflow/lite/lib_package/create_ios_frameworks.sh
+++ b/tensorflow/lite/lib_package/create_ios_frameworks.sh
@@ -30,7 +30,7 @@ echo "Creating target Headers directories"
 mkdir -p $FW_DIR_TFLITE_HDRS
 
 echo "Headers, populating: TensorFlow Lite"
-cd $TFLITE_DIR/../../..
+cd $TFLITE_DIR/../..
 
 find tensorflow/lite -name '*.h' \
     -not -path 'tensorflow/lite/tools/*' \
@@ -51,10 +51,10 @@ cd $FW_DIR_TFLITE_HDRS
 tar xf tmp.tar
 rm -f tmp.tar
 
-cd $TFLITE_DIR/../../..
+cd $TFLITE_DIR/../..
 echo "Generate master LICENSE file and copy to target"
 bazel build //tensorflow/tools/lib_package:clicenses_generate
-cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
+cp $TFLITE_DIR/../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
    $FW_DIR_TFLITE
 
 echo "Copying static libraries"
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index 39709404358c2c9e4a187657064ed26aaff52e29..5ac0532afeffc0801a207c385be9816fa459b416 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -404,8 +404,7 @@ TfLiteStatus InterpreterBuilder::ApplyDelegates(Interpreter* interpreter) {
   }
 
   if (auto flex_delegate = AcquireFlexDelegate()) {
-    return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate),
-                                                /*allow_dynamic_tensors=*/true);
+    return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate));
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
index 950bdb39425f89e8870ca7f2146641912073a2e0..58288a8dd474f44a5760b4ebb9c15e7840222c5f 100644
--- a/tensorflow/lite/nnapi_delegate.cc
+++ b/tensorflow/lite/nnapi_delegate.cc
@@ -140,13 +140,13 @@ NNAPIDelegate::~NNAPIDelegate() {
   // ANeuralNetworksShutdown();
 }
 
-// Adds the tensors of the interpreter to the NN API model.
-TfLiteStatus addTensorOperands(tflite::Interpreter* interpreter,
+// Adds the tensors of the subgraph to the NN API model.
+TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
                                ANeuralNetworksModel* nn_model,
                                uint32_t* no_of_operands_added,
                                std::vector<int64_t>* nnapi_ids) {
   uint32_t next_id = 0;
-  for (size_t i = 0; i < interpreter->tensors_size(); i++) {
+  for (size_t i = 0; i < subgraph->tensors_size(); i++) {
     // Skip temporaries and RNN back-edges.
     if ((*nnapi_ids)[i] == kOperandNotNeeded) continue;
 
@@ -156,7 +156,7 @@ TfLiteStatus addTensorOperands(tflite::Interpreter* interpreter,
     // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
     float scale = 0.0f;
     int32_t zeroPoint = 0;
-    TfLiteTensor* tensor = interpreter->tensor(i);
+    TfLiteTensor* tensor = subgraph->tensor(i);
     switch (tensor->type) {
       case kTfLiteNoType:
         // Tensors added during initialization of Ops don't have a type yet and
@@ -240,12 +240,12 @@ void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count,
 // Adds the operations and their parameters to the NN API model.
 // 'next-id' is the operand ID of the next operand of the model.
 TfLiteStatus AddOpsAndParams(
-    tflite::Interpreter* interpreter, ANeuralNetworksModel* nn_model,
+    tflite::Subgraph* subgraph, ANeuralNetworksModel* nn_model,
     uint32_t next_id, std::vector<int>* model_state_inputs,
     std::vector<int>* model_state_outputs,
     const std::vector<int64_t>& tensor_id_to_nnapi_id) {
-  for (size_t i = 0; i < interpreter->nodes_size(); i++) {
-    const auto* node_and_registration = interpreter->node_and_registration(i);
+  for (size_t i = 0; i < subgraph->nodes_size(); i++) {
+    const auto* node_and_registration = subgraph->node_and_registration(i);
     const TfLiteNode& node = node_and_registration->first;
     const TfLiteRegistration& registration = node_and_registration->second;
     tflite::BuiltinOperator builtin =
@@ -291,9 +291,9 @@ TfLiteStatus AddOpsAndParams(
     // For each state_out tensor, a corresponding state_in operand needs to be
     // created for NNAPI.
     auto duplicate_state_tensor_float32 =
-        [interpreter, &nn_model, &next_id, &augmented_inputs,
-         &model_state_inputs, &model_state_outputs](int tensor_id) {
-          const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
+        [subgraph, &nn_model, &next_id, &augmented_inputs, &model_state_inputs,
+         &model_state_outputs](int tensor_id) {
+          const TfLiteTensor* tensor = subgraph->tensor(tensor_id);
           ANeuralNetworksOperandType operand_type{
               ANEURALNETWORKS_TENSOR_FLOAT32,
               static_cast<uint32_t>(tensor->dims->size),
@@ -388,11 +388,11 @@ TfLiteStatus AddOpsAndParams(
     };
 
     // LSTM in NNAPI requires scratch tensor as an output operand.
-    auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model,
+    auto add_lstm_scratch_tensor_float32 = [subgraph, &node, &nn_model,
                                             &next_id, &augmented_outputs]() {
       if (node.temporaries->size == 0) return;
       int scratch_buffer_index = node.temporaries->data[0];
-      const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index);
+      const TfLiteTensor* tensor = subgraph->tensor(scratch_buffer_index);
       ANeuralNetworksOperandType operand_type{
           ANEURALNETWORKS_TENSOR_FLOAT32,
           static_cast<uint32_t>(tensor->dims->size),
@@ -584,7 +584,7 @@ TfLiteStatus AddOpsAndParams(
         // The permutation input tensor value dictates the output dimensions.
         // TODO(b/110888333): Support dynamically-sized tensors in delegates.
         if ((node.inputs->size > 1) &&
-            (interpreter->tensor(node.inputs->data[1])->allocation_type !=
+            (subgraph->tensor(node.inputs->data[1])->allocation_type !=
              kTfLiteMmapRo)) {
           logError("NNAPI does not yet support dynamic tensors.");
           return kTfLiteError;
@@ -601,14 +601,13 @@ TfLiteStatus AddOpsAndParams(
           return kTfLiteError;
         }
         if ((node.inputs->size > 0) &&
-            (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) {
+            (subgraph->tensor(node.inputs->data[0])->dims->size != 4)) {
           logError("NNAPI only supports input rank 4 for L2Normalization");
           return kTfLiteError;
         }
         break;
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
-        if (interpreter->tensor(node.outputs->data[0])->type !=
-            kTfLiteFloat32) {
+        if (subgraph->tensor(node.outputs->data[0])->type != kTfLiteFloat32) {
           logError("NNAPI only support HASHTABLE_LOOKUP with float32 output",
                    builtin);
           return kTfLiteError;
@@ -682,6 +681,9 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_FILL:
       case tflite::BuiltinOperator_FLOOR_MOD:
       case tflite::BuiltinOperator_RANGE:
+      case tflite::BuiltinOperator_LEAKY_RELU:
+      case tflite::BuiltinOperator_SQUARED_DIFFERENCE:
+      case tflite::BuiltinOperator_MIRROR_PAD:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
@@ -706,7 +708,7 @@ TfLiteStatus AddOpsAndParams(
   return kTfLiteOk;
 }
 
-TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   if (nn_model_ && nn_compiled_model_) return model_status_;
 
   // TODO(aselle): This is not correct. need to handle resize invalidation.
@@ -718,7 +720,7 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
     // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with
     // kOperandIdNotSet. addTensorOperands will replace those with the
     // corresponding NNAPI operand ids and skip kOperandNotNeeded entries.
-    std::vector<int64_t> tensor_id_to_nnapi_id(interpreter->tensors_size(),
+    std::vector<int64_t> tensor_id_to_nnapi_id(subgraph->tensors_size(),
                                                kOperandNotNeeded);
     auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf,
                                                        size_t count) {
@@ -729,35 +731,31 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
         }
       }
     };
-    for (size_t i = 0; i < interpreter->nodes_size(); i++) {
-      const auto* node_and_registration = interpreter->node_and_registration(i);
+    for (size_t i = 0; i < subgraph->nodes_size(); i++) {
+      const auto* node_and_registration = subgraph->node_and_registration(i);
       const TfLiteNode& node = node_and_registration->first;
       set_ids_to_not_set(node.inputs->data, node.inputs->size);
       set_ids_to_not_set(node.outputs->data, node.outputs->size);
     }
-    set_ids_to_not_set(interpreter->inputs().data(),
-                       interpreter->inputs().size());
-    set_ids_to_not_set(interpreter->outputs().data(),
-                       interpreter->outputs().size());
+    set_ids_to_not_set(subgraph->inputs().data(), subgraph->inputs().size());
+    set_ids_to_not_set(subgraph->outputs().data(), subgraph->outputs().size());
 
     uint32_t next_id = 0;
     RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands(
-        interpreter, nn_model_, &next_id, &tensor_id_to_nnapi_id));
+        subgraph, nn_model_, &next_id, &tensor_id_to_nnapi_id));
     RETURN_ERROR_IF_TFLITE_FAILED(
-        AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
+        AddOpsAndParams(subgraph, nn_model_, next_id, &model_states_inputs_,
                         &model_states_outputs_, tensor_id_to_nnapi_id));
 
     std::vector<uint32_t> augmented_inputs;
-    MapAndAddTensorIds(interpreter->inputs().data(),
-                       interpreter->inputs().size(), &augmented_inputs,
-                       tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(subgraph->inputs().data(), subgraph->inputs().size(),
+                       &augmented_inputs, tensor_id_to_nnapi_id);
     augmented_inputs.insert(augmented_inputs.end(),
                             model_states_inputs_.begin(),
                             model_states_inputs_.end());
     std::vector<uint32_t> augmented_outputs;
-    MapAndAddTensorIds(interpreter->outputs().data(),
-                       interpreter->outputs().size(), &augmented_outputs,
-                       tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(subgraph->outputs().data(), subgraph->outputs().size(),
+                       &augmented_outputs, tensor_id_to_nnapi_id);
     MapAndAddTensorIds(model_states_outputs_.data(),
                        model_states_outputs_.size(), &augmented_outputs,
                        tensor_id_to_nnapi_id);
@@ -770,7 +768,7 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
 
     if (GetAndroidSdkVersionCached() >= 28) {
       CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-          nn_model_, interpreter->GetAllowFp16PrecisionForFp32()));
+          nn_model_, subgraph->GetAllowFp16PrecisionForFp32()));
     }
     CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
   }
@@ -781,9 +779,9 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
   return kTfLiteOk;
 }
 
-TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
   if (!nn_model_) {
-    model_status_ = BuildGraph(interpreter);
+    model_status_ = BuildGraph(subgraph);
     if (model_status_ != kTfLiteOk) {
       logError("Failed to build graph for NNAPI");
     }
@@ -796,19 +794,19 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
   CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
 
   // Currently perform deep copy of input buffer
-  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
-    int input = interpreter->inputs()[i];
+  for (size_t i = 0; i < subgraph->inputs().size(); i++) {
+    int input = subgraph->inputs()[i];
     // TODO(aselle): Is this what we want or do we want input instead?
     // TODO(aselle): This should be called setInputValue maybe to be cons.
-    TfLiteTensor* tensor = interpreter->tensor(input);
+    TfLiteTensor* tensor = subgraph->tensor(input);
     CHECK_NN(ANeuralNetworksExecution_setInput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
 
   // Tell nn api where to place final data.
-  for (size_t i = 0; i < interpreter->outputs().size(); i++) {
-    int output = interpreter->outputs()[i];
-    TfLiteTensor* tensor = interpreter->tensor(output);
+  for (size_t i = 0; i < subgraph->outputs().size(); i++) {
+    int output = subgraph->outputs()[i];
+    TfLiteTensor* tensor = subgraph->tensor(output);
     CHECK_NN(ANeuralNetworksExecution_setOutput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
@@ -817,16 +815,16 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
   // current invocation.
   for (size_t i = 0; i < model_states_outputs_.size(); i++) {
     int state_tensor_idx = model_states_outputs_[i];
-    TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx);
+    TfLiteTensor* tensor = subgraph->tensor(state_tensor_idx);
     // Here we are using a deep copy for state_in tensors so that we are not
     // reading and writing into the same buffer during a invocation.
     // TODO(miaowang): using double shared buffer to minimize the copies.
     CHECK_NN(ANeuralNetworksExecution_setInput(
-        execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw,
+        execution, i + subgraph->inputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
     // Tell NNAPI where to output the state_out.
     CHECK_NN(ANeuralNetworksExecution_setOutput(
-        execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw,
+        execution, i + subgraph->outputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
   }
 
@@ -839,9 +837,9 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
 
 #if 0
   printf("From the NN API:\n");
-  TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+  TfLiteTensor* tensor = subgraph->tensor(subgraph->outputs()[0]);
   if (float* data =
-          interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
+          subgraph->typed_tensor<float>(subgraph->outputs()[0])) {
     size_t num = tensor->bytes / sizeof(float);
     for (float* p = data; p < data + num; p++) {
       printf(" %f", *p);
diff --git a/tensorflow/lite/nnapi_delegate.h b/tensorflow/lite/nnapi_delegate.h
index 63b408c1416ed1c2126cbdb5c376cb3dbb10f789..b4f8e4ecf3935c41346c78647e631651dbcccb3e 100644
--- a/tensorflow/lite/nnapi_delegate.h
+++ b/tensorflow/lite/nnapi_delegate.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/interpreter.h"
 
 class ANeuralNetworksModel;
@@ -50,10 +51,10 @@ class NNAPIDelegate {
   ~NNAPIDelegate();
 
   // Convert a tflite graph to NNAPI
-  TfLiteStatus BuildGraph(Interpreter* interpreter);
+  TfLiteStatus BuildGraph(Subgraph* subgraph);
 
   // Run
-  TfLiteStatus Invoke(Interpreter* interpreter);
+  TfLiteStatus Invoke(Subgraph* subgraph);
 
   // Whether the current platform supports NNAPI delegation.
   static bool IsSupported();
diff --git a/tensorflow/lite/nnapi_delegate_disabled.cc b/tensorflow/lite/nnapi_delegate_disabled.cc
index 44dc21f1b6c2b3e4eb2c31fb19046fca90440428..a8f2c0bfe386f1339c17e34a199cf929c43ecc33 100644
--- a/tensorflow/lite/nnapi_delegate_disabled.cc
+++ b/tensorflow/lite/nnapi_delegate_disabled.cc
@@ -35,13 +35,11 @@ NNAPIDelegate::~NNAPIDelegate() {
 #undef UNUSED_MEMBER
 }
 
-TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   return kTfLiteError;
 }
 
-TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
-  return kTfLiteError;
-}
+TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) { return kTfLiteError; }
 
 bool NNAPIDelegate::IsSupported() { return false; }
 
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 020d1d8de5ff0edb0f48dd28a3acfc93e00bed03..1113bf01b175d93d849dbd51abf2f6c677f450d4 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -44,6 +44,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt32";
     case kTfLiteUInt8:
       return "kTfLiteUInt8";
+    case kTfLiteInt8:
+      return "kTfLiteInt8";
     case kTfLiteInt64:
       return "kTfLiteInt64";
     case kTfLiteString:
@@ -83,26 +85,27 @@ void PrintInterpreterState(Interpreter* interpreter) {
   printf("Outputs:");
   PrintIntVector(interpreter->outputs());
   printf("\n");
-  for (int tensor_index = 0; tensor_index < interpreter->tensors_size();
+  for (size_t tensor_index = 0; tensor_index < interpreter->tensors_size();
        tensor_index++) {
-    TfLiteTensor* tensor = interpreter->tensor(tensor_index);
-    printf("Tensor %3d %-20s %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
+    TfLiteTensor* tensor = interpreter->tensor(static_cast<int>(tensor_index));
+    printf("Tensor %3zu %-20s %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
            tensor->name, TensorTypeName(tensor->type),
            AllocTypeName(tensor->allocation_type), tensor->bytes,
            (static_cast<float>(tensor->bytes) / (1 << 20)));
     PrintTfLiteIntVector(tensor->dims);
   }
   printf("\n");
-  for (int node_index = 0; node_index < interpreter->nodes_size();
+  for (size_t node_index = 0; node_index < interpreter->nodes_size();
        node_index++) {
     const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg =
-        interpreter->node_and_registration(node_index);
+        interpreter->node_and_registration(static_cast<int>(node_index));
     const TfLiteNode& node = node_and_reg->first;
     const TfLiteRegistration& reg = node_and_reg->second;
     if (reg.custom_name != nullptr) {
-      printf("Node %3d Operator Custom Name %s\n", node_index, reg.custom_name);
+      printf("Node %3zu Operator Custom Name %s\n", node_index,
+             reg.custom_name);
     } else {
-      printf("Node %3d Operator Builtin Code %3d\n", node_index,
+      printf("Node %3zu Operator Builtin Code %3d\n", node_index,
              reg.builtin_code);
     }
     printf("  Inputs:");
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 14b9def4dbebb3836736c491197d0e768ce0f06d..9991fb2a7335ddd9c916c35a4378ab3dcfb643bf 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -116,12 +116,12 @@ def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
   # TODO(aselle): When toco does not use fatal errors for failure, we can
   # switch this on.
   if not _toco_from_proto_bin:
-    model_str = _toco_python.TocoConvert(model_flags_str, toco_flags_str,
-                                         input_data_str)
-    if not model_str:
-      raise ConverterError(
-          "TOCO returned an empty string. See console for more info.")
-    return model_str
+    try:
+      model_str = _toco_python.TocoConvert(model_flags_str, toco_flags_str,
+                                           input_data_str)
+      return model_str
+    except Exception as e:
+      raise ConverterError("TOCO failed: %s" % e)
 
   # Windows and TemporaryFile are not that useful together,
   # since you cannot have two readers/writers. So we have to
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index e71752fe6318e8518a8e67d1bb006661b4bdd880..d14af439ec0ab600ea260da17ef0041cca25d629 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -124,6 +124,8 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
       return NPY_INT16;
     case kTfLiteUInt8:
       return NPY_UINT8;
+    case kTfLiteInt8:
+      return NPY_INT8;
     case kTfLiteInt64:
       return NPY_INT64;
     case kTfLiteString:
@@ -150,6 +152,8 @@ TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
       return kTfLiteInt16;
     case NPY_UINT8:
       return kTfLiteUInt8;
+    case NPY_INT8:
+      return kTfLiteInt8;
     case NPY_INT64:
       return kTfLiteInt64;
     case NPY_BOOL:
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 07b16f28121310934672e6c8d68def88945e8f07..652871d01378822904e6acb363d75381df4c4410 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -39,16 +39,34 @@ enum TensorType : byte {
   BOOL = 6,
   INT16 = 7,
   COMPLEX64 = 8,
+  INT8 = 9,
 }
 
-// Parameters for converting a quantized tensor back to float. Given a
-// quantized value q, the corresponding float value f should be:
-//   f = scale * (q - zero_point)
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[byte];
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
+}
+
+// Parameters for converting a quantized tensor back to float.
 table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
   min:[float];  // For importing back into tensorflow.
   max:[float];  // For importing back into tensorflow.
   scale:[float];  // For dequantizing the tensor's values.
   zero_point:[long];
+
+  // If this is not none, the quantization parameters above are ignored and the
+  // value of the QuantizationDetails union below should be used.
+  details:QuantizationDetails;
 }
 
 table Tensor {
@@ -182,6 +200,9 @@ enum BuiltinOperator : byte {
   FLOOR_MOD = 95,
   RANGE = 96,
   RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
 }
 
 // Options for the builtin operators.
@@ -260,6 +281,9 @@ union BuiltinOptions {
   FloorModOptions,
   RangeOptions,
   ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -640,6 +664,24 @@ table FloorModOptions {
 table RangeOptions {
 }
 
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 479c0d658bac29aab64b7965fc8fad74ccc6efad..1464c75613c72ab4564d33570353cbdb4e965513 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -22,6 +22,9 @@ limitations under the License.
 
 namespace tflite {
 
+struct CustomQuantization;
+struct CustomQuantizationT;
+
 struct QuantizationParameters;
 struct QuantizationParametersT;
 
@@ -250,6 +253,15 @@ struct FloorModOptionsT;
 struct RangeOptions;
 struct RangeOptionsT;
 
+struct LeakyReluOptions;
+struct LeakyReluOptionsT;
+
+struct SquaredDifferenceOptions;
+struct SquaredDifferenceOptionsT;
+
+struct MirrorPadOptions;
+struct MirrorPadOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -275,11 +287,12 @@ enum TensorType {
   TensorType_BOOL = 6,
   TensorType_INT16 = 7,
   TensorType_COMPLEX64 = 8,
+  TensorType_INT8 = 9,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_COMPLEX64
+  TensorType_MAX = TensorType_INT8
 };
 
-inline const TensorType (&EnumValuesTensorType())[9] {
+inline const TensorType (&EnumValuesTensorType())[10] {
   static const TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
@@ -289,7 +302,8 @@ inline const TensorType (&EnumValuesTensorType())[9] {
     TensorType_STRING,
     TensorType_BOOL,
     TensorType_INT16,
-    TensorType_COMPLEX64
+    TensorType_COMPLEX64,
+    TensorType_INT8
   };
   return values;
 }
@@ -305,6 +319,7 @@ inline const char * const *EnumNamesTensorType() {
     "BOOL",
     "INT16",
     "COMPLEX64",
+    "INT8",
     nullptr
   };
   return names;
@@ -315,6 +330,87 @@ inline const char *EnumNameTensorType(TensorType e) {
   return EnumNamesTensorType()[index];
 }
 
+enum QuantizationDetails {
+  QuantizationDetails_NONE = 0,
+  QuantizationDetails_CustomQuantization = 1,
+  QuantizationDetails_MIN = QuantizationDetails_NONE,
+  QuantizationDetails_MAX = QuantizationDetails_CustomQuantization
+};
+
+inline const QuantizationDetails (&EnumValuesQuantizationDetails())[2] {
+  static const QuantizationDetails values[] = {
+    QuantizationDetails_NONE,
+    QuantizationDetails_CustomQuantization
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQuantizationDetails() {
+  static const char * const names[] = {
+    "NONE",
+    "CustomQuantization",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQuantizationDetails(QuantizationDetails e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesQuantizationDetails()[index];
+}
+
+template<typename T> struct QuantizationDetailsTraits {
+  static const QuantizationDetails enum_value = QuantizationDetails_NONE;
+};
+
+template<> struct QuantizationDetailsTraits<CustomQuantization> {
+  static const QuantizationDetails enum_value = QuantizationDetails_CustomQuantization;
+};
+
+struct QuantizationDetailsUnion {
+  QuantizationDetails type;
+  void *value;
+
+  QuantizationDetailsUnion() : type(QuantizationDetails_NONE), value(nullptr) {}
+  QuantizationDetailsUnion(QuantizationDetailsUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(QuantizationDetails_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  QuantizationDetailsUnion(const QuantizationDetailsUnion &) FLATBUFFERS_NOEXCEPT;
+  QuantizationDetailsUnion &operator=(const QuantizationDetailsUnion &u) FLATBUFFERS_NOEXCEPT
+    { QuantizationDetailsUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  QuantizationDetailsUnion &operator=(QuantizationDetailsUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~QuantizationDetailsUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    Reset();
+    type = QuantizationDetailsTraits<typename T::TableType>::enum_value;
+    if (type != QuantizationDetails_NONE) {
+      value = new T(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  CustomQuantizationT *AsCustomQuantization() {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<CustomQuantizationT *>(value) : nullptr;
+  }
+  const CustomQuantizationT *AsCustomQuantization() const {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<const CustomQuantizationT *>(value) : nullptr;
+  }
+};
+
+bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type);
+bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
 enum BuiltinOperator {
   BuiltinOperator_ADD = 0,
   BuiltinOperator_AVERAGE_POOL_2D = 1,
@@ -413,11 +509,14 @@ enum BuiltinOperator {
   BuiltinOperator_FLOOR_MOD = 95,
   BuiltinOperator_RANGE = 96,
   BuiltinOperator_RESIZE_NEAREST_NEIGHBOR = 97,
+  BuiltinOperator_LEAKY_RELU = 98,
+  BuiltinOperator_SQUARED_DIFFERENCE = 99,
+  BuiltinOperator_MIRROR_PAD = 100,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR
+  BuiltinOperator_MAX = BuiltinOperator_MIRROR_PAD
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[97] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[100] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -515,7 +614,10 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[97] {
     BuiltinOperator_FILL,
     BuiltinOperator_FLOOR_MOD,
     BuiltinOperator_RANGE,
-    BuiltinOperator_RESIZE_NEAREST_NEIGHBOR
+    BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+    BuiltinOperator_LEAKY_RELU,
+    BuiltinOperator_SQUARED_DIFFERENCE,
+    BuiltinOperator_MIRROR_PAD
   };
   return values;
 }
@@ -620,6 +722,9 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "FLOOR_MOD",
     "RANGE",
     "RESIZE_NEAREST_NEIGHBOR",
+    "LEAKY_RELU",
+    "SQUARED_DIFFERENCE",
+    "MIRROR_PAD",
     nullptr
   };
   return names;
@@ -706,11 +811,14 @@ enum BuiltinOptions {
   BuiltinOptions_FloorModOptions = 72,
   BuiltinOptions_RangeOptions = 73,
   BuiltinOptions_ResizeNearestNeighborOptions = 74,
+  BuiltinOptions_LeakyReluOptions = 75,
+  BuiltinOptions_SquaredDifferenceOptions = 76,
+  BuiltinOptions_MirrorPadOptions = 77,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_ResizeNearestNeighborOptions
+  BuiltinOptions_MAX = BuiltinOptions_MirrorPadOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[75] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[78] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -786,7 +894,10 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[75] {
     BuiltinOptions_UnidirectionalSequenceLSTMOptions,
     BuiltinOptions_FloorModOptions,
     BuiltinOptions_RangeOptions,
-    BuiltinOptions_ResizeNearestNeighborOptions
+    BuiltinOptions_ResizeNearestNeighborOptions,
+    BuiltinOptions_LeakyReluOptions,
+    BuiltinOptions_SquaredDifferenceOptions,
+    BuiltinOptions_MirrorPadOptions
   };
   return values;
 }
@@ -868,6 +979,9 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "FloorModOptions",
     "RangeOptions",
     "ResizeNearestNeighborOptions",
+    "LeakyReluOptions",
+    "SquaredDifferenceOptions",
+    "MirrorPadOptions",
     nullptr
   };
   return names;
@@ -1178,6 +1292,18 @@ template<> struct BuiltinOptionsTraits<ResizeNearestNeighborOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ResizeNearestNeighborOptions;
 };
 
+template<> struct BuiltinOptionsTraits<LeakyReluOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LeakyReluOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SquaredDifferenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquaredDifferenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<MirrorPadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MirrorPadOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1801,6 +1927,30 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_ResizeNearestNeighborOptions ?
       reinterpret_cast<const ResizeNearestNeighborOptionsT *>(value) : nullptr;
   }
+  LeakyReluOptionsT *AsLeakyReluOptions() {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<LeakyReluOptionsT *>(value) : nullptr;
+  }
+  const LeakyReluOptionsT *AsLeakyReluOptions() const {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<const LeakyReluOptionsT *>(value) : nullptr;
+  }
+  SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  const SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() const {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<const SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  MirrorPadOptionsT *AsMirrorPadOptions() {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<MirrorPadOptionsT *>(value) : nullptr;
+  }
+  const MirrorPadOptionsT *AsMirrorPadOptions() const {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<const MirrorPadOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -1998,6 +2148,35 @@ inline const char *EnumNameCombinerType(CombinerType e) {
   return EnumNamesCombinerType()[index];
 }
 
+enum MirrorPadMode {
+  MirrorPadMode_REFLECT = 0,
+  MirrorPadMode_SYMMETRIC = 1,
+  MirrorPadMode_MIN = MirrorPadMode_REFLECT,
+  MirrorPadMode_MAX = MirrorPadMode_SYMMETRIC
+};
+
+inline const MirrorPadMode (&EnumValuesMirrorPadMode())[2] {
+  static const MirrorPadMode values[] = {
+    MirrorPadMode_REFLECT,
+    MirrorPadMode_SYMMETRIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesMirrorPadMode() {
+  static const char * const names[] = {
+    "REFLECT",
+    "SYMMETRIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesMirrorPadMode()[index];
+}
+
 enum CustomOptionsFormat {
   CustomOptionsFormat_FLEXBUFFERS = 0,
   CustomOptionsFormat_MIN = CustomOptionsFormat_FLEXBUFFERS,
@@ -2024,12 +2203,75 @@ inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
   return EnumNamesCustomOptionsFormat()[index];
 }
 
+struct CustomQuantizationT : public flatbuffers::NativeTable {
+  typedef CustomQuantization TableType;
+  std::vector<int8_t> custom;
+  CustomQuantizationT() {
+  }
+};
+
+struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CustomQuantizationT NativeTableType;
+  enum {
+    VT_CUSTOM = 4
+  };
+  const flatbuffers::Vector<int8_t> *custom() const {
+    return GetPointer<const flatbuffers::Vector<int8_t> *>(VT_CUSTOM);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CUSTOM) &&
+           verifier.VerifyVector(custom()) &&
+           verifier.EndTable();
+  }
+  CustomQuantizationT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CustomQuantization> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CustomQuantizationBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_custom(flatbuffers::Offset<flatbuffers::Vector<int8_t>> custom) {
+    fbb_.AddOffset(CustomQuantization::VT_CUSTOM, custom);
+  }
+  explicit CustomQuantizationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CustomQuantizationBuilder &operator=(const CustomQuantizationBuilder &);
+  flatbuffers::Offset<CustomQuantization> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CustomQuantization>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int8_t>> custom = 0) {
+  CustomQuantizationBuilder builder_(_fbb);
+  builder_.add_custom(custom);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int8_t> *custom = nullptr) {
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      custom ? _fbb.CreateVector<int8_t>(*custom) : 0);
+}
+
+flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct QuantizationParametersT : public flatbuffers::NativeTable {
   typedef QuantizationParameters TableType;
   std::vector<float> min;
   std::vector<float> max;
   std::vector<float> scale;
   std::vector<int64_t> zero_point;
+  QuantizationDetailsUnion details;
   QuantizationParametersT() {
   }
 };
@@ -2040,7 +2282,9 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     VT_MIN = 4,
     VT_MAX = 6,
     VT_SCALE = 8,
-    VT_ZERO_POINT = 10
+    VT_ZERO_POINT = 10,
+    VT_DETAILS_TYPE = 12,
+    VT_DETAILS = 14
   };
   const flatbuffers::Vector<float> *min() const {
     return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
@@ -2054,6 +2298,16 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   const flatbuffers::Vector<int64_t> *zero_point() const {
     return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
   }
+  QuantizationDetails details_type() const {
+    return static_cast<QuantizationDetails>(GetField<uint8_t>(VT_DETAILS_TYPE, 0));
+  }
+  const void *details() const {
+    return GetPointer<const void *>(VT_DETAILS);
+  }
+  template<typename T> const T *details_as() const;
+  const CustomQuantization *details_as_CustomQuantization() const {
+    return details_type() == QuantizationDetails_CustomQuantization ? static_cast<const CustomQuantization *>(details()) : nullptr;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_MIN) &&
@@ -2064,6 +2318,9 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
            verifier.VerifyVector(scale()) &&
            VerifyOffset(verifier, VT_ZERO_POINT) &&
            verifier.VerifyVector(zero_point()) &&
+           VerifyField<uint8_t>(verifier, VT_DETAILS_TYPE) &&
+           VerifyOffset(verifier, VT_DETAILS) &&
+           VerifyQuantizationDetails(verifier, details(), details_type()) &&
            verifier.EndTable();
   }
   QuantizationParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2071,6 +2328,10 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   static flatbuffers::Offset<QuantizationParameters> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
+template<> inline const CustomQuantization *QuantizationParameters::details_as<CustomQuantization>() const {
+  return details_as_CustomQuantization();
+}
+
 struct QuantizationParametersBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -2086,6 +2347,12 @@ struct QuantizationParametersBuilder {
   void add_zero_point(flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
     fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
   }
+  void add_details_type(QuantizationDetails details_type) {
+    fbb_.AddElement<uint8_t>(QuantizationParameters::VT_DETAILS_TYPE, static_cast<uint8_t>(details_type), 0);
+  }
+  void add_details(flatbuffers::Offset<void> details) {
+    fbb_.AddOffset(QuantizationParameters::VT_DETAILS, details);
+  }
   explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2103,12 +2370,16 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
     flatbuffers::Offset<flatbuffers::Vector<float>> min = 0,
     flatbuffers::Offset<flatbuffers::Vector<float>> max = 0,
     flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0,
+    QuantizationDetails details_type = QuantizationDetails_NONE,
+    flatbuffers::Offset<void> details = 0) {
   QuantizationParametersBuilder builder_(_fbb);
+  builder_.add_details(details);
   builder_.add_zero_point(zero_point);
   builder_.add_scale(scale);
   builder_.add_max(max);
   builder_.add_min(min);
+  builder_.add_details_type(details_type);
   return builder_.Finish();
 }
 
@@ -2117,13 +2388,17 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersD
     const std::vector<float> *min = nullptr,
     const std::vector<float> *max = nullptr,
     const std::vector<float> *scale = nullptr,
-    const std::vector<int64_t> *zero_point = nullptr) {
+    const std::vector<int64_t> *zero_point = nullptr,
+    QuantizationDetails details_type = QuantizationDetails_NONE,
+    flatbuffers::Offset<void> details = 0) {
   return tflite::CreateQuantizationParameters(
       _fbb,
       min ? _fbb.CreateVector<float>(*min) : 0,
       max ? _fbb.CreateVector<float>(*max) : 0,
       scale ? _fbb.CreateVector<float>(*scale) : 0,
-      zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0);
+      zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0,
+      details_type,
+      details);
 }
 
 flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -6450,6 +6725,154 @@ inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(
 
 flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct LeakyReluOptionsT : public flatbuffers::NativeTable {
+  typedef LeakyReluOptions TableType;
+  float alpha;
+  LeakyReluOptionsT()
+      : alpha(0.0f) {
+  }
+};
+
+struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LeakyReluOptionsT NativeTableType;
+  enum {
+    VT_ALPHA = 4
+  };
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_ALPHA) &&
+           verifier.EndTable();
+  }
+  LeakyReluOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LeakyReluOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LeakyReluOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LeakyReluOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  explicit LeakyReluOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LeakyReluOptionsBuilder &operator=(const LeakyReluOptionsBuilder &);
+  flatbuffers::Offset<LeakyReluOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LeakyReluOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float alpha = 0.0f) {
+  LeakyReluOptionsBuilder builder_(_fbb);
+  builder_.add_alpha(alpha);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SquaredDifferenceOptionsT : public flatbuffers::NativeTable {
+  typedef SquaredDifferenceOptions TableType;
+  SquaredDifferenceOptionsT() {
+  }
+};
+
+struct SquaredDifferenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SquaredDifferenceOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SquaredDifferenceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SquaredDifferenceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SquaredDifferenceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SquaredDifferenceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SquaredDifferenceOptionsBuilder &operator=(const SquaredDifferenceOptionsBuilder &);
+  flatbuffers::Offset<SquaredDifferenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SquaredDifferenceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SquaredDifferenceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MirrorPadOptionsT : public flatbuffers::NativeTable {
+  typedef MirrorPadOptions TableType;
+  MirrorPadMode mode;
+  MirrorPadOptionsT()
+      : mode(MirrorPadMode_REFLECT) {
+  }
+};
+
+struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MirrorPadOptionsT NativeTableType;
+  enum {
+    VT_MODE = 4
+  };
+  MirrorPadMode mode() const {
+    return static_cast<MirrorPadMode>(GetField<int8_t>(VT_MODE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_MODE) &&
+           verifier.EndTable();
+  }
+  MirrorPadOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MirrorPadOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MirrorPadOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_mode(MirrorPadMode mode) {
+    fbb_.AddElement<int8_t>(MirrorPadOptions::VT_MODE, static_cast<int8_t>(mode), 0);
+  }
+  explicit MirrorPadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  MirrorPadOptionsBuilder &operator=(const MirrorPadOptionsBuilder &);
+  flatbuffers::Offset<MirrorPadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MirrorPadOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    MirrorPadMode mode = MirrorPadMode_REFLECT) {
+  MirrorPadOptionsBuilder builder_(_fbb);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -6805,6 +7228,15 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const ResizeNearestNeighborOptions *builtin_options_as_ResizeNearestNeighborOptions() const {
     return builtin_options_type() == BuiltinOptions_ResizeNearestNeighborOptions ? static_cast<const ResizeNearestNeighborOptions *>(builtin_options()) : nullptr;
   }
+  const LeakyReluOptions *builtin_options_as_LeakyReluOptions() const {
+    return builtin_options_type() == BuiltinOptions_LeakyReluOptions ? static_cast<const LeakyReluOptions *>(builtin_options()) : nullptr;
+  }
+  const SquaredDifferenceOptions *builtin_options_as_SquaredDifferenceOptions() const {
+    return builtin_options_type() == BuiltinOptions_SquaredDifferenceOptions ? static_cast<const SquaredDifferenceOptions *>(builtin_options()) : nullptr;
+  }
+  const MirrorPadOptions *builtin_options_as_MirrorPadOptions() const {
+    return builtin_options_type() == BuiltinOptions_MirrorPadOptions ? static_cast<const MirrorPadOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -7132,6 +7564,18 @@ template<> inline const ResizeNearestNeighborOptions *Operator::builtin_options_
   return builtin_options_as_ResizeNearestNeighborOptions();
 }
 
+template<> inline const LeakyReluOptions *Operator::builtin_options_as<LeakyReluOptions>() const {
+  return builtin_options_as_LeakyReluOptions();
+}
+
+template<> inline const SquaredDifferenceOptions *Operator::builtin_options_as<SquaredDifferenceOptions>() const {
+  return builtin_options_as_SquaredDifferenceOptions();
+}
+
+template<> inline const MirrorPadOptions *Operator::builtin_options_as<MirrorPadOptions>() const {
+  return builtin_options_as_MirrorPadOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -7534,6 +7978,32 @@ inline flatbuffers::Offset<Model> CreateModelDirect(
 
 flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+inline CustomQuantizationT *CustomQuantization::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CustomQuantizationT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CustomQuantization::UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->custom[_i] = _e->Get(_i); } } };
+}
+
+inline flatbuffers::Offset<CustomQuantization> CustomQuantization::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCustomQuantization(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CustomQuantizationT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _custom = _o->custom.size() ? _fbb.CreateVector(_o->custom) : 0;
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      _custom);
+}
+
 inline QuantizationParametersT *QuantizationParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new QuantizationParametersT();
   UnPackTo(_o, _resolver);
@@ -7547,6 +8017,8 @@ inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const
   { auto _e = max(); if (_e) { _o->max.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->max[_i] = _e->Get(_i); } } };
   { auto _e = scale(); if (_e) { _o->scale.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scale[_i] = _e->Get(_i); } } };
   { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } };
+  { auto _e = details_type(); _o->details.type = _e; };
+  { auto _e = details(); if (_e) _o->details.value = QuantizationDetailsUnion::UnPack(_e, details_type(), _resolver); };
 }
 
 inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -7561,12 +8033,16 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
   auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
   auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
   auto _zero_point = _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
+  auto _details_type = _o->details.type;
+  auto _details = _o->details.Pack(_fbb);
   return tflite::CreateQuantizationParameters(
       _fbb,
       _min,
       _max,
       _scale,
-      _zero_point);
+      _zero_point,
+      _details_type,
+      _details);
 }
 
 inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -9591,6 +10067,81 @@ inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBuf
       _fbb);
 }
 
+inline LeakyReluOptionsT *LeakyReluOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LeakyReluOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LeakyReluOptions::UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = alpha(); _o->alpha = _e; };
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> LeakyReluOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLeakyReluOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LeakyReluOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _alpha = _o->alpha;
+  return tflite::CreateLeakyReluOptions(
+      _fbb,
+      _alpha);
+}
+
+inline SquaredDifferenceOptionsT *SquaredDifferenceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SquaredDifferenceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SquaredDifferenceOptions::UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> SquaredDifferenceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSquaredDifferenceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SquaredDifferenceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSquaredDifferenceOptions(
+      _fbb);
+}
+
+inline MirrorPadOptionsT *MirrorPadOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MirrorPadOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void MirrorPadOptions::UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = mode(); _o->mode = _e; };
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> MirrorPadOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMirrorPadOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MirrorPadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _mode = _o->mode;
+  return tflite::CreateMirrorPadOptions(
+      _fbb,
+      _mode);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -9775,6 +10326,75 @@ inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_f
       _metadata_buffer);
 }
 
+inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
+  switch (type) {
+    case QuantizationDetails_NONE: {
+      return true;
+    }
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const CustomQuantization *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return false;
+  }
+}
+
+inline bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyQuantizationDetails(
+        verifier,  values->Get(i), types->GetEnum<QuantizationDetails>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *QuantizationDetailsUnion::UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const CustomQuantization *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> QuantizationDetailsUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const CustomQuantizationT *>(value);
+      return CreateCustomQuantization(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline QuantizationDetailsUnion::QuantizationDetailsUnion(const QuantizationDetailsUnion &u) FLATBUFFERS_NOEXCEPT : type(u.type), value(nullptr) {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      value = new CustomQuantizationT(*reinterpret_cast<CustomQuantizationT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void QuantizationDetailsUnion::Reset() {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<CustomQuantizationT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = QuantizationDetails_NONE;
+}
+
 inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
   switch (type) {
     case BuiltinOptions_NONE: {
@@ -10076,6 +10696,18 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const ResizeNearestNeighborOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -10390,6 +11022,18 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const ResizeNearestNeighborOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -10692,6 +11336,18 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const ResizeNearestNeighborOptionsT *>(value);
       return CreateResizeNearestNeighborOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptionsT *>(value);
+      return CreateLeakyReluOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptionsT *>(value);
+      return CreateSquaredDifferenceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptionsT *>(value);
+      return CreateMirrorPadOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -10994,6 +11650,18 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new ResizeNearestNeighborOptionsT(*reinterpret_cast<ResizeNearestNeighborOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      value = new LeakyReluOptionsT(*reinterpret_cast<LeakyReluOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      value = new SquaredDifferenceOptionsT(*reinterpret_cast<SquaredDifferenceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      value = new MirrorPadOptionsT(*reinterpret_cast<MirrorPadOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -11371,6 +12039,21 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<LeakyReluOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<SquaredDifferenceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<MirrorPadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index df448e8a880f680b34b0f87d214badea158ab57e..22ffed43cc0e08ac45a9a07077450d2642ba7f26 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -162,8 +162,10 @@ cc_library(
         ":test_runner",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -248,8 +250,9 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
+        "//tensorflow/lite:string_util",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -263,6 +266,8 @@ cc_test(
     ],
     deps = [
         ":tf_driver",
+        "//tensorflow/lite:string_util",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index cdd0107221a10afbe18d4ff2adf7ad122716900f..b7e549cc5c119d73fde45251a88e9ea0b03330b1 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -370,7 +370,8 @@ def make_zip_of_tests(zip_path,
                       make_graph,
                       make_test_inputs,
                       extra_toco_options=ExtraTocoOptions(),
-                      use_frozen_graph=False):
+                      use_frozen_graph=False,
+                      expected_tf_success=None):
   """Helper to make a zip file of a bunch of TensorFlow models.
 
   This does a cartestian product of the dictionary of test_parameters and
@@ -390,6 +391,8 @@ def make_zip_of_tests(zip_path,
       `output_tensors` and returns tuple `(input_values, output_values)`.
     extra_toco_options: Additional toco options.
     use_frozen_graph: Whether or not freeze graph before toco converter.
+    expected_tf_success: Number of times tensorflow is supposed to succeed in
+      executing the input graphs. `None` means "unknown".
 
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
@@ -550,6 +553,11 @@ def make_zip_of_tests(zip_path,
                    " and %d TOCO converted graphs (%.1f%%"), zip_path,
                   total_conversions, tf_success, toco_success, percent)
 
+  if expected_tf_success is not None and tf_success != expected_tf_success:
+    raise RuntimeError(
+        "Expected TF to succeed %d times, but that happened %d times" %
+        (expected_tf_success, tf_success))
+
   if not FLAGS.ignore_toco_errors and toco_errors > 0:
     raise RuntimeError(
         "Found %d errors while generating toco models" % toco_errors)
@@ -747,6 +755,34 @@ def make_prelu_tests(zip_path):
       use_frozen_graph=True)
 
 
+def make_leaky_relu_tests(zip_path):
+  """Make a set of tests to do LeakyRelu."""
+
+  test_parameters = [
+      {
+          "input_shape": [[], [1], [5], [1, 10, 10, 3], [3, 3, 3, 3]],
+          "alpha": [0.1, 1.0, 2.0, -0.1, -1.0, -2.0],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.leaky_relu(input_tensor, alpha=parameters["alpha"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build the inputs for the test case."""
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-3, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # This function tests various TensorFLow functions that generates Const op,
 # including `tf.ones`, `tf.zeros` and random functions.
 def make_constant_tests(zip_path):
@@ -780,33 +816,45 @@ def make_constant_tests(zip_path):
 def make_binary_op_tests(zip_path, binary_operator):
   """Make a set of tests to do binary ops with and without broadcast."""
 
-  # These parameters are split because we don't support broadcasting.
-  test_parameters = [{
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[1, 3, 4, 3]],
-      "input_shape_2": [[1, 3, 4, 3]],
-      "activation": [True]
-  }, {
-      "dtype": [tf.float32],
-      "input_shape_1": [[5]],
-      "input_shape_2": [[5]],
-      "activation": [False, True]
-  }, {
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[1, 3, 4, 3]],
-      "input_shape_2": [[3]],
-      "activation": [True, False]
-  }, {
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[3]],
-      "input_shape_2": [[1, 3, 4, 3]],
-      "activation": [True, False]
-  }, {
-      "dtype": [tf.float32],
-      "input_shape_1": [[]],
-      "input_shape_2": [[]],
-      "activation": [False]
-  }]
+  test_parameters = [
+      # Avoid creating all combinations to keep the test size small.
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape_1": [[1, 3, 4, 3]],
+          "input_shape_2": [[1, 3, 4, 3]],
+          "activation": [True],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[5]],
+          "input_shape_2": [[5]],
+          "activation": [False, True],
+      },
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "input_shape_1": [[1, 3, 4, 3]],
+          "input_shape_2": [[3]],
+          "activation": [True, False],
+      },
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape_1": [[3]],
+          "input_shape_2": [[1, 3, 4, 3]],
+          "activation": [True, False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[]],
+          "input_shape_2": [[]],
+          "activation": [False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[0]],
+          "input_shape_2": [[1]],
+          "activation": [False],
+      }
+  ]
 
   def build_graph(parameters):
     """Builds the graph given the current parameters."""
@@ -881,7 +929,13 @@ def make_reduce_tests(reduce_op,
     }, {
         "input_dtype": [tf.float32],
         "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
-        "axis": [None],
+        "axis": [[]],   # shape is: [0]
+        "const_axis": [False],
+        "keepdims": [True, False],
+    }, {
+        "input_dtype": [tf.float32],
+        "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
+        "axis": [None],  # shape is: []
         "const_axis": [True],
         "keepdims": [True, False],
     }]
@@ -1123,6 +1177,10 @@ def make_floor_mod_tests(zip_path):
   make_binary_op_tests(zip_path, tf.floormod)
 
 
+def make_squared_difference_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.squared_difference)
+
+
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
@@ -1130,9 +1188,9 @@ def make_gather_tests(zip_path):
       # TODO(mgubin): add string tests when they are supported by Toco.
       # TODO(mgubin): add tests for Nd indices when they are supported by
       # TfLite.
-      "params_dtype": [tf.float32, tf.int32],
+      "params_dtype": [tf.float32, tf.int32, tf.int64],
       "params_shape": [[10], [1, 2, 20]],
-      "indices_dtype": [tf.int32],
+      "indices_dtype": [tf.int32, tf.int64],
       "indices_shape": [[3], [5]],
       "axis": [-1, 0, 1],
   }]
@@ -1160,7 +1218,13 @@ def make_gather_tests(zip_path):
     return [params, indices], sess.run(
         outputs, feed_dict=dict(zip(inputs, [params, indices])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  # Note that TF can't execute with index=1 and params_shape=[10].
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=60)
 
 
 def make_global_batch_norm_tests(zip_path):
@@ -2456,6 +2520,32 @@ def make_strided_slice_1d_exhaustive_tests(zip_path):
   _make_strided_slice_tests(zip_path, test_parameters)
 
 
+def make_strided_slice_buggy_tests(zip_path):
+  """Make a set of tests to show strided_slice yields incorrect results."""
+
+  test_parameters = [{
+      "unused_iteration_counter": [1],
+  }]
+
+  def build_graph(parameters):
+    """Build the strided_slice op testing graph."""
+    del parameters
+    input_values = tf.placeholder(dtype=tf.float32, shape=[4, 2])
+    data = tf.constant([[0, 1, 2, 3],
+                        [4, 5, 6, 7],
+                        [8, 9, 10, 11],
+                        [12, 13, 14, 15]], tf.float32)
+    return [input_values], [input_values + data[:, :2]]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    del parameters
+    input_values = np.zeros([4, 2], dtype=np.float32)
+    return [input_values], sess.run(
+        outputs, feed_dict={inputs[0]: input_values})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_lstm_tests(zip_path):
   """Make a set of tests to do basic Lstm cell."""
 
@@ -3109,7 +3199,7 @@ def make_transpose_conv_tests(zip_path):
 def make_tile_tests(zip_path):
   """Make a set of tests to do tile."""
   test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32],
+      "input_dtype": [tf.float32, tf.int32, tf.bool],
       "input_shape": [[3, 2, 1], [2, 2, 2]],
       "multiplier_dtype": [tf.int32, tf.int64],
       "multiplier_shape": [[3]]
@@ -3131,8 +3221,10 @@ def make_tile_tests(zip_path):
   def build_inputs(parameters, sess, inputs, outputs):
     input_value = create_tensor_data(parameters["input_dtype"],
                                      parameters["input_shape"])
-    multipliers_value = create_tensor_data(parameters["multiplier_dtype"],
-                                           parameters["multiplier_shape"])
+    multipliers_value = create_tensor_data(
+        parameters["multiplier_dtype"],
+        parameters["multiplier_shape"],
+        min_value=0)
     return [input_value, multipliers_value], sess.run(
         outputs,
         feed_dict={
@@ -3237,12 +3329,30 @@ def make_sparse_to_dense_tests(zip_path):
 def make_pack_tests(zip_path):
   """Make a set of tests to do stack."""
 
-  test_parameters = [{
-      "base_shape": [[3, 4, 3], [3, 4], [5]],
-      "num_tensors": [1, 2, 3, 4, 5, 6],
-      "axis": [0, 1, 2, 3],
-      "additional_shape": [1, 2, 3],
-  }]
+  test_parameters = [
+      # Avoid creating all combinations to keep the test size small.
+      {
+          "dtype": [tf.float32],
+          "base_shape": [[3, 4, 3], [3, 4], [5]],
+          "num_tensors": [1, 2, 3, 4, 5, 6],
+          "axis": [0, 1, 2, 3],
+          "additional_shape": [1, 2, 3],
+      },
+      {
+          "dtype": [tf.int32],
+          "base_shape": [[3, 4, 3], [3, 4], [5]],
+          "num_tensors": [6],
+          "axis": [0, 1, 2, 3],
+          "additional_shape": [1, 2, 3],
+      },
+      {
+          "dtype": [tf.int64],
+          "base_shape": [[3, 4, 3], [3, 4], [5]],
+          "num_tensors": [5],
+          "axis": [0, 1, 2, 3],
+          "additional_shape": [1, 2, 3],
+      }
+  ]
 
   def get_shape(parameters):
     """Return a tweaked version of 'base_shape'."""
@@ -3256,7 +3366,9 @@ def make_pack_tests(zip_path):
     all_tensors = []
     for n in range(0, parameters["num_tensors"]):
       input_tensor = tf.placeholder(
-          dtype=tf.float32, name=("input%d" % n), shape=get_shape(parameters))
+          dtype=parameters["dtype"],
+          name=("input%d" % n),
+          shape=get_shape(parameters))
       all_tensors.append(input_tensor)
     out = tf.stack(all_tensors, parameters["axis"])
     return all_tensors, [out]
@@ -3384,6 +3496,32 @@ def make_logical_xor_tests(zip_path):
   return _make_logical_tests(tf.logical_xor)(zip_path)
 
 
+def make_unroll_batch_matmul_tests(zip_path):
+  """Make a set of tests to test unroll_batch_matmul."""
+
+  test_parameters = [{"dtype": [tf.float32], "shape": [[(2, 2, 3), (2, 3, 2)]]}]
+
+  def build_graph(parameters):
+    """Build the batch_matmul op testing graph."""
+    input_tensor1 = tf.placeholder(
+        dtype=parameters["dtype"], shape=parameters["shape"][0])
+    input_tensor2 = tf.placeholder(
+        dtype=parameters["dtype"], shape=parameters["shape"][1])
+    # Should be unrolled and replaced with fully_connected ops in the end.
+    out = tf.matmul(input_tensor1, input_tensor2)
+    return [input_tensor1, input_tensor2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(
+        parameters["dtype"], shape=parameters["shape"][0])
+    input_value2 = create_tensor_data(
+        parameters["dtype"], shape=parameters["shape"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 49f7b527bb7587f793c338d429897a0c32226e37..91a4851fb0251d4e5f7e8fcd7146ee43fdfe99f2 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -94,6 +94,17 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/div.*activation=True.*dtype=tf\.int32)", "112968789"},
     {R"(^\/floor_div.*activation=True.*dtype=tf\.int32)", "112968789"},
     {R"(^\/floor_mod.*activation=True.*dtype=tf\.int32)", "112968789"},
+    {R"(^\/floor_mod.*activation=True.*dtype=tf\.int64)", "112968789"},
+
+    {R"(^\/sub.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/div.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/mul.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/add.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/floor_div.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/squared_difference.*dtype=tf\.int64)", "119126484"},
+
+    // Strided Slice chooses the wrong dimension.
+    {R"(^\/strided_slice_buggy)", "119786029"},
 };
 
 // Allows test data to be unarchived into a temporary directory and makes
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index ce8ef0b19600213d9785ad6a0a8400fddb3f8edd..804e328d9da248859e806bd070de26a8f5aa37b4 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.lite.python import convert_saved_model as _convert_saved_model
 from tensorflow.lite.python import lite as _lite
-from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
@@ -166,18 +166,20 @@ def evaluate_keras_model(filename):
   return lambda input_data: [keras_model.predict(input_data)]
 
 
-# TODO(nupurgarg): Make this function a parameter to test_frozen_graph (and
-# related functions) in order to make it easy to use different data generators.
-def compare_models_random_data(tflite_model, tf_eval_func, tolerance=5):
-  """Compares TensorFlow and TFLite models with random data.
+def compare_models(tflite_model, tf_eval_func, input_data=None, tolerance=5):
+  """Compares TensorFlow and TFLite models.
+
+  Unless the input data is provided, the models are compared with random data.
 
   Args:
     tflite_model: Serialized TensorFlow Lite model.
     tf_eval_func: Lambda function that takes in input data and outputs the
       results of the TensorFlow model ([np.ndarray data] : [np.ndarray result]).
+    input_data: np.ndarray to pass into models during inference. (default None)
     tolerance: Decimal place to check accuracy to. (default 5)
   """
-  input_data = _generate_random_input_data(tflite_model)
+  if input_data is None:
+    input_data = _generate_random_input_data(tflite_model)
   tf_results = tf_eval_func(input_data)
   tflite_results = _evaluate_tflite_model(tflite_model, input_data)
   for tf_result, tflite_result in zip(tf_results, tflite_results):
@@ -253,6 +255,7 @@ def test_frozen_graph(filename,
                       input_arrays,
                       output_arrays,
                       input_shapes=None,
+                      input_data=None,
                       **kwargs):
   """Validates the TensorFlow frozen graph converts to a TFLite model.
 
@@ -267,6 +270,7 @@ def test_frozen_graph(filename,
       integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
       Automatically determined when input shapes is None (e.g., {"foo" : None}).
         (default None)
+    input_data: np.ndarray to pass into models during inference. (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
   converter = _lite.TFLiteConverter.from_frozen_graph(
@@ -274,13 +278,14 @@ def test_frozen_graph(filename,
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_frozen_graph(filename, input_arrays, output_arrays)
-  compare_models_random_data(tflite_model, tf_eval_func)
+  compare_models(tflite_model, tf_eval_func, input_data=input_data)
 
 
 def test_saved_model(directory,
                      input_shapes=None,
                      tag_set=None,
                      signature_key=None,
+                     input_data=None,
                      **kwargs):
   """Validates the TensorFlow SavedModel converts to a TFLite model.
 
@@ -296,6 +301,7 @@ def test_saved_model(directory,
     tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
       analyze. All tags in the tag set must be present.
     signature_key: Key identifying SignatureDef containing inputs and outputs.
+    input_data: np.ndarray to pass into models during inference. (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
   converter = _lite.TFLiteConverter.from_saved_model(
@@ -306,10 +312,14 @@ def test_saved_model(directory,
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_saved_model(directory, tag_set, signature_key)
-  compare_models_random_data(tflite_model, tf_eval_func)
+  compare_models(tflite_model, tf_eval_func, input_data=input_data)
 
 
-def test_keras_model(filename, input_arrays=None, input_shapes=None, **kwargs):
+def test_keras_model(filename,
+                     input_arrays=None,
+                     input_shapes=None,
+                     input_data=None,
+                     **kwargs):
   """Validates the tf.keras model converts to a TFLite model.
 
   Converts the tf.keras model to TFLite and checks the accuracy of the model on
@@ -322,6 +332,7 @@ def test_keras_model(filename, input_arrays=None, input_shapes=None, **kwargs):
       integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
       Automatically determined when input shapes is None (e.g., {"foo" : None}).
         (default None)
+    input_data: np.ndarray to pass into models during inference. (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
   converter = _lite.TFLiteConverter.from_keras_model_file(
@@ -329,4 +340,4 @@ def test_keras_model(filename, input_arrays=None, input_shapes=None, **kwargs):
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_keras_model(filename)
-  compare_models_random_data(tflite_model, tf_eval_func)
+  compare_models(tflite_model, tf_eval_func, input_data=input_data)
diff --git a/tensorflow/lite/testing/model_coverage/testdata/grace_hopper.jpg b/tensorflow/lite/testing/model_coverage/testdata/grace_hopper.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d2a427810f679db537236c5430873a81a62ef412
Binary files /dev/null and b/tensorflow/lite/testing/model_coverage/testdata/grace_hopper.jpg differ
diff --git a/tensorflow/lite/testing/test_runner.h b/tensorflow/lite/testing/test_runner.h
index 303155b072bc3a607a83fbe5fa4323b6d485cc8f..7cda8b5ec1366f962080e0198dbb7f7f0856707e 100644
--- a/tensorflow/lite/testing/test_runner.h
+++ b/tensorflow/lite/testing/test_runner.h
@@ -54,12 +54,12 @@ class TestRunner {
 
   // Define the contents of the given input tensor. The given 'id' is
   // guaranteed to be one of the ids returned by GetInputs().
-  virtual void SetInput(int id, const string& csv_values) = 0;
+  virtual void SetInput(int id, const string& values_as_string) = 0;
 
   // Define what should be expected for an output tensor after Invoke() runs.
   // The given 'id' is guaranteed to be one of the ids returned by
   // GetOutputs().
-  virtual void SetExpectation(int id, const string& csv_values) = 0;
+  virtual void SetExpectation(int id, const string& values_as_string) = 0;
 
   // Run the model.
   virtual void Invoke() = 0;
diff --git a/tensorflow/lite/testing/tf_driver.cc b/tensorflow/lite/testing/tf_driver.cc
index 36c556ba0495093341c641b845471196b4a13530..ffd76e8dc7eeb46404838ba29789ad5f446de2bf 100644
--- a/tensorflow/lite/testing/tf_driver.cc
+++ b/tensorflow/lite/testing/tf_driver.cc
@@ -17,9 +17,11 @@ limitations under the License.
 #include <fstream>
 #include <iostream>
 
+#include "absl/strings/escaping.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tflite {
 namespace testing {
@@ -34,13 +36,38 @@ tensorflow::Tensor CreateTensor(const tensorflow::DataType type,
 }
 
 template <typename T>
-void FillTensorWithData(tensorflow::Tensor* tensor, const string& csv_values) {
-  auto data = tensor->flat<T>();
+int FillTensorWithData(tensorflow::Tensor* tensor,
+                       const string& values_as_string) {
+  const auto& values = testing::Split<T>(values_as_string, ",");
+
+  if (values.size() == tensor->NumElements()) {
+    auto data = tensor->flat<T>();
+    for (int i = 0; i < values.size(); i++) {
+      data(i) = values[i];
+    }
+  }
 
-  const auto& values = testing::Split<T>(csv_values, ",");
-  for (int i = 0; i < values.size(); i++) {
-    data(i) = values[i];
+  return values.size();
+}
+
+// Assumes 'values_as_string' is a hex string that gets converted into a
+// TF Lite DynamicBuffer. Strings are then extracted and copied into the
+// TensorFlow tensor.
+int FillTensorWithTfLiteHexString(tensorflow::Tensor* tensor,
+                                  const string& values_as_string) {
+  string s = absl::HexStringToBytes(values_as_string);
+
+  int num_strings = values_as_string.empty() ? 0 : GetStringCount(s.data());
+
+  if (num_strings == tensor->NumElements()) {
+    auto data = tensor->flat<string>();
+    for (size_t i = 0; i < num_strings; ++i) {
+      auto ref = GetString(s.data(), i);
+      data(i).assign(ref.str, ref.len);
+    }
   }
+
+  return num_strings;
 }
 
 template <typename T>
@@ -57,6 +84,22 @@ string TensorDataToCsvString(const tensorflow::Tensor& tensor) {
   return Join(data.data(), data.size(), ",");
 }
 
+string TensorDataToTfLiteHexString(const tensorflow::Tensor& tensor) {
+  DynamicBuffer dynamic_buffer;
+
+  auto data = tensor.flat<string>();
+  for (int i = 0; i < tensor.NumElements(); ++i) {
+    dynamic_buffer.AddString(data(i).data(), data(i).size());
+  }
+
+  char* char_buffer = nullptr;
+  size_t size = dynamic_buffer.WriteToBuffer(&char_buffer);
+  string s = absl::BytesToHexString({char_buffer, size});
+  free(char_buffer);
+
+  return s;
+}
+
 }  // namespace
 
 TfDriver::TfDriver(const std::vector<string>& input_layer,
@@ -107,28 +150,44 @@ void TfDriver::LoadModel(const string& bin_file_path) {
   }
 }
 
-void TfDriver::SetInput(int id, const string& csv_values) {
-  if (!IsValid()) return;
-
-  auto tensor = CreateTensor(input_types_[id], input_shapes_[id]);
-  switch (input_types_[id]) {
-    case tensorflow::DT_FLOAT: {
-      FillTensorWithData<float>(&tensor, csv_values);
+void TfDriver::SetInput(const string& values_as_string,
+                        tensorflow::Tensor* tensor) {
+  int num_values_available = 0;
+  switch (tensor->dtype()) {
+    case tensorflow::DT_FLOAT:
+      num_values_available =
+          FillTensorWithData<float>(tensor, values_as_string);
       break;
-    }
-    case tensorflow::DT_INT32: {
-      FillTensorWithData<int32_t>(&tensor, csv_values);
+    case tensorflow::DT_INT32:
+      num_values_available =
+          FillTensorWithData<int32_t>(tensor, values_as_string);
       break;
-    }
-    case tensorflow::DT_UINT8: {
-      FillTensorWithData<uint8_t>(&tensor, csv_values);
+    case tensorflow::DT_UINT8:
+      num_values_available =
+          FillTensorWithData<uint8_t>(tensor, values_as_string);
+      break;
+    case tensorflow::DT_STRING:
+      num_values_available =
+          FillTensorWithTfLiteHexString(tensor, values_as_string);
       break;
-    }
     default:
-      fprintf(stderr, "Unsupported type %d in SetInput\n", input_types_[id]);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              tensorflow::DataType_Name(tensor->dtype()),
+                              " in SetInput"));
       return;
   }
+
+  if (tensor->NumElements() != num_values_available) {
+    Invalidate(absl::StrCat("Needed ", tensor->NumElements(),
+                            " values for input tensor, but was given ",
+                            num_values_available, " instead."));
+  }
+}
+
+void TfDriver::SetInput(int id, const string& values_as_string) {
+  if (!IsValid()) return;
+  auto tensor = CreateTensor(input_types_[id], input_shapes_[id]);
+  SetInput(values_as_string, &tensor);
   input_tensors_[input_names_[id]] = tensor;
 }
 
@@ -145,43 +204,54 @@ void TfDriver::ResetTensor(int id) {
       break;
     }
     default:
-      fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ", input_types_[id],
+                              tensorflow::DataType_Name(input_types_[id]),
+                              " in ResetInput"));
       return;
   }
 }
 
-void TfDriver::ReshapeTensor(int id, const string& csv_values) {
-  input_shapes_[id] = Split<int64_t>(csv_values, ",");
+void TfDriver::ReshapeTensor(int id, const string& values_as_string) {
+  input_shapes_[id] = Split<int64_t>(values_as_string, ",");
   input_tensors_[input_names_[id]] =
       CreateTensor(input_types_[id], input_shapes_[id]);
   ResetTensor(id);
 }
 
-string TfDriver::ReadOutput(int id) {
-  if (!IsValid()) return "";
-  switch (output_tensors_[id].dtype()) {
+string TfDriver::ReadOutput(const tensorflow::Tensor& tensor) {
+  switch (tensor.dtype()) {
     case tensorflow::DT_FLOAT:
-      return TensorDataToCsvString<float>(output_tensors_[id]);
+      return TensorDataToCsvString<float>(tensor);
     case tensorflow::DT_INT32:
-      return TensorDataToCsvString<int32_t>(output_tensors_[id]);
+      return TensorDataToCsvString<int32_t>(tensor);
+    case tensorflow::DT_INT64:
+      return TensorDataToCsvString<tensorflow::int64>(tensor);
     case tensorflow::DT_UINT8:
-      return TensorDataToCsvString<uint8_t>(output_tensors_[id]);
+      return TensorDataToCsvString<uint8_t>(tensor);
+    case tensorflow::DT_STRING:
+      return TensorDataToTfLiteHexString(tensor);
+    case tensorflow::DT_BOOL:
+      return TensorDataToCsvString<bool>(tensor);
     default:
-      fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              tensorflow::DataType_Name(tensor.dtype()),
+                              " in ReadOutput"));
       return "";
   }
 }
 
+string TfDriver::ReadOutput(int id) {
+  if (!IsValid()) return "";
+  return ReadOutput(output_tensors_[id]);
+}
+
 void TfDriver::Invoke() {
   if (!IsValid()) return;
   auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
                               output_names_, {}, &output_tensors_);
   if (!status.ok()) {
-    Invalidate(
-        "Failed to run input data on graph. Make sure the correct value is "
-        "defined for the input and output arrays.");
+    Invalidate(absl::StrCat("TensorFlow failed to run graph:",
+                            status.error_message()));
   }
 }
 
diff --git a/tensorflow/lite/testing/tf_driver.h b/tensorflow/lite/testing/tf_driver.h
index f10689cb58c1753ac8743da00f2ba8322a6270a7..46b18980b95fd109fbfe17c0c221cf1bf02dbac6 100644
--- a/tensorflow/lite/testing/tf_driver.h
+++ b/tensorflow/lite/testing/tf_driver.h
@@ -39,23 +39,27 @@ class TfDriver : public TestRunner {
   ~TfDriver() override {}
 
   void LoadModel(const string& bin_file_path) override;
-  void SetInput(int id, const string& csv_values) override;
+  void SetInput(int id, const string& values_as_string) override;
   void Invoke() override;
   string ReadOutput(int id) override;
 
   const std::vector<int>& GetInputs() override { return input_ids_; }
   const std::vector<int>& GetOutputs() override { return output_ids_; }
-  void ReshapeTensor(int id, const string& csv_values) override;
+  void ReshapeTensor(int id, const string& values_as_string) override;
   // Note: ResetTensor only works for input tensor.
   void ResetTensor(int id) override;
 
   // no-op. SetInput will overwrite existing data .
   void AllocateTensors() override {}
   // no-op. Tf driver is not supposed to check the results.
-  void SetExpectation(int id, const string& csv_values) override {}
+  void SetExpectation(int id, const string& values_as_string) override {}
   // tf driver is not supposed to check the results.
   bool CheckResults() override { return false; }
 
+ protected:
+  void SetInput(const string& values_as_string, tensorflow::Tensor*);
+  string ReadOutput(const tensorflow::Tensor& tensor);
+
  private:
   std::unique_ptr<tensorflow::Session> session_;
   std::vector<int> input_ids_;
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index d178ccf1e3f7d824cb8f887441be466be5345307..4381fe4c19dc2240ed2335495276e3a5dab91022 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/escaping.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace testing {
@@ -23,6 +25,68 @@ namespace {
 
 using ::testing::ElementsAre;
 
+class TestDriver : public TfDriver {
+ public:
+  // No need for a full TfDriver. We just want to test the read/write methods.
+  TestDriver() : TfDriver({}, {}, {}, {}) {}
+  string WriteAndReadBack(tensorflow::DataType type,
+                          const std::vector<int64_t>& shape,
+                          const string& values) {
+    tensorflow::Tensor t = {
+        type,
+        tensorflow::TensorShape{tensorflow::gtl::ArraySlice<tensorflow::int64>{
+            reinterpret_cast<const tensorflow::int64*>(shape.data()),
+            shape.size()}}};
+    SetInput(values, &t);
+    return ReadOutput(t);
+  }
+};
+
+TEST(TfDriverTest, ReadingAndWrintingValues) {
+  TestDriver driver;
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_FLOAT, {1, 2, 2},
+                                    "0.10,0.20,0.30,0.40"),
+            "0.1,0.2,0.3,0.4");
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_INT32, {1, 2, 2},
+                                    "10,40,100,-100"),
+            "10,40,100,-100");
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_UINT8, {1, 2, 2},
+                                    "48,49,121, 122"),
+            "0,1,y,z");
+}
+
+TEST(TfDriverTest, ReadingAndWrintingValuesStrings) {
+  TestDriver driver;
+
+  auto set_buffer = [](const std::vector<string>& values, string* buffer) {
+    DynamicBuffer dynamic_buffer;
+    for (const string& s : values) {
+      dynamic_buffer.AddString(s.data(), s.size());
+    }
+
+    char* char_b = nullptr;
+    int size = dynamic_buffer.WriteToBuffer(&char_b);
+    *buffer = absl::BytesToHexString(absl::string_view(char_b, size));
+    free(char_b);
+  };
+
+  string buffer;
+
+  set_buffer({"", "", "", ""}, &buffer);
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_STRING, {1, 2, 2}, buffer),
+            buffer);
+
+  // Note that if we pass the empty string we get the "empty" buffer (where all
+  // the strings are empty).
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_STRING, {1, 2, 2}, ""),
+            buffer);
+
+  set_buffer({"AB", "ABC", "X", "YZ"}, &buffer);
+
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_STRING, {1, 2, 2}, buffer),
+            buffer);
+}
+
 TEST(TfDriverTest, SimpleTest) {
   std::unique_ptr<TfDriver> runner(
       new TfDriver({"a", "b", "c", "d"}, {"float", "float", "float", "float"},
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index ada5d74912bc844ac11d3127d6c9b1107c229cb7..3a0febb780c331178a36fdfa72ba5d59c260a331 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -16,8 +16,10 @@ limitations under the License.
 
 #include <iostream>
 
+#include "absl/strings/escaping.h"
 #include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/delegates/flex/delegate.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/split.h"
 
 namespace tflite {
@@ -105,6 +107,7 @@ class TfLiteDriver::Expectation {
     if (tensor_size != num_elements_) {
       std::cerr << "Expected a tensor with " << num_elements_
                 << " elements, got " << tensor_size << std::endl;
+      std::cerr << "while checking tensor " << tensor.name << std::endl;
       return false;
     }
 
@@ -143,7 +146,11 @@ TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
   }
 }
 
-TfLiteDriver::~TfLiteDriver() {}
+TfLiteDriver::~TfLiteDriver() {
+  for (TfLiteTensor* t : tensors_to_deallocate_) {
+    free(t->data.raw);
+  }
+}
 
 void TfLiteDriver::AllocateTensors() {
   if (must_allocate_tensors_) {
@@ -173,9 +180,7 @@ void TfLiteDriver::LoadModel(const string& bin_file_path) {
   interpreter_->UseNNAPI(use_nnapi_);
 
   if (delegate_) {
-    if (interpreter_->ModifyGraphWithDelegate(delegate_.get(),
-                                              /*allow_dynamic_tensors=*/true) !=
-        kTfLiteOk) {
+    if (interpreter_->ModifyGraphWithDelegate(delegate_.get()) != kTfLiteOk) {
       Invalidate("Unable to the build graph using the delegate");
       return;
     }
@@ -234,6 +239,17 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
       SetTensorData(values, &tensor->data);
       break;
     }
+    case kTfLiteString: {
+      string s = absl::HexStringToBytes(csv_values);
+
+      tensor->data.raw = reinterpret_cast<char*>(malloc(s.size()));
+      tensor->bytes = s.size();
+      memcpy(tensor->data.raw, s.data(), s.size());
+
+      // We must remember to free the memory we allocated above.
+      tensors_to_deallocate_.push_back(tensor);
+      break;
+    }
     default:
       fprintf(stderr, "Unsupported type %d in SetInput\n", tensor->type);
       Invalidate("Unsupported tensor data type");
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 785baf0f004f3344393373ee148957d9a0c3820f..d8b40565bacd181df9f3ed114d76e6c003e645e5 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -59,6 +59,7 @@ class TfLiteDriver : public TestRunner {
   std::unique_ptr<Interpreter> interpreter_;
   std::map<int, std::unique_ptr<Expectation>> expected_output_;
   bool must_allocate_tensors_ = true;
+  std::vector<TfLiteTensor*> tensors_to_deallocate_;
 };
 
 }  // namespace testing
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 14302874441c4af49909dd3ba5b3bee78c421c45..82aa1f557efec04a7af3ef5e4d8b2ceb51f42a62 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -395,6 +395,28 @@ tf_cc_test(
 
 # :toco is the main public command-line tool exposing the functionality
 # of the :toco_tooling library.
+cc_library(
+    name = "toco_convert",
+    srcs = ["toco_convert.cc"],
+    hdrs = ["toco_convert.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        # We cannot embed the core:ops dependency directly into :toco_tooling as
+        # it can conflict with downstream deps when toco is used as a library.
+        "//tensorflow/core:ops",
+    ],
+)
+
 tf_cc_binary(
     name = "toco",
     srcs = ["toco.cc"],
@@ -404,6 +426,7 @@ tf_cc_binary(
         ":model_cmdline_flags",
         ":model_flags_proto_cc",
         ":toco_cmdline_flags",
+        ":toco_convert",
         ":toco_flags_proto_cc",
         ":toco_port",
         ":toco_tooling",
@@ -416,6 +439,29 @@ tf_cc_binary(
     ],
 )
 
+tf_cc_test(
+    name = "toco_convert_test",
+    srcs = ["toco_convert_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_convert",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        # We cannot embed the core:ops dependency directly into :toco_tooling as
+        # it can conflict with downstream deps when toco is used as a library.
+        "//tensorflow/core:ops",
+    ],
+)
+
 tf_cc_test(
     name = "toco_port_test",
     srcs = ["toco_port_test.cc"],
diff --git a/tensorflow/lite/toco/README.md b/tensorflow/lite/toco/README.md
index 91f6f618a376ff4df7c51dfd285152229f4757cc..bd8f8282f0fdbe21700921d0c0a5de533f4383a7 100644
--- a/tensorflow/lite/toco/README.md
+++ b/tensorflow/lite/toco/README.md
@@ -8,9 +8,9 @@ the usage documentation.
 
 Usage information is given in these documents:
 
-*   [Command-line glossary](g3doc/cmdline_reference.md)
-*   [Command-line examples](g3doc/cmdline_examples.md)
-*   [Python API examples](g3doc/python_api.md)
+*   [Command-line glossary](../g3doc/convert/cmdline_reference.md)
+*   [Command-line examples](../g3doc/convert/cmdline_examples.md)
+*   [Python API examples](../g3doc/convert/python_api.md)
 
 ## Where the converter fits in the TensorFlow landscape
 
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index 1752745aaee987e1ef029523ce12d05a4a80cdce..bdc3a5b0fb453ded74859feb4550a886409eb447 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -48,7 +48,8 @@ using tensorflow::TensorProto;
 namespace toco {
 namespace {
 
-tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
+tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type,
+                                           const string& error_location) {
   switch (data_type) {
     case ArrayDataType::kBool:
       return tensorflow::DT_BOOL;
@@ -66,14 +67,21 @@ tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
       return tensorflow::DT_COMPLEX64;
     default:
     case ArrayDataType::kNone:
-      LOG(FATAL) << "Unsupported data type: " << static_cast<int>(data_type);
+      LOG(FATAL) << "Unsupported data type '" << ArrayDataTypeName(data_type)
+                 << "' in " << error_location;
       return tensorflow::DT_INVALID;
   }
 }
 
+tensorflow::DataType GetTensorFlowDataTypeForOp(ArrayDataType data_type,
+                                                const string& op_name) {
+  return GetTensorFlowDataType(data_type, "op '" + op_name + "'");
+}
+
 tensorflow::DataType GetTensorFlowDataType(const Model& model,
                                            const string& array_name) {
-  return GetTensorFlowDataType(model.GetArray(array_name).data_type);
+  return GetTensorFlowDataType(model.GetArray(array_name).data_type,
+                               "array '" + array_name + "'");
 }
 
 // TensorFlow sometimes forbids what it calls "legacy scalars",
@@ -1285,7 +1293,7 @@ void ConvertRangeOperator(const Model& model, const RangeOperator& src_op,
   *range_op->add_input() = src_op.inputs[1];
   *range_op->add_input() = src_op.inputs[2];
   (*range_op->mutable_attr())["Tidx"].set_type(
-      GetTensorFlowDataType(src_op.dtype));
+      GetTensorFlowDataTypeForOp(src_op.dtype, /*op_name=*/src_op.outputs[0]));
 }
 
 void ConvertPackOperator(const Model& model, const PackOperator& src_op,
@@ -1298,7 +1306,8 @@ void ConvertPackOperator(const Model& model, const PackOperator& src_op,
   }
   (*pack_op->mutable_attr())["axis"].set_i(src_op.axis);
   (*pack_op->mutable_attr())["N"].set_i(src_op.inputs.size());
-  (*pack_op->mutable_attr())["T"].set_type(GetTensorFlowDataType(src_op.dtype));
+  (*pack_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataTypeForOp(src_op.dtype, src_op.outputs[0]));
 }
 
 void ConvertFillOperator(const Model& model, const FillOperator& src_op,
@@ -1887,7 +1896,7 @@ void ConvertRandomUniformOperator(const Model& model,
       GetTensorFlowDataType(model, src_op.inputs[0]);
   (*new_op->mutable_attr())["T"].set_type(shape_type);
   (*new_op->mutable_attr())["dtype"].set_type(
-      GetTensorFlowDataType(src_op.dtype));
+      GetTensorFlowDataTypeForOp(src_op.dtype, src_op.outputs[0]));
   (*new_op->mutable_attr())["seed"].set_i(src_op.seed);
   (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
 }
diff --git a/tensorflow/lite/toco/g3doc/README.md b/tensorflow/lite/toco/g3doc/README.md
index 2153b6cc6360a7a0e0375600c83b0c0945d3b326..e1be8fab3ad39e682a942490bf4af2674c3cf9b3 100644
--- a/tensorflow/lite/toco/g3doc/README.md
+++ b/tensorflow/lite/toco/g3doc/README.md
@@ -1,3 +1,3 @@
 # TOCO
 
-These files have moved to [../../g3doc/tflite_convert](../../g3doc/tflite_convert)
+These files have moved to [../../g3doc/convert](../../g3doc/convert)
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 78ea54e452b9dd163aa75349162493a2abe72707..664424860edfd3b86b6f5da43320e2ce82a6d8af 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1714,6 +1714,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
     case OperatorType::kPRelu:
+    case OperatorType::kLeakyRelu:
     case OperatorType::kSoftmax:
     case OperatorType::kLogSoftmax:
     case OperatorType::kLog:
@@ -1759,6 +1760,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
     case OperatorType::kEqual:
     case OperatorType::kNotEqual:
     case OperatorType::kPow:
+    case OperatorType::kSquaredDifference:
       ProcessSimpleBinaryOperator(model, op);
       break;
     case OperatorType::kAddN:
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index e28b7288f0102a6b03dff61c3e1b6aeb3dd1adbe..1146078c301fd1b880c99da23e5be8223efe31e3 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -64,7 +64,8 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
          type == OperatorType::kShape || type == OperatorType::kExpandDims ||
          type == OperatorType::kPack || type == OperatorType::kTopK_V2 ||
-         type == OperatorType::kResizeNearestNeighbor;
+         type == OperatorType::kResizeNearestNeighbor ||
+         type == OperatorType::kPRelu;
 }
 
 // The quantized op allows output arrays of type float using
@@ -360,7 +361,7 @@ bool ChooseQuantizationForOperatorOutput(
       op.type == OperatorType::kSpaceToDepth ||
       op.type == OperatorType::kReshape || op.type == OperatorType::kSplit ||
       op.type == OperatorType::kRelu || op.type == OperatorType::kRelu1 ||
-      op.type == OperatorType::kRelu6) {
+      op.type == OperatorType::kRelu6 || op.type == OperatorType::kPRelu) {
     int data_input_index = 0;
     if (op.type == OperatorType::kSplit) {
       data_input_index = 1;
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
index ea5d33009b4b602abf4de8b310c456f142737c7d..9ceba45e93fee10c820f2b0ba01a5948be0787b6 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
@@ -35,6 +35,11 @@ bool ResolveAttributes(Model* model, T* op) {
 
   const Array& indices_array = model->GetArray(op->inputs[1]);
   if (!indices_array.has_shape()) return false;
+
+  // It is ok for indices_array to have a shape for an empty tensor. In that
+  // case, we don't bother setting 'axis'.
+  if (indices_array.buffer->Length() == 0) return false;
+
   op->axis = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
   return true;
 }
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index d59954fc740ed91ce041948ff76a029ea294017b..41a735394d714b65a4c9fc309927e34a7f610431 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -117,7 +117,8 @@ namespace toco {
     auto* slice_b_op = new SliceOperator;
     slice_b_op->inputs = {
         batch_op->inputs[1],
-        CreateInt32Array(model, batch_name + "/slice_b/slice/begin", {0, 0, 0}),
+        CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
+                         {batch, 0, 0}),
         CreateInt32Array(
             model, batch_name + "/slice_b/slice/size",
             {1, input_array_b.shape().dims(1), input_array_b.shape().dims(2)}),
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 86d55f3e15d85cd93cf796a880c64a78ff3bbc10..4c3a0717e7452cd35455169701c87d201c125cd6 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -199,23 +199,35 @@ tensorflow::Status ImportShape(
         input_dims,
     int* input_flat_size, Shape* shape) {
   std::vector<int> input_dims_only_sizes;
+  bool zero_sized_shape = false;
   for (auto& d : input_dims) {
-    if (d.size() == 0) {
-      // Some TensorFlow shapes contain a 0 dim, effectively making
-      // them of flat size 0 even though they have other nonzero dims.
-      // This breaks our invariant, that array dims can't be 0.
-      // For now, tweaking this to record a 0-D shape instead.
-      shape->mutable_dims()->clear();
-      if (input_flat_size != nullptr) *input_flat_size = 0;
-      return tensorflow::Status::OK();
-    }
     // TensorFlow's shapes use int64s, while TOCO uses ints.
     if (d.size() > std::numeric_limits<int>::max()) {
       return tensorflow::errors::InvalidArgument("Shape element overflows");
     }
-
+    if (d.size() == 0) {
+      zero_sized_shape = true;
+    }
     input_dims_only_sizes.push_back(d.size());
   }
+
+  // Note that up to this point we were OK with the input shape containing
+  // elements valued -1 or 0, which are perfectly legal in tensorflow. However
+  // our CheckValidShapeDimensions() insists on them being >= 1, with the
+  // exception of the "scalar" shape [0]. The main issue with zero-values shape
+  // elements is that the corresponding arrays don't contain any data and the
+  // allocation code gets a bit confused. It seems that the code expects an
+  // empty shape for zero-sized shapes, so we will do just that, except for the
+  // [0] case.
+  // TODO(b/119325030): In order to correctly import the "scalar" shapes the
+  // following test must include "&& input_dims_only_sizes.size() > 1", but
+  // that seems to slow everything down a lot.
+  if (zero_sized_shape) {
+    shape->mutable_dims()->clear();
+    if (input_flat_size != nullptr) *input_flat_size = 0;
+    return tensorflow::Status::OK();
+  }
+
   *shape->mutable_dims() = input_dims_only_sizes;
 
   if (input_flat_size == nullptr) return tensorflow::Status::OK();
@@ -1122,28 +1134,53 @@ tensorflow::Status ConvertConcatOperator(
   return tensorflow::Status::OK();
 }
 
+static constexpr int kAnyNumInputs = -1;
+
+enum FlexSupport { kFlexOk, kFlexNotOk };
+
 // This method supports simple operators without additional attributes.
-template <typename Op>
-tensorflow::Status ConvertSimpleOperator(
+// Converts a simple operator that takes no attributes. The list of inputs is
+// taken from the given NodeDef, and its number must match NumInputs, unless
+// kAnyNumInputs is passed in. If kFlexOk is passed in the resulting operator
+// will be eligible for being exported as a flex op.
+template <typename Op, int NumInputs, FlexSupport flex>
+tensorflow::Status ConvertSimpleOperatorGeneric(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
+  if (NumInputs != kAnyNumInputs) {
+    TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, NumInputs));
+  }
   auto* op = new Op;
   const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+
+  if (flex == kFlexOk) {
+    RetainTensorFlowNodeDef(node, op);
+  }
+
   model->operators.emplace_back(op);
   return tensorflow::Status::OK();
 }
 
-// This method supports simple operators without additional attributes.
-template <typename Op, unsigned int NumInputs>
+// Convert a simple operator which is not valid as a flex op.
+template <typename Op, int NumInputs = kAnyNumInputs>
 tensorflow::Status ConvertSimpleOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, NumInputs));
-  return ConvertSimpleOperator<Op>(node, tf_import_flags, model);
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexNotOk>(
+      node, tf_import_flags, model);
+}
+
+// Convert a simple operator which is valid as a flex op.
+template <typename Op, int NumInputs = kAnyNumInputs>
+tensorflow::Status ConvertSimpleOperatorFlexOk(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexOk>(
+      node, tf_import_flags, model);
 }
 
 void GetOutputNamesFromNodeDef(const NodeDef& node,
@@ -2183,6 +2220,21 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertLeakyReluOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "LeakyRelu");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 1));
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  const auto& input_name = node.input(0);
+  auto* op = new LeakyReluOperator;
+  op->inputs.push_back(input_name);
+  op->outputs.push_back(node.name());
+  op->alpha = GetFloatAttr(node, "alpha");
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 }  // namespace
 
 namespace internal {
@@ -2196,6 +2248,7 @@ ConverterMapType GetTensorFlowNodeConverterMapForFlex() {
   return std::unordered_map<std::string, ConverterType>({
       // We need to let TCO convert Placeholder information into
       // array data, so that the data types are correct.
+      {"LegacyFedInput", ConvertPlaceholderOperator},
       {"Placeholder", ConvertPlaceholderOperator},
   });
 }
@@ -2203,7 +2256,7 @@ ConverterMapType GetTensorFlowNodeConverterMapForFlex() {
 ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
       {"Add", ConvertSimpleOperator<AddOperator, 2>},
-      {"AddN", ConvertSimpleOperator<AddNOperator>},
+      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator>},
       {"All", ConvertSimpleOperator<TensorFlowAllOperator>},
       {"Any", ConvertReduceOperator<TensorFlowAnyOperator>},
       {"ArgMax", ConvertArgMaxOperator},
@@ -2245,6 +2298,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
        ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2>},
       {"Identity", ConvertIdentityOperator},
       {"LRN", ConvertLRNOperator},
+      {"LeakyRelu", ConvertLeakyReluOperator},
       {"LegacyFedInput", ConvertPlaceholderOperator},
       {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2>},
       {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>},
@@ -2297,6 +2351,8 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Split", ConvertSplitOperator},
       {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1>},
       {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1>},
+      {"SquaredDifference",
+       ConvertSimpleOperator<SquaredDifferenceOperator, 2>},
       {"Squeeze", ConvertSqueezeOperator},
       {"StopGradient", ConvertIdentityOperator},
       {"StridedSlice", ConvertStridedSliceOperator},
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
index 30aa725f1db8748db30f11693800d964f37e6107..0be358b1f7be2cc632322558eda3da86d16688af 100644
--- a/tensorflow/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -90,96 +90,73 @@ NodeDef BuildNode(
   return node;
 }
 
-class ShapeImportTest : public ::testing::TestWithParam<tensorflow::DataType> {
- protected:
-  ShapeImportTest() {}
-
-  void BuildConstNode(std::initializer_list<int64_t> shape,
-                      tensorflow::DataType dtype, int64_t num_elements,
-                      NodeDef* node) {
-    node->set_op("Const");
-    node->set_name("Node1");
-
-    // An attribute describing the type of this const node.
-    AttrValue dtype_attr;
-    SetAttrValue(dtype, &dtype_attr);
-    (*node->mutable_attr())["dtype"] = dtype_attr;
-
-    // An attribute describing the content of this const node.
-    tensorflow::TensorProto t;
-    t.set_dtype(dtype);
-    auto* s = t.mutable_tensor_shape();
-    for (auto d : shape) {
-      s->add_dim()->set_size(d);
-    }
-
-    // TODO(ahentz): also need to test via tensor_content()
-    switch (dtype) {
-      case DT_FLOAT:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_float_val(i / 10000.0);
-        }
-        break;
-      case DT_INT32:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_int_val(i % std::numeric_limits<int>::max());
-        }
-        break;
-      case DT_QUINT8:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_int_val(i % std::numeric_limits<uint8_t>::max());
-        }
-        break;
-      case DT_INT64:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_int64_val(i);
-        }
-        break;
-      case DT_STRING:
-        break;
-      case DT_BOOL:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_bool_val(i % 2);
-        }
-        break;
-      case DT_COMPLEX64:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_scomplex_val(i / 10000.0);
-          t.add_scomplex_val(-i / 10000.0);
-        }
-        break;
-      default:
-        break;
-    }
-
-    AttrValue value_attr;
-    SetAttrValue(t, &value_attr);
-    (*node->mutable_attr())["value"] = value_attr;
+namespace {
+void BuildConstNode(std::initializer_list<int64_t> shape,
+                    tensorflow::DataType dtype, int64_t num_elements,
+                    NodeDef* node) {
+  node->set_op("Const");
+  node->set_name("Node1");
+
+  // An attribute describing the type of this const node.
+  AttrValue dtype_attr;
+  SetAttrValue(dtype, &dtype_attr);
+  (*node->mutable_attr())["dtype"] = dtype_attr;
+
+  // An attribute describing the content of this const node.
+  tensorflow::TensorProto t;
+  t.set_dtype(dtype);
+  auto* s = t.mutable_tensor_shape();
+  for (auto d : shape) {
+    s->add_dim()->set_size(d);
   }
-};
-
-class TypeImportTest : public ::testing::TestWithParam<
-                           std::pair<tensorflow::DataType, ArrayDataType>> {
- protected:
-  TypeImportTest() {}
 
-  void BuildUnaryNode(const std::string& op_name, tensorflow::DataType dtype,
-                      NodeDef* node) {
-    node->set_op(op_name);
-    node->set_name("Node1");
-
-    node->add_input();
-    node->set_input(0, "Node0");
-
-    AttrValue dtype_attr;
-    SetAttrValue(dtype, &dtype_attr);
-    (*node->mutable_attr())["T"] = dtype_attr;
+  // TODO(ahentz): also need to test via tensor_content()
+  switch (dtype) {
+    case DT_FLOAT:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_float_val(i / 10000.0);
+      }
+      break;
+    case DT_INT32:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_int_val(i % std::numeric_limits<int>::max());
+      }
+      break;
+    case DT_QUINT8:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_int_val(i % std::numeric_limits<uint8_t>::max());
+      }
+      break;
+    case DT_INT64:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_int64_val(i);
+      }
+      break;
+    case DT_STRING:
+      break;
+    case DT_BOOL:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_bool_val(i % 2);
+      }
+      break;
+    case DT_COMPLEX64:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_scomplex_val(i / 10000.0);
+        t.add_scomplex_val(-i / 10000.0);
+      }
+      break;
+    default:
+      break;
   }
-};
 
-std::vector<tensorflow::DataType> TestTypes() {
-  return {DT_FLOAT, DT_INT32, DT_INT64, DT_BOOL, DT_QUINT8, DT_COMPLEX64};
+  AttrValue value_attr;
+  SetAttrValue(t, &value_attr);
+  (*node->mutable_attr())["value"] = value_attr;
 }
+}  //  namespace
+
+class ShapeImportTest : public ::testing::TestWithParam<tensorflow::DataType> {
+};
 
 TEST_P(ShapeImportTest, ShapeElementIsNegative) {
   NodeDef node;
@@ -190,8 +167,33 @@ TEST_P(ShapeImportTest, ShapeElementIsNegative) {
       "Tensor shape should not include negative values\n\t (while processing "
       "node 'Node1')");
 }
-INSTANTIATE_TEST_CASE_P(ShapeElementIsNegative, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
+
+TEST_P(ShapeImportTest, ShapeElementIsZero) {
+  NodeDef node;
+  // Const nodes with zero-sized, non-scalar shapes are still not importable.
+  BuildConstNode({1, 0, 10}, GetParam(), 0, &node);
+
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+
+  const auto& array = model.GetArray("Node1");
+  EXPECT_THAT(array.shape().dims(), ::testing::ElementsAre());
+}
+
+// Note how this is subtly different thant ShapeElementIsZero above, where toco
+// removes all shape information after import.
+TEST_P(ShapeImportTest, ShapeIsOneDimZero) {
+  NodeDef node;
+  BuildConstNode({0}, GetParam(), 0, &node);
+
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+
+  const auto& array = model.GetArray("Node1");
+  // We would like to have [0] shapes actually import correctly, but
+  // for some reason that slows everything down.
+  EXPECT_THAT(array.shape().dims(), ::testing::ElementsAre());
+}
 
 TEST_P(ShapeImportTest, ShapeElementTooLarge) {
   NodeDef node;
@@ -200,8 +202,6 @@ TEST_P(ShapeImportTest, ShapeElementTooLarge) {
   EXPECT_EQ(status.error_message(),
             "Shape element overflows\n\t (while processing node 'Node1')");
 }
-INSTANTIATE_TEST_CASE_P(ShapeElementTooLarge, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
 
 TEST_P(ShapeImportTest, ShapeTooLarge) {
   NodeDef node;
@@ -210,8 +210,6 @@ TEST_P(ShapeImportTest, ShapeTooLarge) {
   EXPECT_EQ(status.error_message(),
             "Tensor shape is too large\n\t (while processing node 'Node1')");
 }
-INSTANTIATE_TEST_CASE_P(ShapeTooLarge, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
 
 TEST_P(ShapeImportTest, ValidShapeButZeroElements) {
   NodeDef node;
@@ -223,10 +221,15 @@ TEST_P(ShapeImportTest, ValidShapeButZeroElements) {
                   "dimensions .8. for this .* tensor\n\t .while processing "
                   "node 'Node1'."));
 }
-INSTANTIATE_TEST_CASE_P(ValidShapeButZeroElements, ShapeImportTest,
+
+std::vector<tensorflow::DataType> TestTypes() {
+  return {DT_FLOAT, DT_INT32, DT_INT64, DT_BOOL, DT_QUINT8, DT_COMPLEX64};
+}
+
+INSTANTIATE_TEST_CASE_P(ShapeImportTest, ShapeImportTest,
                         ::testing::ValuesIn(TestTypes()));
 
-TEST_P(ShapeImportTest, Complex64ConstNode) {
+TEST(ImportTest, Complex64ConstNode) {
   NodeDef node;
   BuildConstNode({1, 2, 3}, DT_COMPLEX64, 6, &node);
   Model model;
@@ -241,8 +244,6 @@ TEST_P(ShapeImportTest, Complex64ConstNode) {
     i++;
   }
 }
-INSTANTIATE_TEST_CASE_P(Complex64ConstNode, ShapeImportTest,
-                        ::testing::ValuesIn({DT_COMPLEX64}));
 
 std::vector<std::pair<tensorflow::DataType, ArrayDataType>> UnaryTestTypes() {
   return {{DT_FLOAT, ArrayDataType::kFloat},
@@ -250,6 +251,25 @@ std::vector<std::pair<tensorflow::DataType, ArrayDataType>> UnaryTestTypes() {
           {DT_INT64, ArrayDataType::kInt64}};
 }
 
+class TypeImportTest : public ::testing::TestWithParam<
+                           std::pair<tensorflow::DataType, ArrayDataType>> {
+ protected:
+  TypeImportTest() {}
+
+  void BuildUnaryNode(const std::string& op_name, tensorflow::DataType dtype,
+                      NodeDef* node) {
+    node->set_op(op_name);
+    node->set_name("Node1");
+
+    node->add_input();
+    node->set_input(0, "Node0");
+
+    AttrValue dtype_attr;
+    SetAttrValue(dtype, &dtype_attr);
+    (*node->mutable_attr())["T"] = dtype_attr;
+  }
+};
+
 TEST_P(TypeImportTest, BasicTypeInference) {
   NodeDef node;
   BuildUnaryNode("Atan", GetParam().first, &node);
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index f85e1c287879e636a56ef10bf0f75a781d252ae9..92be42f47caa6ff951dc7fb811846017b6bc4f92 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -123,6 +123,7 @@ enum class OperatorType : uint8 {
   kSplit,
   kSqrt,
   kSquare,
+  kSquaredDifference,
   kSum,
   kSwitch,
   kTile,
@@ -152,7 +153,8 @@ enum class OperatorType : uint8 {
   kCTCBeamSearchDecoder,
   kUnpack,
   kZerosLike,
-  kResizeNearestNeighbor
+  kResizeNearestNeighbor,
+  kLeakyRelu
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -699,6 +701,19 @@ struct PReluOperator : Operator {
   PReluOperator() : Operator(OperatorType::kPRelu) {}
 };
 
+// LeakyRelu
+//   x -> max(x, alpha * x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: LeakyRelu
+struct LeakyReluOperator : Operator {
+  LeakyReluOperator() : Operator(OperatorType::kLeakyRelu) {}
+
+  float alpha = 0.2f;  // 0.2 matches the default value for the TF op attribute.
+};
+
 // Element-wise Logistic operator:
 //   x -> Logistic(x) = 1 / (1 + exp(-x))
 //
@@ -1289,6 +1304,17 @@ struct TensorFlowSquareOperator : Operator {
   TensorFlowSquareOperator() : Operator(OperatorType::kSquare) {}
 };
 
+// Element-wise squared difference ((x-y)*(x-y)) operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: SquaredDifference
+struct SquaredDifferenceOperator : Operator {
+  SquaredDifferenceOperator() : Operator(OperatorType::kSquaredDifference) {}
+};
+
 // Transposes a tensor.
 //
 // By default, this operation performs a regular matrix transpose on 2-D input
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 489c21295ef8fc805eb6d587a6d84fd36c5ac3ed..f17ce900ebac65fbece6a314f6c3ad4121266375 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -332,6 +332,11 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
     std::set<int32_t>* variable_tensor_indices, const ExportParams& params) {
   variable_tensor_indices->clear();
 
+  auto is_tflite_builtin = [](const BaseOperator* op) {
+    const auto& tflite_builtins = GetBuiltinOpsMap();
+    return (op && tflite_builtins.find(op->name()) != tflite_builtins.end());
+  };
+
   // The operators are in execution order, so we just follow tf.mini order.
   std::vector<Offset<Operator>> op_vector;
   for (const auto& op : model.operators) {
@@ -360,7 +365,19 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
     auto options = Options::Custom(0);
 
     std::vector<bool> mutating_input_variables;
-    if (tflite_op) {
+
+    // It is conceivable that an op is exportable via Serialize() but does not
+    // have a corresponding TFLITE builtin. In that case, when flex mode is
+    // enabled we should export it as a flex op, not as a native.
+    bool export_as_flex_op = !is_tflite_builtin(tflite_op) &&
+                             key.is_flex_op() &&
+                             !op->tensorflow_node_def.empty();
+    if (export_as_flex_op) {
+      auto fbb = WriteFlexOpOptions(op->tensorflow_node_def);
+      if (fbb) {
+        options = Options::Custom(builder->CreateVector(fbb->GetBuffer()));
+      }
+    } else if (tflite_op) {
       options = tflite_op->Serialize(*op, builder);
       mutating_input_variables = tflite_op->GetMutatingInputVariables(*op);
 
@@ -373,12 +390,13 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
           variable_tensor_indices->insert(variable_tensor_index);
         }
       }
-    } else if (key.is_flex_op() && !op->tensorflow_node_def.empty()) {
-      auto fbb = WriteFlexOpOptions(op->tensorflow_node_def);
-      if (fbb) {
-        options = Options::Custom(builder->CreateVector(fbb->GetBuffer()));
-      }
+    } else {
+      // We don't know much about this op. It doesn't have a serializer and
+      // it is not supposed to be exported as a flex op. We will treat it as
+      // a regular custom op: we will still create an operator for it, but it
+      // will not have any 'options'.
     }
+
     // The only supported CustomOptionFormat is FLEXBUFFERS now.
     op_vector.push_back(CreateOperator(
         *builder, op_index, builder->CreateVector(inputs),
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index b6c67772acadccd36ddc37766424d13e86858ffb..b371296784a34e081ae9bc5c1497348d9eb925ba 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -46,6 +46,18 @@ class ExportTest : public ::testing::Test {
         input_model_.operators.emplace_back(new AddOperator);
       } else if (name == "Sub") {
         input_model_.operators.emplace_back(new SubOperator);
+      } else if (name == "Assert") {
+        auto* op = new TensorFlowAssertOperator;
+
+        // Even though assert is known to TOCO, it doesn't have a tflite
+        // serializer, so it has to be exported as a custom op. If we attach a
+        // NodeDef to it, however, it will be exported as a flex op instead.
+        ::tensorflow::NodeDef node_def;
+        node_def.set_name("Assert");
+        node_def.set_op("Assert");
+        node_def.SerializeToString(&op->tensorflow_node_def);
+
+        input_model_.operators.emplace_back(op);
       } else {
         auto* op = new TensorFlowUnsupportedOperator;
         op->tensorflow_op = name;
@@ -232,37 +244,38 @@ class OpSetsTest : public ExportTest {
 TEST_F(OpSetsTest, BuiltinsOnly) {
   // --target_op_set=TFLITE_BUILTINS
   SetAllowedOpSets({kTfLiteBuiltins});
-  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold"}),
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
               ElementsAre());
   EXPECT_THAT(ImportExport({"Add"}), ElementsAre("builtin:ADD"));
 
   // --target_op_set=TFLITE_BUILTINS --allow_custom_ops
   SetAllowedOpSets({kTfLiteBuiltins, kCustomOps});
-  EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "UnrollAndFold"}),
-      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:UnrollAndFold"));
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
+              ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:Assert",
+                          "custom:UnrollAndFold"));
 }
 
 TEST_F(OpSetsTest, TfSelectOnly) {
   // --target_op_set=SELECT_TF_OPS
   SetAllowedOpSets({kSelectTfOps});
-  EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "RandomUniform", "UnrollAndFold"}),
-      ElementsAre());
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "RandomUniform",
+                            "UnrollAndFold", "Assert"}),
+              ElementsAre());
   EXPECT_THAT(ImportExport({"Add"}), ElementsAre("custom:FlexAdd"));
 
   // --target_op_set=SELECT_TF_OPS --allow_custom_ops
   SetAllowedOpSets({kSelectTfOps, kCustomOps});
   EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "RandomUniform", "UnrollAndFold"}),
-      ElementsAre("custom:AdjustHue", "custom:FlexAdd",
+      ImportExport(
+          {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
+      ElementsAre("custom:AdjustHue", "custom:FlexAdd", "custom:FlexAssert",
                   "custom:FlexRandomUniform", "custom:UnrollAndFold"));
 }
 
 TEST_F(OpSetsTest, BuiltinsAndTfSelect) {
   // --target_op_set=TFLITE_BUILTINS,SELECT_TF_OPS
   SetAllowedOpSets({kTfLiteBuiltins, kSelectTfOps});
-  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold"}),
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
               ElementsAre());
   EXPECT_THAT(ImportExport({"Add", "RandomUniform"}),
               ElementsAre("builtin:ADD", "custom:FlexRandomUniform"));
@@ -270,9 +283,10 @@ TEST_F(OpSetsTest, BuiltinsAndTfSelect) {
   // --target_op_set=TFLITE_BUILTINS,SELECT_TF_OPS --allow_custom_ops
   SetAllowedOpSets({kTfLiteBuiltins, kSelectTfOps, kCustomOps});
   EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "RandomUniform", "UnrollAndFold"}),
-      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:FlexRandomUniform",
-                  "custom:UnrollAndFold"));
+      ImportExport(
+          {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
+      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:FlexAssert",
+                  "custom:FlexRandomUniform", "custom:UnrollAndFold"));
 }
 
 // This test is based on a hypothetical scenario that dilation is supported
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 015029e1cbd57ca0cd34b8acaea063c8077da52e..83325f1f797f6ee223dfbcec6d198dd4be27ee00 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -1218,6 +1218,43 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class LeakyRelu
+    : public BuiltinOperator<LeakyReluOperator, ::tflite::LeakyReluOptions,
+                             ::tflite::BuiltinOptions_LeakyReluOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateLeakyReluOptions(*builder, op.alpha);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->alpha = options.alpha();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class SquaredDifference
+    : public BuiltinOperator<
+          SquaredDifferenceOperator, ::tflite::SquaredDifferenceOptions,
+          ::tflite::BuiltinOptions_SquaredDifferenceOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSquaredDifferenceOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
     const string& tensorflow_node_def) {
   auto fbb = absl::make_unique<flexbuffers::Builder>();
@@ -1516,6 +1553,11 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                    OperatorType::kOneHot));
   ops.push_back(MakeUnique<Unpack>(::tflite::BuiltinOperator_UNPACK,
                                    OperatorType::kUnpack));
+  ops.push_back(MakeUnique<LeakyRelu>(::tflite::BuiltinOperator_LEAKY_RELU,
+                                      OperatorType::kLeakyRelu));
+  ops.push_back(MakeUnique<SquaredDifference>(
+      ::tflite::BuiltinOperator_SQUARED_DIFFERENCE,
+      OperatorType::kSquaredDifference));
 
   // Custom Operators.
   ops.push_back(
@@ -1526,11 +1568,11 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                                   OperatorType::kUnsupported,
                                                   enable_select_tf_ops));
 
-  // There operators are supported by Toco, but not by TF Lite, and has no
-  // attributes.
-  ops.push_back(
-      MakeUnique<SimpleOperator<AddNOperator>>("ADDN", OperatorType::kAddN));
-  // Simple Operators.
+  // SimpleOperator was designed to export CUSTOM TF Lite ops, but has since
+  // been modified to also export builtins. As TOCO evolved we added warnings
+  // when custom ops are exported but SimpleOperator bypasses thoses. To
+  // prevent user confusion we are settling on using SimpleOperator only for
+  // builtins.
   ops.push_back(MakeUnique<SimpleOperator<DequantizeOperator>>(
       "DEQUANTIZE", OperatorType::kDequantize));
   ops.push_back(
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 8a776cbf0be57d906408afcd7d7d7a687a0d4c17..16514760de31f4c80a738f64dd48129e0273144d 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -517,6 +517,21 @@ TEST_F(OperatorTest, BuiltinUnpack) {
   EXPECT_EQ(op.axis, output_toco_op->axis);
 }
 
+TEST_F(OperatorTest, BuiltinLeakyRelu) {
+  LeakyReluOperator op;
+  op.alpha = 3;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("LEAKY_RELU", OperatorType::kLeakyRelu), op);
+  EXPECT_EQ(op.alpha, output_toco_op->alpha);
+}
+
+TEST_F(OperatorTest, BuiltinSquaredDifference) {
+  SquaredDifferenceOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SQUARED_DIFFERENCE", OperatorType::kSquaredDifference), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+}
+
 TEST_F(OperatorTest, CustomCTCBeamSearchDecoder) {
   CTCBeamSearchDecoderOperator op;
   op.beam_width = 3;
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
index 221e9b8e34e2b86f7c7610796022864d4153f2ab..039a918af16019292214f982326fba3eb5695c62 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
@@ -55,6 +55,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "AssignSub",
           "AudioSpectrogram",
           "AvgPool",
+          "AvgPool3D",
           "AvgPoolGrad",
           "BatchMatMul",
           "BatchNormWithGlobalNormalization",
@@ -78,6 +79,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Conv2D",
           "Conv2DBackpropFilter",
           "Conv2DBackpropInput",
+          "Conv3D",
           "Cos",
           "Cosh",
           "CropAndResize",
@@ -168,6 +170,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Max",
           "Maximum",
           "MaxPool",
+          "MaxPool3D",
           "MaxPoolGrad",
           "MaxPoolGradGrad",
           "MaxPoolGradGradV2",
@@ -184,6 +187,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "MirrorPad",
           "MirrorPadGrad",
           "Mul",
+          "Multinomial",
           "Neg",
           "NextIteration",
           "NonMaxSuppression",
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index 9740015850a05cdbc2ad2e97c508012e1678d998..4a3d6a5848751f4c1d526153bd6f6d08a9f882af 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -16,87 +16,9 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/model_cmdline_flags.h"
-#include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_cmdline_flags.h"
-#include "tensorflow/lite/toco/toco_flags.pb.h"
-#include "tensorflow/lite/toco/toco_port.h"
-#include "tensorflow/lite/toco/toco_tooling.h"
-#include "tensorflow/lite/toco/toco_types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-namespace {
-
-// Checks the permissions of the output file to ensure it is writeable.
-void CheckOutputFilePermissions(const Arg<string>& output_file) {
-  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
-  QCHECK(port::file::Writable(output_file.value()).ok())
-      << "Specified output_file is not writable: " << output_file.value()
-      << ".\n";
-}
-
-// Checks the permissions of the frozen model file.
-void CheckFrozenModelPermissions(const Arg<string>& input_file) {
-  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
-  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
-      << "Specified input_file does not exist: " << input_file.value() << ".\n";
-  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
-      << "Specified input_file exists, but is not readable: "
-      << input_file.value() << ".\n";
-}
-
-// Reads the contents of the GraphDef from either the frozen graph file or the
-// SavedModel directory. If it reads the SavedModel directory, it updates the
-// ModelFlags and TocoFlags accordingly.
-void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
-                   const ParsedModelFlags& parsed_model_flags,
-                   TocoFlags* toco_flags, ModelFlags* model_flags,
-                   string* graph_def_contents) {
-  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
-
-  // Ensure savedmodel_directory is not set.
-  QCHECK(!parsed_toco_flags.savedmodel_directory.specified())
-      << "Use `tensorflow/lite/python/tflite_convert` script with "
-      << "SavedModel directories.\n";
-
-  // Checks the input file permissions and reads the contents.
-  CheckFrozenModelPermissions(parsed_toco_flags.input_file);
-  CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
-                                graph_def_contents, port::file::Defaults())
-            .ok());
-}
-
-tensorflow::Status ToolMain(const ParsedTocoFlags& parsed_toco_flags,
-                            const ParsedModelFlags& parsed_model_flags) {
-  ModelFlags model_flags;
-  ReadModelFlagsFromCommandLineFlags(parsed_model_flags, &model_flags);
-
-  TocoFlags toco_flags;
-  ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
-
-  string graph_def_contents;
-  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
-                &model_flags, &graph_def_contents);
-  CheckOutputFilePermissions(parsed_toco_flags.output_file);
-
-  std::unique_ptr<Model> model =
-      Import(toco_flags, model_flags, graph_def_contents);
-  Transform(toco_flags, model.get());
-  string output_file_contents;
-  TF_RETURN_IF_ERROR(Export(toco_flags, *model, toco_flags.allow_custom_ops(),
-                            &output_file_contents));
-  TF_RETURN_IF_ERROR(
-      port::file::SetContents(parsed_toco_flags.output_file.value(),
-                              output_file_contents, port::file::Defaults()));
-  return tensorflow::Status();
-}
-
-}  // namespace
-}  // namespace toco
+#include "tensorflow/lite/toco/toco_convert.h"
 
 int main(int argc, char** argv) {
   toco::string msg;
@@ -126,6 +48,6 @@ int main(int argc, char** argv) {
     return 1;
   }
   toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
-  auto status = toco::ToolMain(parsed_toco_flags, parsed_model_flags);
+  auto status = toco::Convert(parsed_toco_flags, parsed_model_flags);
   return status.ok() ? 0 : -1;
 }
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
new file mode 100644
index 0000000000000000000000000000000000000000..28e7b10ecd056815c8ca6d7a74f324a18d307451
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_tooling.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+namespace {
+
+// Checks the permissions of the output file to ensure it is writeable.
+void CheckOutputFilePermissions(const Arg<string>& output_file) {
+  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
+  QCHECK(port::file::Writable(output_file.value()).ok())
+      << "Specified output_file is not writable: " << output_file.value()
+      << ".\n";
+}
+
+// Checks the permissions of the frozen model file.
+void CheckFrozenModelPermissions(const Arg<string>& input_file) {
+  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
+  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file does not exist: " << input_file.value() << ".\n";
+  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file exists, but is not readable: "
+      << input_file.value() << ".\n";
+}
+
+// Reads the contents of the GraphDef from either the frozen graph file or the
+// SavedModel directory. If it reads the SavedModel directory, it updates the
+// ModelFlags and TocoFlags accordingly.
+void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags,
+                   string* graph_def_contents) {
+  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
+
+  // Ensure savedmodel_directory is not set.
+  QCHECK(!parsed_toco_flags.savedmodel_directory.specified())
+      << "Use `tensorflow/lite/python/tflite_convert` script with "
+      << "SavedModel directories.\n";
+
+  // Checks the input file permissions and reads the contents.
+  CheckFrozenModelPermissions(parsed_toco_flags.input_file);
+  CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
+                                graph_def_contents, port::file::Defaults())
+            .ok());
+}
+}  // namespace
+
+tensorflow::Status Convert(const string& graph_def_contents,
+                           const TocoFlags& toco_flags,
+                           const ModelFlags& model_flags,
+                           string* output_file_contents) {
+  std::unique_ptr<Model> model =
+      Import(toco_flags, model_flags, graph_def_contents);
+  Transform(toco_flags, model.get());
+  return Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+                output_file_contents);
+}
+
+tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags) {
+  ModelFlags model_flags;
+  ReadModelFlagsFromCommandLineFlags(parsed_model_flags, &model_flags);
+
+  TocoFlags toco_flags;
+  ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
+
+  string graph_def_contents;
+  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
+                &model_flags, &graph_def_contents);
+  CheckOutputFilePermissions(parsed_toco_flags.output_file);
+
+  string output_file_contents;
+  TF_RETURN_IF_ERROR(Convert(graph_def_contents, toco_flags, model_flags,
+                             &output_file_contents));
+
+  TF_RETURN_IF_ERROR(
+      port::file::SetContents(parsed_toco_flags.output_file.value(),
+                              output_file_contents, port::file::Defaults()));
+  return tensorflow::Status();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/toco_convert.h b/tensorflow/lite/toco/toco_convert.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebbd336d3f50ae63a106387eadb5888c00ed9064
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+
+namespace toco {
+
+tensorflow::Status Convert(const string& graph_def_contents,
+                           const TocoFlags& toco_flags,
+                           const ModelFlags& model_flags,
+                           string* output_file_contents);
+
+tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags);
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3c440db94396def2f8cfd40242642767d11a63a
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -0,0 +1,173 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/toco_convert.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace toco {
+namespace {
+
+TEST(TocoTest, MissingInputFile) {
+  ParsedTocoFlags toco_flags;
+  ParsedModelFlags model_flags;
+  EXPECT_DEATH(Convert(toco_flags, model_flags).ok(),
+               "Missing required flag --input_file");
+}
+
+TEST(TocoTest, BadInputFormat) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Unhandled input_format='FILE_FORMAT_UNKNOWN'");
+}
+
+TEST(TocoTest, MissingOuputArrays) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "This model does not define output arrays, so a --output_arrays "
+               "flag must be given on the command-line");
+}
+
+TEST(TocoTest, BadOutputArray) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  model_flags.add_output_arrays("output1");
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Specified output array .output1. is not produced by any op "
+               "in this graph. Is it a typo. To silence this message, pass "
+               "this flag:  allow_nonexistent_arrays");
+}
+
+TEST(TocoTest, BadOutputFormat) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "output1"
+      input: "input1"
+      input: "input2"
+      op: "Sub"
+      attr { key: "T" value { type: DT_FLOAT } }
+    }
+  )GraphDef";
+
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Unhandled output_format='FILE_FORMAT_UNKNOWN'");
+}
+
+TEST(TocoTest, SimpleFloatModel) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  toco_flags.set_output_format(TENSORFLOW_GRAPHDEF);
+
+  // Inputs are automatically selected (but that might not be a good idea).
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "input1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "input2"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "output1"
+      input: "input1"
+      input: "input2"
+      op: "Sub"
+      attr { key: "T" value { type: DT_FLOAT } }
+    }
+  )GraphDef";
+
+  string output;
+  EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
+  EXPECT_TRUE(!output.empty());
+}
+
+TEST(TocoTest, TransientStringTensors) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+
+  // We need to do a couple of things to trigger the transient array
+  // initialization code: output format must support memory planning, and the
+  // input array must have a shape.
+  toco_flags.set_output_format(TFLITE);
+
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "input1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_STRING } }
+      attr { key: "shape" value { shape { dim { size:1 }}}}
+    }
+    node {
+      name: "indices1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "intermediate1"
+      op: "Gather"
+      input: "input1"
+      input: "indices1"
+      attr { key: "Tparams" value { type: DT_STRING } }
+      attr { key: "Tindices" value { type: DT_INT64 } }
+    }
+    node {
+      name: "output1"
+      op: "Gather"
+      input: "intermediate1"
+      input: "indices2"
+      attr { key: "Tparams" value { type: DT_STRING } }
+      attr { key: "Tindices" value { type: DT_INT64 } }
+    }
+  )GraphDef";
+
+  string output;
+
+  EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
+  EXPECT_TRUE(!output.empty());
+}
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index 0881065a23f12295975e9ba7fb0c0a4f8917f5b0..fb8c1b8337f1e509ed9c9ee2522e63e84d143927 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -66,8 +66,9 @@ namespace file {
 // Conversion to our wrapper Status.
 tensorflow::Status ToStatus(const ::util::Status& uts) {
   if (!uts.ok()) {
-    return tensorflow::Status(tensorflow::errors::Code(uts.error_code()),
-                              uts.error_message());
+    return tensorflow::Status(
+        tensorflow::errors::Code(::util::RetrieveErrorCode(uts)),
+        uts.error_message());
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index 5f96e833fbf4000f1796ca8efbb62fa960ad9544..d8b111d03792721eb0d4d60f122cfe5c5cc7d3de 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -210,7 +210,8 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
       CheckInvariants(*model);
       break;
     default:
-      LOG(FATAL) << "Unhandled input_format";
+      LOG(FATAL) << "Unhandled input_format='"
+                 << FileFormat_Name(toco_flags.input_format()) << "'";
   }
 
   LogDump(kLogLevelModelChanged, "AT IMPORT", *model);
@@ -424,7 +425,8 @@ tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
       DumpGraphviz(model, output_file_contents);
       break;
     default:
-      LOG(FATAL) << "Unhandled output_format";
+      LOG(FATAL) << "Unhandled output_format='"
+                 << FileFormat_Name(toco_flags.output_format()) << "'";
   }
   return tensorflow::Status();
 }
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index e33f7c8452f88d8402f51ef1bc55a8dfe95631ec..084169548e2c61a68f03acca341e7d080106685e 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -411,6 +411,8 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(ZerosLike)
     HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceLstm)
     HANDLE_OPERATORTYPENAME_CASE(ResizeNearestNeighbor)
+    HANDLE_OPERATORTYPENAME_CASE(LeakyRelu)
+    HANDLE_OPERATORTYPENAME_CASE(SquaredDifference)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -439,6 +441,7 @@ bool OperatorSupportsFusedActivation(OperatorType type) {
     case OperatorType::kMaxPool:
     case OperatorType::kMul:
     case OperatorType::kSub:
+    case OperatorType::kSquaredDifference:
       return true;
     default:
       return false;
@@ -1767,6 +1770,14 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
   if (!array->has_shape()) {
     return false;
   }
+
+  // The size of string tensors is rarely known ahead of time, so all transient
+  // tensors of this type will need to be dynamically allocated.
+  if (array->final_data_type == ArrayDataType::kString ||
+      array->data_type == ArrayDataType::kString) {
+    return false;
+  }
+
   return true;
 }
 
@@ -2207,6 +2218,8 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
       return ArrayDataType::kFloat;
     case QUANTIZED_UINT8:
       return ArrayDataType::kUint8;
+    case INT8:
+      return ArrayDataType::kInt8;
     case QUANTIZED_INT16:
       return ArrayDataType::kInt16;
     case INT32:
diff --git a/tensorflow/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
index 92ce82632f96859ce4f425dac64c4e2b812f156e..53131824b532853afc1660354de92da40db0da86 100644
--- a/tensorflow/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -338,8 +338,9 @@ tensorflow::Status NumElements(const std::vector<T>& shape, U* num_elements) {
       return tensorflow::errors::InvalidArgument(
           "Tensor shape should not include negative values");
     }
-    if (static_cast<uint64_t>(dim) >
-        std::numeric_limits<U>::max() / *num_elements) {
+    if (*num_elements != 0 &&
+        static_cast<uint64_t>(dim) >
+            std::numeric_limits<U>::max() / *num_elements) {
       *num_elements = 0;
       return tensorflow::errors::InvalidArgument("Tensor shape is too large");
     }
diff --git a/tensorflow/lite/toco/tooling_util_test.cc b/tensorflow/lite/toco/tooling_util_test.cc
index e3826cb8fde69f05c2f3d8daa64e534639fe5fbb..6f1c9c563ada01891b67094caa93cfd1847cdf6b 100644
--- a/tensorflow/lite/toco/tooling_util_test.cc
+++ b/tensorflow/lite/toco/tooling_util_test.cc
@@ -109,6 +109,10 @@ TEST(NumElementsTest, Int) {
   EXPECT_TRUE(status.ok());
   EXPECT_EQ(count, 2146435072);
 
+  status = NumElements(std::vector<int>{1024, 0, 2048}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 0);
+
   status = NumElements(std::vector<int>{1, 2, -3}, &count);
   EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
 
diff --git a/tensorflow/lite/toco/types.proto b/tensorflow/lite/toco/types.proto
index 12f711fd8a3c7cbed103bcf43206966e3c5f72b9..fa911b8a4c80d96790fa16e34dbc3f114b522e45 100644
--- a/tensorflow/lite/toco/types.proto
+++ b/tensorflow/lite/toco/types.proto
@@ -43,4 +43,7 @@ enum IODataType {
 
   // Complex64, not quantized
   COMPLEX64 = 8;
+
+  // Int8, quantized based on QuantizationParameters in schema.
+  INT8 = 9;
 }
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 7646bdcca9f2641acb9ecbc183e79dabe13b57d9..583046ad73d67ba9fba76570299fc1331aef07e4 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -112,6 +112,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:profile_summarizer",
+        "@gemmlowp",
     ],
 )
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index e063a144b66b1c963e6184ec825e82ec61264dad..7768b75f769c12d6603154a35fe650b550542faf 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -29,6 +29,10 @@ limitations under the License.
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
 
+#ifdef GEMMLOWP_PROFILING
+#include "third_party/gemmlowp/profiling/profiler.h"
+#endif
+
 #ifdef TFLITE_CUSTOM_OPS_HEADER
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 #endif
@@ -62,6 +66,21 @@ void ProfilingListener::OnSingleRunEnd() {
   summarizer_.ProcessProfiles(profile_events, *interpreter_);
 }
 
+void GemmlowpProfilingListener::OnBenchmarkStart(
+    const BenchmarkParams& params) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::RegisterCurrentThreadForProfiling();
+  gemmlowp::StartProfiling();
+#endif
+}
+
+void GemmlowpProfilingListener::OnBenchmarkEnd(
+    const BenchmarkResults& results) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::FinishProfiling();
+#endif
+}
+
 namespace {
 
 std::vector<std::string> Split(const std::string& str, const char delim) {
@@ -162,7 +181,9 @@ bool PopulateInputLayerInfo(
   return true;
 }
 
-BenchmarkParams GetDefaultParams() {
+}  // namespace
+
+BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   BenchmarkParams default_params = BenchmarkModel::DefaultParams();
   default_params.AddParam("graph", BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("input_layer",
@@ -173,16 +194,13 @@ BenchmarkParams GetDefaultParams() {
   return default_params;
 }
 
-}  // namespace
-
 BenchmarkTfLiteModel::BenchmarkTfLiteModel()
-    : BenchmarkModel(GetDefaultParams()) {
-  AddListener(&profiling_listener_);
-}
+    : BenchmarkTfLiteModel(DefaultParams()) {}
 
 BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
     : BenchmarkModel(std::move(params)) {
   AddListener(&profiling_listener_);
+  AddListener(&gemmlowp_profiling_listener_);
 }
 
 std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
@@ -301,6 +319,7 @@ void BenchmarkTfLiteModel::Init() {
   bool use_nnapi = params_.Get<bool>("use_nnapi");
 
   interpreter->UseNNAPI(use_nnapi);
+  ApplyDelegates();
 
   auto interpreter_inputs = interpreter->inputs();
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 8ad3a5dbe50980e7815843e1ef5744249d596552..83599e644d1f41f70fd96f3a73f9155d6e62deef 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tflite {
 namespace benchmark {
 
-// Dumps profiling events if profiling is enabled
+// Dumps profiling events if profiling is enabled.
 class ProfilingListener : public BenchmarkListener {
  public:
   explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
@@ -47,11 +47,21 @@ class ProfilingListener : public BenchmarkListener {
   bool has_profiles_;
 };
 
+// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
+class GemmlowpProfilingListener : public BenchmarkListener {
+ public:
+  virtual ~GemmlowpProfilingListener() {}
+
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
+
 // Benchmarks a TFLite model by running tflite interpreter.
 class BenchmarkTfLiteModel : public BenchmarkModel {
  public:
   BenchmarkTfLiteModel();
-  BenchmarkTfLiteModel(BenchmarkParams params);
+  explicit BenchmarkTfLiteModel(BenchmarkParams params);
   virtual ~BenchmarkTfLiteModel() {}
 
   std::vector<Flag> GetFlags() override;
@@ -67,13 +77,19 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   };
 
  protected:
+  static BenchmarkParams DefaultParams();
   void PrepareInputsAndOutputs() override;
 
- private:
+  // Allows installation of custom delegates during initialization
+  virtual void ApplyDelegates() {}
+
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
+
+ private:
   std::vector<InputLayerInfo> inputs;
   ProfilingListener profiling_listener_;
+  GemmlowpProfilingListener gemmlowp_profiling_listener_;
 };
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/benchmark/command_line_flags.cc b/tensorflow/lite/tools/benchmark/command_line_flags.cc
index 002fa7dea3e9c01dec96e074d0db6c869cc0d791..2fad780dc8680b5ac6c1d2d77739e495116aa990 100644
--- a/tensorflow/lite/tools/benchmark/command_line_flags.cc
+++ b/tensorflow/lite/tools/benchmark/command_line_flags.cc
@@ -59,11 +59,12 @@ bool ParseFlag(const std::string& flag_value,
 
 bool ParseBoolFlag(const std::string& flag_value,
                    const std::function<void(const bool&)>& hook) {
-  if (flag_value != "true" && flag_value != "false") {
+  if (flag_value != "true" && flag_value != "false" && flag_value != "0" &&
+      flag_value != "1") {
     return false;
   }
 
-  hook(flag_value == "true");
+  hook(flag_value == "true" || flag_value == "1");
   return true;
 }
 }  // namespace
diff --git a/tensorflow/lite/tools/benchmark/command_line_flags_test.cc b/tensorflow/lite/tools/benchmark/command_line_flags_test.cc
index 36eb75b06a84e828b20c271c52fff4ad97cdf6cd..afdf2793bf9db61636941e8415934e312d58ed07 100644
--- a/tensorflow/lite/tools/benchmark/command_line_flags_test.cc
+++ b/tensorflow/lite/tools/benchmark/command_line_flags_test.cc
@@ -27,13 +27,17 @@ TEST(CommandLineFlagsTest, BasicUsage) {
   bool some_switch = false;
   std::string some_name = "something_a";
   float some_float = -23.23f;
+  bool some_bool = false;
+  bool some_numeric_bool = true;
   const char* argv_strings[] = {"program_name",
                                 "--some_int32=20",
                                 "--some_int64=214748364700",
                                 "--some_switch=true",
                                 "--some_name=somethingelse",
-                                "--some_float=42.0"};
-  int argc = 6;
+                                "--some_float=42.0",
+                                "--some_bool=true",
+                                "--some_numeric_bool=0"};
+  int argc = 8;
   bool parsed_ok = Flags::Parse(
       &argc, reinterpret_cast<const char**>(argv_strings),
       {
@@ -42,6 +46,9 @@ TEST(CommandLineFlagsTest, BasicUsage) {
           Flag::CreateFlag("some_switch", &some_switch, "some switch"),
           Flag::CreateFlag("some_name", &some_name, "some name"),
           Flag::CreateFlag("some_float", &some_float, "some float"),
+          Flag::CreateFlag("some_bool", &some_bool, "some bool"),
+          Flag::CreateFlag("some_numeric_bool", &some_numeric_bool,
+                           "some numeric bool"),
       });
 
   EXPECT_EQ(true, parsed_ok);
@@ -50,6 +57,8 @@ TEST(CommandLineFlagsTest, BasicUsage) {
   EXPECT_EQ(true, some_switch);
   EXPECT_EQ("somethingelse", some_name);
   EXPECT_NEAR(42.0f, some_float, 1e-5f);
+  EXPECT_TRUE(some_bool);
+  EXPECT_FALSE(some_numeric_bool);
   EXPECT_EQ(argc, 1);
 }
 
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 8f123558545723a603646523aad5dd47cb620e46..28374f13b784c5bdd5f823e7b24f0d446afba450 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -208,6 +208,9 @@ $(BENCHMARK_BINARY) : $(BENCHMARK_LIB)
 
 benchmark: $(BENCHMARK_BINARY)
 
+libdir:
+	@echo $(LIBDIR)
+
 # Gets rid of all generated files.
 clean:
 	rm -rf $(MAKEFILE_DIR)/gen
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
index 477883b49095b2017770e076abacb31e9e97c8f7..6e0d262827f0944918580d073f082d20e0e1803b 100755
--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -17,7 +17,7 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../.."
+cd "$SCRIPT_DIR/../../../.."
 
 # Build library for supported architectures and packs them in a fat binary.
 make_library() {
diff --git a/tensorflow/lite/tools/make/build_rpi_lib.sh b/tensorflow/lite/tools/make/build_rpi_lib.sh
index d4047bb0eb5071432d4628fef1cbebbe7e023f8c..1521bb39332bd44ecc6cf0f6a2910c7f0711a123 100755
--- a/tensorflow/lite/tools/make/build_rpi_lib.sh
+++ b/tensorflow/lite/tools/make/build_rpi_lib.sh
@@ -17,6 +17,6 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../.."
+cd "$SCRIPT_DIR/../../../.."
 
 CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=rpi TARGET_ARCH=armv7l
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index aa5495329b10577447f5d3ade8faf157021dde3a..fa3d5d3d3b6657ff327dd6ec34bd65823da13cd2 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -17,7 +17,7 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../.."
+cd "$SCRIPT_DIR/../../../.."
 
 DOWNLOADS_DIR=tensorflow/lite/tools/make/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index a13774f7130b932a65f162d4b54bfe5325475ed5..de3c0b03237c1c85d1cfbeafc2ce8db4faf70ff6 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -182,8 +182,7 @@ std::vector<TensorInfo> GetQuantizableTensorsFromOperator(
     TensorT* tensor = subgraph->tensors[tensor_idx].get();
     // TODO(suharshs): Support shared weights, i.e. If two tensors share the
     // same weight array, things may break. (i.e. SSD object detection)
-    if (!eval_hybrid &&
-        CountTensorConsumers(model, subgraph, tensor_idx) != 1) {
+    if (CountTensorConsumers(model, subgraph, tensor_idx) != 1) {
       LOG(INFO) << "Skipping quantization of tensor " << tensor->name
                 << " that is shared between multiple multiple operations.";
       continue;
diff --git a/tensorflow/lite/tools/pip_package/MANIFEST.in b/tensorflow/lite/tools/pip_package/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..bb574e63a372da96841efbc70b8e213a943213c6
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include * *.py
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8190782c39fcb910749fb466b7075dd628cdd554
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -0,0 +1,33 @@
+# Building TensorFlow Lite Standalone Pip
+
+Many users would like to deploy TensorFlow lite interpreter and use it from
+Python without requiring the rest of TensorFlow.
+
+## Steps
+
+To build a binary wheel run this script:
+```
+sudo apt install swig libjpeg-dev zlib1g-dev python3-dev python3-numpy
+sh tensorflow/lite/tools/pip_package/build_pip_package.sh
+```
+That will print out some output and a .whl file. You can then install that
+```
+pip install --upgrade <wheel>
+```
+
+Note, unlike tensorflow this will be installed to a tflite_runtime namespace.
+You can then use the Tensorflow Lite interpreter as.
+```
+import tflite_runtime as tflr
+interpreter = tflr.lite.Interpreter(model_path="foo.tflite")
+```
+
+This currently works to build on Linux machines including Raspberry Pi. In
+the future, cross compilation to smaller SOCs like Raspberry Pi from
+bigger host will be supported.
+
+## Caveats
+
+* You cannot use TensorFlow Select ops, only TensorFlow Lite builtins.
+* Currently custom ops and delegates cannot be registered.
+
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package.sh b/tensorflow/lite/tools/pip_package/build_pip_package.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2887ce84712aa75168bd2b5ae77240f25deddf57
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/build_pip_package.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+# Find where this script lives and then the Tensorflow root.
+MY_DIRECTORY=`dirname $0`
+export TENSORFLOW_SRC_ROOT=`realpath $MY_DIRECTORY/../../../..`
+
+export TENSORFLOW_VERSION=`grep "_VERSION = " $TENSORFLOW_SRC_ROOT/tensorflow/tools/pip_package/setup.py  | cut -d'=' -f 2 | sed "s/[ '-]//g"`;
+
+
+# Build a pip build tree.
+BUILD_ROOT=/tmp/tflite_pip
+rm -rf $BUILD_ROOT
+mkdir -p $BUILD_ROOT/tflite_runtime/lite
+mkdir -p $BUILD_ROOT/tflite_runtime/lite/python
+
+# Build an importable module tree
+cat > $BUILD_ROOT/tflite_runtime/__init__.py <<EOF;
+import tflite_runtime.lite.interpreter
+EOF
+
+cat > $BUILD_ROOT/tflite_runtime/lite/__init__.py <<EOF;
+from interpreter import Interpreter as Interpreter
+EOF
+
+cat > $BUILD_ROOT/tflite_runtime/lite/python/__init__.py <<EOF;
+# Python module for TensorFlow Lite
+EOF
+
+# Copy necessary source files
+TFLITE_ROOT=$TENSORFLOW_SRC_ROOT/tensorflow/lite
+cp -r  $TFLITE_ROOT/python/interpreter_wrapper $BUILD_ROOT
+cp $TFLITE_ROOT/python/interpreter.py $BUILD_ROOT/tflite_runtime/lite/
+cp $TFLITE_ROOT/tools/pip_package/setup.py $BUILD_ROOT
+cp $TFLITE_ROOT/tools/pip_package/MANIFEST.in $BUILD_ROOT
+
+# Build the Pip
+cd $BUILD_ROOT
+python setup.py bdist_wheel
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..64d62ee1f2d5d0cc1fa1d1804c637f8220937128
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -0,0 +1,150 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite is for mobile and embedded devices.
+
+TensorFlow Lite is the official solution for running machine learning models on
+mobile and embedded devices. It enables on-device machine learning inference
+with low latency and a small binary size on Android, iOS, and other operating
+systems.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+import subprocess
+
+from distutils.command.build_ext import build_ext
+import numpy
+
+from setuptools import Extension
+from setuptools import find_packages
+from setuptools import setup
+from setuptools.command.build_py import build_py
+PACKAGE_NAME = 'tflite-runtime'
+PACKAGE_VERSION = os.environ['TENSORFLOW_VERSION']
+DOCLINES = __doc__.split('\n')
+PACKAGE = 'tflite_runtime.lite.python'
+TENSORFLOW_DIR = os.environ['TENSORFLOW_SRC_ROOT']
+
+# Setup cross compiling
+TARGET = (
+    os.environ['TENSORFLOW_TARGET'] if 'TENSORFLOW_TARGET' in os.environ
+    else None)
+if TARGET == 'rpi':
+  os.environ['CXX'] = 'arm-linux-gnueabihf-g++'
+  os.environ['CC'] = 'arm-linux-gnueabihf-g++'
+MAKE_CROSS_OPTIONS = ['TARGET=%s' % TARGET]  if TARGET else []
+
+RELATIVE_MAKE_DIR = os.path.join('tensorflow', 'lite', 'tools', 'make')
+MAKE_DIR = os.path.join(TENSORFLOW_DIR, RELATIVE_MAKE_DIR)
+DOWNLOADS_DIR = os.path.join(MAKE_DIR, 'downloads')
+RELATIVE_MAKEFILE_PATH = os.path.join(RELATIVE_MAKE_DIR, 'Makefile')
+DOWNLOAD_SCRIPT_PATH = os.path.join(MAKE_DIR, 'download_dependencies.sh')
+
+
+def make_args(target='', quiet=True):
+  """Construct make command line."""
+  args = (['make', 'SHELL=/bin/bash', '-C', TENSORFLOW_DIR]
+          + MAKE_CROSS_OPTIONS +
+          ['-f', RELATIVE_MAKEFILE_PATH, '-j',
+           str(multiprocessing.cpu_count())])
+  if quiet:
+    args.append('--quiet')
+  if target:
+    args.append(target)
+  return args
+
+
+def make_output(target):
+  """Invoke make on the target and return output."""
+  return subprocess.check_output(make_args(target)).decode('utf-8').strip()
+
+
+def make():
+  """Invoke make to build tflite C++ sources.
+
+  Build dependencies:
+     apt-get install swig libjpeg-dev zlib1g-dev python3-dev python3-nump
+  """
+  subprocess.check_call(make_args(quiet=False))
+
+
+def download_dependencies():
+  """Download build dependencies if haven't done yet."""
+  if not os.path.isdir(DOWNLOADS_DIR) or not os.listdir(DOWNLOADS_DIR):
+    subprocess.check_call(DOWNLOAD_SCRIPT_PATH)
+
+
+class CustomBuildExt(build_ext, object):
+
+  def run(self):
+    download_dependencies()
+    make()
+
+    return super(CustomBuildExt, self).run()
+
+
+class CustomBuildPy(build_py, object):
+
+  def run(self):
+    self.run_command('build_ext')
+    return super(CustomBuildPy, self).run()
+
+
+LIB_TFLITE = 'tensorflow-lite'
+LIB_TFLITE_DIR = make_output('libdir')
+
+ext = Extension(
+    name='%s._interpreter_wrapper' % PACKAGE,
+    language='c++',
+    sources=['interpreter_wrapper/interpreter_wrapper.i',
+             'interpreter_wrapper/interpreter_wrapper.cc'],
+    swig_opts=['-c++',
+               '-I%s' % TENSORFLOW_DIR,
+               '-module', 'interpreter_wrapper',
+               '-outdir', '.'],
+    extra_compile_args=['-std=c++11'],
+    include_dirs=[TENSORFLOW_DIR,
+                  os.path.join(TENSORFLOW_DIR, 'tensorflow', 'lite', 'tools',
+                               'pip_package'),
+                  numpy.get_include(),
+                  os.path.join(DOWNLOADS_DIR, 'flatbuffers', 'include'),
+                  os.path.join(DOWNLOADS_DIR, 'absl')],
+    libraries=[LIB_TFLITE],
+    library_dirs=[LIB_TFLITE_DIR])
+
+
+setup(
+    name=PACKAGE_NAME,
+    version=PACKAGE_VERSION,
+    description=DOCLINES[0],
+    long_description='\n'.join(DOCLINES[2:]),
+    url='https://www.tensorflow.org/lite/',
+    author='Google Inc.',
+    author_email='opensource@google.com',
+    license='Apache 2.0',
+    include_package_data=True,
+    keywords='tflite tensorflow tensor machine learning',
+    packages=find_packages(exclude=[]),
+    ext_modules=[ext],
+    package_dir={PACKAGE: '.'},
+    cmdclass={
+        'build_ext': CustomBuildExt,
+        'build_py': CustomBuildPy,
+    }
+)
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
index 64a5b52e2f982bb4f7c4802f9b5b79a6edc0325e..dbb87528d06b6719a29b364711a7c62c273fdb34 100644
--- a/tensorflow/lite/util.h
+++ b/tensorflow/lite/util.h
@@ -52,6 +52,12 @@ bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
 
 size_t CombineHashes(std::initializer_list<size_t> hashes);
 
+struct TfLiteIntArrayDeleter {
+  void operator()(TfLiteIntArray* a) {
+    if (a) TfLiteIntArrayFree(a);
+  }
+};
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_UTIL_H_
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 0d06c49f7c77830a68448625ab5bdac7391f6067..3def23bc444992546d4a8a0dc8b9a57b86081d7e 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -20,6 +20,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+exports_files(["platform/base.i"])
+
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
@@ -81,6 +83,7 @@ py_library(
         "//tensorflow/python/tools:__pkg__",
         "//tensorflow/python/tools/api/generator:__pkg__",
         "//tensorflow/tools/api/tests:__pkg__",
+        "//tensorflow/tools/compatibility/update:__pkg__",
     ],
     deps = [
         ":array_ops",
@@ -121,7 +124,6 @@ py_library(
         ":session_ops",
         ":sets",
         ":sparse_ops",
-        ":spectral_ops",
         ":spectral_ops_test_util",
         ":standard_ops",
         ":state_ops",
@@ -129,6 +131,7 @@ py_library(
         ":subscribe",
         ":summary",
         ":tensor_array_ops",
+        ":tensor_forest_ops",
         ":test_ops",  # TODO: Break testing code out into separate rule.
         ":tf_cluster",
         ":tf_item",
@@ -142,12 +145,15 @@ py_library(
         "//tensorflow/python/compat",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute:estimator_training",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/ops/parallel_for",
+        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/ops/signal",
         "//tensorflow/python/profiler",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/tools:component_api_helper",
@@ -848,7 +854,6 @@ py_library(
     deps = [
         ":c_api_util",
         ":control_flow_util",
-        ":cpp_shape_inference_proto_py",
         ":device",
         ":dtypes",
         ":error_interpolation",
@@ -874,6 +879,8 @@ py_library(
     deps = [
         ":auto_control_deps",
         ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
         "//tensorflow/python/autograph",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:graph_only_ops",
@@ -888,6 +895,8 @@ py_library(
     deps = [
         ":control_flow_ops",
         ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
         ":util",
     ],
 )
@@ -988,6 +997,7 @@ py_library(
         ":common_shapes",
         ":dtypes",
         ":tensor_shape",
+        ":util",
         "//third_party/py/numpy",
     ],
 )
@@ -1046,6 +1056,7 @@ py_library(
         ":random_seed",
         ":resource_variable_ops",
         ":session",
+        ":tensor_array_ops",
         ":training",
         ":util",
         ":variables",
@@ -1536,6 +1547,7 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1611,6 +1623,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "tensor_forest_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "summary_ops_gen",
     visibility = ["//tensorflow:__subpackages__"],
@@ -1830,6 +1850,7 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "spectral_ops_gen",
+    visibility = ["//tensorflow/python/ops/signal:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1851,6 +1872,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/contrib/text:__pkg__",
         "//learning/brain/contrib/text/python/ragged:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
     ],
 )
 
@@ -1859,6 +1881,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/contrib/text:__pkg__",
         "//learning/brain/contrib/text/python/ragged:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
     ],
 )
 
@@ -1867,6 +1890,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/contrib/text:__pkg__",
         "//learning/brain/contrib/text/python/ragged:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
     ],
 )
 
@@ -1931,6 +1955,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tensor_forest_ops",
+    srcs = ["ops/tensor_forest_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":ops",
+        ":tensor_forest_ops_gen",
+        ":training",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+    ],
+)
+
 py_library(
     name = "sets",
     srcs = [
@@ -2083,7 +2120,9 @@ py_library(
     srcs = ["ops/control_flow_util_v2.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "framework_ops",
+        ":control_flow_util",
+        ":framework_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
     ],
@@ -2108,7 +2147,6 @@ py_library(
         ":graph_to_function_def",
         ":pywrap_tensorflow",
         ":util",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:function",
     ],
 )
@@ -2127,14 +2165,15 @@ py_library(
         ":control_flow_util_v2",
         ":dtypes",
         ":framework_ops",
+        ":framework_test_lib",
         ":function_def_to_graph",
         ":functional_ops_gen",
         ":gradients_impl",
         ":list_ops",
         ":tensor_array_ops",
         ":tensor_shape",
+        ":tensor_util",
         ":util",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:function",
     ],
 )
@@ -2255,7 +2294,6 @@ py_library(
         ":platform",
         ":random_grad",
         ":resource_variable_ops",
-        ":spectral_grad",
         ":tensor_array_ops",
         ":tensor_util",
         ":unconnected_gradients",
@@ -2497,7 +2535,6 @@ py_library(
         ":nn_ops_gen",
         ":sparse_ops_gen",
         ":sparse_tensor",
-        ":spectral_ops_gen",
         ":state_ops",
         ":state_ops_gen",
         ":tensor_shape",
@@ -2807,29 +2844,29 @@ py_test(
 )
 
 py_library(
-    name = "spectral_grad",
-    srcs = ["ops/spectral_grad.py"],
+    name = "sort_ops",
+    srcs = ["ops/sort_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
         ":framework",
-        ":framework_for_generated_wrappers",
         ":math_ops",
-        ":spectral_ops",
+        ":nn_ops",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
-    name = "spectral_ops",
-    srcs = ["ops/spectral_ops.py"],
+py_test(
+    name = "sort_ops_test",
+    srcs = ["ops/sort_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":dtypes",
-        ":framework_ops",
-        ":math_ops",
-        ":spectral_ops_gen",
+        ":client_testlib",
+        ":framework",
+        ":random_ops",
+        ":sort_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -2947,10 +2984,10 @@ py_library(
         ":random_ops",
         ":script_ops",
         ":session_ops",
+        ":sort_ops",
         ":sparse_grad",
         ":sparse_ops",
         ":special_math_ops",
-        ":spectral_grad",
         ":state_grad",
         ":state_ops",
         ":stateless_random_ops",
@@ -2961,6 +2998,7 @@ py_library(
         ":util",
         ":variable_scope",
         ":variables",
+        "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
     ],
@@ -3000,18 +3038,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "summary_ops",
-    srcs = ["ops/summary_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":logging_ops_gen",
-        ":summary_op_util",
-    ],
-)
-
 py_library(
     name = "summary_ops_v2",
     srcs = ["ops/summary_ops_v2.py"],
@@ -3067,13 +3093,16 @@ py_library(
     deps = [
         ":array_ops",
         ":constant_op",
+        ":control_flow_ops_gen",
         ":data_flow_ops_gen",
         ":dtypes",
         ":errors",
         ":framework_ops",
+        ":list_ops",
         ":math_ops",
         ":tensor_shape",
         ":tensor_util",
+        ":tf2",
         ":tf_should_use",
         "//tensorflow/python/eager:context",
     ],
@@ -3424,6 +3453,8 @@ py_library(
             # file):
             "training/basic_session_run_hooks.py",
             "training/checkpoint_management.py",
+            "training/distribute.py",
+            "training/distribution_strategy_context.py",
             "training/saveable_object.py",
             "training/saver.py",
             "training/session_run_hook.py",
@@ -3441,6 +3472,7 @@ py_library(
         ":control_flow_ops",
         ":data_flow_ops",
         ":device",
+        ":distribute",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -3473,6 +3505,7 @@ py_library(
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
@@ -3578,8 +3611,10 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":constant_op",
         ":control_flow_ops",
         ":device_util",
+        ":dtypes",
         ":framework_ops",
         ":platform",
         ":resource_variable_ops",
@@ -3587,7 +3622,9 @@ py_library(
         ":util",
         ":variable_scope",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/tools/docs:doc_controls",
     ],
 )
 
@@ -3598,7 +3635,9 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":client_testlib",
+        ":constant_op",
         ":distribute",
+        ":dtypes",
         ":variable_scope",
     ],
 )
@@ -4004,7 +4043,6 @@ tf_py_wrap_cc(
         "platform/stacktrace_handler.i",
         "pywrap_tfe.i",
         "training/quantize_training.i",
-        "training/server_lib.i",
         "util/kernel_registry.i",
         "util/port.i",
         "util/py_checkpoint_reader.i",
@@ -4900,7 +4938,7 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -5012,7 +5050,6 @@ py_library(
     deps = [
         ":client",
         ":constant_op",
-        ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
         ":lib",
@@ -5021,12 +5058,10 @@ py_library(
         ":protos_all_py",
         ":pywrap_tensorflow",
         ":summary_op_util",
-        ":summary_ops",
         ":summary_ops_gen",
         ":summary_ops_v2",
         ":util",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
@@ -5037,7 +5072,6 @@ py_tests(
     srcs = [
         "summary/plugin_asset_test.py",
         "summary/summary_test.py",
-        "summary/text_summary_test.py",
         "summary/writer/writer_test.py",
     ],
     additional_deps = [
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index a2ab63bb48799d5b93882bb87ab40b02dbb96621..3b462c7de82f05b6aac2d4e46013c8044ffdcf03 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -87,10 +87,10 @@ from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sets
-from tensorflow.python.ops import spectral_ops as spectral
 from tensorflow.python.ops.distributions import distributions
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.ops.signal import signal
 from tensorflow.python.profiler import profiler
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.summary import summary
@@ -124,6 +124,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # Eager execution
 from tensorflow.python.eager.context import executing_eagerly
+from tensorflow.python.eager.def_function import function
 from tensorflow.python.framework.ops import enable_eager_execution
 
 # Necessary for the symbols in this module to be taken into account by
@@ -161,7 +162,7 @@ tf_export('Summary', 'summary.Summary')(Summary)
 tf_export('summary.SummaryDescription')(SummaryDescription)
 tf_export('SummaryMetadata')(SummaryMetadata)
 tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
-tf_export('TensorInfo')(TensorInfo)
+tf_export(v1=['TensorInfo'])(TensorInfo)
 # pylint: enable=undefined-variable
 
 # Special dunders that we choose to export:
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index fd9e60bea75fcdb1196a8a595e6908f1bbc979fa..7252e0d9bf92e430e224fe00d9a9a5ff4254b46f 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -26,6 +26,7 @@ from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core.converter import ConversionOptions
 from tensorflow.python.autograph.core.converter import Feature
+from tensorflow.python.autograph.core.converter import Verbosity
 from tensorflow.python.autograph.core.errors import GraphConstructionError
 from tensorflow.python.autograph.core.errors import improved_errors
 from tensorflow.python.autograph.core.errors import TfRuntimeError
@@ -58,6 +59,7 @@ _allowed_symbols = [
     'improved_errors',
     'GraphConstructionError',
     'TfRuntimeError',
+    'Verbosity',
     # Python language "extensions"
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index 916c736fb4bb3099901e9125b37bb54c7050cecc..892f90e350ccf4e73d60ed1f49f70dee2b3610fd 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -113,7 +113,7 @@ class CallTreesTest(converter_testing.TestCase):
     with self.compiled(node, ns) as result:
       with self.cached_session() as sess:
         result_tensor = result.test_fn(constant_op.constant(1))
-        self.assertEquals(sess.run(result_tensor), 3)
+        self.assertEquals(self.evaluate(result_tensor), 3)
 
   def test_call_to_decorated_function(self):
 
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index a7596be2913652038a917d723a5798034854bd9b..5853e044c532d24c3327f06da790f85fddcd5700 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -131,14 +131,18 @@ class ControlFlowTransformer(converter.Base):
     created_in_body = body_scope.modified & returned_from_cond - defined_in
     created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
 
-    if created_in_body != created_in_orelse:
+    basic_created_in_body = tuple(
+        s for s in created_in_body if not s.is_composite())
+    basic_created_in_orelse = tuple(
+        s for s in created_in_orelse if not s.is_composite())
+    if basic_created_in_body != basic_created_in_orelse:
       raise ValueError(
           'if statement may not initialize all variables: the true branch'
           ' creates %s, while the false branch creates %s. Make sure all'
           ' these variables are initialized either in both'
           ' branches or before the if statement.' %
-          (self._fmt_symbols(created_in_body),
-           self._fmt_symbols(created_in_orelse)))
+          (self._fmt_symbols(basic_created_in_body),
+           self._fmt_symbols(basic_created_in_orelse)))
 
     # Alias the closure variables inside the conditional functions, to allow
     # the functions access to the respective variables.
@@ -160,6 +164,10 @@ class ControlFlowTransformer(converter.Base):
     node_body = ast_util.rename_symbols(node.body, alias_body_map)
     node_orelse = ast_util.rename_symbols(node.orelse, alias_orelse_map)
 
+    cond_var_name = self.ctx.namer.new_symbol('cond', body_scope.referenced)
+    body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
+    orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
+
     returned_from_cond = tuple(returned_from_cond)
     if returned_from_cond:
       if len(returned_from_cond) == 1:
@@ -181,13 +189,14 @@ class ControlFlowTransformer(converter.Base):
       # actually has some return value as well.
       cond_results = None
       # TODO(mdan): This doesn't belong here; it's specific to the operator.
-      returned_from_body = (templates.replace_as_expression('tf.constant(1)'),)
-      returned_from_orelse = (
-          templates.replace_as_expression('tf.constant(1)'),)
-
-    body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
-    orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
-
+      returned_from_body = (templates.replace_as_expression(
+          'ag__.match_staging_level(1, cond_var_name)',
+          cond_var_name=cond_var_name),)
+      returned_from_orelse = (templates.replace_as_expression(
+          'ag__.match_staging_level(1, cond_var_name)',
+          cond_var_name=cond_var_name),)
+
+    cond_assign = self.create_assignment(cond_var_name, node.test)
     body_def = self._create_cond_branch(
         body_name,
         aliased_orig_names=aliased_body_orig_names,
@@ -200,10 +209,10 @@ class ControlFlowTransformer(converter.Base):
         aliased_new_names=aliased_orelse_new_names,
         body=node_orelse,
         returns=returned_from_orelse)
-    cond_expr = self._create_cond_expr(cond_results, node.test, body_name,
+    cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
                                        orelse_name)
 
-    return body_def + orelse_def + cond_expr
+    return cond_assign + body_def + orelse_def + cond_expr
 
   def _get_loop_state(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
diff --git a/tensorflow/python/autograph/converters/error_handlers_test.py b/tensorflow/python/autograph/converters/error_handlers_test.py
index 29597e1da3e89eacb724371a59ae5636887679be..1f6c5a682172b54dfd6c1c47f2ac94396db11d43 100644
--- a/tensorflow/python/autograph/converters/error_handlers_test.py
+++ b/tensorflow/python/autograph/converters/error_handlers_test.py
@@ -20,12 +20,10 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.python.autograph.converters import control_flow
 from tensorflow.python.autograph.converters import error_handlers
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.core import errors
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import random_ops
+from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.platform import test
 
 
@@ -44,29 +42,12 @@ class ErrorHandlersTest(converter_testing.TestCase):
   def test_no_origin_annotation(self):
 
     def test_fn(x):
-      a = 0
-      if x:
-        a = random_ops.random_normal((2, 3), mean=0.0, dtype=dtypes.int32)
-      else:
-        a = 0
-      return a
+      return x + 1
 
-    node, ctx = self.prepare(test_fn, {
-        'random_ops': random_ops,
-        'dtypes': dtypes
-    })
-    # To simulate a function without origin info we use the control flow
-    # converter which adds a function that lacks origin info so we will not have
-    # a wrapping try/except that reraises the NotImplementedError as a
-    # GraphConstructionError.
-    node = control_flow.transform(node, ctx)
+    node, ctx = self.prepare(test_fn, {})
+    anno.delanno(node, anno.Basic.ORIGIN)
     node = error_handlers.transform(node, ctx)
-    # TODO(b/111562364): remove run_cond from traceback.
-    test_fn_try_body = node.body[0].body
-    true_fn_body = test_fn_try_body[1].body
-    false_fn_body = test_fn_try_body[2].body
-    self.assertNotIn(gast.Try, true_fn_body)
-    self.assertNotIn(gast.Try, false_fn_body)
+    self.assertIsInstance(node.body[0], gast.Return)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/lists_test.py b/tensorflow/python/autograph/converters/lists_test.py
index f6da845fcc3f19106073deaa094c0479063c02e7..8c8135acefb030e9dfd57e9103feb27f134e732b 100644
--- a/tensorflow/python/autograph/converters/lists_test.py
+++ b/tensorflow/python/autograph/converters/lists_test.py
@@ -68,7 +68,7 @@ class ListTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2, 3])
+        self.assertAllEqual(self.evaluate(r), [1, 2, 3])
 
   def test_list_pop(self):
 
@@ -91,8 +91,8 @@ class ListTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         ts, tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2])
-        self.assertAllEqual(sess.run(ts), 3)
+        self.assertAllEqual(self.evaluate(r), [1, 2])
+        self.assertAllEqual(self.evaluate(ts), 3)
 
   def test_double_list_pop(self):
 
diff --git a/tensorflow/python/autograph/converters/side_effect_guards.py b/tensorflow/python/autograph/converters/side_effect_guards.py
index 910c470f9786c1d5a293becbbdfbd9544549b293..98e29ec8e1b27061371f0328402d8cb45a0f69e7 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards.py
@@ -122,11 +122,12 @@ class SideEffectGuardTransformer(converter.Base):
       # possible, gate all remaining statements (and that may fail too, see
       # _visit_and_reindent.
       args_scope = anno.getanno(node.value, NodeAnno.ARGS_SCOPE)
+      live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
       # NOTE: We can't guard object attributes because they may not be writable.
       # In addition, avoid renaming well-known names.
       # TODO(mdan): Move these names into config.
-      unguarded_names = (qual_names.QN('self'), qual_names.QN('tf'))
-      guarded_args = tuple(s for s in args_scope.read
+      unguarded_names = (qual_names.QN('self'), qual_names.QN('ag__'))
+      guarded_args = tuple(s for s in live_out
                            if not s.is_composite() and s not in unguarded_names)
 
       # TODO(mdan): Include all arguments which depended on guarded_args too.
diff --git a/tensorflow/python/autograph/converters/side_effect_guards_test.py b/tensorflow/python/autograph/converters/side_effect_guards_test.py
index cef3199169c387194a95df72c26f353ad8f58873..e72b5eac324b4cb70d56c2388425b3a4e9b6312a 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards_test.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards_test.py
@@ -48,12 +48,12 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
+        self.evaluate(v.initializer)
         sess.run(result.test_fn(v))
         # TODO(mdan): Add support for this use case.
         # Right now the variable `a` is not conditioned on the `assign` because
         # there's no way to add control dependencies to a variable object.
-        self.assertEqual(2, sess.run(v))
+        self.assertEqual(2, self.evaluate(v))
 
   def test_side_effect_on_used_variable(self):
 
@@ -69,11 +69,11 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
+        self.evaluate(v.initializer)
         sess.run(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
         # Right now it's 3 or 4 based on whether the read is synchronized.
-        self.assertEqual(3, sess.run(v))
+        self.assertEqual(3, self.evaluate(v))
 
   def test_side_effect_on_tensor(self):
 
@@ -109,10 +109,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign_add) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
+        self.evaluate(v.initializer)
         sess.run(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, sess.run(v))
+        self.assertEqual(4, self.evaluate(v))
 
   def test_multiline_nested_block(self):
 
@@ -130,10 +130,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign, ops.name_scope) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
+        self.evaluate(v.initializer)
         sess.run(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(3, sess.run(v))
+        self.assertEqual(3, self.evaluate(v))
 
   def test_multiline_block_unsafe(self):
 
@@ -153,10 +153,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
                        state_ops.assign_add) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
+        self.evaluate(v.initializer)
         sess.run(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, sess.run(v))
+        self.assertEqual(4, self.evaluate(v))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py
index e190a7cfe8492bef5985f128cf553a0fc17b3b96..bd049afdfcef4c839bcb3d9ba5444d885c3061cc 100644
--- a/tensorflow/python/autograph/converters/slices_test.py
+++ b/tensorflow/python/autograph/converters/slices_test.py
@@ -49,7 +49,7 @@ class SliceTest(converter_testing.TestCase):
         tl = list_ops.tensor_list_from_tensor(
             [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32))
         y = result.test_fn(tl)
-        self.assertEqual(2, sess.run(y))
+        self.assertEqual(2, self.evaluate(y))
 
   def test_index_access_multiple_definitions(self):
 
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 59b9ebb591865b56c65ec8bb94203012fb9e150c..49e24895a2b6ec31e83e44b4ef89d463b0157c97 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -64,6 +64,7 @@ from __future__ import division
 from __future__ import print_function
 
 from enum import Enum
+from enum import IntEnum
 
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import naming
@@ -89,6 +90,17 @@ from tensorflow.python.autograph.pyct.static_analysis import type_info
 # TODO(mdan): Add a test specific to this converter.
 
 
+class Verbosity(IntEnum):
+  """Different levels of verbosity for printing errors.
+
+  Attributes:
+   * BRIEF: No logging, minimal error messages.
+   * VERBOSE: Detailed logging of generated code, detailed error messages.
+ """
+  BRIEF = 0
+  VERBOSE = 1
+
+
 class Feature(Enum):
   """Constants to use when selecting AutoGraph features."""
 
@@ -97,9 +109,15 @@ class Feature(Enum):
   AUTO_CONTROL_DEPS = (
       'Insert of control dependencies in the generated code.')
   DECORATORS = (
-      'Allow decorators in local functions. Note that special decorators, '
+      'Allow decorators in local functions. Note that special decorators,'
       ' like ag.convert or tf.function are allowed regardless of this toggle.')
+  ERROR_REWRITING = (
+      'Rewrite errors that occur in the generated code to indicate the source'
+      ' code to which the failing code corresponds.')
   LISTS = 'Convert list idioms, like initializers, slices, append, etc.'
+  NAME_SCOPES = (
+      'Insert name scopes that name ops according to context, like the'
+      ' function they were defined in.')
 
   def __repr__(self):
     return self.name
@@ -111,7 +129,7 @@ class ConversionOptions(object):
   Attributes:
     recursive: bool, whether to recursively convert any user functions or
       classes that the converted function may use.
-    verbose: bool, whether to log the converted code.
+    verbose: Verbosity, the level of verbosity to use.
     strip_decorators: Tuple[Callable], contains decorators that should be in
       excluded from the compiled output. By default, when converting a function
       before the decorators are applied, the compiled output will include those
@@ -126,7 +144,7 @@ class ConversionOptions(object):
 
   def __init__(self,
                recursive=False,
-               verbose=False,
+               verbose=Verbosity.VERBOSE,
                strip_decorators=None,
                force_conversion=False,
                internal_convert_user_code=True,
@@ -197,7 +215,7 @@ class ConversionOptions(object):
         constructor_name=parser.parse_expression(
             as_qualified_name(ConversionOptions)),
         recursive_val=parser.parse_expression(str(self.recursive)),
-        verbose_val=parser.parse_expression(str(self.verbose)),
+        verbose_val=parser.parse_expression(str(int(self.verbose))),
         strip_decorators_val=list_of_names(self.strip_decorators),
         force_conversion_val=parser.parse_expression(
             str(self.force_conversion)),
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index 0326103933fc6a904847ab0ebbe1c1b7575c4c17..7b0608d03fccbb45651ad63e36e4377f7d6a1dd3 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -30,6 +30,7 @@ from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
@@ -103,6 +104,7 @@ class TestCase(test.TestCase):
       fake_ag = self.make_fake_mod('fake_ag', converted_call,
                                    converter.ConversionOptions)
       fake_ag.__dict__.update(operators.__dict__)
+      fake_ag.__dict__.update(special_functions.__dict__)
       fake_ag.__dict__['utils'] = utils
       fake_ag.__dict__['rewrite_graph_construction_error'] = (
           errors.rewrite_graph_construction_error)
diff --git a/tensorflow/python/autograph/core/naming.py b/tensorflow/python/autograph/core/naming.py
index aecc9e33caaed9e336fedc6fcc5a02cc176c7861..43fcbcfc0302a6472bf3bd153212ba7222083016 100644
--- a/tensorflow/python/autograph/core/naming.py
+++ b/tensorflow/python/autograph/core/naming.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.util import tf_inspect
 
 
 class Namer(object):
@@ -76,6 +77,10 @@ class Namer(object):
     if not self.recursive:
       return None, False
 
+    if (live_entity is not None and tf_inspect.isfunction(live_entity) and
+        live_entity.__name__ == '<lambda>'):
+      return None, False
+
     if owner_type is not None and owner_type not in self.partial_types:
       # Members are not renamed when part of an entire converted class.
       return None, False
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index e0e07c6d5f52a4866f58d7ad30e797d5ec1340fb..69674b2be3c9e9356349d0670df0548b47be34c0 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -47,7 +47,9 @@ from tensorflow.python.util import tf_inspect
 
 
 # TODO(mdan): This should behave like to_graph (e.g. convert statically).
-def convert(recursive=False, verbose=False):
+# TODO(znado): Make an alias so can write Verbosity directly without needing
+# to write converter.
+def convert(recursive=False, verbose=converter.Verbosity.VERBOSE):
   """Decorator that compiles a function to use TensorFlow ops.
 
   The decorator is dynamic - it recompiles the target whenever the decorated
@@ -58,7 +60,7 @@ def convert(recursive=False, verbose=False):
   Args:
     recursive: bool, whether to recursively convert any functions or classes
       that the converted function may use.
-    verbose: bool, whether to output the compiled code in the logs.
+    verbose: converter.Verbosity, the level of verbosity.
 
   Returns:
     Callable, a decorator that converts the given function into an equivalent
@@ -92,8 +94,7 @@ def convert(recursive=False, verbose=False):
 class RunMode(Enum):
   """Specifies the way a converted function or method should be executed in TF.
 
-  The enum values have the following semantics:
-
+  Attributes:
    * GRAPH: Call this function directly, as-is. This is suitable for functions
        that were already designed for TF graphs and contain ops.
    * PY_FUNC: Wrap this function into a py_func op. This is suitable for code
@@ -153,7 +154,7 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
 # TODO(mdan): Move to a private, undocumented module.
 def converted_call(f, owner, options, *args, **kwargs):
   """Compiles a function call inline. For internal use only."""
-  if options.verbose:
+  if options.verbose >= converter.Verbosity.VERBOSE:
     logging.info('Converted call: {}; owner: {}'.format(f, owner))
 
   if owner is not None:
@@ -170,13 +171,22 @@ def converted_call(f, owner, options, *args, **kwargs):
 
     f = getattr(owner, f)
 
+  if inspect_utils.isbuiltin(f):
+    return py_builtins.overload_of(f)(*args, **kwargs)
+
   # TODO(mdan): This needs cleanup.
   # In particular, we may want to avoid renaming functions altogether.
   if not options.force_conversion and conversion.is_whitelisted_for_graph(f):
-    return f(*args, **kwargs)
 
-  if inspect_utils.isbuiltin(f):
-    return py_builtins.overload_of(f)(*args, **kwargs)
+    # Args typically include `self`, as required by the conversion process.
+    # When conversion is skipped, `self` is not necessary, because the
+    # original bound method is being executed. This code removes it.
+    if tf_inspect.ismethod(f) and args:
+      f_class = inspect_utils.getmethodclass(f)
+      if args[0] is f_class:
+        args = args[1:]
+
+    return f(*args, **kwargs)
 
   # internal_convert_user_code is for example turned off when issuing a dynamic
   # call conversion from generated code while in nonrecursive mode. In that
@@ -191,6 +201,7 @@ def converted_call(f, owner, options, *args, **kwargs):
     arg_map_target = f
     f_class = inspect_utils.getmethodclass(f)
 
+    # TODO(b/119246461): This may be more elegantly handled using __get__?
     if f_class is not None:
       # If this is a method call, it may or may not include self.
       #
@@ -203,7 +214,13 @@ def converted_call(f, owner, options, *args, **kwargs):
       if owner is not None and (not args or args[0] is not owner):
         effective_args = (owner,) + args
       else:
-        effective_args = args
+        # When the owner is not specified, use the result of
+        # inspect_utils.getmethodclass.
+        # TODO(b/119246461): Make sure an owner is always specified.
+        if not args or args[0] is not f_class:
+          effective_args = (f_class,) + args
+        else:
+          effective_args = (f_class,) + args[1:]
       partial_types = (f_class,)
     else:
       effective_args = args
@@ -254,28 +271,30 @@ def converted_call(f, owner, options, *args, **kwargs):
       optional_features=options.optional_features)
 
   result = converted_f(*effective_args, **kwargs)
-  # When converting a function, we write a tmp file and import it as a module.
-  # This leaks the module's closure. Once we've executed the converted_f module
-  # and there is no more code left to be executed, we can clean up the module.
-
-  # TODO(mdan): Look into workarounds that don't suffer from refcount leaks.
-  # Possibly attach the closure as a regular closure cell, instead of relying on
-  # module globals.
-
-  # If there are callables in the result, they will fail to find their closure
-  # when called, so only delete module if all returned types are not callable.
-  flat_results = nest.flatten(result)
-  if all(map(_is_not_callable, flat_results)):
+
+  # The converted function's closure is simply inserted into the function's
+  # module __dict__. Since modules are permanently cached, that results in
+  # leaking the entire closure.
+  # Normally, it's not safe to delete the module because that may release said
+  # closure as well. However, in the case of converted_call we are certain the
+  # function will not be executed again, so the closure should no longer be
+  # needed so long as the function doesn't return any executable code.
+  # TODO(mdan): Attach the closure properly, using cells.
+  if all(map(_is_not_callable, nest.flatten(result))):
     del sys.modules[converted_f.__module__]
 
   return result
 
 
 def _is_not_callable(obj):
-  # TODO(brianklee): What happens if obj is a tensor wrapping a py_func?
-  return (isinstance(obj,
-                     (int, float, complex, str, bool, np.ndarray, np.generic))
-          or tensor_util.is_tensor(obj))
+  # TODO(brianklee): Handle case when obj is a tensor dependent on a py_func.
+  if isinstance(obj, (int, float, complex, str, bool)):
+    return True
+  if isinstance(obj, (np.ndarray, np.generic)):
+    return True
+  if tensor_util.is_tensor(obj):
+    return True
+  return False
 
 
 # TODO(mdan): Rename: to_ops?
@@ -283,7 +302,7 @@ def _is_not_callable(obj):
 # TODO(mdan): Remove partial_types.
 def to_graph(e,
              recursive=True,
-             verbose=False,
+             verbose=converter.Verbosity.VERBOSE,
              arg_values=None,
              arg_types=None,
              partial_types=None,
@@ -301,7 +320,7 @@ def to_graph(e,
     e: Union[Callable, Type], the Python entity to convert.
     recursive: bool, whether to recursively convert any functions that the
       converted function may call.
-    verbose: bool, whether to output the compiled code in the logs.
+    verbose: converter.Verbosity, the level of printing verbosity to use.
     arg_values: Optional[Dict[Text, Any]], value hints for symbols including
       function arguments.
     arg_types: Optional[Dict[Text, Type]], type hints for symbols including
@@ -356,6 +375,11 @@ def to_graph(e,
   if tf_inspect.isfunction(e):
     compiled.__defaults__ = e.__defaults__
 
+  if hasattr(compiled, '__globals__'):
+    # Remove self to avoid circular references. This will probably only work
+    # so long as the function is not reentrant.
+    del compiled.__globals__[name]
+
   # Need this so the source_mapping attribute is available for the context
   # manager to access for runtime errors.
   #
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index 276fb8748fe4638a7e54e253276e58c718a7707e..44cb99d657f8c98bdb06b10ae6a8eba621196d65 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -28,6 +28,9 @@ from tensorflow.python.autograph.impl import api
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 
@@ -60,7 +63,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_decorator_does_not_recurse(self):
 
@@ -80,7 +83,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_decorator_calls_unconverted_graph(self):
 
@@ -101,7 +104,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_decorator_calls_unconverted_py_func(self):
 
@@ -127,7 +130,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_decorator_calls_decorated(self):
 
@@ -150,7 +153,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_decorator_preserves_argspec(self):
 
@@ -189,7 +192,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_converted_call_builtin(self):
     x = api.converted_call(range, None, converter.ConversionOptions(), 3)
@@ -205,7 +208,7 @@ class ApiTest(test.TestCase):
     with self.cached_session() as sess:
       x = api.converted_call(test_fn, None, converter.ConversionOptions(),
                              constant_op.constant(-1))
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_explicit_owner(self):
     # TODO(mdan): Implement.
@@ -231,7 +234,7 @@ class ApiTest(test.TestCase):
       tc = TestClass(constant_op.constant(-1))
       x = api.converted_call(tc.test_method, None,
                              converter.ConversionOptions(), tc)
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_by_class(self):
 
@@ -249,7 +252,7 @@ class ApiTest(test.TestCase):
       tc = TestClass(constant_op.constant(-1))
       x = api.converted_call(TestClass.test_method, None,
                              converter.ConversionOptions(), tc)
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_callable_object(self):
 
@@ -266,7 +269,7 @@ class ApiTest(test.TestCase):
     with self.cached_session() as sess:
       tc = TestClass(constant_op.constant(-1))
       x = api.converted_call(tc, None, converter.ConversionOptions())
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_constructor(self):
 
@@ -285,7 +288,7 @@ class ApiTest(test.TestCase):
                               constant_op.constant(-1))
       # tc is now a converted object.
       x = tc.test_method()
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_already_converted(self):
 
@@ -295,12 +298,12 @@ class ApiTest(test.TestCase):
     with self.cached_session() as sess:
       x = api.converted_call(f, None, converter.ConversionOptions(),
                              constant_op.constant(0))
-      self.assertTrue(sess.run(x))
+      self.assertTrue(self.evaluate(x))
 
       converted_f = api.to_graph(f)
       x = api.converted_call(converted_f, None, converter.ConversionOptions(),
                              constant_op.constant(0))
-      self.assertTrue(sess.run(x))
+      self.assertTrue(self.evaluate(x))
 
   def test_converted_call_no_user_code(self):
 
@@ -319,6 +322,63 @@ class ApiTest(test.TestCase):
     # The constant has static shape so the result is a primitive not a Tensor.
     self.assertEqual(x, 1)
 
+  def test_converted_call_whitelisted_method(self):
+
+    opts = converter.ConversionOptions()
+
+    model = sequential.Sequential([
+        core.Dense(2)
+    ])
+
+    x = api.converted_call(model.call, None, opts,
+                           constant_op.constant([[0.0]]), training=True)
+
+    with self.cached_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+
+  def test_converted_call_whitelisted_method_extra_self(self):
+
+    opts = converter.ConversionOptions()
+
+    model = sequential.Sequential([
+        core.Dense(2)
+    ])
+
+    x = api.converted_call(model.call, None, opts,
+                           model, constant_op.constant([[0.0]]), training=True)
+
+    with self.cached_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+
+  def test_converted_call_whitelisted_method_via_owner(self):
+
+    opts = converter.ConversionOptions()
+
+    model = sequential.Sequential([
+        core.Dense(2)
+    ])
+
+    x = api.converted_call('call', model, opts,
+                           constant_op.constant([[0.0]]), training=True)
+
+    with self.cached_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+
+  def test_converted_call_lambda(self):
+
+    opts = converter.ConversionOptions()
+
+    l = lambda x: x == 0
+
+    x = api.converted_call(l, None, opts, constant_op.constant(0))
+
+    with self.cached_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(True, self.evaluate(x))
+
   def test_to_graph_basic(self):
 
     def test_fn(x, s):
@@ -330,7 +390,7 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       x = compiled_fn(constant_op.constant([4, 8]), 4)
-      self.assertListEqual([1, 2], sess.run(x).tolist())
+      self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
   def test_to_graph_with_defaults(self):
 
@@ -345,7 +405,7 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       x = compiled_fn(constant_op.constant([4, 8]))
-      self.assertListEqual([1, 2], sess.run(x).tolist())
+      self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
   def test_to_code_basic(self):
 
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index ee09b2718eabe4ed2dc0e8d1aabba00837cb15c5..48a9307cabd733f1cf0317d32439c44405995bf4 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -45,6 +45,7 @@ from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import inspect_utils
@@ -108,21 +109,13 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
   Raises:
     ValueError: if the entity type is not supported.
   """
-  if program_ctx.options.verbose:
+  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
     logging.info('Converting {}'.format(o))
 
   if tf_inspect.isclass(o):
     node, name, ns = class_to_graph(o, program_ctx)
   elif tf_inspect.isfunction(o):
-    # TODO(mdan): This is not a reliable mechanism.
-    # The most reliable way is to check the source code, the AST will contain
-    # a Lambda node instead of a FunctionDef
-    if o.__name__ == '<lambda>':
-      raise NotImplementedError(
-          'lambda functions are not yet supported; declare the function'
-          ' using def instead: %s' % o)
-    else:
-      node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
+    node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   elif tf_inspect.ismethod(o):
     node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   # TODO(mdan,yashkatariya): Remove when object conversion is implemented.
@@ -151,7 +144,7 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
 
   program_ctx.add_to_cache(o, node)
 
-  if program_ctx.options.verbose:
+  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
     logging.info('Compiled output of {}:\n\n{}\n'.format(
         o, compiler.ast_to_source(node)))
 
@@ -192,8 +185,7 @@ def class_to_graph(c, program_ctx):
         program_ctx=program_ctx,
         arg_values={},
         arg_types={'self': (c.__name__, c)},
-        owner_type=c,
-        rewrite_errors=False)
+        owner_type=c)
     if class_namespace is None:
       class_namespace = namespace
     else:
@@ -273,6 +265,7 @@ def _add_self_references(namespace, autograph_module):
     # TODO(mdan): Add safeguards against name clashes.
     # We don't want to create a submodule because we want the operators to be
     # accessible as ag__.<operator>
+    ag_internal.__dict__.update(special_functions.__dict__)
     ag_internal.__dict__.update(operators.__dict__)
 
   _add_reserved_symbol(namespace, 'ag__', ag_internal)
@@ -282,12 +275,39 @@ def function_to_graph(f,
                       program_ctx,
                       arg_values,
                       arg_types,
-                      owner_type=None,
-                      rewrite_errors=True):
+                      owner_type=None):
   """Specialization of `entity_to_graph` for callable functions."""
 
   node, source = parser.parse_entity(f)
   node = node.body[0]
+
+  # In general, the output of inspect.getsource is inexact because it uses crude
+  # regex matching methods to search the source file. This is particularly
+  # problematic for lambda functions, where the entire containing lines are
+  # returned. Certain distributions of CPython may also return the enclosing
+  # function for local functions.
+  nodes = ast_util.find_matching_definitions(node, f)
+  if len(nodes) != 1:
+    if f.__name__ == '<lambda>':
+      raise ValueError(
+          'Unable to identify source code of lambda function {}. It was'
+          ' defined on this line: {}, which must contain a single lambda with'
+          ' matching signature. To avoid ambiguity, define each lambda'
+          ' in a separate expression.'.format(f, source))
+    else:
+      # The inspect.getsource bug is currently known to occur in the Windows
+      # integration tests which run Python 3.6.
+      # TODO(mdan): Find out eaxctly which distribution of Python is that.
+      raise ValueError(
+          'Unable to identify source code of function {}. The source code'
+          ' reported by Python did not include exactly one matching signature:'
+          '\n{}\nTo avoid ambiguity, use a unique name for each'
+          ' function.\nNote that some distributions of Python may report source'
+          ' code incorrectly. It may be possible to avoid that bug by'
+          ' organizing the code into smaller units (smaller files, functions or'
+          ' classes), or by turning AutoGraph off.'.format(f, source))
+  node, = nodes
+
   # TODO(znado): Place inside standard_analysis.
   origin_info.resolve(node, source, f)
   namespace = inspect_utils.getnamespace(f)
@@ -302,15 +322,22 @@ def function_to_graph(f,
       arg_types=arg_types,
       owner_type=owner_type)
   context = converter.EntityContext(namer, entity_info, program_ctx)
-  node = node_to_graph(node, context, rewrite_errors=rewrite_errors)
+  node = node_to_graph(node, context)
+
+  if isinstance(node, gast.Lambda):
+    new_name = namer.new_symbol('tf__lambda', ())
+    node = gast.Assign(
+        targets=[gast.Name(new_name, gast.Store(), None)], value=node)
 
-  # TODO(mdan): This somewhat duplicates the call rename logic in call_trees.py
-  new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type)
-  if not did_rename:
-    new_name = f.__name__
-    if node.name != f.__name__:
-      raise NotImplementedError('Strange corner case. Send us offending code!')
-  node.name = new_name
+  else:
+    # TODO(mdan): This somewhat duplicates the renaming logic in call_trees.py
+    new_name, did_rename = namer.compiled_function_name(f.__name__, f,
+                                                        owner_type)
+    if did_rename:
+      node.name = new_name
+    else:
+      new_name = f.__name__
+      assert node.name == new_name
 
   program_ctx.update_name_map(namer)
   # TODO(mdan): Use this at compilation.
@@ -318,13 +345,12 @@ def function_to_graph(f,
   return [node], new_name, namespace
 
 
-def node_to_graph(node, context, rewrite_errors=True):
+def node_to_graph(node, context):
   """Convert Python code to equivalent TF graph mode code.
 
   Args:
     node: AST, the code to convert.
     context: converter.EntityContext
-    rewrite_errors: Boolean, whether or not to rewrite the error traceback.
 
   Returns:
     A tuple (node, deps):
@@ -361,7 +387,9 @@ def node_to_graph(node, context, rewrite_errors=True):
   node = converter.apply_(node, context, logical_expressions)
   if context.program.options.uses(converter.Feature.AUTO_CONTROL_DEPS):
     node = converter.apply_(node, context, side_effect_guards)
-  node = converter.apply_(node, context, function_scopes)
-  if rewrite_errors:
+  # TODO(mdan): If function scopes ever does more, the toggle will need moving.
+  if context.program.options.uses(converter.Feature.NAME_SCOPES):
+    node = converter.apply_(node, context, function_scopes)
+  if context.program.options.uses(converter.Feature.ERROR_REWRITING):
     node = converter.apply_(node, context, error_handlers)
   return node
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index 442d0e31e389176f68b58f9dcbc5b674258bdfd2..9a4fbdad8c1994d8c8cc534b6e0b4af45f5c4c80 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -160,12 +160,67 @@ class ConversionTest(test.TestCase):
                      program_ctx.dependency_cache[TestSubclass][-2].name)
 
   def test_entity_to_graph_lambda(self):
-    f = lambda a: a
+    b = 2
+    f = lambda x: b * x if x > 0 else -x
 
-    with self.assertRaises(NotImplementedError):
-      program_ctx = self._simple_program_ctx()
+    program_ctx = self._simple_program_ctx()
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.Assign)
+    self.assertIsInstance(fn_node.value, gast.Lambda)
+    self.assertEqual('tf__lambda', name)
+    self.assertIs(ns['b'], b)
+
+  def test_entity_to_graph_multiple_lambdas(self):
+    a, b = 1, 2
+    f, _ = (lambda x: a * x, lambda y: b * y)
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.Assign)
+    self.assertIsInstance(fn_node.value, gast.Lambda)
+    self.assertEqual('tf__lambda', name)
+    self.assertIs(ns['a'], a)
+
+  def test_entity_to_graph_multiple_lambdas_ambiguous_definitions(self):
+    a, b = 1, 2
+    f, _ = (lambda x: a * x, lambda x: b * x)
+
+    program_ctx = self._simple_program_ctx()
+    with self.assertRaises(ValueError):
       conversion.entity_to_graph(f, program_ctx, None, None)
 
+  def test_entity_to_graph_lambda_code_with_garbage(self):
+    # pylint:disable=g-long-lambda
+    f = (  # intentional wrap
+        lambda x: (x  # intentional wrap
+                   + 1),)[0]
+    # pylint:enable=g-long-lambda
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, _ = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.Assign)
+    self.assertIsInstance(fn_node.value, gast.Lambda)
+    self.assertEqual('tf__lambda', name)
+
+  def test_entity_to_graph_nested_functions(self):
+    b = 2
+
+    def f(x):
+      def g(x):
+        return b * x
+      return g(x)
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.FunctionDef)
+    self.assertEqual(fn_node.name, 'tf__f')
+    self.assertEqual('tf__f', name)
+    self.assertIs(ns['b'], b)
+
   def test_ag_module_cached(self):
     def callee():
       return range(3)
diff --git a/tensorflow/python/autograph/lang/special_functions.py b/tensorflow/python/autograph/lang/special_functions.py
index 62ac018ac46ffd98e1d8b91d71fe953a0a9f1700..411770692b0d7f35826d6f9e5151dbf2f7e8136d 100644
--- a/tensorflow/python/autograph/lang/special_functions.py
+++ b/tensorflow/python/autograph/lang/special_functions.py
@@ -24,6 +24,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators import data_structures
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_util
 
 
@@ -46,6 +47,13 @@ def _validate_list_constructor(elements, element_dtype, element_shape):
       ' allowed'.format(type(elements)))
 
 
+def match_staging_level(value, like_value):
+  """Casts a value to be staged at the same level as another."""
+  if tensor_util.is_tensor(like_value):
+    return constant_op.constant(value)
+  return value
+
+
 def tensor_list(elements,
                 element_dtype=None,
                 element_shape=None,
diff --git a/tensorflow/python/autograph/lang/special_functions_test.py b/tensorflow/python/autograph/lang/special_functions_test.py
index 206a32d07cd2b8b7aa1357fa9be4e23b03276a71..8d40f4036c5a1892afca6e5fb2daf891c9487800 100644
--- a/tensorflow/python/autograph/lang/special_functions_test.py
+++ b/tensorflow/python/autograph/lang/special_functions_test.py
@@ -30,27 +30,36 @@ from tensorflow.python.platform import test
 
 class SpecialFunctionsTest(test.TestCase):
 
+  def test_match_staging_level(self):
+    some_tensor = constant_op.constant(0)
+    tensor_one = special_functions.match_staging_level(1, some_tensor)
+    python_one = special_functions.match_staging_level(1, 1)
+    with self.cached_session() as sess:
+      self.assertTrue(tensor_util.is_tensor(tensor_one))
+      self.assertAllEqual(self.evaluate(tensor_one), 1)
+      self.assertEqual(python_one, 1)
+
   def test_tensor_list_empty_list(self):
     l = special_functions.tensor_list([],
                                       element_dtype=dtypes.int32,
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [])
 
     l = special_functions.tensor_list((),
                                       element_dtype=dtypes.int32,
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [])
 
   def test_tensor_list_tensor(self):
     l = special_functions.tensor_list(
         constant_op.constant([], dtype=dtypes.int32))
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [])
 
   def test_tensor_list_unsupported_initializer(self):
     with self.assertRaisesRegexp(ValueError, 'unknown type'):
@@ -66,16 +75,16 @@ class SpecialFunctionsTest(test.TestCase):
 
     l = special_functions.tensor_list(elements)
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [[1, 2], [3, 4]])
 
   def test_tensor_list_array_from_elements(self):
     elements = [constant_op.constant([1, 2]), constant_op.constant([3, 4])]
 
     l = special_functions.tensor_list(elements, use_tensor_array=True)
     sl = l.stack()
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [[1, 2], [3, 4]])
 
   def test_stack(self):
     self.assertEqual(special_functions.stack(1, strict=False), 1)
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 2dea18dc5faa250784eff815f216396c353e2014..05b5660941d0bafe89bf68f8aa92b2686561fd5c 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -35,7 +35,7 @@ class ForLoopTest(test.TestCase):
         body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.cached_session() as sess:
-      self.assertEqual((10,), sess.run(s))
+      self.assertEqual((10,), self.evaluate(s))
 
   def test_python(self):
     s = control_flow.for_stmt(
@@ -53,7 +53,7 @@ class ForLoopTest(test.TestCase):
         body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.cached_session() as sess:
-      self.assertEqual((10,), sess.run(s))
+      self.assertEqual((10,), self.evaluate(s))
 
 
 class WhileLoopTest(test.TestCase):
@@ -66,7 +66,7 @@ class WhileLoopTest(test.TestCase):
         init_state=(0, 0),
         extra_deps=(n,))
     with self.cached_session() as sess:
-      self.assertEqual((5, 10), sess.run(results))
+      self.assertEqual((5, 10), self.evaluate(results))
 
   def test_python(self):
     n = 5
@@ -90,9 +90,9 @@ class IfStmtTest(test.TestCase):
   def test_tensor(self):
     with self.cached_session() as sess:
       t = self.single_return_if_stmt(constant_op.constant(True))
-      self.assertEqual(1, sess.run(t))
+      self.assertEqual(1, self.evaluate(t))
       t = self.single_return_if_stmt(constant_op.constant(False))
-      self.assertEqual(-1, sess.run(t))
+      self.assertEqual(-1, self.evaluate(t))
 
   def test_python(self):
     self.assertEqual(1, self.single_return_if_stmt(True))
@@ -101,9 +101,9 @@ class IfStmtTest(test.TestCase):
   def test_tensor_multiple_returns(self):
     with self.cached_session() as sess:
       t = self.multi_return_if_stmt(constant_op.constant(True))
-      self.assertAllEqual([1, 2], sess.run(t))
+      self.assertAllEqual([1, 2], self.evaluate(t))
       t = self.multi_return_if_stmt(constant_op.constant(False))
-      self.assertAllEqual([-1, -2], sess.run(t))
+      self.assertAllEqual([-1, -2], self.evaluate(t))
 
   def test_python_multiple_returns(self):
     self.assertEqual((1, 2), self.multi_return_if_stmt(True))
diff --git a/tensorflow/python/autograph/operators/data_structures_test.py b/tensorflow/python/autograph/operators/data_structures_test.py
index 6039b07982c8e4b820acda059c701b8fdb96e295..dc50edb4c987386f14e6d93db153336584516a48 100644
--- a/tensorflow/python/autograph/operators/data_structures_test.py
+++ b/tensorflow/python/autograph/operators/data_structures_test.py
@@ -43,7 +43,7 @@ class ListTest(test.TestCase):
     l = data_structures.tf_tensor_list_new([3, 4, 5])
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
   def test_tf_tensor_list_new_empty(self):
     l = data_structures.tf_tensor_list_new([],
@@ -51,13 +51,13 @@ class ListTest(test.TestCase):
                                            element_shape=())
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [])
+      self.assertAllEqual(self.evaluate(t), [])
 
   def test_tf_tensor_list_new_from_tensor(self):
     l = data_structures.tf_tensor_list_new(constant_op.constant([3, 4, 5]))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
   def test_tf_tensor_list_new_illegal_input(self):
     with self.assertRaises(ValueError):
@@ -77,7 +77,7 @@ class ListTest(test.TestCase):
     l = data_structures.tf_tensor_array_new([3, 4, 5])
     t = l.stack()
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
   def test_tf_tensor_array_new_illegal_input(self):
     with self.assertRaises(ValueError):
@@ -102,7 +102,7 @@ class ListTest(test.TestCase):
 
     t = list_ops.tensor_list_stack(l, element_dtype=x.dtype)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [[1, 2, 3]])
+      self.assertAllEqual(self.evaluate(t), [[1, 2, 3]])
 
   def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
@@ -131,10 +131,10 @@ class ListTest(test.TestCase):
 
     with self.cached_session() as sess:
       l, x = data_structures.list_pop(l, None, opts)
-      self.assertAllEqual(sess.run(x), [3, 4])
+      self.assertAllEqual(self.evaluate(x), [3, 4])
 
       t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
-      self.assertAllEqual(sess.run(t), [[1, 2]])
+      self.assertAllEqual(self.evaluate(t), [[1, 2]])
 
   def test_pop_python(self):
     l = [1, 2, 3]
@@ -152,12 +152,11 @@ class ListTest(test.TestCase):
 
     with self.cached_session() as sess:
       t = data_structures.list_stack(l, opts)
-      self.assertAllEqual(sess.run(t), sess.run(initial_list))
+      self.assertAllEqual(sess.run(t), self.evaluate(initial_list))
 
   def test_stack_tensor_list_empty(self):
     l = list_ops.empty_tensor_list(
-        element_shape=-1,
-        element_dtype=dtypes.variant)
+        element_shape=None, element_dtype=dtypes.variant)
 
     opts = data_structures.ListStackOpts(
         element_dtype=dtypes.int32, original_call=None)
diff --git a/tensorflow/python/autograph/operators/logical_test.py b/tensorflow/python/autograph/operators/logical_test.py
index d6649f7b2bfccb17b31689fc8ff460c0c58d522c..ebf6458f01e0161b50a48c89d44218d85aa59f16 100644
--- a/tensorflow/python/autograph/operators/logical_test.py
+++ b/tensorflow/python/autograph/operators/logical_test.py
@@ -45,11 +45,11 @@ class LogicalOperatorsTest(test.TestCase):
   def test_and_tf(self):
     with self.cached_session() as sess:
       t = logical.and_(self._tf_true, self._tf_true)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.and_(self._tf_true, lambda: True)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.and_(self._tf_false, lambda: True)
-      self.assertEqual(sess.run(t), False)
+      self.assertEqual(self.evaluate(t), False)
       # TODO(mdan): Add a test for ops with side effects.
 
   def test_or_python(self):
@@ -63,11 +63,11 @@ class LogicalOperatorsTest(test.TestCase):
   def test_or_tf(self):
     with self.cached_session() as sess:
       t = logical.or_(self._tf_false, self._tf_true)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.or_(self._tf_false, lambda: True)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.or_(self._tf_true, lambda: True)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       # TODO(mdan): Add a test for ops with side effects.
 
   def test_not_python(self):
@@ -78,7 +78,7 @@ class LogicalOperatorsTest(test.TestCase):
   def test_not_tf(self):
     with self.cached_session() as sess:
       t = logical.not_(self._tf_false())
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index d312e6938b2347ff55ac639af44f9970663d6bc4..2f55d538924609f4ad2549acccbc15a57ac13c19 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -216,10 +216,10 @@ def _py_range(start_or_stop, stop, step):
   return range(start_or_stop)
 
 
-SUPPORTED_BUILTINS = set((abs, float, int, len, print, range))
+SUPPORTED_BUILTINS = (abs, float, int, len, print, range)
 
 if six.PY2:
-  SUPPORTED_BUILTINS.add(xrange)
+  SUPPORTED_BUILTINS += (xrange,)
 
 BUILTIN_FUINCTIONS_MAP = {
     'abs': abs_,
diff --git a/tensorflow/python/autograph/operators/py_builtins_test.py b/tensorflow/python/autograph/operators/py_builtins_test.py
index 443e30a475d111e08e587c08fb47f42eb776182a..4d9eec77c38ed8ca4efec3c503b568cab42aa066 100644
--- a/tensorflow/python/autograph/operators/py_builtins_test.py
+++ b/tensorflow/python/autograph/operators/py_builtins_test.py
@@ -38,29 +38,29 @@ class PyBuiltinsTest(test.TestCase):
     self.assertEqual(py_builtins.abs_(-1), 1)
     with self.cached_session() as sess:
       t = py_builtins.abs_(constant_op.constant(-1))
-      self.assertEqual(sess.run(t), 1)
+      self.assertEqual(self.evaluate(t), 1)
       t = py_builtins.abs_(constant_op.constant([-1, 2, -3]))
-      self.assertAllEqual(sess.run(t), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(t), [1, 2, 3])
 
   def test_float(self):
     self.assertEqual(py_builtins.float_(10), 10.0)
     self.assertEqual(py_builtins.float_('10.0'), 10.0)
     with self.cached_session() as sess:
       t = py_builtins.float_(constant_op.constant(1, dtype=dtypes.int64))
-      self.assertEqual(sess.run(t), 1.0)
+      self.assertEqual(self.evaluate(t), 1.0)
       st = py_builtins.float_(constant_op.constant('1.0'))
-      self.assertEqual(sess.run(st), 1.0)
+      self.assertEqual(self.evaluate(st), 1.0)
 
   def test_int(self):
     self.assertEqual(py_builtins.int_(10.0), 10)
     self.assertEqual(py_builtins.int_('11', 2), 3)
     with self.cached_session() as sess:
       t = py_builtins.int_(constant_op.constant(1, dtype=dtypes.float64))
-      self.assertEqual(sess.run(t), 1)
+      self.assertEqual(self.evaluate(t), 1)
       st = py_builtins.int_(constant_op.constant('1'))
-      self.assertEqual(sess.run(st), 1)
+      self.assertEqual(self.evaluate(st), 1)
       st = py_builtins.int_(constant_op.constant('1'), 10)
-      self.assertEqual(sess.run(st), 1)
+      self.assertEqual(self.evaluate(st), 1)
 
   def test_int_unsupported_base(self):
     t = constant_op.constant(1, dtype=dtypes.float64)
@@ -73,9 +73,9 @@ class PyBuiltinsTest(test.TestCase):
       t = py_builtins.len_(constant_op.constant([[1], [2], [3]]))
       self.assertEqual(t, 3)
       ta = py_builtins.len_(tensor_array_ops.TensorArray(dtypes.int32, size=5))
-      self.assertEqual(sess.run(ta), 5)
+      self.assertEqual(self.evaluate(ta), 5)
       tl = py_builtins.len_(data_structures.tf_tensor_list_new([3, 4, 5]))
-      self.assertEqual(sess.run(tl), 3)
+      self.assertEqual(self.evaluate(tl), 3)
 
   def test_len_scalar(self):
     with self.assertRaises(ValueError):
@@ -120,18 +120,18 @@ class PyBuiltinsTest(test.TestCase):
   def test_range_tensor(self):
     with self.cached_session() as sess:
       r = py_builtins.range_(constant_op.constant(3))
-      self.assertAllEqual(sess.run(r), [0, 1, 2])
+      self.assertAllEqual(self.evaluate(r), [0, 1, 2])
       r = py_builtins.range_(1, constant_op.constant(3))
-      self.assertAllEqual(sess.run(r), [1, 2])
+      self.assertAllEqual(self.evaluate(r), [1, 2])
       r = py_builtins.range_(2, 0, constant_op.constant(-1))
-      self.assertAllEqual(sess.run(r), [2, 1])
+      self.assertAllEqual(self.evaluate(r), [2, 1])
 
   def test_range_tensor_empty_range(self):
     with self.session() as sess:
       r = py_builtins.range_(constant_op.constant(-3))
-      self.assertAllEqual(sess.run(r), [])
+      self.assertAllEqual(self.evaluate(r), [])
       r = py_builtins.range_(5, constant_op.constant(2))
-      self.assertAllEqual(sess.run(r), [])
+      self.assertAllEqual(self.evaluate(r), [])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/operators/slices_test.py b/tensorflow/python/autograph/operators/slices_test.py
index 9e4865b3c66923815338e70d4104c42318e56eb3..d444054fd772cf68b2e7c028adc87b6623ccffba 100644
--- a/tensorflow/python/autograph/operators/slices_test.py
+++ b/tensorflow/python/autograph/operators/slices_test.py
@@ -34,7 +34,7 @@ class SlicesTest(test.TestCase):
 
     with self.cached_session() as sess:
       t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
-      self.assertAllEqual(sess.run(t), [[5, 6], [3, 4]])
+      self.assertAllEqual(self.evaluate(t), [[5, 6], [3, 4]])
 
   def test_get_item_tensor_list(self):
     initial_list = constant_op.constant([[1, 2], [3, 4]])
@@ -44,7 +44,7 @@ class SlicesTest(test.TestCase):
         l, 1, slices.GetItemOpts(element_dtype=initial_list.dtype))
 
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4])
+      self.assertAllEqual(self.evaluate(t), [3, 4])
 
   def test_get_item_tensor_string(self):
     initial_str = constant_op.constant('abcd')
@@ -52,14 +52,14 @@ class SlicesTest(test.TestCase):
                         slices.GetItemOpts(element_dtype=initial_str.dtype))
 
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(t), b'b')
+      self.assertEqual(self.evaluate(t), b'b')
 
     initial_list_str = constant_op.constant(['abcd', 'bcde'])
     t = slices.get_item(initial_list_str, 1,
                         slices.GetItemOpts(element_dtype=initial_str.dtype))
 
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(t), b'bcde')
+      self.assertEqual(self.evaluate(t), b'bcde')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/ast_util.py b/tensorflow/python/autograph/pyct/ast_util.py
index 7df3b8858c0128de64a928d7daf9db081566d9c6..ea7eca6463a17d43f1a3536ebdd1770cfcf265f7 100644
--- a/tensorflow/python/autograph/pyct/ast_util.py
+++ b/tensorflow/python/autograph/pyct/ast_util.py
@@ -24,6 +24,7 @@ import gast
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.util import tf_inspect
 
 
 class CleanCopier(object):
@@ -311,3 +312,64 @@ def parallel_walk(node, other):
         raise ValueError(
             'inconsistent values for field {}: {} and {}'.format(
                 f, n_child, o_child))
+
+
+class FunctionDefMatcher(gast.NodeVisitor):
+  """Finds nodes that match a given function's signature."""
+
+  def __init__(self, fn):
+    self.fn = fn
+    self.matching_nodes = []
+
+  def _arg_name(self, node):
+    if node is None:
+      return None
+    if isinstance(node, gast.Name):
+      return node.id
+    assert isinstance(node, str)
+    return node
+
+  def _argspec_matches(self, node):
+    arg_spec = tf_inspect.getfullargspec(self.fn)
+
+    node_args = tuple(self._arg_name(arg) for arg in node.args.args)
+    if node_args != tuple(arg_spec.args):
+      return False
+
+    if arg_spec.varargs != self._arg_name(node.args.vararg):
+      return False
+
+    if arg_spec.varkw != self._arg_name(node.args.kwarg):
+      return False
+
+    node_kwonlyargs = tuple(self._arg_name(arg) for arg in node.args.kwonlyargs)
+    if node_kwonlyargs != tuple(arg_spec.kwonlyargs):
+      return False
+
+    return True
+
+  def visit_Lambda(self, node):
+    self.generic_visit(node)
+
+    if self.fn.__name__ != '<lambda>':
+      return
+    if not self._argspec_matches(node):
+      return
+
+    self.matching_nodes.append(node)
+
+  def visit_FunctionDef(self, node):
+    self.generic_visit(node)
+
+    if self.fn.__name__ != node.name:
+      return
+    if not self._argspec_matches(node):
+      return
+
+    self.matching_nodes.append(node)
+
+
+def find_matching_definitions(node, f):
+  matcher = FunctionDefMatcher(f)
+  matcher.visit(node)
+  return tuple(matcher.matching_nodes)
diff --git a/tensorflow/python/autograph/pyct/ast_util_test.py b/tensorflow/python/autograph/pyct/ast_util_test.py
index b1577c466e6e67d6429b5f0eef6916efad16f46b..9fcbbe646c6e558b93fdafb6380ae0a46ee1d60a 100644
--- a/tensorflow/python/autograph/pyct/ast_util_test.py
+++ b/tensorflow/python/autograph/pyct/ast_util_test.py
@@ -22,6 +22,8 @@ import ast
 import collections
 import textwrap
 
+import gast
+
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
@@ -191,6 +193,107 @@ class AstUtilTest(test.TestCase):
       for _ in ast_util.parallel_walk(node_1, node_3):
         pass
 
+  def assertLambdaNodes(self, matching_nodes, expected_bodies):
+    self.assertEqual(len(matching_nodes), len(expected_bodies))
+    for node in matching_nodes:
+      self.assertIsInstance(node, gast.Lambda)
+      self.assertIn(compiler.ast_to_source(node.body).strip(), expected_bodies)
+
+  def test_find_matching_definitions_lambda(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      f = lambda x: 1
+    """))
+    f = lambda x: x
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(1)',))
+
+  def test_find_matching_definitions_lambda_multiple_matches(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      f = lambda x: 1, lambda x: 2
+    """))
+    f = lambda x: x
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(1)', '(2)'))
+
+  def test_find_matching_definitions_lambda_uses_arg_names(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      f = lambda x: 1, lambda y: 2
+    """))
+    f = lambda x: x
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(1)',))
+
+    f = lambda y: y
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(2)',))
+
+  def assertFunctionDefNodes(self, matching_nodes, expected_bodies):
+    self.assertEqual(len(matching_nodes), len(expected_bodies))
+    for node in matching_nodes:
+      self.assertIsInstance(node, gast.FunctionDef)
+      self.assertIn(compiler.ast_to_source(node.body).strip(), expected_bodies)
+
+  def test_find_matching_definitions_function(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(x):
+        return 1
+    """))
+
+    def f(x):
+      return x
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1',))
+
+  def test_find_matching_definitions_nested_functions_same_name(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(x, *args, **kwargs):
+        def f(x, y):
+          return 1
+        return 2
+    """))
+
+    def f(x, y):
+      return x + y
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1',))
+
+  def test_find_matching_definitions_nested_functions_same_args(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def g(x):
+        def f(x):
+          return 1
+        return 2
+    """))
+
+    def f(x):
+      return x
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1',))
+
+  def test_find_matching_definitions_multiple_matches(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(x):
+        return 1
+      def f(x):
+        return 2
+    """))
+
+    def f(x):
+      return x
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1', 'return 2'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/compiler.py b/tensorflow/python/autograph/pyct/compiler.py
index 21281aeb561475db1726ab4c3d80a25622a71ae4..06e66c5b5871d5528bccfcc9fe47268207594ea6 100644
--- a/tensorflow/python/autograph/pyct/compiler.py
+++ b/tensorflow/python/autograph/pyct/compiler.py
@@ -123,26 +123,15 @@ def ast_to_object(nodes,
   compiled_nodes = imp.load_source(module_name, f.name)
 
   # TODO(znado): Clean this up so we don't need to attach it to the namespace.
-  # TODO(znado): This does not work for classes because their methods share a
-  # namespace.
-  # This attaches the source map which is needed for error handling.  Note that
-  # api.to_graph copies this source map into an attribute of the function.
-  #
-  # We need this so the ag_source_map__ variable is available to the call to
-  # rewrite_graph_construction_error in the except block inside each function
-  # that handles graph construction errors.
-  #
   # We cannot get the rewritten function name until it is too late so templating
-  # is hard, and this cleanly fixes the
-  # issues encountered with nested functions because this is attached to the
-  # outermost one.
+  # is hard, and this cleanly fixes the issues encountered with nested functions
+  # because this is attached to the outermost one.
   if include_source_map:
     # TODO(mdan): This name should be decided by the caller.
     source_map_name = 'ag_source_map__'
-    if source_map_name in compiled_nodes.__dict__:
-      raise ValueError('cannot convert %s because is has namespace attribute '
-                       '"%s", which is reserved for AutoGraph.' %
-                       (compiled_nodes, source_map_name))
+    assert source_map_name not in compiled_nodes.__dict__, (
+        'cannot convert %s because is has namespace attribute "%s", which is '
+        'reserved for AutoGraph.') % (compiled_nodes, source_map_name)
     compiled_nodes.__dict__[source_map_name] = source_map
 
   return compiled_nodes, source
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index e078cd56a21085af9682dd36da33af9abf967820..4d56b93671e3305b5099f2ce8976ae629fc087c6 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -185,12 +185,15 @@ def getmethodclass(m):
       return m.__class__
 
   # Instance method and class methods: should be bound to a non-null "self".
-  # If self is a class, then it's a class method.
   if hasattr(m, '__self__'):
-    if m.__self__:
-      if tf_inspect.isclass(m.__self__):
-        return m.__self__
-      return type(m.__self__)
+    if m.__self__ is not None:
+      # A fallback allowing methods to be actually bound to a type different
+      # than __self__. This is useful when a strong reference from the method
+      # to the object is not desired, for example when caching is involved.
+      if hasattr(m.__self__, 'ag_self_weakref__'):
+        return m.__self__.ag_self_weakref__()
+
+      return m.__self__
 
   # Class, static and unbound methods: search all defined classes in any
   # namespace. This is inefficient but more robust method.
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index 7e79b3b9f68702df5c73125a6cc7085624792fdf..622e3bafc0ab3d7dd8876cbbbee45f8055c48056 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -20,11 +20,14 @@ from __future__ import print_function
 
 from functools import wraps
 import imp
+import types
+import weakref
 
 import six
 
 from tensorflow.python import lib
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
 
@@ -184,16 +187,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = TestClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.static_method),
         TestClass)
@@ -242,16 +245,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = LocalClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        LocalClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        LocalClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        LocalClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        LocalClass)
+        test_obj)
 
   def test_getmethodclass_callables(self):
     class TestCallable(object):
@@ -262,6 +265,25 @@ class InspectUtilsTest(test.TestCase):
     c = TestCallable()
     self.assertEqual(inspect_utils.getmethodclass(c), TestCallable)
 
+  def test_getmethodclass_weakref_mechanism(self):
+    test_obj = TestClass()
+
+    class WeakrefWrapper(object):
+
+      def __init__(self):
+        self.ag_self_weakref__ = weakref.ref(test_obj)
+
+    def test_fn(self):
+      return self
+
+    bound_method = types.MethodType(test_fn, WeakrefWrapper())
+    self.assertEqual(inspect_utils.getmethodclass(bound_method), test_obj)
+
+  def test_getmethodclass_no_bool_conversion(self):
+
+    tensor = constant_op.constant([1])
+    self.assertEqual(inspect_utils.getmethodclass(tensor.get_shape), tensor)
+
   def test_getdefiningclass(self):
     class Superclass(object):
 
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 63686350d518d806578bf6c49d108ad764ad0bfe..8f4037c5e286accc600dbac97acd7b5fe045b582 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -31,21 +31,61 @@ from tensorflow.python.util import tf_inspect
 def parse_entity(entity):
   """Returns the AST of given entity."""
   source = tf_inspect.getsource(entity)
+
+  def fail(comment):
+    raise ValueError(
+        'Failed to parse source code of {}, which Python reported as:\n{}\n'
+        '{}'.format(entity, source, comment))
+
   # Comments and multiline strings can appear at arbitrary indentation levels,
   # causing textwrap.dedent to not correctly dedent source code.
   # TODO(b/115884650): Automatic handling of comments/multiline strings.
   source = textwrap.dedent(source)
+
   try:
     return parse_str(source), source
+
   except IndentationError:
-    # Because we are parsing the source code of entities that have already
-    # successfully parsed once, any IndentationErrors are guaranteed to be
-    # caused by insufficient dedenting.
-    raise ValueError(
-        'Failed to dedent prior to parsing source code. If you have comments '
-        'or multiline strings in your code, try indenting them. '
-        'Multiline strings can be rewritten using textwrap.dedent.\n'
-        'Offending source code: \n %s' % source)
+    # The text below lists the causes of this error known to us. There may
+    # be more.
+    fail('This may be caused by multiline strings or comments not indented at'
+         'the same level as the code.')
+
+  except SyntaxError as e:
+    if not tf_inspect.isfunction(entity) or entity.__name__ != '<lambda>':
+      raise
+
+    # Certain entities, like lambdas, only hold the raw code lines which defined
+    # them, which may include surrounding tokens and may be syntactically
+    # invalid out of context. For example:
+    #
+    #     l = (
+    #         lambda x: x,)[0]
+    #
+    # will have the dedented source "lambda x: x,)[0]"
+    # Here we make an attempt to stip away the garbage by looking at the
+    # information in the syntax error.
+    lines = source.split('\n')
+    lineno, offset = e.lineno, e.offset  # 1-based
+
+    # Give up if there's nothing we can chip away.
+    if len(lines) == lineno and len(lines[-1]) == offset:
+      fail('If this is a lambda function, the error may be avoided by creating'
+           ' the lambda in a standalone statement.')
+
+    # Drop all lines following the error location
+    # TODO(mdan): What's with the pylint errors?
+    lines = lines[:lineno]  # pylint:disable=invalid-slice-index
+    # Drop all characters following the error location
+    lines[-1] = lines[-1][:offset - 1]  # pylint:disable=invalid-slice-index
+    new_source = '\n'.join(lines)
+
+    try:
+      return parse_str(new_source), new_source
+    except SyntaxError as e:
+      fail('If this is a lambda function, the error may be avoided by creating'
+           ' the lambda in a standalone statement. Tried to strip down the'
+           ' source to:\n{}\nBut that did not work.'.format(new_source))
 
 
 def parse_str(src):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index ad11057a0b0e86e7be9af65e49cbf320c9cae81e..451398f1b70abf56d6c141305930c8a4e1a66a07 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -198,6 +198,13 @@ class Annotator(transformer.Base):
     node = self._block_statement_live_out(node)
     return self._block_statement_live_in(node, node.test)
 
+  def visit_Expr(self, node):
+    node = self.generic_visit(node)
+    cfg_node = self.current_analyzer.graph.index[node]
+    anno.setanno(node, anno.Static.LIVE_VARS_OUT,
+                 frozenset(self.current_analyzer.out[cfg_node]))
+    return node
+
 
 def resolve(node, source_info, graphs):
   """Resolves the live symbols at the exit of control flow statements.
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index 3b6a446340ad942360a12968d15ee2b9df91d5a7..b6830534b3dbf2e2815957b26d715d24dc002da7 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -26,6 +26,7 @@ import six
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import pretty_printer
+from tensorflow.python.autograph.pyct import templates
 
 
 class AutographParseError(SyntaxError):
@@ -280,6 +281,12 @@ class Base(gast.NodeTransformer):
       print(pretty_printer.fmt(node))
     return node
 
+  def create_assignment(self, target, expression):
+    template = """
+      target = expression
+    """
+    return templates.replace(template, target=target, expression=expression)
+
   def visit_block(self, nodes, before_visit=None, after_visit=None):
     """A more powerful version of generic_visit for statement blocks.
 
@@ -316,13 +323,14 @@ class Base(gast.NodeTransformer):
     Args:
       nodes: enumerable of AST node objects. If None, the function returns None.
       before_visit: optional callable that is called before visiting each item
-          in nodes
-      after_visit: optional callable that takes in an AST node and
-          returns a tuple (new_node, new_destination). It is called after
-          visiting each item in nodes. Is used in the same was as the
+        in nodes
+      after_visit: optional callable that takes in an AST node and returns a
+        tuple (new_node, new_destination). It is called after visiting each item
+        in nodes. Is used in the same was as the
           visit_* methods: new_node will replace the node; if not None,
-          new_destination must be a list, and subsequent nodes will be placed
-          in this list instead of the list returned by visit_block.
+            new_destination must be a list, and subsequent nodes will be placed
+            in this list instead of the list returned by visit_block.
+
     Returns:
       A list of AST node objects containing the transformed items fron nodes,
       except those nodes that have been relocated using after_visit.
diff --git a/tensorflow/python/autograph/utils/misc_test.py b/tensorflow/python/autograph/utils/misc_test.py
index 8d2b0d6e13802313abf6751b0e62b2807a866c2f..c813e0f5c96386a0d0fbd078bd5b663c688b0327 100644
--- a/tensorflow/python/autograph/utils/misc_test.py
+++ b/tensorflow/python/autograph/utils/misc_test.py
@@ -32,7 +32,7 @@ class MiscTest(test.TestCase):
     new_a = alias_tensors(a)
     self.assertFalse(new_a is a)
     with self.cached_session() as sess:
-      self.assertEqual(1, sess.run(new_a))
+      self.assertEqual(1, self.evaluate(new_a))
 
   def test_alias_tensors(self):
     a = constant(1)
@@ -47,7 +47,7 @@ class MiscTest(test.TestCase):
     self.assertTrue(new_s is s)
     self.assertTrue(new_l is l)
     with self.cached_session() as sess:
-      self.assertEqual(1, sess.run(new_a))
+      self.assertEqual(1, self.evaluate(new_a))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/utils/py_func_test.py b/tensorflow/python/autograph/utils/py_func_test.py
index 1c220d94922be680021bd96c6b7ddbf2593c6125..28cefd8c3edb343aa10d458b9e3a3cd55e3418c4 100644
--- a/tensorflow/python/autograph/utils/py_func_test.py
+++ b/tensorflow/python/autograph/utils/py_func_test.py
@@ -34,13 +34,13 @@ class PyFuncTest(test.TestCase):
     with self.cached_session() as sess:
       result = py_func.wrap_py_func(test_fn, dtypes.int64,
                                     (1, constant_op.constant(1), 1))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
       result = py_func.wrap_py_func(test_fn, dtypes.int64, (1, 1, 1))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
       result = py_func.wrap_py_func(
           test_fn, dtypes.int64,
           (constant_op.constant(1), 1, constant_op.constant(1)))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
 
   def test_wrap_py_func_complex_args(self):
 
@@ -54,10 +54,10 @@ class PyFuncTest(test.TestCase):
 
     with self.cached_session() as sess:
       result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass()))
-      self.assertEqual(35, sess.run(result))
+      self.assertEqual(35, self.evaluate(result))
       result = py_func.wrap_py_func(test_fn, dtypes.int64,
                                     (constant_op.constant(7), TestClass()))
-      self.assertEqual(35, sess.run(result))
+      self.assertEqual(35, self.evaluate(result))
 
   def test_wrap_py_func_kwargs(self):
 
@@ -74,13 +74,13 @@ class PyFuncTest(test.TestCase):
           'c': 11,
           'd': TestClass(13)
       })
-      self.assertEqual(178, sess.run(result))
+      self.assertEqual(178, self.evaluate(result))
       result = py_func.wrap_py_func(test_fn, dtypes.int64,
                                     (constant_op.constant(7), TestClass(5)), {
                                         'c': constant_op.constant(11),
                                         'd': TestClass(13)
                                     })
-      self.assertEqual(178, sess.run(result))
+      self.assertEqual(178, self.evaluate(result))
 
   def test_wrap_py_func_dummy_return(self):
 
@@ -91,11 +91,11 @@ class PyFuncTest(test.TestCase):
 
     with self.cached_session() as sess:
       result = py_func.wrap_py_func(test_fn, None, (5,), use_dummy_return=True)
-      self.assertEqual(1, sess.run(result))
+      self.assertEqual(1, self.evaluate(result))
       self.assertEqual([1], side_counter)
       result = py_func.wrap_py_func(
           test_fn, None, (constant_op.constant(5),), use_dummy_return=True)
-      self.assertEqual(1, sess.run(result))
+      self.assertEqual(1, self.evaluate(result))
       self.assertEqual([2], side_counter)
 
 
diff --git a/tensorflow/python/autograph/utils/tensor_list_test.py b/tensorflow/python/autograph/utils/tensor_list_test.py
index 697c166eb12c0f3e5b3782259795fcf2e366cb5d..a5bbd97cf921e0e043b797318d2f2478970d7d81 100644
--- a/tensorflow/python/autograph/utils/tensor_list_test.py
+++ b/tensorflow/python/autograph/utils/tensor_list_test.py
@@ -43,13 +43,13 @@ class TensorListTest(test.TestCase):
     l = tl.dynamic_list_append(l, 1)
     s = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(s), [1])
+      self.assertAllEqual(self.evaluate(s), [1])
 
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l = tl.dynamic_list_append(l, 1)
     s = l.stack()
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(s), [1])
+      self.assertAllEqual(self.evaluate(s), [1])
 
     l = tl.TensorList(self._shape(()), dtypes.int32)
     l = tl.dynamic_list_append(l, 1)
diff --git a/tensorflow/python/client/client_lib.py b/tensorflow/python/client/client_lib.py
index 80a256bf7a87032a40bfb3fa19fb0162c6dd2393..6efddba9792533c010707422341022477678512e 100644
--- a/tensorflow/python/client/client_lib.py
+++ b/tensorflow/python/client/client_lib.py
@@ -15,7 +15,7 @@
 
 """Support for launching graphs and executing operations.
 
-See the [Client](https://tensorflow.org/api_guides/python/client) guide.
+See the [Client](https://www.tensorflow.org/guide/graphs) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index df020f88a88687ac9616d40618aebb8f7eef2858..224f880ed15f1796b08d1db3ea52c52302a9b83f 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -62,7 +62,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
 
     const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config)
-    output = sess.run(const)
+    output = self.evaluate(const)
     self.assertEqual(17, output)
 
   def testClusterSpecPropagationWorker2Placement(self):
@@ -106,7 +106,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default() as g, ops.device('/job:worker/task:0'):
       const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config, graph=g)
-    output = sess.run(const)
+    output = self.evaluate(const)
     self.assertEqual(17, output)
 
   def testCanonicalDeviceNames(self):
@@ -208,7 +208,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
       with ops.device('/job:worker/task:0/cpu:0'):
         sum3 = sum1 + sum2
     sess = session.Session(server1.target, config=config, graph=g)
-    output = sess.run(sum3)
+    output = self.evaluate(sum3)
     self.assertEqual(40, output)
 
   def testLegacyDeviceNames(self):
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index dfd0147643017b3a7ae17498ac638b7b5e093022..f9bd50957a78e5ba3f8edc810fd8e1766d08c718 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -147,7 +147,7 @@ class TimelineTest(test.TestCase):
         num2 = variables.Variable(2.0, name='num2')
       with ops.device('/cpu:2'):
         result = num1 + num2 + num1 * num2
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(result, options=run_options, run_metadata=run_metadata)
 
     self.assertTrue(run_metadata.HasField('step_stats'))
@@ -176,7 +176,7 @@ class TimelineTest(test.TestCase):
         num2 = variables.Variable(2.0, name='num2')
       with ops.device('/cpu:2'):
         result = num1 + num2 + num1 * num2
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(result, options=run_options, run_metadata=run_metadata)
     self.assertTrue(run_metadata.HasField('step_stats'))
     step_stats = run_metadata.step_stats
diff --git a/tensorflow/python/client/virtual_gpu_test.py b/tensorflow/python/client/virtual_gpu_test.py
index 5892e0fc845787a3d690b2085f22905306e9a10b..e82ee0666c30f8dcf71d3e6609fc7d7a8ec7eeed 100644
--- a/tensorflow/python/client/virtual_gpu_test.py
+++ b/tensorflow/python/client/virtual_gpu_test.py
@@ -216,7 +216,7 @@ class VirtualGpuTest(test_util.TensorFlowTestCase):
       for d in self._util.devices:
         with ops.device(d):
           var = variables.Variable(random_ops.random_uniform(mat_shape))
-          sess.run(var.initializer)
+          self.evaluate(var.initializer)
           data.append(var)
       s = data[0]
       for i in range(1, len(data)):
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 24927f138fafc3f648b529a451ac7398f39b746b..75290f0613c2f258dddf9668570f61672d0c0381 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -26,7 +26,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 1)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 21)
 
 
 @tf_export("compat.forward_compatible")
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index 4166aa61f4ddfc7e8c2540ba6cdeab908174046e..25f63b79a26e37bd381df7c1f3c0ae91667a70bf 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -29,14 +29,16 @@ _NUMPY_RANDOM_SEED = 42
 class RangeBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.Dataset.range()`."""
 
-  def benchmarkRange(self):
-    num_elements = 50000000
+  def _benchmarkRangeHelper(self, modeling_enabled):
+    num_elements = 10000000 if modeling_enabled else 50000000
+    options = dataset_ops.Options()
+    options.experimental_autotune = modeling_enabled
 
     # Use `Dataset.skip()` and `Dataset.take()` to perform the iteration in
     # C++, and focus on the minimal overheads (excluding Python invocation
     # costs).
     dataset = dataset_ops.Dataset.range(num_elements).skip(
-        num_elements - 1).take(1)
+        num_elements - 1).take(1).with_options(options)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
 
@@ -52,10 +54,15 @@ class RangeBenchmark(test.Benchmark):
       end = time.time()
 
       time_per_element = (end - start) / num_elements
-      print("Average time per element: %f nanoseconds" % (
-          time_per_element * 1e9))
+      print("Average time per element (%s modeling): %f nanoseconds" % (
+          "with" if modeling_enabled else "without", time_per_element * 1e9))
       self.report_benchmark(iters=num_elements, wall_time=time_per_element,
-                            name="benchmark_tf_data_dataset_range")
+                            name="benchmark_tf_data_dataset_range%s"
+                            % ("_with_modeling" if modeling_enabled else ""))
+
+  def benchmarkRange(self):
+    for modeling_enabled in [False, True]:
+      self._benchmarkRangeHelper(modeling_enabled)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index d4e7fee9219b9c6d69112e3a463ce11cc941f079..126c2be44209f5697386fe210be853ca676bbd13 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -29,6 +29,8 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@RandomDataset
 @@Reducer
 @@SqlDataset
+@@StatsAggregator
+@@StatsOptions
 @@TFRecordWriter
 
 @@bucket_by_sequence_length
@@ -52,9 +54,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@rejection_resample
 @@sample_from_datasets
 @@scan
-@@set_stats_aggregator
 @@shuffle_and_repeat
-@@StatsAggregator
 @@unbatch
 @@unique
 
@@ -98,9 +98,9 @@ from tensorflow.python.data.experimental.ops.readers import SqlDataset
 from tensorflow.python.data.experimental.ops.resampling import rejection_resample
 from tensorflow.python.data.experimental.ops.scan_ops import scan
 from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repeat
+from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
-from tensorflow.python.data.experimental.ops.stats_ops import set_stats_aggregator
-from tensorflow.python.data.experimental.ops.stats_ops import StatsAggregator
+from tensorflow.python.data.experimental.ops.stats_options import StatsOptions
 from tensorflow.python.data.experimental.ops.unique import unique
 from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
 from tensorflow.python.data.ops.iterator_ops import get_next_as_optional
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index a1382f759828b67e87002bf67c5eb5e3dc639d23..6a387f55bdf7183519a5d49d64051136ba9b6c47 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -38,6 +38,7 @@ cuda_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python/compat:compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -279,6 +280,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/ops:readers",
@@ -616,7 +618,10 @@ py_test(
     size = "medium",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
     deps = [
         ":reader_dataset_ops_test_base",
         ":stats_dataset_test_base",
@@ -624,9 +629,15 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:stats_aggregator",
         "//tensorflow/python/data/experimental/ops:stats_ops",
+        "//tensorflow/python/data/experimental/ops:stats_options",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
index e896752a269c9fee5430f96a32fc13f41098b3ce..dbb780c47d9805dd4bfdd5bb4eb6b959251020ae 100644
--- a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
@@ -53,10 +53,10 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)], results.indices)
@@ -81,10 +81,10 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j, z]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)
@@ -141,7 +141,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
       for i in range(4):
-        self.assertEqual(i, sess.run(next_elem))
+        self.assertEqual(i, self.evaluate(next_elem))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_elem)
 
@@ -159,7 +159,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i,) * 3, sess.run(op))
+        self.assertEqual((i,) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
@@ -179,7 +179,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
+        self.assertEqual((i, compat.as_bytes(str(i)), i), self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
@@ -198,7 +198,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        st_row = sess.run(next_element)
+        st_row = self.evaluate(next_element)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
         self.assertEqual([10], st_row.dense_shape)
@@ -219,7 +219,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        dense_elem, st_row = sess.run(next_element)
+        dense_elem, st_row = self.evaluate(next_element)
         self.assertEqual(i, dense_elem)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
@@ -241,7 +241,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual(((i,),) * 3, sess.run(op))
+        self.assertEqual(((i,),) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
@@ -354,7 +354,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op, feed_dict={count: 28, batch_size: 14})
       num_batches = (28 * 7) // 14
       for i in range(num_batches):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
             self.assertAllEqual(component[(i * 14 + j) % 7]**2,
@@ -369,12 +369,12 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       # We expect (num_batches - 1) full-sized batches.
       num_batches = int(math.ceil((14 * 7) / 8))
       for i in range(num_batches - 1):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
             self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
-      result = sess.run(get_next)
+      result = self.evaluate(get_next)
       for component, result_component in zip(components, result):
         for j in range((14 * 7) % 8):
           self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
@@ -408,10 +408,10 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
     with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
       if not drop_remainder:
-        self.assertAllEqual([[64], [81]], sess.run(next_element))
+        self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -423,9 +423,9 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
     with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      self.assertAllEqual([[64], [81]], sess.run(next_element))
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
+      self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -439,7 +439,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       elements.append(iterator.get_next())
     with self.cached_session() as sess:
       for i in range(5):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
@@ -459,7 +459,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       elements.append(iterator.get_next())
     with self.cached_session() as sess:
       for i in range(4):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
@@ -480,9 +480,9 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(2):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         expected = sparse_tensor.SparseTensorValue(
             indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
             values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
@@ -524,7 +524,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "number of elements does not match"):
         sess.run(get_next)
@@ -576,7 +576,8 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)], sess.run(get_next))
+        self.assertAllEqual([i * 10 + j for j in range(10)],
+                            self.evaluate(get_next))
       if threshold % 10 != 0:
         self.assertAllEqual(
             [threshold // 10 * 10 + j for j in range(threshold % 10)],
@@ -609,7 +610,8 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
+        self.assertAllEqual([element for _ in range(10)],
+                            self.evaluate(get_next))
 
 
 class UnbatchDatasetBenchmark(test.Benchmark):
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index 3903ec49b98447bc69e37107c359be748818f1f1..4263a90f4cc3babcb10ef48b49eb34c4017053a3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -300,7 +300,7 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       with self.cached_session() as sess:
         with self.assertRaises(errors.OutOfRangeError):
           while True:
-            output = sess.run(batch)
+            output = self.evaluate(batch)
             sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
                            tuple(output.values))
             all_sparse_tensors.add(sprs_tensor)
diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
index adfacf1c9f856e08d6bc60f1197391e0d57765bb..6d063ac9c8f11288024b5c5a5ac8d9f877ebbc7b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
@@ -28,7 +28,9 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat as util_compat
 
 
 class CopyToDeviceTest(test_base.DatasetTestBase):
@@ -55,7 +57,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -80,7 +82,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -106,7 +108,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -132,7 +134,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -158,7 +160,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -184,7 +186,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -215,7 +217,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
@@ -249,7 +251,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
@@ -269,9 +271,9 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -288,9 +290,45 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testCopyToDeviceGpuWithMap(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    def generator():
+      for i in range(10):
+        yield i, float(i), str(i)
+
+    host_dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=(dtypes.int32, dtypes.float32, dtypes.string))
+    device_dataset = host_dataset.apply(
+        prefetching_ops.copy_to_device("/gpu:0"))
+
+    def gpu_map_func(x, y, z):
+      return math_ops.square(x), math_ops.square(y), z
+
+    device_dataset = device_dataset.apply(
+        prefetching_ops.map_on_gpu(gpu_map_func))
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    device_dataset = device_dataset.with_options(options)
+
+    with ops.device("/gpu:0"):
+      iterator = device_dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      self.evaluate(iterator.initializer)
+      for i in range(10):
+        x, y, z = self.evaluate(next_element)
+        self.assertEqual(i**2, x)
+        self.assertEqual(float(i**2), y)
+        self.assertEqual(util_compat.as_bytes(str(i)), z)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -307,8 +345,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -325,8 +363,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -343,8 +381,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([b"a", b"b", b"c"], sess.run(next_element))
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -361,8 +399,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([b"a", b"b", b"c"], sess.run(next_element))
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -382,9 +420,9 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         next_element = iterator.get_next()
 
       with self.cached_session() as sess:
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         for i in range(10):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
@@ -409,12 +447,12 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -439,12 +477,12 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -461,12 +499,12 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -483,12 +521,12 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -515,7 +553,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
 
       # For each element of the dataset, assert that the optional evaluates to
       # the expected value.
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(3):
         elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t])
         self.assertTrue(elem_has_value)
@@ -524,7 +562,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       # After exhausting the iterator, `next_elem.has_value()` will evaluate to
       # false, and attempting to get the value will fail.
       for _ in range(2):
-        self.assertFalse(sess.run(elem_has_value_t))
+        self.assertFalse(self.evaluate(elem_has_value_t))
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(elem_value_t)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
index 4e114ac47914f89666f35a9fbc3c4a0099f0e6b1..d1dd07a87943a3250e17f05867e9b9fe8af906f8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
@@ -38,13 +38,13 @@ class CounterTest(test_base.DatasetTestBase):
     negative_get_next = negative_iterator.get_next()
 
     with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(get_next))
-      self.assertEqual(3 + 4, sess.run(get_next))
-      self.assertEqual(3 + 2 * 4, sess.run(get_next))
+      self.assertEqual(3, self.evaluate(get_next))
+      self.assertEqual(3 + 4, self.evaluate(get_next))
+      self.assertEqual(3 + 2 * 4, self.evaluate(get_next))
 
-      self.assertEqual(0, sess.run(negative_get_next))
-      self.assertEqual(-1, sess.run(negative_get_next))
-      self.assertEqual(-2, sess.run(negative_get_next))
+      self.assertEqual(0, self.evaluate(negative_get_next))
+      self.assertEqual(-1, self.evaluate(negative_get_next))
+      self.assertEqual(-2, self.evaluate(negative_get_next))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
index 73be6cbcca8a204ac87cfb6ac8ae87f1d84ffa15..9fe2ee43ed99096df550cf041f977fe03e879d6a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
@@ -41,10 +41,10 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)], results.indices)
@@ -69,10 +69,10 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j, z]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)
diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
index 796a692c56ffb3cbd1347270ed31b3abcbef1739..234fd86bdde98ef8e04e6c2c2e48f662c42ec7fd 100644
--- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
@@ -40,10 +40,10 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for _ in range(100):
         for i in range(10):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -107,7 +107,7 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
 
     with self.cached_session() as sess:
       for i in choice_array:
-        self.assertEqual(words[i], sess.run(next_element))
+        self.assertEqual(words[i], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
index e54235d9f80c2dc0eaf2c30a8e5eda58310b3284..78805bb801e9c54fc37ce68b34c9af420097cb72 100644
--- a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
@@ -44,9 +44,9 @@ class EnumerateDatasetTest(test_base.DatasetTestBase):
                      [t.shape for t in get_next[1]])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertEqual((20, (b"a", 1, 37.0)), sess.run(get_next))
-      self.assertEqual((21, (b"b", 2, 38.0)), sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertEqual((20, (b"a", 1, 37.0)), self.evaluate(get_next))
+      self.assertEqual((21, (b"b", 2, 38.0)), self.evaluate(get_next))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
diff --git a/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py b/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
index d38452e265a6c48828cee8f9350c3754d1e32210..860442571ebaab39666e8b1d1873c21a791c4cda 100644
--- a/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
@@ -94,18 +94,18 @@ class FunctionBufferingResourceTest(test_base.DatasetTestBase):
                                                   device0, device1)
 
     with self.test_session(config=worker_config) as sess:
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [4.0])
       self._event.wait()
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
+      self.evaluate(destroy_op)
 
   def testSameDeviceCPU(self):
     self._prefetch_fn_helper_one_shot("same_device_cpu",
@@ -135,35 +135,35 @@ class FunctionBufferingResourceTest(test_base.DatasetTestBase):
         ds, ds_iterator, "reinit", device0, device1)
 
     with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
+      self.evaluate(ds_iterator.initializer)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [4.0])
       self._event.wait()
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [5.0])
       # Lets reset the function buffering resource and reinitialize the
       # iterator. Should be able to go through this again.
       self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
+      self.evaluate(reset_op)
+      self.evaluate(ds_iterator.initializer)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [4.0])
       self._event.wait()
-      elem = sess.run(prefetch_op)
+      elem = self.evaluate(prefetch_op)
       self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
+      self.evaluate(destroy_op)
 
   def testReinitializationOutOfRange(self):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
@@ -175,30 +175,30 @@ class FunctionBufferingResourceTest(test_base.DatasetTestBase):
         ds, ds_iterator, "reinit", device0, device1)
 
     with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
+      self.evaluate(ds_iterator.initializer)
       for i in range(1, 10):
-        elem = sess.run(prefetch_op)
+        elem = self.evaluate(prefetch_op)
         self.assertEqual(elem, [float(i)])
       # Try fetching after its over twice to test out end of sequence.
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
+        self.evaluate(prefetch_op)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
+        self.evaluate(prefetch_op)
 
       # Now reset everything and try it out again.
       self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
+      self.evaluate(reset_op)
+      self.evaluate(ds_iterator.initializer)
       for i in range(1, 10):
-        elem = sess.run(prefetch_op)
+        elem = self.evaluate(prefetch_op)
         self.assertEqual(elem, [float(i)])
       # Try fetching after its over twice to test out end of sequence.
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
+        self.evaluate(prefetch_op)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
+        self.evaluate(prefetch_op)
 
-      sess.run(destroy_op)
+      self.evaluate(destroy_op)
 
   def testStringsGPU(self):
     if not test_util.is_gpu_available():
@@ -235,13 +235,13 @@ class FunctionBufferingResourceTest(test_base.DatasetTestBase):
           buffer_resource_handle, ignore_lookup_error=True)
 
     with self.cached_session() as sess:
-      self.assertEqual([b"a"], sess.run(prefetch_op))
-      self.assertEqual([b"b"], sess.run(prefetch_op))
-      self.assertEqual([b"c"], sess.run(prefetch_op))
+      self.assertEqual([b"a"], self.evaluate(prefetch_op))
+      self.assertEqual([b"b"], self.evaluate(prefetch_op))
+      self.assertEqual([b"c"], self.evaluate(prefetch_op))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
+        self.evaluate(prefetch_op)
 
-      sess.run(destroy_op)
+      self.evaluate(destroy_op)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index 8c07afbac57944593ba48f2116f876dbe7ab9e76..0147988c595eeba3f6bce81ccc49b93dcee71717 100644
--- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -67,6 +67,17 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
         with self.assertRaisesRegexp(error, error_msg):
           sess.run(element, feed_dict={skip_t: skip, take_t: take})
 
+  def testWindow(self):
+    """Test that `get_single_element()` can consume a nested dataset."""
+    def flat_map_func(ds):
+      batched = ds.batch(2)
+      element = get_single_element.get_single_element(batched)
+      return dataset_ops.Dataset.from_tensors(element)
+
+    dataset = dataset_ops.Dataset.range(10).window(2).flat_map(flat_map_func)
+    self.assertDatasetProduces(
+        dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
index 9030328593181c15981c889cd7b0c0dc370f060d..15396f329d02a59b634f9b10c42edd3191db42ca 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -39,7 +39,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     get_next = dataset.make_one_shot_iterator().get_next()
     with self.cached_session() as sess:
       for expected in values:
-        got = sess.run(get_next)
+        got = self.evaluate(get_next)
         self.assertEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -127,7 +127,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       iterator = dataset.make_one_shot_iterator()
       get_next = iterator.get_next()
       with self.cached_session() as sess:
-        x, y = sess.run(get_next)
+        x, y = self.evaluate(get_next)
         self.assertAllEqual([0] * (2**i), x)
         self.assertAllEqual(np.array(1, ndmin=i), y)
         with self.assertRaises(errors.OutOfRangeError):
@@ -190,7 +190,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
             grouping.group_by_reducer(lambda x, y: np.int64(0), reducer))
     get_next = dataset.make_one_shot_iterator().get_next()
     with self.cached_session() as sess:
-      x, y = sess.run(get_next)
+      x, y = self.evaluate(get_next)
       self.assertAllEqual(x, np.asarray([x for x in range(10)]))
       self.assertEqual(y, 45)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index 557d56e8b9a60ec4cd4fb248dd6dfeb1c2ed4589..cfc357ba13aaa7d52bda6bbfef26352c45741f01 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -68,9 +68,9 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
-      which_bucket, bucketed_values = sess.run(get_next)
+      which_bucket, bucketed_values = self.evaluate(get_next)
 
       self.assertEqual(0, which_bucket)
 
@@ -103,11 +103,11 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       # Get two minibatches (one containing even values, one containing odds)
-      which_bucket_even, bucketed_values_even = sess.run(get_next)
-      which_bucket_odd, bucketed_values_odd = sess.run(get_next)
+      which_bucket_even, bucketed_values_even = self.evaluate(get_next)
+      which_bucket_odd, bucketed_values_odd = self.evaluate(get_next)
 
       # Count number of bucket_tensors.
       self.assertEqual(3, len(bucketed_values_even))
@@ -174,11 +174,11 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       # Get two minibatches ([0, 2, ...] and [64, 66, ...])
-      which_bucket0, bucketed_values_even0 = sess.run(get_next)
-      which_bucket1, bucketed_values_even1 = sess.run(get_next)
+      which_bucket0, bucketed_values_even0 = self.evaluate(get_next)
+      which_bucket1, bucketed_values_even1 = self.evaluate(get_next)
 
       # Ensure that bucket 1 was completely filtered out
       self.assertAllEqual(0, which_bucket0)
@@ -207,11 +207,11 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.OutOfRangeError):
         batches = 0
         while True:
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           is_even = all(x % 2 == 0 for x in result)
           is_odd = all(x % 2 == 1 for x in result)
           self.assertTrue(is_even or is_odd)
@@ -232,11 +232,11 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       counts = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           self.assertTrue(
               all(x % 2 == 0
                   for x in result) or all(x % 2 == 1)
@@ -259,16 +259,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       # The input is infinite, so this test demonstrates that:
       # 1. We produce output without having to consume the entire input,
       # 2. Different buckets can produce output at different rates, and
       # 3. For deterministic input, the output is deterministic.
       for _ in range(3):
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-        self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
-        self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
+        self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
+        self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next))
+        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
 
   def testSmallGroups(self):
     components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
@@ -280,13 +280,13 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
+      self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
       # The small outputs at the end are deterministically produced in key
       # order.
-      self.assertAllEqual([0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1], sess.run(get_next))
+      self.assertAllEqual([0, 0, 0], self.evaluate(get_next))
+      self.assertAllEqual([1], self.evaluate(get_next))
 
   def testEmpty(self):
     iterator = (
@@ -297,7 +297,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "Window size must be greater than zero, but got 0."):
@@ -323,7 +323,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(get_next)
 
@@ -351,11 +351,11 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       counts = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
-          tight_result, multiple_of_10_result = sess.run(get_next)
+          tight_result, multiple_of_10_result = self.evaluate(get_next)
           self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
           self.assertAllEqual(tight_result,
                               multiple_of_10_result[:, :tight_result.shape[1]])
diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
index c0ec1486ab8d49e8f1fc3a6ac98fe32cefba605b..cb0fc139145b6c9d0075639830982a45e755bfb8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
@@ -47,9 +47,9 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -65,9 +65,9 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -93,9 +93,9 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
 
     with self.cached_session() as sess:
       # All of the files are present.
-      sess.run(init_op)
+      self.evaluate(init_op)
       for filename in filenames:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -104,9 +104,9 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
 
       # Attempting to read filenames[0] will fail, but ignore_errors()
       # will catch the error.
-      sess.run(init_op)
+      self.evaluate(init_op)
       for filename in filenames[1:]:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
index c93a8353ce01063f52ecc68253df7d02a7689603..c4076daef2a0b6f8f2833d001f21b9285bfe5fc9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
@@ -53,7 +53,7 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
     ds = indexed_dataset_ops.IdentityIndexedDataset(16)
     materialized = ds.materialize()
     with self.cached_session() as sess:
-      sess.run(materialized.initializer)
+      self.evaluate(materialized.initializer)
       placeholder = array_ops.placeholder(dtypes.uint64, shape=[])
       for i in range(16):
         output = sess.run(
@@ -68,9 +68,9 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
     itr = ds.make_initializable_iterator()
     n = itr.get_next()
     with self.cached_session() as sess:
-      sess.run(itr.initializer)
+      self.evaluate(itr.initializer)
       for i in range(16):
-        output = sess.run(n)
+        output = self.evaluate(n)
         self.assertEqual(i, output)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(n)
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
index 5ee94e14dcdd77ad4317d5fee022975bb74b9f39..c6cefa7034e00b63be4c9851e4ed40c4a2361d91 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 
@@ -110,10 +112,10 @@ class MakeBatchedFeaturesDatasetTest(
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
           range(self._num_files), 2, 10):
-        actual_batch = sess.run(next_element)
+        actual_batch = self.evaluate(next_element)
         self.assertAllEqual(file_batch, actual_batch["file"])
         self.assertAllEqual(record_batch, actual_batch["record"])
       with self.assertRaises(errors.OutOfRangeError):
@@ -234,6 +236,20 @@ class MakeBatchedFeaturesDatasetTest(
       if issubclass(clazz, ops.Tensor):
         self.assertEqual(32, shape[0])
 
+  def testOldStyleReader(self):
+    with self.assertRaisesRegexp(
+        TypeError, r"The `reader` argument must return a `Dataset` object. "
+        r"`tf.ReaderBase` subclasses are not supported."):
+      _ = readers.make_batched_features_dataset(
+          file_pattern=self.test_filenames[0], batch_size=32,
+          features={
+              "file": parsing_ops.FixedLenFeature([], dtypes.int64),
+              "record": parsing_ops.FixedLenFeature([], dtypes.int64),
+              "keywords": parsing_ops.VarLenFeature(dtypes.string),
+              "label": parsing_ops.FixedLenFeature([], dtypes.string),
+          },
+          reader=io_ops.TFRecordReader)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index e4bf08918420b7b63fbb0d3a0ae56c7395ff9e97..5486369462d4d0b18b2d384b1fc84caf35765386 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -90,7 +90,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         batch_size,
         num_epochs,
     ):
-      actual_features = sess.run(nxt)
+      actual_features = self.evaluate(nxt)
 
       if label_name is not None:
         expected_labels = expected_features.pop(label_name)
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
index 657cf3c00ee899a9a5718d808ba3d7ee2454bf6b..404edf2fdab82eba1bd2006235bde523e5a3a502 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
@@ -105,7 +105,7 @@ class MakeTFRecordDatasetTest(
     for expected_batch in self._next_expected_batch(
         file_indices, batch_size, num_epochs, interleave_cycle_length,
         drop_final_batch, use_parser_fn):
-      actual_batch = sess.run(outputs)
+      actual_batch = self.evaluate(outputs)
       self.assertAllEqual(expected_batch, actual_batch)
 
   def _read_test(self, batch_size, num_epochs, file_index=None,
@@ -188,7 +188,7 @@ class MakeTFRecordDatasetTest(
         iterator = dataset.make_initializable_iterator()
         next_element = iterator.get_next()
 
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         first_batches = []
         try:
           while True:
@@ -196,7 +196,7 @@ class MakeTFRecordDatasetTest(
         except errors.OutOfRangeError:
           pass
 
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         second_batches = []
         try:
           while True:
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index 5ead6d1c7547fe2a39ffa826ac3f9a28f3bec90e..b4bc4a617fefa9119ddddaf8fa1336403d49e5ad 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -89,7 +89,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op, feed_dict={count: 28, batch_size: 14})
       num_batches = (28 * 7) // 14
       for i in range(num_batches):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
             self.assertAllEqual(component[(i * 14 + j) % 7]**2,
@@ -104,12 +104,12 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       # We expect (num_batches - 1) full-sized batches.
       num_batches = int(math.ceil((14 * 7) / 8))
       for i in range(num_batches - 1):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
             self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
-      result = sess.run(get_next)
+      result = self.evaluate(get_next)
       for component, result_component in zip(components, result):
         for j in range((14 * 7) % 8):
           self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
@@ -152,10 +152,10 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
     with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
       if not drop_remainder:
-        self.assertAllEqual([[64], [81]], sess.run(next_element))
+        self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -177,9 +177,9 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
     with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      self.assertAllEqual([[64], [81]], sess.run(next_element))
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
+      self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -201,7 +201,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       elements.append(iterator.get_next())
     with self.cached_session() as sess:
       for i in range(5):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
@@ -230,7 +230,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       elements.append(iterator.get_next())
     with self.cached_session() as sess:
       for i in range(4):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
@@ -261,9 +261,9 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(2):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         expected = sparse_tensor.SparseTensorValue(
             indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
             values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
@@ -321,7 +321,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "number of elements does not match"):
         sess.run(get_next)
@@ -393,7 +393,8 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)], sess.run(get_next))
+        self.assertAllEqual([i * 10 + j for j in range(10)],
+                            self.evaluate(get_next))
       if threshold % 10 != 0:
         self.assertAllEqual(
             [threshold // 10 * 10 + j for j in range(threshold % 10)],
@@ -442,7 +443,8 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
+        self.assertAllEqual([element for _ in range(10)],
+                            self.evaluate(get_next))
 
   @parameterized.named_parameters(
       ("Identity", None, lambda x: x, None),
@@ -462,7 +464,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       else:
         expected = map_fn(
             sess.run(self.structuredElement(structure, shape=[10])))
-      self.assertAllEqual(expected, sess.run(get_next))
+      self.assertAllEqual(expected, self.evaluate(get_next))
 
   def testShortCircuitCapturedInput(self):
     captured_t = array_ops.placeholder(dtypes.int64, shape=[])
@@ -473,7 +475,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={captured_t: 42})
-      self.assertAllEqual([42] * 10, sess.run(get_next))
+      self.assertAllEqual([42] * 10, self.evaluate(get_next))
 
   @parameterized.named_parameters(
       ("Normal", False),
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index 11694540fae9996c9249d3a3d3a7c308e2a6f131..3cf3b89c3f248bd1854763119c980911c2323415 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -218,7 +218,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def _assert_op_cancelled(self, sess, map_defun_op):
     with self.assertRaisesRegexp(errors.CancelledError, "was cancelled"):
-      sess.run(map_defun_op)
+      self.evaluate(map_defun_op)
 
   def testMapDefunWithParentCancellation(self):
     # Checks that a cancellation of the parent graph is threaded through to
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 5b75e54f66cd60e148f7758fb07b6f7bc8fcdb4e..1d0e6af64934077d333bea127e89c03aee1236b5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -89,6 +89,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:stats_aggregator",
         "//tensorflow/python/data/experimental/ops:stats_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
@@ -201,6 +202,7 @@ py_test(
     name = "map_vectorization_test",
     size = "medium",
     srcs = ["map_vectorization_test.py"],
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
index c2a5da3af00f9e4c38e3c4aac0e9c736957a8284..9b8248a78da11d99e3cf6cd87ab69d30d4d369d6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
@@ -21,44 +21,39 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class AssertNextDatasetTest(test_base.DatasetTestBase):
 
   def testAssertNext(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Map"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testAssertNextInvalid(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Whoops"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Asserted Whoops transformation at offset 0 but encountered "
-          "Map transformation instead."):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            "Asserted Whoops transformation at offset 0 but encountered "
+            "Map transformation instead."))
 
   def testAssertNextShort(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Map", "Whoops"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Asserted next 2 transformations but encountered only 1."):
-        sess.run(get_next)
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            "Asserted next 2 transformations but encountered only 1."))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
index 80a0d879dc2088024d1a2a7117f79758c779d5d0..fe2c104e94b77f8f9847d5b591b192ee64c363e8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
@@ -24,7 +24,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -58,6 +58,7 @@ def _filter_fusion_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_filter_fusion_test_cases())
@@ -72,26 +73,22 @@ class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
     options = dataset_ops.Options()
     options.experimental_filter_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(5):
-        r = map_function(x)
-        filtered = False
-        for predicate in predicates:
-          if isinstance(r, tuple):
-            b = predicate(*r)  # Pass tuple as multiple arguments.
-          else:
-            b = predicate(r)
-          if not sess.run(b):
-            filtered = True
-            break
+    expected_output = []
+    for x in range(5):
+      r = map_function(x)
+      filtered = False
+      for predicate in predicates:
+        if isinstance(r, tuple):
+          b = predicate(*r)  # Pass tuple as multiple arguments.
+        else:
+          b = predicate(r)
+        if not self.evaluate(b):
+          filtered = True
+          break
 
-        if not filtered:
-          result = sess.run(get_next)
-          self.assertAllEqual(r, result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+      if not filtered:
+        expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
index 9f7fbfeba0d0a2d0c503106060985c2e27d6d364..e86b19438ef45044a9120354b534b55976f45f32 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -22,10 +22,12 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -58,23 +60,29 @@ def _hoist_random_uniform_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _testDataset(self, dataset):
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
     previous_result = 0
-    with self.cached_session() as sess:
-      for _ in range(5):
-        result = sess.run(get_next)
-        self.assertLessEqual(1, result)
-        self.assertLessEqual(result, 10)
-        # This checks if the result is somehow random by checking if we are not
-        # generating the same values.
-        self.assertNotEqual(previous_result, result)
-        previous_result = result
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    if context.executing_eagerly():
+      iterator = dataset.__iter__()
+      get_next = iterator._next_internal  # pylint: disable=protected-access
+    else:
+      iterator = dataset.make_one_shot_iterator()
+      get_next = iterator.get_next
+    for _ in range(5):
+      result = self.evaluate(get_next())
+      self.assertLessEqual(1, result)
+      self.assertLessEqual(result, 10)
+      # This checks if the result is somehow random by checking if we are not
+      # generating the same values.
+      self.assertNotEqual(previous_result, result)
+      previous_result = result
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(*_hoist_random_uniform_test_cases())
   def testHoisting(self, function, will_optimize):
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index 469b05399a1681e28311969f1b15d4a7e0767dd9..c2665cf69ae82287edb104f8143443bfdf9fcde8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -19,40 +19,61 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops import stats_ops
+from tensorflow.python.data.experimental.ops import stats_aggregator
+from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
 
   def testLatencyStatsOptimization(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.from_tensors(1).apply(
         optimization.assert_next(
             ["LatencyStats", "Map", "LatencyStats", "Prefetch",
-             "LatencyStats"])).map(lambda x: x * x).prefetch(1).apply(
-                 stats_ops.set_stats_aggregator(stats_aggregator))
+             "LatencyStats"])).map(lambda x: x * x).prefetch(1)
     options = dataset_ops.Options()
-    options.experimental_latency_all_edges = True
+    options.experimental_stats = stats_options.StatsOptions()
+    options.experimental_stats.latency_all_edges = True
+    options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[1],
+        requires_initialization=True,
+        num_test_iterations=1)
+    summary_t = aggregator.get_summary()
+    summary_str = self.evaluate(summary_t)
+    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
+                                1)
+    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
+    self._assertSummaryHasCount(summary_str,
+                                "record_latency_PrefetchDataset/_6", 1)
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertEqual(1, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      summary_str = sess.run(summary_t)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_TensorDataset/_1", 1)
-      self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4",
-                                  1)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_PrefetchDataset/_6", 1)
+  def testLatencyStatsOptimizationV2(self):
+    aggregator = stats_aggregator.StatsAggregator()
+    dataset = dataset_ops.Dataset.from_tensors(1).apply(
+        optimization.assert_next(
+            ["LatencyStats", "Map", "LatencyStats", "Prefetch",
+             "LatencyStats"])).map(lambda x: x * x).prefetch(1)
+    options = dataset_ops.Options()
+    options.experimental_stats = stats_options.StatsOptions(aggregator)
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[1],
+        requires_initialization=True,
+        num_test_iterations=1)
+    summary_t = aggregator.get_summary()
+    summary_str = self.evaluate(summary_t)
+    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
+                                1)
+    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
+    self._assertSummaryHasCount(summary_str,
+                                "record_latency_PrefetchDataset/_6", 1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
index 6191a7db0840329ecd9f2de0a112aaf1af2ef8b3..2386dd5f116d660eb93213c935b662c05d90011d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
@@ -21,10 +21,11 @@ from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeNumaAwareTest(test_base.DatasetTestBase):
 
   def testMakeNumaAware(self):
@@ -34,13 +35,8 @@ class MakeNumaAwareTest(test_base.DatasetTestBase):
     options = dataset_ops.Options()
     options.experimental_numa_aware = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset, expected_output=[[x * x for x in range(10)]])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
index ddf3cbbcc358d765beb4bca3ae4ffdf26f2da9ca..67f3ceeabef764130395eafd08e2d08a46a49cb6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapAndBatchFusionTest(test_base.DatasetTestBase):
 
   def testMapAndBatchFusion(self):
@@ -33,13 +34,8 @@ class MapAndBatchFusionTest(test_base.DatasetTestBase):
     options = dataset_ops.Options()
     options.experimental_map_and_batch_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset, expected_output=[[x * x for x in range(10)]])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index 3b4ca62340917de16b2d62d65da2f8cd924e2478..a898c3844003bb803b1ed38bc18d95b6ddbae0ee 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -24,7 +24,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -62,23 +62,20 @@ def _map_and_filter_fusion_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _testMapAndFilter(self, dataset, function, predicate):
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(10):
-        r = function(x)
-        if isinstance(r, tuple):
-          b = predicate(*r)  # Pass tuple as multiple arguments.
-        else:
-          b = predicate(r)
-        if sess.run(b):
-          result = sess.run(get_next)
-          self.assertAllEqual(r, result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    expected_output = []
+    for x in range(10):
+      r = function(x)
+      if isinstance(r, tuple):
+        b = predicate(*r)  # Pass tuple as multiple arguments.
+      else:
+        b = predicate(r)
+      if self.evaluate(b):
+        expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
   @parameterized.named_parameters(*_map_and_filter_fusion_test_cases())
   def testMapFilterFusion(self, function, predicate):
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
index ec63ad72006502afb9f23752c00d5926e8bc9f04..47a1b0896cdfd71612b3c3ce4a20e62a93c677f0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
@@ -22,7 +22,7 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -62,6 +62,7 @@ def _map_fusion_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_map_fusion_test_cases())
@@ -75,21 +76,16 @@ class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
     options = dataset_ops.Options()
     options.experimental_map_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        r = x
-        for function in functions:
-          if isinstance(r, tuple):
-            r = function(*r)  # Pass tuple as multiple arguments.
-          else:
-            r = function(r)
-        self.assertAllEqual(r, result)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    expected_output = []
+    for x in range(5):
+      r = x
+      for function in functions:
+        if isinstance(r, tuple):
+          r = function(*r)  # Pass tuple as multiple arguments.
+        else:
+          r = function(r)
+      expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index c95f7b2eb191e5d1c9abba0605f820568aa3225c..042b9ce54bb3ed459691163c722bbfc4565d60ae 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -23,8 +23,8 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -58,6 +58,7 @@ def _map_parallelization_test_cases():
           ("AssertWithRandom", assert_with_random, False))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_map_parallelization_test_cases())
@@ -68,21 +69,9 @@ class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     options = dataset_ops.Options()
     options.experimental_map_parallelization = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        # No need to run the pipeline if it was not optimized.  Also the results
-        # might be hard to check because of random.
-        if not should_optimize:
-          return
-        r = function(x)
-        self.assertAllEqual(r, result)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    if should_optimize:
+      self.assertDatasetProduces(
+          dataset, expected_output=[function(x) for x in range(5)])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index f10b66ff69159ef8c88232cfd9eebf545aed1771..470de580e8375974d09a8ef10f2943b7cb4c5964 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import check_ops
@@ -319,6 +320,7 @@ def _generate_optimization_test_cases():
   } for x in test_cases for num_parallel_calls in (None, 12)]
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _get_test_datasets(self,
@@ -366,7 +368,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
                                                      num_parallel_calls)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  def testOptimizationBadMapFn(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationBadMapFn(self):
     # Test map functions that give an error
     def map_fn(x):
       # x has leading dimension 5, this will raise an error
@@ -394,7 +397,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=True)
     self.assertDatasetsEqual(optimized, unoptimized)
 
-  def testOptimizationIgnoreStateful(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationIgnoreStateful(self):
 
     def map_fn(x):
       with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
@@ -420,7 +424,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=False)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  def testOptimizationIgnoreRaggedMap(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationIgnoreRaggedMap(self):
     # Don't optimize when the output of the map fn shapes are unknown.
     def map_fn(x):
       return array_ops.tile(x, x)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
index 5b49bdf453207c70b232cd68bedfaf1a19e08c79..f5a839912448406ab56649734cdfe5e66beab13d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): Add eager coverage for the following tests.
 class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testModelMap(self):
@@ -72,7 +73,7 @@ class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.cached_session() as sess:
       for _ in range(5):
         sess.run(get_next.op)
-      for _ in range(1000):
+      for _ in range(100):
         start = time.time()
         sess.run(get_next.op)
         end = time.time()
@@ -136,7 +137,7 @@ class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.cached_session() as sess:
       for _ in range(5):
         sess.run(get_next.op)
-      for _ in range(1000):
+      for _ in range(100):
         start = time.time()
         sess.run(get_next.op)
         end = time.time()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index ddcd7f4da4bdbcf9d6b871192c73bc1b0239c5dd..d957e8007cb4a13b25798b9cb8b4b9e10f910fe7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -22,11 +22,12 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NoopEliminationTest(test_base.DatasetTestBase):
 
   def testNoopElimination(self):
@@ -43,16 +44,7 @@ class NoopEliminationTest(test_base.DatasetTestBase):
     options = dataset_ops.Options()
     options.experimental_noop_elimination = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        self.assertAllEqual(result, x)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(dataset, expected_output=range(5))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
index c04bef89f55b7fc3633b0762f7899ddf3355e727..510b197ddf7604636052b34d5b89a9c325a74ca0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -24,25 +24,24 @@ from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class OptimizeDatasetTest(test_base.DatasetTestBase):
 
   def testOptimizationStatefulFunction(self):
     dataset = dataset_ops.Dataset.range(10).map(
         lambda _: random_ops.random_uniform([])).batch(10)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(get_next)
+    get_next = self.getNext(dataset)
+    self.evaluate(get_next())
 
-  def testOptimizationLargeInputFromTensor(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationLargeInputFromTensor(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
     dataset = dataset_ops.Dataset.from_tensors(input_t)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
@@ -54,7 +53,8 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
       sess.run(init_op, {input_t: np.ones([512, 1024, 1025], np.int32)})
       sess.run(get_next)
 
-  def testOptimizationLargeInputFromTensorSlices(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationLargeInputFromTensorSlices(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
     dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
@@ -78,13 +78,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(1)
     dataset = dataset.flat_map(flat_map_fn)
     dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEquals(0, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testOptimizationThreadPoolDataset(self):
     dataset = dataset_ops.Dataset.range(10).batch(10)
@@ -95,14 +89,29 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
             2, display_name="private_thread_pool_%d" % 2))
 
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[list(range(10))],
+        requires_initialization=True)
+
+  def testOptimizationNonSerializable(self):
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset.apply(optimization.assert_next(["FiniteSkip"]))
+    dataset = dataset.skip(0)  # Should not be removed by noop elimination
+    dataset = dataset.apply(optimization.non_serializable())
+    dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
+    dataset = dataset.skip(0)  # Should be removed by noop elimination
+    dataset = dataset.cache()
+    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual(list(range(10)), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+  def testOptimizationNonSerializableAsDirectInput(self):
+    """Tests that non-serializable dataset can be OptimizeDataset's input.
+    """
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset.apply(optimization.non_serializable())
+    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
index 36582f449f3a3de6ef9c8b710348bed21ff83880..f1d00a59c4f8f0b5aa28d789f2b2db1aad53200f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
@@ -21,9 +21,11 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase):
 
   def testShuffleAndRepeatFusion(self):
@@ -32,17 +34,17 @@ class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase):
     options = dataset_ops.Options()
     options.experimental_shuffle_and_repeat_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      for _ in range(2):
-        results = []
-        for _ in range(10):
-          results.append(sess.run(get_next))
-        self.assertAllEqual([x for x in range(10)], sorted(results))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    for _ in range(2):
+      results = []
+      for _ in range(10):
+        results.append(self.evaluate(get_next()))
+      self.assertAllEqual([x for x in range(10)], sorted(results))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
index 5e419a9b2f9e9debef63446263dc51b5c079a495..ca8bc5ff97ad93c7ccb7fe996f2bf5648bc7fc33 100644
--- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
@@ -72,7 +72,7 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       thread_ids = []
       try:
         while True:
diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
index 90ac250df70bfac8c0d73836391900cf83a603e5..91908f5582f0daf1efddcf4e669e3202bd20e90d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
@@ -637,11 +637,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
         for j in range(2):
           expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
+          self.assertAllEqual(expected, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -796,7 +796,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     with self.cached_session() as sess:
       for _ in range(2):
         elements = []
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         try:
           while True:
             elements.extend(sess.run(next_element))
diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
index f9ea4c3b545afd487f3e9be6e970a1ba659c584e..c74f754fefbc88d685d593c3545d34107f5ca2af 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
@@ -27,7 +27,6 @@ from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsing_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
@@ -35,7 +34,6 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
 
 # Helpers for creating Example objects
 example = example_pb2.Example
@@ -50,33 +48,20 @@ feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d)
 sequence_example = example_pb2.SequenceExample
 
 
-def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
-                                flat_output):
-  tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
-
-  i = 0  # Index into the flattened output
-  for k, v in sorted(dict_tensors.items()):
-    # TODO(shivaniagrawal): flat_output is same as v.
-    expected_v = expected_tensors[k]
-    tf_logging.info("Comparing key: %s", k)
-    print("i", i, "flat_output", flat_output[i], "expected_v", expected_v)
-    if sparse_tensor.is_sparse(v):
-      # Three outputs for SparseTensor : indices, values, shape.
-      tester.assertEqual([k, len(expected_v)], [k, 3])
-      print("i", i, "flat_output", flat_output[i].indices, "expected_v",
-            expected_v[0])
-      tester.assertAllEqual(expected_v[0], flat_output[i].indices)
-      tester.assertAllEqual(expected_v[1], flat_output[i].values)
-      tester.assertAllEqual(expected_v[2], flat_output[i].dense_shape)
-    else:
-      # One output for standard Tensor.
-      tester.assertAllEqual(expected_v, flat_output[i])
-    i += 1
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
+  def _compare_output_to_expected(self, dict_tensors, expected_tensors):
+    self.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
+
+    for k, v in sorted(dict_tensors.items()):
+      expected_v = expected_tensors[k]
+      if sparse_tensor.is_sparse(v):
+        self.assertSparseValuesEqual(expected_v, v)
+      else:
+        # One output for standard Tensor.
+        self.assertAllEqual(expected_v, v)
+
   def _test(self,
             input_tensor,
             feature_val,
@@ -99,26 +84,29 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
           contrib_parsing_ops.parse_example_dataset(feature_val))
       get_next = self.getNext(dataset)
       result = self.evaluate(get_next())
-      flattened = nest.flatten(result)
-      _compare_output_to_expected(self, result, expected_values, flattened)
+      self._compare_output_to_expected(result, expected_values)
+      with self.assertRaises(errors_impl.OutOfRangeError):
+        self.evaluate(get_next())
+      with self.assertRaises(errors_impl.OutOfRangeError):
+        self.evaluate(get_next())
       if create_iterator_twice:
         get_next = self.getNext(dataset)
         result = self.evaluate(get_next())
-        flattened = nest.flatten(result)
-        _compare_output_to_expected(self, result, expected_values, flattened)
+        self._compare_output_to_expected(result, expected_values)
+        with self.assertRaises(errors_impl.OutOfRangeError):
+          self.evaluate(get_next())
     # Check shapes; if serialized is a Tensor we need its size to
     # properly check.
     batch_size = (
         self.evaluate(input_tensor).size if isinstance(input_tensor, ops.Tensor)
         else np.asarray(input_tensor).size)
     for k, f in feature_val.items():
-      print("output_shapes as list ", tuple(dataset.output_shapes[k].as_list()))
       if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
         self.assertEqual(dataset.output_shapes[k].as_list()[0], batch_size)
       elif isinstance(f, parsing_ops.VarLenFeature):
         self.assertEqual(dataset.output_shapes[k].as_list()[1], None)
 
-  def testSkipEagerEmptySerializedWithAllDefaults(self):
+  def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
     a_name = "a"
     b_name = "b"
@@ -127,13 +115,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
     b_default = np.random.rand(3, 3).astype(bytes)
     c_default = np.random.rand(2).astype(np.float32)
 
-    expected_st_a = (  # indices, values, shape
-        np.empty(
-            (0, 2), dtype=np.int64),  # indices
-        np.empty(
-            (0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array(
-            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+    expected_st_a = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.empty((0, 2), dtype=np.int64),  # indices
+        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
 
     expected_output = {
         sparse_name: expected_st_a,
@@ -219,7 +204,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         {"a": parsing_ops.FixedLenFeature(None, dtypes.float32)},
         expected_err=(ValueError, "Missing shape for feature a"))
 
-  def testSkipEagerSerializedContainingSparse(self):
+  def testSerializedContainingSparse(self):
     original = [
         example(features=features({
             "st_c": float_feature([3, 4])
@@ -238,17 +223,14 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_st_c = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), np.array(
-                [3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), np.array(
-                    [4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
+    expected_st_c = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64),
+        np.array([3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32),
+        np.array([4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
 
-    expected_st_d = (  # indices, values, shape
-        np.array(
-            [[3, 0]], dtype=np.int64), np.array(
-                ["hi"], dtype=bytes), np.array(
-                    [4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
+    expected_st_d = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[3, 0]], dtype=np.int64), np.array(["hi"], dtype=bytes),
+        np.array([4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
 
     expected_output = {
         "st_c": expected_st_c,
@@ -263,7 +245,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedContainingSparseFeature(self):
+  def testSerializedContainingSparseFeature(self):
     original = [
         example(features=features({
             "val": float_feature([3, 4]),
@@ -286,12 +268,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_sp = (  # indices, values, shape
-        np.array(
-            [[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64),
-        np.array(
-            [3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), np.array(
-                [4, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+    expected_sp = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64),
+        np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
+        np.array([4, 13], dtype=np.int64))  # batch == 4, max_elems = 13
 
     expected_output = {"sp": expected_sp,}
 
@@ -301,7 +281,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedContainingSparseFeatureReuse(self):
+  def testSerializedContainingSparseFeatureReuse(self):
     original = [
         example(features=features({
             "val1": float_feature([3, 4]),
@@ -316,17 +296,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_sp1 = (  # indices, values, shape
-        np.array(
-            [[0, 5], [0, 10]], dtype=np.int64), np.array(
-                [3.0, 4.0], dtype=np.float32), np.array(
-                    [2, 13], dtype=np.int64))  # batch == 2, max_elems = 13
+    expected_sp1 = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 5], [0, 10]], dtype=np.int64),
+        np.array([3.0, 4.0], dtype=np.float32),
+        np.array([2, 13], dtype=np.int64))  # batch == 2, max_elems = 13
 
-    expected_sp2 = (  # indices, values, shape
-        np.array(
-            [[0, 5], [0, 10]], dtype=np.int64), np.array(
-                [5.0, 6.0], dtype=np.float32), np.array(
-                    [2, 7], dtype=np.int64))  # batch == 2, max_elems = 13
+    expected_sp2 = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 5], [0, 10]], dtype=np.int64),
+        np.array([5.0, 6.0], dtype=np.float32),
+        np.array([2, 7], dtype=np.int64))  # batch == 2, max_elems = 13
 
     expected_output = {
         "sp1": expected_sp1,
@@ -344,7 +322,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedContaining3DSparseFeature(self):
+  def testSerializedContaining3DSparseFeature(self):
     original = [
         example(features=features({
             "val": float_feature([3, 4]),
@@ -369,11 +347,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_sp = (
+    expected_sp = sparse_tensor.SparseTensorValue(
         # indices
-        np.array(
-            [[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]],
-            dtype=np.int64),
+        np.array([[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]],
+                 dtype=np.int64),
         # values
         np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
         # shape batch == 4, max_elems = 13
@@ -534,20 +511,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedSparseAndSparseFeatureAndDenseWithNoDefault(
-      self):
-    expected_st_a = (  # indices, values, shape
-        np.empty(
-            (0, 2), dtype=np.int64),  # indices
-        np.empty(
-            (0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array(
-            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
-    expected_sp = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 3], [1, 7]], dtype=np.int64), np.array(
-                ["a", "b", "c"], dtype="|S"), np.array(
-                    [2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+  def testSerializedSparseAndSparseFeatureAndDenseWithNoDefault(self):
+    expected_st_a = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.empty((0, 2), dtype=np.int64),  # indices
+        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+    expected_sp = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 3], [1, 7]], dtype=np.int64),
+        np.array(["a", "b", "c"], dtype="|S"),
+        np.array([2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
 
     original = [
         example(features=features({
@@ -594,18 +566,16 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagererializedContainingSparseAndSparseFeatureWithReuse(self):
-    expected_idx = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
-        np.array([0, 3, 7, 1]), np.array(
-            [2, 2], dtype=np.int64))  # batch == 4, max_elems = 2
+  def testerializedContainingSparseAndSparseFeatureWithReuse(self):
+    expected_idx = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
+        np.array([0, 3, 7, 1]),
+        np.array([2, 2], dtype=np.int64))  # batch == 4, max_elems = 2
 
-    expected_sp = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64), np.array(
-                ["a", "b", "d", "c"], dtype="|S"), np.array(
-                    [2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+    expected_sp = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64),
+        np.array(["a", "b", "d", "c"], dtype="|S"),
+        np.array([2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
 
     original = [
         example(features=features({
@@ -694,16 +664,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
-  def testSkipEagerSerializedContainingVarLenDenseLargerBatch(self):
+  def testSerializedContainingVarLenDenseLargerBatch(self):
     np.random.seed(3456)
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
-  def testSkipEagerSerializedContainingVarLenDense(self):
+  def testSkipEagerSerializedShapeMismatch(self):
     aname = "a"
     bname = "b"
     cname = "c"
-    dname = "d"
     original = [
         example(features=features({
             cname: int64_feature([2]),
@@ -722,6 +691,47 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         })),
     ]
 
+    serialized = [m.SerializeToString() for m in original]
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenSequenceFeature((2, 1),
+                                                    dtype=dtypes.float32,
+                                                    allow_missing=True,
+                                                    default_value=[]),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+        },
+        expected_err=(ValueError,
+                      "Cannot reshape a tensor with 0 elements to shape"))
+
+  def testSerializedContainingVarLenDense(self):
+    aname = "a"
+    bname = "b"
+    cname = "c"
+    dname = "d"
+    original = [
+        example(features=features({
+            cname: int64_feature([2]),
+        })),
+        example(
+            features=features({
+                aname: float_feature([1, 1]),
+                bname: bytes_feature([b"b0_str", b"b1_str"]),
+            })),
+        example(
+            features=features({
+                aname: float_feature([-1, -1, 2, 2]),
+                bname: bytes_feature([b"b1"]),
+            })),
+        example(
+            features=features({
+                aname: float_feature([]),
+                cname: int64_feature([3]),
+            })),
+    ]
+
     serialized = [m.SerializeToString() for m in original]
 
     expected_output = {
@@ -807,21 +817,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
             errors_impl.OpError, "Key: b, Index: 2.  "
             "Number of bytes values is not a multiple of stride length."))
 
-    self._test(
-        ops.convert_to_tensor(serialized), {
-            aname:
-                parsing_ops.FixedLenSequenceFeature(
-                    (2, 1),
-                    dtype=dtypes.float32,
-                    allow_missing=True,
-                    default_value=[]),
-            bname:
-                parsing_ops.FixedLenSequenceFeature(
-                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
-        },
-        expected_err=(ValueError,
-                      "Cannot reshape a tensor with 0 elements to shape"))
-
     self._test(
         ops.convert_to_tensor(serialized), {
             aname:
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index f73725366c46e1b0dca88e3d1b09147a23966eaf..60c3741d32d763df4a3a68d82ba269aa0b2be3ab 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -57,7 +57,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -87,7 +87,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -117,7 +117,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -150,7 +150,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
@@ -170,7 +170,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -199,12 +199,12 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -220,12 +220,12 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index 0730455431f9a3faaeb22b62f59d45c04d07c208..0e9bb462f30cc52c027553d0f0fb194e481d0971 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -60,7 +60,7 @@ class ScanTest(test_base.DatasetTestBase):
                  feed_dict={start: start_val, step: step_val, take: take_val})
         for expected, _ in zip(
             itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, sess.run(next_element))
+          self.assertEqual(expected, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
@@ -110,7 +110,7 @@ class ScanTest(test_base.DatasetTestBase):
                  feed_dict={start: start_val, step: step_val, take: take_val})
         for expected, _ in zip(
             itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, sess.run(next_element).values[0])
+          self.assertEqual(expected, self.evaluate(next_element).values[0])
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
@@ -136,7 +136,7 @@ class ScanTest(test_base.DatasetTestBase):
 
     with self.cached_session() as sess:
       for i in range(5):
-        (longer_vector_val, larger_rank_val), _ = sess.run(next_element)
+        (longer_vector_val, larger_rank_val), _ = self.evaluate(next_element)
         self.assertAllEqual([0] * (2**i), longer_vector_val)
         self.assertAllEqual(np.array(1, ndmin=i), larger_rank_val)
       with self.assertRaises(errors.OutOfRangeError):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 66bc3833a7351db65544b74da0b921c51f472c1a..2cfb57590367e586fb4a3195b11b0eee681d9f61 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -651,6 +651,7 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:stats_aggregator",
         "//tensorflow/python/data/experimental/ops:stats_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
index ef99d01c73ce164265c06bdf08b76ff67a90dd89..704a40721f8eff7c2ec3ef74ddce0ce2b28aa4bf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
@@ -71,19 +71,19 @@ class RangeDatasetSerializationTest(
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
+        self.evaluate(init_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -91,14 +91,14 @@ class RangeDatasetSerializationTest(
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-        sess.run(restore_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
index 88d5c896c9fd9710e41026b321daa1fc90a7c66f..496fd4594777a72a8523f4abd41692c2bcfa39c7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
@@ -62,7 +62,7 @@ class SerializationIntegrationTest(test.TestCase):
       with self.session(graph=g) as sess:
         sess.run(init_ops)
         for _ in range(break_point):
-          output = sess.run(get_next_ops)
+          output = self.evaluate(get_next_ops)
           for i in range(num_pipelines):
             all_outputs[i].append(output[i])
         saver.save(sess, self._ckpt_path())
@@ -73,7 +73,7 @@ class SerializationIntegrationTest(test.TestCase):
       with self.session(graph=g) as sess:
         saver.restore(sess, self._ckpt_path())
         for _ in range(num_outputs - break_point):
-          output = sess.run(get_next_ops)
+          output = self.evaluate(get_next_ops)
           for i in range(num_pipelines):
             all_outputs[i].append(output[i])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
index ef7061b190473f726673e85fbd19fc8da8584052..662d768b4896f846e7d0cad078838a7c12590c04 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
@@ -92,9 +93,9 @@ class StatsDatasetSerializationTest(
         None, num_outputs)
 
   def _build_dataset_stats_aggregator(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+    aggregator = stats_aggregator.StatsAggregator()
     return dataset_ops.Dataset.range(10).apply(
-        stats_ops.set_stats_aggregator(stats_aggregator))
+        stats_ops.set_stats_aggregator(aggregator))
 
   def test_set_stats_aggregator_not_support_checkpointing(self):
     with self.assertRaisesRegexp(errors.UnimplementedError,
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index c208963a8612228ecf9ff8b91328a2d02c0d3890..5f7d9051eca482c29747dafca618cc11761c573a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -108,7 +108,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
           shuffle_ops.shuffle_and_repeat(buffer_size=21))
       get_next_op = ds.make_one_shot_iterator().get_next()
       with self.session(graph=g) as sess:
-        sess.run(get_next_op)
+        self.evaluate(get_next_op)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
index bf53acc82a8259e04e470ab5e7b87ec3ab00911f..f7d42bc5b34710ee00aecf2dc1832feff53dc850 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
@@ -38,10 +38,10 @@ class SleepTest(test_base.DatasetTestBase):
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       start_time = time.time()
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       end_time = time.time()
       self.assertGreater(end_time - start_time, (10 * sleep_microseconds) / 1e6)
       with self.assertRaises(errors.OutOfRangeError):
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
index a2c11696387ddbf81546765734854897a279adbf..e11bad7969c49bff2912a402f3550b69c3e181dc 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
@@ -39,8 +39,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                             "ORDER BY first_name DESC"
             })
         for _ in range(2):  # Dataset is repeated. See setUp.
-          self.assertEqual((b"John", b"Doe", b"Hi!"), sess.run(get_next))
-          self.assertEqual((b"Jane", b"Moe", b"Hi again!"), sess.run(get_next))
+          self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
+          self.assertEqual((b"Jane", b"Moe", b"Hi again!"),
+                           self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -58,7 +59,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ON students.first_name = people.first_name "
                   "AND students.last_name = people.last_name"
           })
-      self.assertEqual((b"John", b"California", b"Hi!"), sess.run(get_next))
+      self.assertEqual((b"John", b"California", b"Hi!"),
+                       self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -75,8 +77,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, last_name, favorite_nonsense_word "
                   "FROM students ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"n\0nsense"), sess.run(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"n\0nsense"), self.evaluate(get_next))
+      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"),
+                       self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -93,8 +96,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, last_name, motto FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"Hi!"), sess.run(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
+      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
       sess.run(
@@ -103,7 +106,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, last_name, state FROM people "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"California"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"California"),
+                       self.evaluate(get_next))
       self.assertEqual((b"Benjamin", b"Franklin", b"Pennsylvania"),
                        sess.run(get_next))
       with self.assertRaises(errors.OutOfRangeError):
@@ -212,8 +216,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -230,7 +234,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students "
                           "WHERE first_name = 'John' ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0, -2), sess.run(get_next))
+      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -246,9 +250,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT desk_number, favorite_negative_number FROM students "
                   "ORDER BY first_name DESC"
           })
-      self.assertEqual((9, -2), sess.run(get_next))
+      self.assertEqual((9, -2), self.evaluate(get_next))
       # Max and min values of int8
-      self.assertEqual((127, -128), sess.run(get_next))
+      self.assertEqual((127, -128), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -263,8 +267,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -281,7 +285,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students "
                           "WHERE first_name = 'John' ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0, -2), sess.run(get_next))
+      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -297,9 +301,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students ORDER BY first_name DESC"
           })
       # Max value of int16
-      self.assertEqual((b"John", 32767), sess.run(get_next))
+      self.assertEqual((b"John", 32767), self.evaluate(get_next))
       # Min value of int16
-      self.assertEqual((b"Jane", -32768), sess.run(get_next))
+      self.assertEqual((b"Jane", -32768), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -314,8 +318,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int32` tensor.
@@ -328,8 +332,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, income FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0), sess.run(get_next))
-      self.assertEqual((b"Jane", -20000), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
+      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -345,9 +349,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Max value of int32
-      self.assertEqual((b"John", 2147483647), sess.run(get_next))
+      self.assertEqual((b"John", 2147483647), self.evaluate(get_next))
       # Min value of int32
-      self.assertEqual((b"Jane", -2147483648), sess.run(get_next))
+      self.assertEqual((b"Jane", -2147483648), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -362,8 +366,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, school_id FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 123), sess.run(get_next))
-      self.assertEqual((b"Jane", 1000), sess.run(get_next))
+      self.assertEqual((b"John", 123), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 1000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -378,8 +382,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -394,8 +398,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, income FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0), sess.run(get_next))
-      self.assertEqual((b"Jane", -20000), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
+      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -412,9 +416,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       # Max value of int64
-      self.assertEqual((b"John", 9223372036854775807), sess.run(get_next))
+      self.assertEqual((b"John", 9223372036854775807), self.evaluate(get_next))
       # Min value of int64
-      self.assertEqual((b"Jane", -9223372036854775808), sess.run(get_next))
+      self.assertEqual((b"Jane", -9223372036854775808), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -429,8 +433,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -446,9 +450,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Min value of uint8
-      self.assertEqual((b"John", 0), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
       # Max value of uint8
-      self.assertEqual((b"Jane", 255), sess.run(get_next))
+      self.assertEqual((b"Jane", 255), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -463,8 +467,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -480,9 +484,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Min value of uint16
-      self.assertEqual((b"John", 0), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
       # Max value of uint16
-      self.assertEqual((b"Jane", 65535), sess.run(get_next))
+      self.assertEqual((b"Jane", 65535), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
@@ -499,8 +503,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, registration_complete FROM students "
                   "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", True), sess.run(get_next))
-      self.assertEqual((b"Jane", False), sess.run(get_next))
+      self.assertEqual((b"John", True), self.evaluate(get_next))
+      self.assertEqual((b"Jane", False), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -515,8 +519,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, favorite_medium_sized_number "
                           "FROM students ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", True), sess.run(get_next))
-      self.assertEqual((b"Jane", True), sess.run(get_next))
+      self.assertEqual((b"John", True), self.evaluate(get_next))
+      self.assertEqual((b"Jane", True), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -533,8 +537,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, last_name, victories FROM townspeople "
                   "ORDER BY first_name"
           })
-      self.assertEqual((b"George", b"Washington", 20.0), sess.run(get_next))
-      self.assertEqual((b"John", b"Adams", -19.95), sess.run(get_next))
+      self.assertEqual((b"George", b"Washington", 20.0),
+                       self.evaluate(get_next))
+      self.assertEqual((b"John", b"Adams", -19.95), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 4d794b4b8458d830361a4445f71da8f27360c2ae..958c3f0038ad23d9c9543682488361f80bf2c62d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -17,13 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_ops
+from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -32,68 +35,95 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+def function_set_stats_aggregator(dataset,
+                                  aggregator,
+                                  prefix="",
+                                  counter_prefix=""):
+  return dataset.apply(
+      stats_ops.set_stats_aggregator(aggregator, prefix, counter_prefix))
+
+
+def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
+  options = dataset_ops.Options()
+  options.experimental_stats = stats_options.StatsOptions(aggregator)
+  options.experimental_stats.latency_all_edges = False
+  if prefix:
+    options.experimental_stats.prefix = prefix
+  if counter_prefix:
+    options.experimental_stats.counter_prefix = counter_prefix
+  return dataset.with_options(options)
+
+
+@parameterized.named_parameters(
+    dict(
+        testcase_name="SetStatsAggregator",
+        dataset_transformation=function_set_stats_aggregator),
+    dict(
+        testcase_name="StatsOptions",
+        dataset_transformation=function_apply_options))
 class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
-  def testBytesProduced(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testBytesProduced(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
-            stats_ops.bytes_produced_stats("bytes_produced")).apply(
-                stats_ops.set_stats_aggregator(stats_aggregator))
+            stats_ops.bytes_produced_stats("bytes_produced"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       expected_sum = 0.0
       for i in range(100):
         self.assertAllEqual(
             np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
         expected_sum += i * 8.0
         self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
-      summary_str = sess.run(summary_t)
+      summary_str = self.evaluate(summary_t)
       self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
-  def testLatencyStats(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testLatencyStats(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
             sess.run(summary_t), "record_latency", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 100.0)
 
-  def testPrefetchBufferUtilization(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testPrefetchBufferUtilization(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
-        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(
-            -1).apply(stats_ops.set_stats_aggregator(stats_aggregator))
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(-1)
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
         self.assertAllEqual(
             np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
                                     float(i + 1))
         self._assertSummaryContains(summary_str, "Prefetch::buffer_capacity")
@@ -102,25 +132,25 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
                                     0, 1)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
-      summary_str = sess.run(summary_t)
+      summary_str = self.evaluate(summary_t)
       self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
                                   100)
 
-  def testPrefetchBufferScalars(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testPrefetchBufferScalars(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(
-            0).apply(stats_ops.set_stats_aggregator(stats_aggregator))
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(0)
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(10):
         self.assertAllEqual(
             np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasScalarValue(summary_str,
                                           "Prefetch::buffer_capacity", 0)
         self._assertSummaryHasScalarValue(summary_str, "Prefetch::buffer_size",
@@ -128,19 +158,19 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
-  def testFilteredElementsStats(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testFilteredElementsStats(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(101).filter(
-        lambda x: math_ops.equal(math_ops.mod(x, 3), 0)).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
+        lambda x: math_ops.equal(math_ops.mod(x, 3), 0))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(34):
-        self.assertEqual(i * 3, sess.run(next_element))
+        self.assertEqual(i * 3, self.evaluate(next_element))
         if i is not 0:
           self._assertSummaryHasScalarValue(
               sess.run(summary_t), "Filter::dropped_elements", float(i * 2))
@@ -153,7 +183,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       self._assertSummaryHasScalarValue(
           sess.run(summary_t), "Filter::filtered_elements", 34.0)
 
-  def testMapBufferUtilization(self):
+  def testMapBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
       return dataset_ops.Dataset.range(10).map(
@@ -161,9 +191,13 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
           num_parallel_calls=4)
 
     self._testParallelCallsStats(
-        dataset_fn, "ParallelMap", 10, function_processing_time=True)
+        dataset_fn,
+        "ParallelMap",
+        10,
+        dataset_transformation,
+        function_processing_time=True)
 
-  def testMapAutoTuneBufferUtilization(self):
+  def testMapAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
       dataset = dataset_ops.Dataset.range(10).map(
@@ -174,9 +208,13 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       return dataset.with_options(options)
 
     self._testParallelCallsStats(
-        dataset_fn, "ParallelMap", 10, function_processing_time=True)
+        dataset_fn,
+        "ParallelMap",
+        10,
+        dataset_transformation,
+        function_processing_time=True)
 
-  def testInterleaveAutoTuneBufferUtilization(self):
+  def testInterleaveAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
       dataset = dataset_ops.Dataset.range(10).map(
@@ -189,9 +227,10 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       options.experimental_autotune = True
       return dataset.with_options(options)
 
-    self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10)
+    self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10,
+                                 dataset_transformation)
 
-  def testMapAndBatchAutoTuneBufferUtilization(self):
+  def testMapAndBatchAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
       dataset = dataset_ops.Dataset.range(100).apply(
@@ -208,23 +247,24 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_fn,
         "MapAndBatch",
         num_output,
+        dataset_transformation,
         check_elements=False,
         function_processing_time=True)
 
-  def testReinitialize(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testReinitialize(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       for j in range(5):
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         for i in range(100):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
           self._assertSummaryHasCount(
               sess.run(summary_t), "record_latency", float((j * 100) + i + 1))
         with self.assertRaises(errors.OutOfRangeError):
@@ -232,100 +272,103 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         self._assertSummaryHasCount(
             sess.run(summary_t), "record_latency", (j + 1) * 100.0)
 
-  def testNoAggregatorRegistered(self):
+  def testNoAggregatorRegistered(self, dataset_transformation):
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
-  def testMultipleTags(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testMultipleTags(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency_2")).apply(
-                stats_ops.set_stats_aggregator(stats_aggregator))
+            stats_ops.latency_stats("record_latency_2"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
             sess.run(summary_t), "record_latency", float(i + 1))
         self._assertSummaryHasCount(
             sess.run(summary_t), "record_latency_2", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 100.0)
       self._assertSummaryHasCount(
           sess.run(summary_t), "record_latency_2", 100.0)
 
-  def testRepeatedTags(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testRepeatedTags(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency")).apply(
-                stats_ops.set_stats_aggregator(stats_aggregator))
+            stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
             sess.run(summary_t), "record_latency", float(2 * (i + 1)))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 200.0)
 
-  def testMultipleIteratorsSameAggregator(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testMultipleIteratorsSameAggregator(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
     iterator_0 = dataset.make_initializable_iterator()
     iterator_1 = dataset.make_initializable_iterator()
     next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
-        self.assertEqual(i * 2, sess.run(next_element))
+        self.assertEqual(i * 2, self.evaluate(next_element))
         self._assertSummaryHasCount(
             sess.run(summary_t), "record_latency", float(2 * (i + 1)))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 200.0)
 
-  def testMultipleDatasetWithTags(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  def testMultipleDatasetWithPrefixes(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator, "dataset1"))
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator, prefix="dataset1")
     dataset2 = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator, "dataset2"))
+        stats_ops.latency_stats("record_latency"))
+    dataset2 = dataset_transformation(dataset2, aggregator, prefix="dataset2")
     iterator_0 = dataset.make_initializable_iterator()
     iterator_1 = dataset2.make_initializable_iterator()
     next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
       sess.run([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
-        self.assertEqual(i * 2, sess.run(next_element))
+        self.assertEqual(i * 2, self.evaluate(next_element))
         self._assertSummaryHasCount(
             sess.run(summary_t), "dataset1_record_latency", float(i + 1))
         self._assertSummaryHasCount(
@@ -338,15 +381,22 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
           sess.run(summary_t), "dataset2_record_latency", 100.0)
 
 
+@parameterized.named_parameters(
+    dict(
+        testcase_name="SetStatsAggregator",
+        dataset_transformation=function_set_stats_aggregator),
+    dict(
+        testcase_name="StatsOptions",
+        dataset_transformation=function_apply_options))
 class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
-  def testFeaturesStats(self):
+  def testFeaturesStats(self, dataset_transformation):
     num_epochs = 5
     total_records = num_epochs * self._num_records
     batch_size = 2
-    stats_aggregator = stats_ops.StatsAggregator()
+    aggregator = stats_aggregator.StatsAggregator()
 
     def dataset_fn():
       return self.make_batch_feature(
@@ -362,16 +412,20 @@ class FeatureStatsDatasetTest(
       num_output = total_records // batch_size + 1
 
     self._testParallelCallsStats(
-        dataset_fn, "ParseExample", num_output, check_elements=False)
+        dataset_fn,
+        "ParseExample",
+        num_output,
+        dataset_transformation,
+        check_elements=False)
 
-    iterator = dataset_fn().apply(
-        stats_ops.set_stats_aggregator(
-            stats_aggregator, "record_stats")).make_initializable_iterator()
+    dataset = dataset_transformation(
+        dataset_fn(), aggregator, prefix="record_stats")
+    iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for _ in range(num_output):
         sess.run(next_element)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index a4e6242b00c849719c558e06617054e43fcc486c..c5bf9267590b105bcb681455d9488d09451345b9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.core.framework import summary_pb2
-from tensorflow.python.data.experimental.ops import stats_ops
+from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import errors
 
@@ -87,14 +87,15 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
                               dataset_fn,
                               dataset_name,
                               num_output,
+                              dataset_transformation,
                               function_processing_time=False,
                               check_elements=True):
-    stats_aggregator = stats_ops.StatsAggregator()
-    dataset = dataset_fn().apply(
-        stats_ops.set_stats_aggregator(stats_aggregator))
+    aggregator = stats_aggregator.StatsAggregator()
+    dataset = dataset_fn()
+    dataset = dataset_transformation(dataset, aggregator)
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer)
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
index 0278a208cbba5c84cb19732172277cf6685d5520..755294ac451d2c7c95837dbf962b2faea8d91774 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -50,7 +50,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
       for i in range(4):
-        self.assertEqual(i, sess.run(next_elem))
+        self.assertEqual(i, self.evaluate(next_elem))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_elem)
 
@@ -68,7 +68,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i,) * 3, sess.run(op))
+        self.assertEqual((i,) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
@@ -88,7 +88,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
+        self.assertEqual((i, compat.as_bytes(str(i)), i), self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
@@ -107,7 +107,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        st_row = sess.run(next_element)
+        st_row = self.evaluate(next_element)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
         self.assertEqual([10], st_row.dense_shape)
@@ -128,7 +128,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        dense_elem, st_row = sess.run(next_element)
+        dense_elem, st_row = self.evaluate(next_element)
         self.assertEqual(i, dense_elem)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
@@ -150,7 +150,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual(((i,),) * 3, sess.run(op))
+        self.assertEqual(((i,),) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
index 847cff26b0d047f852658344529750b908250a19..4b14a7e96351e6a700ac48646f67947dbdc0624b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -49,11 +49,11 @@ class UniqueTest(test_base.DatasetTestBase):
     with self.cached_session() as sess:
       for test_case, expected in test_cases:
         current_test_case = test_case
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         for element in expected:
           if dtype == dtypes.string:
             element = compat.as_bytes(element)
-          self.assertAllEqual(element, sess.run(next_element))
+          self.assertAllEqual(element, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 323298e33a6b1b2f3bad29b36db6ff07266f5673..170fda90b68a05c7732ce607e26ef06b1a82528c 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -82,6 +82,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:tensor_shape",
@@ -271,6 +272,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "stats_aggregator",
+    srcs = ["stats_aggregator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "stats_ops",
     srcs = ["stats_ops.py"],
@@ -286,6 +297,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "stats_options",
+    srcs = ["stats_options.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":stats_aggregator",
+    ],
+)
+
 py_library(
     name = "threadpool",
     srcs = ["threadpool.py"],
diff --git a/tensorflow/python/data/experimental/ops/counter.py b/tensorflow/python/data/experimental/ops/counter.py
index 42200eaef9cb078afa0a9f598b6fa21e5e91f04b..652eb9d002992a737f3f8f0018db3a7316d0091e 100644
--- a/tensorflow/python/data/experimental/ops/counter.py
+++ b/tensorflow/python/data/experimental/ops/counter.py
@@ -25,8 +25,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.Counter")
-def Counter(start=0, step=1, dtype=dtypes.int64):
+@tf_export("data.experimental.Counter", v1=[])
+def CounterV2(start=0, step=1, dtype=dtypes.int64):
   """Creates a `Dataset` that counts from `start` in steps of size `step`.
 
   For example:
@@ -53,3 +53,13 @@ def Counter(start=0, step=1, dtype=dtypes.int64):
     step = ops.convert_to_tensor(step, dtype=dtype, name="step")
     return dataset_ops.Dataset.from_tensors(0).repeat(None).apply(
         scan_ops.scan(start, lambda state, _: (state + step, state)))
+
+
+@tf_export(v1=["data.experimental.Counter"])
+def CounterV1(start=0, step=1, dtype=dtypes.int64):
+  return dataset_ops.DatasetV1Adapter(CounterV2(start, step, dtype))
+CounterV1.__doc__ = CounterV2.__doc__
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+Counter = CounterV1  # pylint: disable=invalid-name
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
index 82e274b70c5b703c62dcc143df371fae3d80065e..879b13ce092f20c2a6cfc911ba4c6e11992e23a8 100644
--- a/tensorflow/python/data/experimental/ops/error_ops.py
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -52,7 +52,7 @@ def ignore_errors():
   return _apply_fn
 
 
-class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
+class _IgnoreErrorsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that silently ignores errors when computing its input."""
 
   def __init__(self, input_dataset):
@@ -64,15 +64,3 @@ class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/get_single_element.py b/tensorflow/python/data/experimental/ops/get_single_element.py
index 132526166cfe49e267b1569b9e7851c8256234dd..73116edf1288bf252721a5f96cf69b8d590dff14 100644
--- a/tensorflow/python/data/experimental/ops/get_single_element.py
+++ b/tensorflow/python/data/experimental/ops/get_single_element.py
@@ -60,7 +60,7 @@ def get_single_element(dataset):
     InvalidArgumentError (at runtime): if `dataset` does not contain exactly
       one element.
   """
-  if not isinstance(dataset, dataset_ops.Dataset):
+  if not isinstance(dataset, dataset_ops.DatasetV2):
     raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
 
   nested_ret = nest.pack_sequence_as(
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index 026867d405fc47f12ae251e851bf8669ad29d7d1..80ca7104d851ab51d2da705b32a4facfb5a9fabb 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -454,8 +454,7 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
         self._transformation_name(),
         input_classes=(ops.Tensor, nested_dataset),
         input_shapes=(tensor_shape.scalar(), nested_dataset),
-        input_types=(dtypes.int64, nested_dataset),
-        experimental_nested_dataset_support=True)
+        input_types=(dtypes.int64, nested_dataset))
     if not isinstance(
         wrapped_func.output_classes, dataset_ops._NestedDatasetComponent):  # pylint: disable=protected-access
       raise TypeError("`reduce_func` must return a `Dataset` object.")
@@ -528,10 +527,7 @@ class _MapXDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
 
     wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        map_func,
-        self._transformation_name(),
-        dataset=input_dataset,
-        experimental_nested_dataset_support=True)
+        map_func, self._transformation_name(), dataset=input_dataset)
     self._output_classes = wrapped_func.output_classes
     self._output_shapes = wrapped_func.output_shapes
     self._output_types = wrapped_func.output_types
diff --git a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
index 9c06474a2f8076d3ded5fd798665ea05930ecfe5..570f0116f7686327f147f96447e87e5ddf8a927c 100644
--- a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
+++ b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
@@ -65,6 +65,7 @@ class MaterializedIndexedDataset(object):
             sparse.as_dense_types(self._output_shapes, self._output_classes)))
 
 
+# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
 class IndexedDataset(dataset_ops.Dataset):
   """IndexedDataset is highly experimental!
   """
@@ -149,6 +150,7 @@ class IndexedDataset(dataset_ops.Dataset):
     raise NotImplementedError("IndexedDataset._as_variant_tensor")
 
 
+# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
 class IdentityIndexedDataset(IndexedDataset):
   """IdentityIndexedDataset is a trivial indexed dataset used for testing.
   """
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index a3c094859efb7586b3ddcf1823ab27bf0a733445..8b0fdfce11d26889770aa84403829b87c6528191 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -133,8 +133,8 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
     return self._data_inputs[0].output_types
 
 
-@tf_export("data.experimental.sample_from_datasets")
-def sample_from_datasets(datasets, weights=None, seed=None):
+@tf_export("data.experimental.sample_from_datasets", v1=[])
+def sample_from_datasets_v2(datasets, weights=None, seed=None):
   """Samples elements at random from the datasets in `datasets`.
 
   Args:
@@ -158,7 +158,7 @@ def sample_from_datasets(datasets, weights=None, seed=None):
       length of the `datasets` element.
   """
   num_datasets = len(datasets)
-  if not isinstance(weights, dataset_ops.Dataset):
+  if not isinstance(weights, dataset_ops.DatasetV2):
     if weights is None:
       # Select inputs with uniform probability.
       logits = [[1.0] * num_datasets]
@@ -217,8 +217,15 @@ def sample_from_datasets(datasets, weights=None, seed=None):
   return _DirectedInterleaveDataset(selector_input, datasets)
 
 
-@tf_export("data.experimental.choose_from_datasets")
-def choose_from_datasets(datasets, choice_dataset):
+@tf_export(v1=["data.experimental.sample_from_datasets"])
+def sample_from_datasets_v1(datasets, weights=None, seed=None):
+  return dataset_ops.DatasetV1Adapter(
+      sample_from_datasets_v2(datasets, weights, seed))
+sample_from_datasets_v1.__doc__ = sample_from_datasets_v2.__doc__
+
+
+@tf_export("data.experimental.choose_from_datasets", v1=[])
+def choose_from_datasets_v2(datasets, choice_dataset):
   """Creates a dataset that deterministically chooses elements from `datasets`.
 
   For example, given the following datasets:
@@ -260,3 +267,16 @@ def choose_from_datasets(datasets, choice_dataset):
     raise TypeError("`choice_dataset` must be a dataset of scalar "
                     "`tf.int64` tensors.")
   return _DirectedInterleaveDataset(choice_dataset, datasets)
+
+
+@tf_export(v1=["data.experimental.choose_from_datasets"])
+def choose_from_datasets_v1(datasets, choice_dataset):
+  return dataset_ops.DatasetV1Adapter(
+      choose_from_datasets_v2(datasets, choice_dataset))
+choose_from_datasets_v1.__doc__ = choose_from_datasets_v2.__doc__
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+choose_from_datasets = choose_from_datasets_v1
+sample_from_datasets = sample_from_datasets_v1
diff --git a/tensorflow/python/data/experimental/ops/map_defun.py b/tensorflow/python/data/experimental/ops/map_defun.py
index ec1a3adf0c17e83c308e18afb71c95983a508ce1..5d729d392ac5ec9745cbfdd269bc536a74f3e865 100644
--- a/tensorflow/python/data/experimental/ops/map_defun.py
+++ b/tensorflow/python/data/experimental/ops/map_defun.py
@@ -52,7 +52,7 @@ def map_defun(fn, elems, output_dtypes, output_shapes):
     raise ValueError("`output_shapes` must be a list of `tf.TensorShape` "
                      "objects.")
 
-  concrete_fn = fn.get_concrete_function()
+  concrete_fn = fn._get_concrete_function_internal()  # pylint: disable=protected-access
   # TODO(shivaniagrawal/rachelim): what about functions created without
   # input_signature.
   elems = [ops.convert_to_tensor(e) for e in elems]
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index 8e1de136b6644863d70877df92093a2beadd9d1f..c6c7de9265c32245dfbc348a4e7c4fd06eda653b 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -65,6 +65,21 @@ def model():
   return _apply_fn
 
 
+def non_serializable():
+  """A non-serializable identity transformation.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    return _NonSerializableDataset(dataset)
+
+  return _apply_fn
+
+
 def optimize(optimizations=None):
   """A transformation that applies optimizations.
 
@@ -85,7 +100,7 @@ def optimize(optimizations=None):
   return _apply_fn
 
 
-class _AssertNextDataset(dataset_ops.UnaryDataset):
+class _AssertNextDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that asserts which transformations happen next."""
 
   def __init__(self, input_dataset, transformations):
@@ -103,15 +118,16 @@ class _AssertNextDataset(dataset_ops.UnaryDataset):
         self._transformations,
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
+class _NonSerializableDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` that performs non-serializable identity transformation."""
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def __init__(self, input_dataset):
+    """See `non_serializable()` for details."""
+    super(_NonSerializableDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
 
+  def _as_variant_tensor(self):
+    return gen_experimental_dataset_ops.experimental_non_serializable_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        **dataset_ops.flat_structure(self))
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
index 2add95558d5bcc7bbe355bad6d941b9179b41919..1c4e14ce32eb75b8be591540a7e9c999b332c877 100644
--- a/tensorflow/python/data/experimental/ops/prefetching_ops.py
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -138,7 +138,7 @@ class _PrefetchToDeviceIterator(object):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
-    self._prefetch_fn = _prefetch_fn.get_concrete_function()
+    self._prefetch_fn = _prefetch_fn._get_concrete_function_internal()  # pylint: disable=protected-access
 
     iterator_device = ged_ops.experimental_iterator_get_device(
         self._input_iterator._iterator_resource)
@@ -237,7 +237,7 @@ class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
-    self._prefetch_fn = _prefetch_fn.get_concrete_function()
+    self._prefetch_fn = _prefetch_fn._get_concrete_function_internal()  # pylint: disable=protected-access
 
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
@@ -265,7 +265,7 @@ class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
 # pylint: enable=protected-access
 
 
-class _PrefetchToDeviceDataset(dataset_ops.UnaryDataset):
+class _PrefetchToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` whose iterator prefetches elements to another device."""
 
   def __init__(self, input_dataset, device, buffer_size):
@@ -322,18 +322,6 @@ class _PrefetchToDeviceDataset(dataset_ops.UnaryDataset):
     raise NotImplementedError("`prefetch_to_device()` must be the last "
                               "transformation in a dataset pipeline.")
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
 
 @tf_export("data.experimental.prefetch_to_device")
 def prefetch_to_device(device, buffer_size=None):
@@ -380,7 +368,7 @@ def copy_to_device(target_device, source_device="/cpu:0"):
 # TODO(rohanj): Use the _input_hostmem attr on the RemoteCall ops to indicate
 # all inputs to the Op are in host memory, thereby avoiding some unnecessary
 # Sends and Recvs.
-class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
+class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that copies elements to another device."""
 
   def __init__(self, input_dataset, target_device, source_device="/cpu:0"):
@@ -422,7 +410,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           [gen_dataset_ops.make_iterator(ds_variant, resource)]):
         return gen_dataset_ops.iterator_to_string_handle(resource)
 
-    init_func_concrete = _init_func.get_concrete_function()
+    init_func_concrete = _init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun()
     def _remote_init_func():
       return functional_ops.remote_call(
@@ -431,7 +420,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           Tout=[dtypes.string],
           f=init_func_concrete)
 
-    self._init_func = _remote_init_func.get_concrete_function()
+    self._init_func = _remote_init_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._init_captured_args = self._init_func.captured_inputs
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
@@ -450,7 +439,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
       ret = iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
-    next_func_concrete = _next_func.get_concrete_function()
+    next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
@@ -460,7 +450,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           Tout=self._flat_output_types,
           f=next_func_concrete)
 
-    self._next_func = _remote_next_func.get_concrete_function()
+    self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._next_captured_args = self._next_func.captured_inputs
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
@@ -481,7 +471,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
               iterator_resource, ignore_lookup_error=True)]):
         return array_ops.constant(0, dtypes.int64)
 
-    finalize_func_concrete = _finalize_func.get_concrete_function()
+    finalize_func_concrete = _finalize_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
@@ -491,7 +482,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           Tout=[dtypes.int64],
           f=finalize_func_concrete)
 
-    self._finalize_func = _remote_finalize_func.get_concrete_function()
+    self._finalize_func = _remote_finalize_func._get_concrete_function_internal(  # pylint: disable=protected-access
+    )
     self._finalize_captured_args = self._finalize_func.captured_inputs
 
     g = ops.get_default_graph()
@@ -525,14 +517,70 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           output_types=self._flat_output_types,
           output_shapes=self._flat_output_shapes)
 
+
+class _MapOnGpuDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over elements in its using a GPU."""
+
+  def __init__(self, input_dataset, map_func, use_inter_op_parallelism=True):
+    """See `Dataset.map()` for details."""
+    super(_MapOnGpuDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._use_inter_op_parallelism = use_inter_op_parallelism
+
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        defun_kwargs={"experimental_ints_on_device": True})
+    self._output_classes = wrapped_func.output_classes
+    self._output_shapes = wrapped_func.output_shapes
+    self._output_types = wrapped_func.output_types
+    self._map_func = wrapped_func.function
+
+  def _as_variant_tensor(self):
+    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+    return ged_ops.experimental_map_dataset(
+        input_t,
+        self._map_func.captured_inputs,
+        f=self._map_func,
+        use_inter_op_parallelism=self._use_inter_op_parallelism,
+        **dataset_ops.flat_structure(self))
+
   @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
-    return self._input_dataset.output_shapes
+    return self._output_shapes
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
+  def output_types(self):
+    return self._output_types
+
+  def _transformation_name(self):
+    return "map_on_gpu()"
+
+
+def map_on_gpu(map_func):
+  """Maps `map_func` across the elements of this dataset.
+
+  NOTE: This is a highly experimental version of `tf.data.Dataset.map` that runs
+  `map_func` on GPU. It must be used after applying the
+  `tf.data.experimental.copy_to_device` transformation with a GPU device
+  argument.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors (having shapes
+      and types defined by `self.output_shapes` and `self.output_types`) to
+      another nested structure of tensors.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _MapOnGpuDataset(dataset, map_func)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index e3a2aeab31ea35ee9636821e3e8b8db35ed72b65..7bf703502be2a9ca7853873b909b4692f89f3476 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.framework import dtypes
@@ -26,13 +28,13 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.RandomDataset")
-class RandomDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.RandomDataset", v1=[])
+class RandomDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` of pseudorandom values."""
 
   def __init__(self, seed=None):
     """A `Dataset` of pseudorandom values."""
-    super(RandomDataset, self).__init__()
+    super(RandomDatasetV2, self).__init__()
     self._seed, self._seed2 = random_seed.get_seed(seed)
 
   def _as_variant_tensor(self):
@@ -52,3 +54,18 @@ class RandomDataset(dataset_ops.DatasetSource):
   @property
   def output_types(self):
     return dtypes.int64
+
+
+@tf_export(v1=["data.experimental.RandomDataset"])
+class RandomDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` of pseudorandom values."""
+
+  @functools.wraps(RandomDatasetV2.__init__)
+  def __init__(self, seed=None):
+    wrapped = RandomDatasetV2(seed)
+    super(RandomDatasetV1, self).__init__(wrapped)
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+RandomDataset = RandomDatasetV1
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 3b2d0945148e44a0c800b4a661b88fc921e93507..1ba26ed5b9f491d51fde753fd2bb64fb0e992b48 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import collections
 import csv
+import functools
 
 import numpy as np
 
@@ -38,6 +39,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util.tf_export import tf_export
 
@@ -306,8 +308,8 @@ def make_tf_record_dataset(file_pattern,
     return dataset.prefetch(buffer_size=prefetch_buffer_size)
 
 
-@tf_export("data.experimental.make_csv_dataset")
-def make_csv_dataset(
+@tf_export("data.experimental.make_csv_dataset", v1=[])
+def make_csv_dataset_v2(
     file_pattern,
     batch_size,
     column_names=None,
@@ -506,11 +508,42 @@ def make_csv_dataset(
   return dataset
 
 
+@tf_export(v1=["data.experimental.make_csv_dataset"])
+def make_csv_dataset_v1(
+    file_pattern,
+    batch_size,
+    column_names=None,
+    column_defaults=None,
+    label_name=None,
+    select_columns=None,
+    field_delim=",",
+    use_quote_delim=True,
+    na_value="",
+    header=True,
+    num_epochs=None,
+    shuffle=True,
+    shuffle_buffer_size=10000,
+    shuffle_seed=None,
+    prefetch_buffer_size=optimization.AUTOTUNE,
+    num_parallel_reads=1,
+    sloppy=False,
+    num_rows_for_inference=100,
+    compression_type=None,
+):  # pylint: disable=missing-docstring
+  return dataset_ops.DatasetV1Adapter(make_csv_dataset_v2(
+      file_pattern, batch_size, column_names, column_defaults, label_name,
+      select_columns, field_delim, use_quote_delim, na_value, header,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, num_parallel_reads, sloppy, num_rows_for_inference,
+      compression_type))
+make_csv_dataset_v1.__doc__ = make_csv_dataset_v2.__doc__
+
+
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 4 * 1024 * 1024  # 4 MB
 
 
-@tf_export("data.experimental.CsvDataset")
-class CsvDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.CsvDataset", v1=[])
+class CsvDatasetV2(dataset_ops.DatasetSource):
   """A Dataset comprising lines from one or more CSV files."""
 
   def __init__(self,
@@ -593,7 +626,7 @@ class CsvDataset(dataset_ops.DatasetSource):
         the input data. If specified, only this subset of columns will be
         parsed. Defaults to parsing all columns.
     """
-    super(CsvDataset, self).__init__()
+    super(CsvDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -657,22 +690,43 @@ class CsvDataset(dataset_ops.DatasetSource):
     return self._output_classes
 
 
-@tf_export("data.experimental.make_batched_features_dataset")
-def make_batched_features_dataset(file_pattern,
-                                  batch_size,
-                                  features,
-                                  reader=core_readers.TFRecordDataset,
-                                  label_key=None,
-                                  reader_args=None,
-                                  num_epochs=None,
-                                  shuffle=True,
-                                  shuffle_buffer_size=10000,
-                                  shuffle_seed=None,
-                                  prefetch_buffer_size=optimization.AUTOTUNE,
-                                  reader_num_threads=1,
-                                  parser_num_threads=2,
-                                  sloppy_ordering=False,
-                                  drop_final_batch=False):
+@tf_export(v1=["data.experimental.CsvDataset"])
+class CsvDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A Dataset comprising lines from one or more CSV files."""
+
+  @functools.wraps(CsvDatasetV2.__init__)
+  def __init__(self,
+               filenames,
+               record_defaults,
+               compression_type=None,
+               buffer_size=None,
+               header=False,
+               field_delim=",",
+               use_quote_delim=True,
+               na_value="",
+               select_cols=None):
+    wrapped = CsvDatasetV2(filenames, record_defaults, compression_type,
+                           buffer_size, header, field_delim, use_quote_delim,
+                           na_value, select_cols)
+    super(CsvDatasetV1, self).__init__(wrapped)
+
+
+@tf_export("data.experimental.make_batched_features_dataset", v1=[])
+def make_batched_features_dataset_v2(file_pattern,
+                                     batch_size,
+                                     features,
+                                     reader=core_readers.TFRecordDataset,
+                                     label_key=None,
+                                     reader_args=None,
+                                     num_epochs=None,
+                                     shuffle=True,
+                                     shuffle_buffer_size=10000,
+                                     shuffle_seed=None,
+                                     prefetch_buffer_size=optimization.AUTOTUNE,
+                                     reader_num_threads=1,
+                                     parser_num_threads=2,
+                                     sloppy_ordering=False,
+                                     drop_final_batch=False):
   """Returns a `Dataset` of feature dictionaries from `Example` protos.
 
   If label_key argument is provided, returns a `Dataset` of tuple
@@ -760,6 +814,7 @@ def make_batched_features_dataset(file_pattern,
     Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects.
 
   Raises:
+    TypeError: If `reader` is a `tf.ReaderBase` subclass.
     ValueError: If `label_key` is not one of the `features` keys.
   """
   # Create dataset of all matching filenames
@@ -768,6 +823,12 @@ def make_batched_features_dataset(file_pattern,
   if shuffle:
     dataset = dataset.shuffle(len(filenames), shuffle_seed)
 
+  if isinstance(reader, type) and issubclass(reader, io_ops.ReaderBase):
+    raise TypeError("The `reader` argument must return a `Dataset` object. "
+                    "`tf.ReaderBase` subclasses are not supported. For "
+                    "example, pass `tf.data.TFRecordDataset` instead of "
+                    "`tf.TFRecordReader`.")
+
   # Read `Example` records from files as tensor objects.
   if reader_args is None:
     reader_args = []
@@ -811,6 +872,31 @@ def make_batched_features_dataset(file_pattern,
   return dataset
 
 
+@tf_export(v1=["data.experimental.make_batched_features_dataset"])
+def make_batched_features_dataset_v1(file_pattern,  # pylint: disable=missing-docstring
+                                     batch_size,
+                                     features,
+                                     reader=core_readers.TFRecordDataset,
+                                     label_key=None,
+                                     reader_args=None,
+                                     num_epochs=None,
+                                     shuffle=True,
+                                     shuffle_buffer_size=10000,
+                                     shuffle_seed=None,
+                                     prefetch_buffer_size=optimization.AUTOTUNE,
+                                     reader_num_threads=1,
+                                     parser_num_threads=2,
+                                     sloppy_ordering=False,
+                                     drop_final_batch=False):
+  return dataset_ops.DatasetV1Adapter(make_batched_features_dataset_v2(
+      file_pattern, batch_size, features, reader, label_key, reader_args,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, reader_num_threads, parser_num_threads,
+      sloppy_ordering, drop_final_batch))
+make_batched_features_dataset_v2.__doc__ = (
+    make_batched_features_dataset_v1.__doc__)
+
+
 def _get_file_names(file_pattern, shuffle):
   """Parse list of file names from pattern, optionally shuffled.
 
@@ -842,8 +928,8 @@ def _get_file_names(file_pattern, shuffle):
   return file_names
 
 
-@tf_export("data.experimental.SqlDataset")
-class SqlDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.SqlDataset", v1=[])
+class SqlDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` consisting of the results from a SQL query."""
 
   def __init__(self, driver_name, data_source_name, query, output_types):
@@ -875,7 +961,7 @@ class SqlDataset(dataset_ops.DatasetSource):
       output_types: A tuple of `tf.DType` objects representing the types of the
         columns returned by `query`.
     """
-    super(SqlDataset, self).__init__()
+    super(SqlDatasetV2, self).__init__()
     self._driver_name = ops.convert_to_tensor(
         driver_name, dtype=dtypes.string, name="driver_name")
     self._data_source_name = ops.convert_to_tensor(
@@ -902,3 +988,21 @@ class SqlDataset(dataset_ops.DatasetSource):
   @property
   def output_types(self):
     return self._output_types
+
+
+@tf_export(v1=["data.experimental.SqlDataset"])
+class SqlDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` consisting of the results from a SQL query."""
+
+  @functools.wraps(SqlDatasetV2.__init__)
+  def __init__(self, driver_name, data_source_name, query, output_types):
+    wrapped = SqlDatasetV2(driver_name, data_source_name, query, output_types)
+    super(SqlDatasetV1, self).__init__(wrapped)
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+CsvDataset = CsvDatasetV1
+SqlDataset = SqlDatasetV1
+make_batched_features_dataset = make_batched_features_dataset_v1
+make_csv_dataset = make_csv_dataset_v1
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index a4307212daf488deae986073264911fcf778588f..d12328a7145992880aedd939d7a02a8a12c61d4c 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
+class _ShuffleAndRepeatDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that fuses `shuffle` and `repeat`."""
 
   def __init__(self, input_dataset, buffer_size, count=None, seed=None):
@@ -53,18 +53,6 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
     # pylint: enable=protected-access
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 @tf_export("data.experimental.shuffle_and_repeat")
 def shuffle_and_repeat(buffer_size, count=None, seed=None):
diff --git a/tensorflow/python/data/experimental/ops/sleep.py b/tensorflow/python/data/experimental/ops/sleep.py
index 7e7d370f702e2c0bc037a1e1455728c52d476327..5e9d021ada9ce4bd068f8d899d570683e7e5d80b 100644
--- a/tensorflow/python/data/experimental/ops/sleep.py
+++ b/tensorflow/python/data/experimental/ops/sleep.py
@@ -21,7 +21,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 
 
-class _SleepDataset(dataset_ops.UnaryDataset):
+class _SleepDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that sleeps before producing each upstream element."""
 
   def __init__(self, input_dataset, sleep_microseconds):
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5274c816a49bf70bf25b18cf7d981b90e100ba10
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -0,0 +1,84 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""StatsAggregator for aggregating statistics from `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.StatsAggregator")
+class StatsAggregator(object):
+  """A stateful resource that aggregates statistics from one or more iterators.
+
+  To record statistics, use one of the custom transformation functions defined
+  in this module when defining your `tf.data.Dataset`. All statistics will be
+  aggregated by the `StatsAggregator` that is associated with a particular
+  iterator (see below). For example, to record the latency of producing each
+  element by iterating over a dataset:
+
+  ```python
+  dataset = ...
+  dataset = dataset.apply(tf.data.experimental.latency_stats("total_bytes"))
+  ```
+
+  To associate a `StatsAggregator` with a `tf.data.Dataset` object, use
+  the following pattern:
+
+  ```python
+  aggregator = tf.data.experimental.StatsAggregator()
+  dataset = ...
+
+  # Apply `StatsOptions` to associate `dataset` with `aggregator`.
+  options = dataset_ops.Options()
+  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
+  dataset = dataset.with_options(options)
+  iterator = dataset.make_one_shot_iterator()
+  ```
+
+  To get a protocol buffer summary of the currently aggregated statistics,
+  use the `StatsAggregator.get_summary()` tensor. The easiest way to do this
+  is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection,
+  so that the summaries will be included with any existing summaries.
+
+  ```python
+  aggregator = tf.data.experimental.StatsAggregator()
+  # ...
+  stats_summary = aggregator.get_summary()
+  tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
+  ```
+
+  Note: This interface is experimental and expected to change. In particular,
+  we expect to add other implementations of `StatsAggregator` that provide
+  different ways of exporting statistics, and add more types of statistics.
+  """
+
+  def __init__(self):
+    """Creates a `StatsAggregator`."""
+    self._resource = gen_dataset_ops.stats_aggregator_handle()
+
+  # TODO(b/116314787): Update this/add support for V2 summary API.
+  def get_summary(self):
+    """Returns a string `tf.Tensor` that summarizes the aggregated statistics.
+
+    The returned tensor will contain a serialized `tf.summary.Summary` protocol
+    buffer, which can be used with the standard TensorBoard logging facilities.
+
+    Returns:
+      A scalar string `tf.Tensor` that summarizes the aggregated statistics.
+    """
+    return gen_dataset_ops.stats_aggregator_summary(self._resource)
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index fb93b86b2911485108e9ea75118fb57923fa2a2f..95689433bd076c8afd8f027a5b867575dcb68daa 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -21,110 +21,18 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.StatsAggregator")
-class StatsAggregator(object):
-  """A stateful resource that aggregates statistics from one or more iterators.
-
-  To record statistics, use one of the custom transformation functions defined
-  in this module when defining your `tf.data.Dataset`. All statistics will be
-  aggregated by the `StatsAggregator` that is associated with a particular
-  iterator (see below). For example, to record the latency of producing each
-  element by iterating over a dataset:
-
-  ```python
-  dataset = ...
-  dataset = dataset.apply(tf.data.experimental.latency_stats("total_bytes"))
-  ```
-
-  To associate a `StatsAggregator` with a `tf.data.Dataset` object, use
-  the following pattern:
-
-  ```python
-  stats_aggregator = stats_ops.StatsAggregator()
-  dataset = ...
-
-  # Apply `set_stats_aggregator` to associate `dataset` with `stats_aggregator`.
-  dataset = dataset.apply(
-      tf.data.experimental.set_stats_aggregator(stats_aggregator))
-  iterator = dataset.make_one_shot_iterator()
-  ```
-
-  To get a protocol buffer summary of the currently aggregated statistics,
-  use the `StatsAggregator.get_summary()` tensor. The easiest way to do this
-  is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection,
-  so that the summaries will be included with any existing summaries.
-
-  ```python
-  stats_aggregator = stats_ops.StatsAggregator()
-  # ...
-  stats_summary = stats_aggregator.get_summary()
-  tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
-  ```
-
-  Note: This interface is experimental and expected to change. In particular,
-  we expect to add other implementations of `StatsAggregator` that provide
-  different ways of exporting statistics, and add more types of statistics.
-  """
-
-  def __init__(self):
-    """Creates a `StatsAggregator`."""
-    self._resource = gen_dataset_ops.stats_aggregator_handle()
-
-  # TODO(b/116314787): Update this/add support for V2 summary API.
-  def get_summary(self):
-    """Returns a string `tf.Tensor` that summarizes the aggregated statistics.
-
-    The returned tensor will contain a serialized `tf.summary.Summary` protocol
-    buffer, which can be used with the standard TensorBoard logging facilities.
-
-    Returns:
-      A scalar string `tf.Tensor` that summarizes the aggregated statistics.
-    """
-    return gen_dataset_ops.stats_aggregator_summary(self._resource)
-
-
-class _SetStatsAggregatorDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that acts as an identity, and sets given stats_aggregator."""
-
-  def __init__(self, input_dataset, stats_aggregator, tag, prefix):
-    super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._stats_aggregator = stats_aggregator
-    self._tag = tag
-    self._prefix = prefix
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.set_stats_aggregator_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._stats_aggregator._resource,  # pylint: disable=protected-access
-        self._tag,
-        self._prefix,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-
-@tf_export("data.experimental.set_stats_aggregator")
-def set_stats_aggregator(stats_aggregator, tag="", counter_prefix=""):
+@deprecation.deprecated(None, "Use `tf.data.experimental.StatsOptions`.")
+def set_stats_aggregator(stats_aggregator, prefix="", counter_prefix=""):
   """Set the given `stats_aggregator` for aggregating the input dataset stats.
 
   Args:
     stats_aggregator: A `tf.contrib.data.StatsAggregator` object.
-    tag: (Optional) String, all statistics recorded for the input `dataset`
-      will have given `tag` prepend with the name.
+    prefix: (Optional) String, all statistics recorded for the input `dataset`
+      will have given `prefix` prepend with the name.
     counter_prefix: (Optional) String, all statistics recorded as `counters`
       will have the given `prefix` for the counter. Defaults to "/tensorflow".
 
@@ -134,8 +42,8 @@ def set_stats_aggregator(stats_aggregator, tag="", counter_prefix=""):
   """
 
   def _apply_fn(dataset):
-    return _SetStatsAggregatorDataset(dataset, stats_aggregator, tag,
-                                      counter_prefix)
+    return dataset_ops._SetStatsAggregatorDataset(  # pylint: disable=protected-access
+        dataset, stats_aggregator, prefix, counter_prefix)
 
   return _apply_fn
 
@@ -186,7 +94,7 @@ def latency_stats(tag):
   return _apply_fn
 
 
-class _StatsDataset(dataset_ops.UnaryDataset):
+class _StatsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and also records statistics."""
 
   def __init__(self, input_dataset, op_function, tag):
@@ -200,15 +108,3 @@ class _StatsDataset(dataset_ops.UnaryDataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._tag,
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
diff --git a/tensorflow/python/data/experimental/ops/stats_options.py b/tensorflow/python/data/experimental/ops/stats_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..c088d3d8881a23bc58742aa64b8368601503f058
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/stats_options.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""StatsOptions to configure stats aggregation options for `tf.data` pipelines.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import stats_aggregator
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.StatsOptions")
+class StatsOptions(object):
+  """Represents options for collecting dataset stats using `StatsAggregator`.
+
+  To apply `StatsOptions` with a `tf.data.Dataset` object, use the following
+  pattern:
+
+  ```python
+  aggretator = tf.data.experimental.StatsAggregator()
+
+  options = dataset_ops.Options()
+  options.experimental_stats = tf.data.experimental.StatsOptions()
+  options.experimental_stats.aggregator = aggregator
+  dataset = dataset.with_options(options)
+
+  iterator = dataset.make_one_shot_iterator()
+  ```
+
+  Note: a `StatsAggregator` object can be attached either duing construction or
+  can be provided later like in above example.
+
+  ```python
+  aggretator = tf.data.experimental.StatsAggregator()
+  # attach aggregator during construction
+  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
+  .....
+  ```
+  """
+
+  for _name, _ty, _default, _docstring in [
+      ("aggregator", stats_aggregator.StatsAggregator, None,
+       "Associate the given statistics options with the dataset pipeline."),
+      ("prefix", str, "",
+       "Prefix to prepend all statistics recorded for the input `dataset` with."
+      ),
+      ("counter_prefix", str, "",
+       "Prefix for the statistics recorded as counter."),
+      ("latency_all_edges", bool, True,
+       "Whether to add latency measurements on all edges."),
+  ]:
+
+    def _make_getter(name):  # pylint: disable=no-self-argument
+
+      def getter(self):
+        return getattr(self, "_" + name)
+
+      return getter
+
+    def _make_setter(name, ty):  # pylint: disable=no-self-argument
+
+      def setter(self, value):
+        if not isinstance(value, ty):
+          raise TypeError(
+              "Attempting to set the option %s to incompatible value: %r when "
+              "it expects  %r" % (name, value, ty))
+        setattr(self, "_" + name, value)
+
+      return setter
+
+    vars()["_" + _name] = _default
+    vars()[_name] = property(
+        _make_getter(_name), _make_setter(_name, _ty), _default, _docstring)
+
+  def __init__(self, aggregator=None):
+    if aggregator:
+      self.aggregator = aggregator
+
+  def __eq__(self, other):
+    if isinstance(other, self.__class__):
+      return self.__dict__ == other.__dict__
+    else:
+      return False
+
+  def __ne__(self, other):
+    return not self.__eq__(other)
+
+  def __str__(self):
+    return str(self.__dict__)
diff --git a/tensorflow/python/data/experimental/ops/threadpool.py b/tensorflow/python/data/experimental/ops/threadpool.py
index 3ea017c6e80a1a22a6bd82770db1952aebd38849..69e8829d687fb54767bca1716c259efa150b4887 100644
--- a/tensorflow/python/data/experimental/ops/threadpool.py
+++ b/tensorflow/python/data/experimental/ops/threadpool.py
@@ -60,7 +60,7 @@ class PrivateThreadPool(object):
           display_name=display_name)
 
 
-class _ThreadPoolDataset(dataset_ops.UnaryDataset):
+class _ThreadPoolDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and sets a custom threadpool."""
 
   def __init__(self, input_dataset, thread_pool):
@@ -74,18 +74,6 @@ class _ThreadPoolDataset(dataset_ops.UnaryDataset):
         self._thread_pool._resource,  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
 
 # TODO(b/73383364): Properly export in the `tf.data.experimental` API when
 # stable or make private / remove.
diff --git a/tensorflow/python/data/experimental/ops/unique.py b/tensorflow/python/data/experimental/ops/unique.py
index 2a7775c456e86a9339cdfccf1e05f545238bb145..55ed98d8542187b1bd353e2ca581ef2fd2180875 100644
--- a/tensorflow/python/data/experimental/ops/unique.py
+++ b/tensorflow/python/data/experimental/ops/unique.py
@@ -48,7 +48,7 @@ def unique():
   return _apply_fn
 
 
-class _UniqueDataset(dataset_ops.UnaryDataset):
+class _UniqueDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` contains the unique elements from its input."""
 
   def __init__(self, input_dataset):
@@ -65,15 +65,3 @@ class _UniqueDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_unique_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/writers.py b/tensorflow/python/data/experimental/ops/writers.py
index 994447cb4db352432e6f2a672c45ba8242930126..cc0a80336c515e629874a5987219e05f0a8918b0 100644
--- a/tensorflow/python/data/experimental/ops/writers.py
+++ b/tensorflow/python/data/experimental/ops/writers.py
@@ -48,7 +48,7 @@ class TFRecordWriter(object):
     Returns:
       A `tf.Operation` that, when run, writes contents of `dataset` to a file.
     """
-    if not isinstance(dataset, dataset_ops.Dataset):
+    if not isinstance(dataset, dataset_ops.DatasetV2):
       raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
     if (dataset.output_types != dtypes.string or
         dataset.output_shapes != tensor_shape.scalar()):
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 6219d1491ff18de84ead3cf7f9a90cefd0a0b4fa..21eed2b070a70c13658246fda5693c8c0a4e9573 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -173,6 +173,20 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "inputs_test",
+    size = "small",
+    srcs = ["inputs_test.py"],
+    additional_deps = [
+        ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_py_test(
     name = "interleave_dataset_op_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
index e8decb9ad0ecf7768f4bf0f77ff74f9b79bff791..10a0427c7f7acbe46f307a61db2575cb86b34cda 100644
--- a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
@@ -93,13 +93,13 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           })
       num_full_batches = (count * 7) // batch_size
       for i in range(num_full_batches):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(batch_size):
             self.assertAllEqual(component[(i * batch_size + j) % 7]**2,
                                 result_component[j])
       if not drop_remainder and (count * 7) % batch_size > 0:
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range((count * 7) % batch_size):
             self.assertAllEqual(
@@ -128,9 +128,9 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(2):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         expected = sparse_tensor.SparseTensorValue(
             indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
             values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
@@ -155,9 +155,9 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(2):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         expected_indices = []
         expected_values = []
         for j in range(5):
@@ -185,8 +185,8 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      actual = sess.run(get_next)
+      self.evaluate(init_op)
+      actual = self.evaluate(get_next)
       expected = sparse_tensor.SparseTensorValue(
           indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
                    [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
@@ -211,7 +211,7 @@ class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'Cannot batch tensors with different shapes in component 0. '
@@ -271,7 +271,7 @@ class PaddedBatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       num_full_batches = len(seq_lens) // batch_size
 
       for i in range(num_full_batches):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         padded_len = padded_shapes[0]
         if padded_len is None or padded_len == -1:
           padded_len = np.max(result) if result.size > 0 else 0
@@ -283,7 +283,7 @@ class PaddedBatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
                               [0] * (padded_len - seq_len))
 
       if not drop_remainder and len(seq_lens) % batch_size > 0:
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         padded_len = np.max(result) if result.size > 0 else 0
         self.assertEqual((len(seq_lens) % batch_size, padded_len),
                          result.shape)
@@ -315,7 +315,7 @@ class PaddedBatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      result = sess.run(get_next)
+      result = self.evaluate(get_next)
       self.assertAllEqual([[], [], [], []], result)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -347,7 +347,7 @@ class PaddedBatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
               seq_lens: random_seq_lens
           })
       for i in range(8):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         padded_len = np.max(result[0])
         self.assertEqual((4, padded_len), result[0].shape)
         self.assertEqual((4, padded_len), result[1].shape)
diff --git a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
index 63625fac03beeb3f8756bfa5c8e543fdc3488fc4..1f351279c69e766f3c9310509effbc9c9ddcbf0f 100644
--- a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
@@ -71,7 +71,7 @@ class FileCacheDatasetTest(test_base.DatasetTestBase):
 
     with self.cached_session() as sess:
       # First run without caching to collect the "ground truth".
-      sess.run(init_fifo_op)
+      self.evaluate(init_fifo_op)
       elements = []
       for _ in range(20):
         elements.append(sess.run(get_next))
@@ -220,14 +220,14 @@ class MemoryCacheDatasetTest(test_base.DatasetTestBase):
 
       with self.cached_session() as sess:
 
-        sess.run(repeat_count.initializer)
-        sess.run(cached_iterator.initializer)
-        sess.run(uncached_iterator.initializer)
+        self.evaluate(repeat_count.initializer)
+        self.evaluate(cached_iterator.initializer)
+        self.evaluate(uncached_iterator.initializer)
 
         for i in range(3):
           for _ in range(10):
-            self.assertEqual(sess.run(cached_next), i)
-            self.assertEqual(sess.run(uncached_next), i)
+            self.assertEqual(self.evaluate(cached_next), i)
+            self.assertEqual(self.evaluate(uncached_next), i)
 
         sess.run(repeat_count.assign(0))
 
@@ -238,7 +238,7 @@ class MemoryCacheDatasetTest(test_base.DatasetTestBase):
         # The cached iterator replays from cache.
         for i in range(3):
           for _ in range(10):
-            self.assertEqual(sess.run(cached_next), i)
+            self.assertEqual(self.evaluate(cached_next), i)
 
         # The cached iterator should now be empty.
         with self.assertRaises(errors.OutOfRangeError):
@@ -280,7 +280,7 @@ class MemoryCacheDatasetTest(test_base.DatasetTestBase):
     i2 = d2.make_initializable_iterator()
 
     with self.cached_session() as sess:
-      sess.run(i1.initializer)
+      self.evaluate(i1.initializer)
 
       self.assertEqual(1, sess.run(i1.get_next()))
       self.assertEqual(2, sess.run(i1.get_next()))
@@ -307,7 +307,7 @@ class MemoryCacheDatasetTest(test_base.DatasetTestBase):
 
     with self.cached_session() as sess:
       for i, expected in enumerate(expected_values):
-        self.assertEqual(expected, sess.run(n),
+        self.assertEqual(expected, self.evaluate(n),
                          "Unexpected value at index %s" % i)
 
       with self.assertRaises(errors.OutOfRangeError):
diff --git a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
index 83af31f380efabc0d8654668a9a81d5789b8eeb1..a0ef69f0823ebefc3fbbee4e11611caac7ca7306 100644
--- a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
@@ -51,9 +51,9 @@ class ConcatenateDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(9):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         if i < 4:
           for component, result_component in zip(input_components, result):
             self.assertAllEqual(component[i], result_component)
@@ -85,9 +85,9 @@ class ConcatenateDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(9):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         if i < 4:
           for component, result_component in zip(input_components, result):
             self.assertAllEqual(component[i], result_component)
diff --git a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
index bc6b36285aa417e6812e44e97e4f3a30ceb8e6a0..f7b500881c7cd62ca4d5362f454e3eec6f209dcd 100644
--- a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
@@ -52,8 +52,8 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
                      [t.shape for t in get_next])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
+      self.evaluate(init_op)
+      results = self.evaluate(get_next)
       for component, result_component in zip(components, results):
         self.assertAllEqual(component, result_component)
       with self.assertRaises(errors.OutOfRangeError):
@@ -81,8 +81,8 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
         [shape for shape in iterator.output_shapes])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
+      self.evaluate(init_op)
+      results = self.evaluate(get_next)
       for component, result_component in zip(components, results):
         self.assertSparseValuesEqual(component, result_component)
       with self.assertRaises(errors.OutOfRangeError):
@@ -112,8 +112,8 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     ], [shape for shape in iterator.output_shapes])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
+      self.evaluate(init_op)
+      results = self.evaluate(get_next)
       for component, result_component in zip(components, results):
         if sparse_tensor.is_sparse(component):
           self.assertSparseValuesEqual(component, result_component)
@@ -139,9 +139,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
                      [t.shape for t in get_next])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(components, results):
           self.assertAllEqual(component[i], result_component)
       with self.assertRaises(errors.OutOfRangeError):
@@ -169,7 +169,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
         [shape for shape in iterator.output_shapes])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       expected = [
           (sparse_tensor.SparseTensorValue(
               indices=np.array([[0]]),
@@ -197,7 +197,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
                dense_shape=np.array([3]))),
       ]
       for i in range(3):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(expected[i], results):
           self.assertSparseValuesEqual(component, result_component)
       with self.assertRaises(errors.OutOfRangeError):
@@ -229,7 +229,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     ], [shape for shape in iterator.output_shapes])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       expected = [
           (sparse_tensor.SparseTensorValue(
               indices=np.array([[0]]),
@@ -257,7 +257,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
                dense_shape=np.array([3]))),
       ]
       for i in range(3):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(
             (list(zip(*components[:3]))[i] + expected[i]), results):
           if sparse_tensor.is_sparse(component):
@@ -280,9 +280,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     self.assertEqual((1,), iterator.output_shapes["bar"])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(3):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertEqual(components["foo"][i], results["foo"])
         self.assertEqual(components["bar"][i], results["bar"])
       with self.assertRaises(errors.OutOfRangeError):
@@ -308,7 +308,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
                                                     dense_shape)
       sess.run(init_op, feed_dict={st: sparse_feed})
       for i, s in enumerate(slices):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual(s, results.values)
         expected_indices = np.array(
             [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
@@ -474,15 +474,15 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with ops.device("/cpu:0"):
         var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
         dataset = dataset.map(lambda x: x + var_0.read_value())
-      sess.run(var_0.initializer)
+      self.evaluate(var_0.initializer)
 
       with ops.device("/cpu:1"):
         var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
         dataset = dataset.map(lambda x: x + var_1.read_value())
-      sess.run(var_1.initializer)
+      self.evaluate(var_1.initializer)
 
       iterator = dataset.make_initializable_iterator()
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
 
       with self.assertRaisesRegexp(
           errors.FailedPreconditionError,
@@ -506,7 +506,7 @@ class DatasetConstructorBenchmark(test.Benchmark):
     next_element = iterator.get_next()
 
     with session.Session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       # Run one whole epoch to burn in the computation.
       for _ in range(input_size // batch_size):
         sess.run(next_element)
@@ -543,7 +543,7 @@ class DatasetConstructorBenchmark(test.Benchmark):
     next_element = iterator.get_next()
 
     with session.Session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       get_next_element = sess.make_callable(next_element)
       # Run one whole epoch to burn in the computation.
       for _ in range(input_size // batch_size):
@@ -582,7 +582,7 @@ class DatasetConstructorBenchmark(test.Benchmark):
     next_element = iterator.get_next()
 
     with session.Session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       get_next_element = sess.make_callable(next_element)
       # Run one whole epoch to burn in the computation.
       for _ in range(input_size // batch_size):
@@ -620,7 +620,7 @@ class DatasetConstructorBenchmark(test.Benchmark):
     next_element = iterator.get_next()
 
     with session.Session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       get_next_element = sess.make_callable(next_element)
       # Run one whole epoch to burn in the computation.
       for _ in range(input_size // batch_size):
diff --git a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
index cb8cb9a77df0b897a87dfecb96395c1bbee450b0..7087b4dd57fa6154eb1b22b75b8195634fd22363 100644
--- a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
@@ -47,10 +47,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
 
     with self.cached_session() as sess:
       for _ in range(2):  # Run twice to test reinitialization.
-        sess.run(init_op)
+        self.evaluate(init_op)
         for _ in range(num_repeats):
           for elem in elem_sequence:
-            self.assertAllEqual(elem, sess.run(get_next))
+            self.assertAllEqual(elem, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -65,7 +65,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     with self.cached_session() as sess:
       for _ in range(num_repeats):
         for elem in elem_sequence:
-          self.assertAllEqual(elem, sess.run(get_next))
+          self.assertAllEqual(elem, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -133,10 +133,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for _ in range(num_inner_repeats * num_outer_repeats):
         for elem in input_list:
-          val0, val1 = sess.run(get_next)
+          val0, val1 = self.evaluate(get_next)
           self.assertAllEqual(elem[0], val0)
           self.assertAllEqual(elem[1], val1)
       with self.assertRaises(errors.OutOfRangeError):
@@ -192,10 +192,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for elem in [0, 1]:
         for _ in range(num_parallel_iterators):
-          self.assertAllEqual(elem, sess.run(get_next))
+          self.assertAllEqual(elem, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -215,9 +215,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       self.assertEqual(dtype, get_next.dtype)
 
       with self.cached_session() as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         for expected in [[1], [2], [3]]:
-          next_val = sess.run(get_next)
+          next_val = self.evaluate(get_next)
           self.assertEqual(dtype.as_numpy_dtype, next_val.dtype)
           self.assertAllEqual(expected, next_val)
         with self.assertRaises(errors.OutOfRangeError):
@@ -236,9 +236,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for expected in [b"foo", b"bar", b"baz"]:
-        next_val = sess.run(get_next)
+        next_val = self.evaluate(get_next)
         self.assertAllEqual(expected, next_val)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -257,12 +257,12 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual([1, 2, 3], self.evaluate(get_next))
+      self.assertAllEqual([4, 5, 6], self.evaluate(get_next))
       with self.assertRaisesOpError("The expected type was int64"):
         sess.run(get_next)
-      self.assertAllEqual([7, 8, 9], sess.run(get_next))
+      self.assertAllEqual([7, 8, 9], self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -280,12 +280,12 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual([1, 2, 3], self.evaluate(get_next))
+      self.assertAllEqual([4, 5, 6], self.evaluate(get_next))
       with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
         sess.run(get_next)
-      self.assertAllEqual([11, 12, 13], sess.run(get_next))
+      self.assertAllEqual([11, 12, 13], self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -304,16 +304,16 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertEqual((1, 2), sess.run(get_next))
-      self.assertEqual((3, 4), sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertEqual((1, 2), self.evaluate(get_next))
+      self.assertEqual((3, 4), self.evaluate(get_next))
       with self.assertRaisesOpError(
           r"The expected structure was \(tf\.int64, tf\.int64\)"):
         sess.run(get_next)
       with self.assertRaisesOpError(
           r"The expected structure was \(tf\.int64, tf\.int64\)"):
         sess.run(get_next)
-      self.assertEqual((9, 10), sess.run(get_next))
+      self.assertEqual((9, 10), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -329,9 +329,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(1, sess.run(get_next))
-      self.assertAllEqual([2, 3], sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual(1, self.evaluate(get_next))
+      self.assertAllEqual([2, 3], self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -349,9 +349,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(0, sess.run(get_next))
-      self.assertAllEqual(1, sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual(0, self.evaluate(get_next))
+      self.assertAllEqual(1, self.evaluate(get_next))
 
   def testFromGeneratorDestructorCalled(self):
     # Use an `Event` to signal that the generator has been deleted.
@@ -378,9 +378,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with session.Session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(42, sess.run(get_next))
-      self.assertAllEqual(42, sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual(42, self.evaluate(get_next))
+      self.assertAllEqual(42, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
       # Test that `GeneratorWrapper` object is destroyed when the
@@ -407,10 +407,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       expected = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
       for x in expected:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -436,13 +436,13 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       expected = [(0, b"Hi!"),
                   (0, b"Hi!"), (1, b"Hi!"),
                   (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"),
                   (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"), (3, b"Hi!")]
       for x in expected:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -470,9 +470,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(37, sess.run(get_next))
-      self.assertAllEqual(37, sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual(37, self.evaluate(get_next))
+      self.assertAllEqual(37, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
         self.assertTrue(event.is_set())
diff --git a/tensorflow/python/data/kernel_tests/dataset_ops_test.py b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
index 63d2be4371c3d8a5c921445e240b525ee21ff784..a5324af4d0cf951c9f228758df09b9912532819e 100644
--- a/tensorflow/python/data/kernel_tests/dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
@@ -226,7 +226,8 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
         options2)
     self.assertTrue(ds.options().experimental_autotune)
-    self.assertFalse(ds.options().experimental_filter_fusion)
+    # Explicitly check that flag is False since assertFalse allows None
+    self.assertIs(ds.options().experimental_filter_fusion, False)
 
   def testOptionsTwiceDifferentError(self):
     options1 = dataset_ops.Options()
@@ -237,6 +238,17 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
                                  "Cannot merge incompatible values of option"):
       dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
 
+  def testOptionsMergeOptionsFromMultipleInputs(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_filter_fusion = True
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(0).with_options(options1),
+         dataset_ops.Dataset.range(0).with_options(options2)))
+    self.assertTrue(ds.options().experimental_autotune)
+    self.assertTrue(ds.options().experimental_filter_fusion)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
index a0c6b37a6dc0c7f4cec829efb26bec08899b8b34..5ddb22285f9de8beef8f771e5e43d4401b2adb1d 100644
--- a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
@@ -67,7 +67,7 @@ class FilterDatasetTest(test_base.DatasetTestBase):
         sess.run(init_op, feed_dict={count: count_val, modulus: modulus_val})
         for _ in range(count_val):
           for i in [x for x in range(7) if x**2 % modulus_val == 0]:
-            result = sess.run(get_next)
+            result = self.evaluate(get_next)
             for component, result_component in zip(components, result):
               self.assertAllEqual(component[i]**2, result_component)
         with self.assertRaises(errors.OutOfRangeError):
@@ -86,9 +86,9 @@ class FilterDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
-      self.assertEqual(1, sess.run(get_next))
-      self.assertEqual(3, sess.run(get_next))
+      self.assertEqual(0, self.evaluate(get_next))
+      self.assertEqual(1, self.evaluate(get_next))
+      self.assertEqual(3, self.evaluate(get_next))
 
   def testFilterDict(self):
     iterator = (dataset_ops.Dataset.range(10)
@@ -100,10 +100,10 @@ class FilterDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
         if (i ** 2) % 2 == 0:
-          self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
+          self.assertEqual(i * 2 + i**2, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -125,8 +125,8 @@ class FilterDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(input_data[0], sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual(input_data[0], self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -148,9 +148,9 @@ class FilterDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(5):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
         self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
       with self.assertRaises(errors.OutOfRangeError):
@@ -166,9 +166,9 @@ class FilterDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
-        self.assertEqual((i, True), sess.run(get_next))
+        self.assertEqual((i, True), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -178,7 +178,7 @@ class FilterDatasetTest(test_base.DatasetTestBase):
     iterators = [dataset.make_one_shot_iterator() for _ in range(10)]
     next_elements = [iterator.get_next() for iterator in iterators]
     with self.cached_session() as sess:
-      self.assertEqual([0 for _ in range(10)], sess.run(next_elements))
+      self.assertEqual([0 for _ in range(10)], self.evaluate(next_elements))
 
 
 class FilterDatasetBenchmark(test.Benchmark):
diff --git a/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
index 68038f9cfc09efcc08c5fa2d8d8af93a4a3c50db..02979fc2c40695c525f5dbe00bf55416698cf27a 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
@@ -45,10 +45,10 @@ class FlatMapDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in repeats:
         for _ in range(i):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -64,11 +64,11 @@ class FlatMapDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for row in repeats:
         for i in row:
           for _ in range(i):
-            self.assertEqual(i, sess.run(get_next))
+            self.assertEqual(i, self.evaluate(get_next))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -94,12 +94,12 @@ class FlatMapDatasetTest(test_base.DatasetTestBase):
       with session.Session(server.target) as sess2:
         for _ in range(3):
           sess = random.choice([sess1, sess2])
-          sess.run(init_op)
+          self.evaluate(init_op)
           for row in repeats:
             for i in row:
               for _ in range(i):
                 sess = random.choice([sess1, sess2])
-                self.assertEqual(i, sess.run(get_next))
+                self.assertEqual(i, self.evaluate(get_next))
 
         with self.assertRaises(errors.OutOfRangeError):
           sess = random.choice([sess1, sess2])
@@ -115,10 +115,10 @@ class FlatMapDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
         for _ in range(i ** 2):
-          self.assertEqual(i * 2, sess.run(get_next))
+          self.assertEqual(i * 2, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
   # pylint: enable=g-long-lambda
@@ -139,11 +139,11 @@ class FlatMapDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
         for j in range(2):
           expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
+          self.assertAllEqual(expected, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py b/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
index b911c249ced1286223ae5477df75c71b3fececab..56434d6e4c4836bb3a8dc244f3056bbc3a1c356c 100644
--- a/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
@@ -196,7 +196,7 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.cached_session() as sess:
       for expected_element in _interleave(
           _repeat(input_values, count), cycle_length, block_length):
-        self.assertEqual(expected_element, sess.run(get_next))
+        self.assertEqual(expected_element, self.evaluate(get_next))
 
       for _ in range(2):
         with self.assertRaises(errors.OutOfRangeError):
@@ -231,7 +231,7 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           with self.assertRaises(errors.InvalidArgumentError):
             sess.run(get_next)
         else:
-          self.assertEqual(value, sess.run(get_next))
+          self.assertEqual(value, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -254,7 +254,7 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       for i in range(10):
         for j in range(2):
           expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
+          self.assertAllEqual(expected, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -308,7 +308,7 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
       for element in elements:
         coordination_events[element].set()
-        self.assertEqual(element * element, sess.run(get_next))
+        self.assertEqual(element * element, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
index bf5fd781d65cd11ded307221dc80cb58567a41b6..cb38728f23884a1844905662e527c5f386398a1b 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
@@ -57,7 +57,7 @@ class IteratorClusterTest(test.TestCase):
 
     with session.Session(worker[0].target) as sess:
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next_op)
+        self.evaluate(get_next_op)
 
   def _testRemoteIteratorHelper(self, device0, device1, target):
     with ops.device(device1):
@@ -134,12 +134,12 @@ class IteratorClusterTest(test.TestCase):
     get_next = iterator.get_next()
 
     with session.Session(worker[0].target) as sess:
-      sess.run(table.initializer)
-      sess.run(init_op)
-      self.assertAllEqual([0, 0, -1, 1, 2], sess.run(get_next))
+      self.evaluate(table.initializer)
+      self.evaluate(init_op)
+      self.assertAllEqual([0, 0, -1, 1, 2], self.evaluate(get_next))
 
     with session.Session(worker[0].target) as sess:
-      self.assertAllEqual([2, 0], sess.run(get_next))
+      self.assertAllEqual([2, 0], self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -166,7 +166,7 @@ class IteratorClusterTest(test.TestCase):
     get_next = iterator.get_next()
 
     with session.Session(worker[0].target) as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for _ in range(3):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
index a2a3528cc620df85ea797aaa7657cc79ff320285..405d94d95648aaea4e7ec2596a119bd6a6f27c1a 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -97,7 +97,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       for _ in range(14):
         for i in range(7):
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           for component, result_component in zip(components, result):
             self.assertAllEqual(component[i]**2, result_component)
       with self.assertRaises(errors.OutOfRangeError):
@@ -123,7 +123,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       for _ in range(14):
         for i in range(7):
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           for component, result_component in zip(components, result):
             self.assertAllEqual(component[i]**2, result_component)
       with self.assertRaises(errors.OutOfRangeError):
@@ -159,7 +159,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
         for _ in range(14):
           for i in range(7):
-            result = sess.run(get_next)
+            result = self.evaluate(get_next)
             for component, result_component in zip(components, result):
               self.assertAllEqual(component[i]**2, result_component)
         with self.assertRaises(errors.OutOfRangeError):
@@ -175,7 +175,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     config = config_pb2.ConfigProto(
         inter_op_parallelism_threads=1, use_per_session_threads=True)
     with session.Session(config=config) as sess:
-      self.assertAllEqual([1, 4, 9], sess.run(next_element))
+      self.assertAllEqual([1, 4, 9], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -254,15 +254,15 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       get_next = iterator.get_next()
 
       with session.Session(server.target) as sess:
-        sess.run(init_op)
-        results = sess.run(get_next)
+        self.evaluate(init_op)
+        results = self.evaluate(get_next)
         for component, result_component in zip(components, results):
           self.assertAllEqual(component, result_component)
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
         # Re-initialize the iterator in the first session.
-        sess.run(init_op)
+        self.evaluate(init_op)
 
     with ops.Graph().as_default():
       # Re-define the iterator manually, without defining any of the
@@ -277,7 +277,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
       with session.Session(server.target) as sess:
         # Use the iterator without re-initializing in the second session.
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(components, results):
           self.assertAllEqual(component, result_component)
         with self.assertRaises(errors.OutOfRangeError):
@@ -317,23 +317,49 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         sess.run(get_next)
 
       # Initialize with one dataset.
-      sess.run(dataset_3_init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      self.evaluate(dataset_3_init_op)
+      self.assertAllEqual([1, 2, 3], self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Initialize with a different dataset.
-      sess.run(dataset_4_init_op)
-      self.assertAllEqual([4, 5, 6, 7], sess.run(get_next))
+      self.evaluate(dataset_4_init_op)
+      self.assertAllEqual([4, 5, 6, 7], self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Reinitialize with the first dataset.
-      sess.run(dataset_3_init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      self.evaluate(dataset_3_init_op)
+      self.assertAllEqual([1, 2, 3], self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testReinitializableIteratorWithFunctions(self):
+
+    def g():
+      for i in range(10):
+        yield i
+
+    iterator = iterator_ops.Iterator.from_structure(dtypes.int64, [])
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      dataset_1 = dataset_ops.Dataset.from_generator(
+          g, output_types=dtypes.int64)
+      sess.run(iterator.make_initializer(dataset_1))
+      for expected in range(10):
+        self.assertEqual(expected, self.evaluate(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      dataset_2 = dataset_ops.Dataset.from_generator(
+          g, output_types=dtypes.int64)
+      sess.run(iterator.make_initializer(dataset_2))
+      for expected in range(10):
+        self.assertEqual(expected, self.evaluate(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testReinitializableIteratorStaticErrors(self):
     # Non-matching structure for types and shapes.
     with self.assertRaises(TypeError):
@@ -653,10 +679,10 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       n = itr.get_next()
 
     with session.Session(s3.target, config=config) as sess:
-      sess.run(itr.initializer)
+      self.evaluate(itr.initializer)
       expected_values = worker_devices
       for expected in expected_values:
-        self.assertEqual((compat.as_bytes(expected),), sess.run(n))
+        self.assertEqual((compat.as_bytes(expected),), self.evaluate(n))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(n)
@@ -760,8 +786,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     with ops.Graph().as_default() as g:
       init_op, _, save_op, _ = _build_range_dataset_graph()
       with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(save_op)
+        self.evaluate(init_op)
+        self.evaluate(save_op)
 
     # Attempt to restore the saved iterator into an IteratorResource of
     # incompatible type. An iterator of RangeDataset has output type int64,
@@ -772,7 +798,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       _, _, _, restore_op = _build_reader_dataset_graph()
       with self.session(graph=g) as sess:
         with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(restore_op)
+          self.evaluate(restore_op)
 
   def testRepeatedGetNextWarning(self):
     iterator = dataset_ops.Dataset.range(10).make_one_shot_iterator()
@@ -923,7 +949,7 @@ class IteratorCheckpointingTest(test.TestCase):
         checkpoint.restore(checkpoint_management.latest_checkpoint(
             checkpoint_directory)).initialize_or_restore(sess)
         for j in range(2):
-          self.assertEqual(i * 2 + j, sess.run(get_next))
+          self.assertEqual(i * 2 + j, self.evaluate(get_next))
         checkpoint.save(file_prefix=checkpoint_prefix)
 
 
diff --git a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
index b58c1444daeb03e9fa0b02a7288cbdaebbc0e42e..ac6fbabcd5922661200a710378a29113d411b2f5 100644
--- a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
@@ -102,7 +102,7 @@ class ListFilesDatasetOpTest(test_base.DatasetTestBase):
       all_produced_filenames = []
       for _ in range(3):
         produced_filenames = []
-        sess.run(itr.initializer)
+        self.evaluate(itr.initializer)
         try:
           while True:
             produced_filenames.append(sess.run(next_element))
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 81ef7d16be2c9d7eb8513ebdcb83d93b750670c8..8f7a19d7e1b9d4c75103b81cc006f6f01d3c24c6 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -114,7 +114,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op, feed_dict={count: 14})
       for _ in range(14):
         for i in range(7):
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           for component, result_component in zip(components, result):
             self.assertAllEqual(component[i]**2, result_component)
       with self.assertRaises(errors.OutOfRangeError):
@@ -185,7 +185,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
             output_buffer_size: output_buffer_size_val})
         for _ in range(14):
           for i in range(7):
-            result = sess.run(get_next)
+            result = self.evaluate(get_next)
             for component, result_component in zip(components, result):
               self.assertAllEqual(component[i]**2, result_component)
         with self.assertRaises(errors.OutOfRangeError):
@@ -242,7 +242,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for _ in range(3):
         sess.run(get_next)
 
@@ -257,7 +257,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for _ in range(3):
         sess.run(get_next)
 
@@ -272,7 +272,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for _ in range(3):
         sess.run(get_next)
       # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
@@ -293,7 +293,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for _ in range(3):
         sess.run(get_next)
       # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
@@ -325,10 +325,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     with ops.Graph().as_default() as g:
       captured_init_op, init_op, get_next = _build_graph()
       with self.session(graph=g) as sess:
-        sess.run(captured_init_op)
-        sess.run(init_op)
+        self.evaluate(captured_init_op)
+        self.evaluate(init_op)
         for i in range(10):
-          self.assertEqual(i * i, sess.run(get_next))
+          self.assertEqual(i * i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -353,8 +353,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(table.initializer)
-      sess.run(init_op)
+      self.evaluate(table.initializer)
+      self.evaluate(init_op)
       sess.run(get_next)
       sess.run(get_next)
       with self.assertRaises(errors.OutOfRangeError):
@@ -371,11 +371,11 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
+      self.evaluate(enqueue_op)
+      self.evaluate(close_op)
+      self.evaluate(init_op)
       for element in elements:
-        self.assertEqual(element, sess.run(get_next))
+        self.assertEqual(element, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -396,9 +396,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
+      self.evaluate(enqueue_op)
+      self.evaluate(close_op)
+      self.evaluate(init_op)
       for i in range(100):
         self.assertEqual(sorted([elements[i * 2], elements[i * 2 + 1]]),
                          sorted(sess.run(get_next)))
@@ -415,15 +415,15 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(counter_var.initializer)
-      sess.run(init_op)
+      self.evaluate(counter_var.initializer)
+      self.evaluate(init_op)
       for i in range(10):
-        self.assertEqual(i, sess.run(counter_var))
-        self.assertEqual(i + 1, sess.run(get_next))
-      self.assertEqual(10, sess.run(counter_var))
+        self.assertEqual(i, self.evaluate(counter_var))
+        self.assertEqual(i + 1, self.evaluate(get_next))
+      self.assertEqual(10, self.evaluate(counter_var))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
-      self.assertEqual(10, sess.run(counter_var))
+      self.assertEqual(10, self.evaluate(counter_var))
 
   def testCaptureUninitializedVariableError(self):
     counter_var = variable_scope.get_variable(
@@ -435,7 +435,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.NotFoundError):
         sess.run(get_next)
 
@@ -447,14 +447,14 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       random_values = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
           random_values.extend(sess.run(get_next))
       self.assertEqual(10, len(random_values))
       self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
-      sess.run(init_op)
+      self.evaluate(init_op)
       random_values_2 = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
@@ -473,8 +473,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      random_values = sess.run(get_next)
+      self.evaluate(init_op)
+      random_values = self.evaluate(get_next)
 
       # Assert that one of the next 99 batches yielded by the iterator is
       # different from the first.
@@ -500,15 +500,15 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(counter_var.initializer)
-      sess.run(init_op)
+      self.evaluate(counter_var.initializer)
+      self.evaluate(init_op)
       for i in range(10):
-        self.assertEqual(i, sess.run(counter_var))
-        self.assertEqual(i, sess.run(get_next))
-      self.assertEqual(10, sess.run(counter_var))
+        self.assertEqual(i, self.evaluate(counter_var))
+        self.assertEqual(i, self.evaluate(get_next))
+      self.assertEqual(10, self.evaluate(counter_var))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
-      self.assertEqual(10, sess.run(counter_var))
+      self.assertEqual(10, self.evaluate(counter_var))
 
   def testMapDict(self):
     iterator = (dataset_ops.Dataset.range(10)
@@ -519,9 +519,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
-        self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
+        self.assertEqual(i * 2 + i**2, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -569,8 +569,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(row ** 2, sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual(row**2, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -611,7 +611,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       row = np.arange(6)
       for num in [2, 3, 4]:
         init_op, get_next = build_dataset(row, num)
-        sess.run(init_op)
+        self.evaluate(init_op)
         for i in range(6):
           self.assertEqual(
               (i // 2 if i % 2 else i * 2) if (num == 2 or num == 3) else i * 2,
@@ -652,7 +652,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       row = np.arange(6)
       for num in [2, 3, 4]:
         init_op, get_next = build_dataset(row, num)
-        sess.run(init_op)
+        self.evaluate(init_op)
         self.assertAllEqual(
             [x // 2 if (num == 2 or num == 3) else x * 2 for x in row],
             sess.run(get_next))
@@ -697,7 +697,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       self.assertAllEqual([(x // 2 if x % 2 else x * 2) if
                            (num == 2 or num == 3) else x * 2 for x in row],
                           sess.run(get_next))
@@ -735,7 +735,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       for buffer_size in [1, 10, 100, 1000]:
         sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
         for i in range(100):
-          self.assertEqual(i * i, sess.run(get_next))
+          self.assertEqual(i * i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -753,10 +753,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
         for i in range(event_will_be_set_after_consuming):
           self.assertFalse(ev.is_set())
-          self.assertEqual(i * i, sess.run(get_next))
+          self.assertEqual(i * i, self.evaluate(get_next))
         ev.wait()
         for i in range(event_will_be_set_after_consuming, 100):
-          self.assertEqual(i * i, sess.run(get_next))
+          self.assertEqual(i * i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -768,9 +768,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
+        self.assertEqual((i, 37.0), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -789,9 +789,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
+        self.assertEqual((i, 37.0), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -810,9 +810,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
         self.assertSparseValuesEqual(actual, _sparse(i))
       with self.assertRaises(errors.OutOfRangeError):
@@ -837,9 +837,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
         self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
       with self.assertRaises(errors.OutOfRangeError):
@@ -861,9 +861,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(100):
-        self.assertEqual(i, sess.run(get_next))
+        self.assertEqual(i, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -875,9 +875,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
-        self.assertEqual((i, b"hello", 10), sess.run(get_next))
+        self.assertEqual((i, b"hello", 10), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -901,12 +901,14 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         break
     self.assertTrue(found_warning)
 
-  def testNestedDatasetError(self):
-    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
-    with self.assertRaisesRegexp(
-        NotImplementedError, r"The Dataset.map\(\) transformation does not "
-        "currently support nested datasets as outputs."):
-      _ = dataset.map(dataset_ops.Dataset.from_tensor_slices)
+  def testNestedDatasetMap(self):
+    # TODO(b/110122868): When iterators can yield a `tf.data.Dataset`, remove
+    # the `get_single_element()` call.
+    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]).map(
+        dataset_ops.Dataset.from_tensor_slices).map(
+            lambda ds: ds.batch(3)).flat_map(lambda x: x)
+
+    self.assertDatasetProduces(dataset, [[1.0, 2.0, 3.0]])
 
   def testReturnValueError(self):
     dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
@@ -943,7 +945,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "BrokenConst"):
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
 
 # pylint: disable=g-long-lambda
   @parameterized.named_parameters(
@@ -970,7 +972,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      tids = sess.run(get_next)
+      tids = self.evaluate(get_next)
       self.assertTrue(all(tids[0] == tid for tid in tids))
 # pylint: enable=g-long-lambda
 
@@ -994,7 +996,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         expected = map_fn(*sess.run(self.structuredElement(structure)))
       else:
         expected = map_fn(sess.run(self.structuredElement(structure)))
-      self.assertEqual(expected, sess.run(get_next))
+      self.assertEqual(expected, self.evaluate(get_next))
 
   @parameterized.named_parameters(
       ("Sequential", None),
@@ -1009,7 +1011,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={captured_t: 42})
-      self.assertEqual(42, sess.run(get_next))
+      self.assertEqual(42, self.evaluate(get_next))
 
   @parameterized.named_parameters(
       ("1", 1, 1),
@@ -1028,7 +1030,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.cached_session(config=config) as sess:
       for i in range(num_elements):
         coordination_events[i].set()
-        self.assertEqual(i * i, sess.run(get_next))
+        self.assertEqual(i * i, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -1050,7 +1052,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
       for element in elements:
         coordination_events[element].set()
-        self.assertEqual(element * element, sess.run(get_next))
+        self.assertEqual(element * element, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 42ee1e218644291e48bd06757c183c37f9c5e8a4..ea6828e575b7fbeba2b9844328870ecb5738b7d4 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -40,7 +40,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
 
   def testBasic(self):
     dataset = dataset_ops.Dataset.range(10)
@@ -50,10 +50,10 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(elem_on_1)
         sess.run(elem_on_2)
@@ -67,10 +67,10 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(elem_on_1)
         sess.run(elem_on_2)
@@ -85,12 +85,12 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 20, 4):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
-        self.assertEqual(i + 2, sess.run(elem_on_3))
-        self.assertEqual(i + 3, sess.run(elem_on_4))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+        self.assertEqual(i + 2, self.evaluate(elem_on_3))
+        self.assertEqual(i + 3, self.evaluate(elem_on_4))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(elem_on_1)
         sess.run(elem_on_2)
@@ -105,11 +105,11 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
-      self.assertEqual(8, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+      self.assertEqual(8, self.evaluate(elem_on_1))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(elem_on_1)
         sess.run(elem_on_2)
@@ -126,7 +126,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
         elem_on_1_has_value, elem_on_1_value = sess.run(
             [elem_on_1_has_value_t, elem_on_1_t])
@@ -140,8 +140,8 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
           [elem_on_1_has_value_t, elem_on_1_t])
       self.assertTrue(elem_on_1_has_value)
       self.assertEqual(8, elem_on_1_value)
-      self.assertFalse(sess.run(elem_on_1_has_value_t))
-      self.assertFalse(sess.run(elem_on_2_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_1_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_2_has_value_t))
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(elem_on_1_t)
       with self.assertRaises(errors.InvalidArgumentError):
@@ -155,11 +155,11 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(elem_on_1)
         sess.run(elem_on_2)
@@ -192,10 +192,10 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(elem_on_1)
         sess.run(elem_on_2)
@@ -211,11 +211,11 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(elem_on_1)
         sess.run(elem_on_2)
@@ -235,7 +235,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
         elem_on_1_has_value, elem_on_1_value = sess.run(
             [elem_on_1_has_value_t, elem_on_1_t])
@@ -249,8 +249,8 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
           [elem_on_1_has_value_t, elem_on_1_t])
       self.assertTrue(elem_on_1_has_value)
       self.assertEqual(8, elem_on_1_value)
-      self.assertFalse(sess.run(elem_on_1_has_value_t))
-      self.assertFalse(sess.run(elem_on_2_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_1_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_2_has_value_t))
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(elem_on_1_t)
       with self.assertRaises(errors.InvalidArgumentError):
@@ -272,10 +272,10 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(elem_on_1)
         sess.run(elem_on_2)
diff --git a/tensorflow/python/data/kernel_tests/optional_ops_test.py b/tensorflow/python/data/kernel_tests/optional_ops_test.py
index 604e3ad88ec96233771b475705ecac016ac6978c..0981ff9651abe00770b0acc74086f27d258bbf7f 100644
--- a/tensorflow/python/data/kernel_tests/optional_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/optional_ops_test.py
@@ -227,7 +227,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
 
       # For each element of the dataset, assert that the optional evaluates to
       # the expected value.
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for _ in range(3):
         elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t])
         self.assertTrue(elem_has_value)
@@ -236,7 +236,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
       # After exhausting the iterator, `next_elem.has_value()` will evaluate to
       # false, and attempting to get the value will fail.
       for _ in range(2):
-        self.assertFalse(sess.run(elem_has_value_t))
+        self.assertFalse(self.evaluate(elem_has_value_t))
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(elem_value_t)
 
diff --git a/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
index 76e2697b29d368f5607c827fe32d017fbefd5ecd..af326ec210f4d81121691531c11da86300b75404 100644
--- a/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
@@ -40,7 +40,7 @@ class PrefetchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.cached_session() as sess:
       sess.run(init_op, feed_dict={buffer_size_t: buffer_size})
       for m in range(10):
-        self.assertEqual(m, sess.run(get_next))
+        self.assertEqual(m, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
index b7e2a5f615ea970525c0aa3138ac3567bd5c70bc..fcb025c8b881fdff82880c8631e4b76806dddc3a 100644
--- a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
@@ -35,139 +35,58 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class RangeDatasetTest(test_base.DatasetTestBase):
 
-  def tearDown(self):
-    # Remove all checkpoint files.
-    prefix = self._iterator_checkpoint_prefix()
-    pattern = prefix + "*"
-    files = gfile.Glob(pattern)
-    map(gfile.Remove, files)
-
   def testStop(self):
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={stop: 5})
-      for i in range(5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(5)
+    self.assertDatasetProduces(dataset, expected_output=range(5))
 
   def testStartStop(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 5})
-      for i in range(2, 5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    start, stop = 2, 5
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 5))
 
   def testStartStopStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: 2})
-      for i in range(2, 10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    start, stop, step = 2, 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, 2))
 
   def testZeroStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={start: 2, stop: 10, step: 0})
+    start, stop, step = 2, 10, 0
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ""))
 
   def testNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: -1})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(2, 10, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    start, stop, step = 2, 10, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, -1))
 
   def testStopLessThanStart(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    start, stop = 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2))
 
   def testStopLessThanStartWithPositiveStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    start, stop, step = 10, 2, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, 2))
 
   def testStopLessThanStartWithNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: -1})
-      for i in range(10, 2, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    start, stop, step = 10, 2, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, -1))
+
+
+class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
+
+  def tearDown(self):
+    # Remove all checkpoint files.
+    prefix = self._iterator_checkpoint_prefix()
+    pattern = prefix + "*"
+    files = gfile.Glob(pattern)
+    map(gfile.Remove, files)
 
   def _iterator_checkpoint_prefix(self):
     return os.path.join(self.get_temp_dir(), "iterator")
@@ -205,19 +124,19 @@ class RangeDatasetTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
+        self.evaluate(init_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -225,14 +144,14 @@ class RangeDatasetTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-        sess.run(restore_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -256,14 +175,14 @@ class RangeDatasetTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop, num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for _ in range(break_epoch):
           for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
+            self.assertEqual(i, self.evaluate(get_next))
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       # Create an empty IteratorResource and restore the Iterator into it.
@@ -274,12 +193,12 @@ class RangeDatasetTest(test_base.DatasetTestBase):
       restore_op = self._restore_op(iterator._iterator_resource)
       get_next = iterator.get_next()
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         for _ in range(break_epoch + 1, num_epochs):
           for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
+            self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -302,20 +221,20 @@ class RangeDatasetTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       # Intentionally build a graph with a different value for stop to make sure
       # the original dataset graph is actually getting loaded.
       init_op, get_next, _, restore_op = _build_graph(start, stop_1)
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -340,19 +259,19 @@ class RangeDatasetTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
+        self.evaluate(init_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -375,27 +294,27 @@ class RangeDatasetTest(test_base.DatasetTestBase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point1):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         for i in range(break_point1, break_point2):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     break_point2 = 7
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         for i in range(break_point2, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -419,28 +338,28 @@ class RangeDatasetTest(test_base.DatasetTestBase):
       init_op, get_next, save_op, restore_op = _build_graph(
           start, stop, num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
+          self.evaluate(restore_op)
         for _ in range(break_epoch - 1):
           for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
+            self.assertEqual(i, self.evaluate(get_next))
         for i in range(start, break_range):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         for i in range(break_range, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         for _ in range(break_epoch, num_epochs):
           for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
+            self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
@@ -462,23 +381,23 @@ class RangeDatasetTest(test_base.DatasetTestBase):
       init_op, get_next, save_op, restore_op = _build_graph(
           start, stop, num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
+          self.evaluate(restore_op)
         for _ in range(num_epochs):
           for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
+            self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
-        sess.run(save_op)
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
index aef2dd1d9c6a4fc5094bbe79e4714022633c2ed2..e26381e902b1cf61dbc2813038713226765bcd68 100644
--- a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
@@ -107,7 +107,7 @@ class TextLineDatasetTest(test_base.DatasetTestBase):
           init_op, feed_dict={filenames: [test_filenames[0]],
                               num_epochs: 1})
       for i in range(5):
-        self.assertEqual(self._lineText(0, i), sess.run(get_next))
+        self.assertEqual(self._lineText(0, i), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -116,7 +116,7 @@ class TextLineDatasetTest(test_base.DatasetTestBase):
           init_op, feed_dict={filenames: [test_filenames[1]],
                               num_epochs: 1})
       for i in range(5):
-        self.assertEqual(self._lineText(1, i), sess.run(get_next))
+        self.assertEqual(self._lineText(1, i), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -124,7 +124,7 @@ class TextLineDatasetTest(test_base.DatasetTestBase):
       sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
       for j in range(2):
         for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(get_next))
+          self.assertEqual(self._lineText(j, i), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -133,7 +133,7 @@ class TextLineDatasetTest(test_base.DatasetTestBase):
       for _ in range(10):
         for j in range(2):
           for i in range(5):
-            self.assertEqual(self._lineText(j, i), sess.run(get_next))
+            self.assertEqual(self._lineText(j, i), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -213,27 +213,47 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
   def _record(self, f, r):
     return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
 
-  def _createFiles(self):
+  def _createFiles(self, compression_type=None):
     filenames = []
     for i in range(self._num_files):
       fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
       filenames.append(fn)
-      with open(fn, "wb") as f:
-        f.write(b"H" * self._header_bytes)
-        for j in range(self._num_records):
-          f.write(self._record(i, j))
-        f.write(b"F" * self._footer_bytes)
+
+      contents = []
+      contents.append(b"H" * self._header_bytes)
+      for j in range(self._num_records):
+        contents.append(self._record(i, j))
+      contents.append(b"F" * self._footer_bytes)
+      contents = b"".join(contents)
+
+      if not compression_type:
+        with open(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "GZIP":
+        with gzip.GzipFile(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "ZLIB":
+        contents = zlib.compress(contents)
+        with open(fn, "wb") as f:
+          f.write(contents)
+      else:
+        raise ValueError("Unsupported compression_type", compression_type)
+
     return filenames
 
-  def testFixedLengthRecordDataset(self):
-    test_filenames = self._createFiles()
+  def _testFixedLengthRecordDataset(self, compression_type=None):
+    test_filenames = self._createFiles(compression_type=compression_type)
     filenames = array_ops.placeholder(dtypes.string, shape=[None])
     num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
     batch_size = array_ops.placeholder(dtypes.int64, shape=[])
 
-    repeat_dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-                      .repeat(num_epochs))
+    repeat_dataset = (
+        readers.FixedLengthRecordDataset(
+            filenames,
+            self._record_bytes,
+            self._header_bytes,
+            self._footer_bytes,
+            compression_type=compression_type).repeat(num_epochs))
     batch_dataset = repeat_dataset.batch(batch_size)
 
     iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
@@ -247,7 +267,7 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
           init_op, feed_dict={filenames: [test_filenames[0]],
                               num_epochs: 1})
       for i in range(self._num_records):
-        self.assertEqual(self._record(0, i), sess.run(get_next))
+        self.assertEqual(self._record(0, i), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -256,7 +276,7 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
           init_op, feed_dict={filenames: [test_filenames[1]],
                               num_epochs: 1})
       for i in range(self._num_records):
-        self.assertEqual(self._record(1, i), sess.run(get_next))
+        self.assertEqual(self._record(1, i), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -264,7 +284,7 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
       for j in range(self._num_files):
         for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(get_next))
+          self.assertEqual(self._record(j, i), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -273,7 +293,7 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       for _ in range(10):
         for j in range(self._num_files):
           for i in range(self._num_records):
-            self.assertEqual(self._record(j, i), sess.run(get_next))
+            self.assertEqual(self._record(j, i), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -293,6 +313,15 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testFixedLengthRecordDatasetNoCompression(self):
+    self._testFixedLengthRecordDataset()
+
+  def testFixedLengthRecordDatasetGzipCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="GZIP")
+
+  def testFixedLengthRecordDatasetZlibCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="ZLIB")
+
   def testFixedLengthRecordDatasetBuffering(self):
     test_filenames = self._createFiles()
     dataset = readers.FixedLengthRecordDataset(
@@ -376,19 +405,19 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
+          self.evaluate(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
             for r in range(self._num_records):
               if (epoch == epoch_break and f == file_break and
                   r == record_break):
-                sess.run(save_op)
+                self.evaluate(save_op)
                 break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
+              self.assertEqual(self._record(f, r), self.evaluate(get_next_op))
             else:
               continue
             break
@@ -397,13 +426,13 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
           break
         else:
           with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
+            self.evaluate(get_next_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
             for r in range(self._num_records):
@@ -412,9 +441,9 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
                   (epoch == epoch_break and f == file_break and
                    r < record_break)):
                 continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
+              self.assertEqual(self._record(f, r), self.evaluate(get_next_op))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
+          self.evaluate(get_next_op)
 
   def testInitThenRestore(self):
     # Note: Calling init_op before restore_op is redundant. This test just makes
@@ -429,19 +458,19 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
+          self.evaluate(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
             for r in range(self._num_records):
               if (epoch == epoch_break and f == file_break and
                   r == record_break):
-                sess.run(save_op)
+                self.evaluate(save_op)
                 break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
+              self.assertEqual(self._record(f, r), self.evaluate(get_next_op))
             else:
               continue
             break
@@ -450,14 +479,14 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
           break
         else:
           with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
+            self.evaluate(get_next_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
+        self.evaluate(init_op)
+        self.evaluate(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
             for r in range(self._num_records):
@@ -466,9 +495,9 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
                   (epoch == epoch_break and f == file_break and
                    r < record_break)):
                 continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
+              self.assertEqual(self._record(f, r), self.evaluate(get_next_op))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
+          self.evaluate(get_next_op)
 
   def testRestoreInModifiedGraph(self):
     num_epochs = 10
@@ -481,19 +510,19 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
+          self.evaluate(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
             for r in range(self._num_records):
               if (epoch == epoch_break and f == file_break and
                   r == record_break):
-                sess.run(save_op)
+                self.evaluate(save_op)
                 break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
+              self.assertEqual(self._record(f, r), self.evaluate(get_next_op))
             else:
               continue
             break
@@ -502,13 +531,13 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
           break
         else:
           with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
+            self.evaluate(get_next_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs_1)
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
             for r in range(self._num_records):
@@ -517,9 +546,9 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
                   (epoch == epoch_break and f == file_break and
                    r < record_break)):
                 continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
+              self.assertEqual(self._record(f, r), self.evaluate(get_next_op))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
+          self.evaluate(get_next_op)
 
   def testRestoreWithoutBuildingDatasetGraph(self):
     num_epochs = 10
@@ -531,19 +560,19 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
+          self.evaluate(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
             for r in range(self._num_records):
               if (epoch == epoch_break and f == file_break and
                   r == record_break):
-                sess.run(save_op)
+                self.evaluate(save_op)
                 break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
+              self.assertEqual(self._record(f, r), self.evaluate(get_next_op))
             else:
               continue
             break
@@ -552,12 +581,12 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
           break
         else:
           with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
+            self.evaluate(get_next_op)
 
     with ops.Graph().as_default() as g:
       restore_op, get_next_op = self._restore_iterator()
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
             for r in range(self._num_records):
@@ -566,9 +595,9 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
                   (epoch == epoch_break and f == file_break and
                    r < record_break)):
                 continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
+              self.assertEqual(self._record(f, r), self.evaluate(get_next_op))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
+          self.evaluate(get_next_op)
 
   def testRestoreUnusedIterator(self):
     num_epochs = 10
@@ -576,22 +605,22 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
+          self.evaluate(restore_op)
         # Save unused iterator.
-        sess.run(save_op)
+        self.evaluate(save_op)
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         for _ in range(num_epochs * self._num_files * self._num_records):
-          sess.run(get_next_op)
+          self.evaluate(get_next_op)
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
+          self.evaluate(get_next_op)
 
   def testRestoreExhaustedIterator(self):
     num_epochs = 10
@@ -600,26 +629,26 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
         with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
+          self.evaluate(restore_op)
         for _ in range(num_epochs):
           for f in range(self._num_files):
             for r in range(self._num_records):
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
+              self.assertEqual(self._record(f, r), self.evaluate(get_next_op))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-        sess.run(save_op)
+          self.evaluate(get_next_op)
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
       with self.session(graph=g) as sess:
-        sess.run(restore_op)
+        self.evaluate(restore_op)
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
+          self.evaluate(get_next_op)
 
 
 class TFRecordDatasetTest(test_base.DatasetTestBase):
@@ -778,7 +807,7 @@ class TFRecordDatasetTest(test_base.DatasetTestBase):
     with self.cached_session() as sess:
       for j in range(self._num_files):
         for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(next_element))
+          self.assertAllEqual(self._record(j, i), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -790,7 +819,7 @@ class TFRecordDatasetTest(test_base.DatasetTestBase):
     with self.cached_session() as sess:
       for j in range(self._num_files):
         for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(next_element))
+          self.assertAllEqual(self._record(j, i), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
diff --git a/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py b/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
index 11e07300b9716d60d0d96587018dd63dce3f9d24..d7f3988b1af89aa0344a93afc4abb125295432a8 100644
--- a/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
@@ -36,7 +36,7 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ds = dataset_ops.Dataset.range(1, i + 1)
       result = ds.reduce(np.int64(0), lambda x, y: x + y)
       with self.cached_session() as sess:
-        self.assertEqual(((i + 1) * i) // 2, sess.run(result))
+        self.assertEqual(((i + 1) * i) // 2, self.evaluate(result))
 
   def testSumTuple(self):
 
@@ -49,7 +49,7 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ds = dataset_ops.Dataset.zip((ds, ds))
       result = ds.reduce(np.int64(0), reduce_fn)
       with self.cached_session() as sess:
-        self.assertEqual(((i + 1) * i), sess.run(result))
+        self.assertEqual(((i + 1) * i), self.evaluate(result))
 
   def testSumAndCount(self):
 
@@ -61,7 +61,7 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ds = dataset_ops.Dataset.range(1, i + 1)
       result = ds.reduce((np.int64(0), np.int64(0)), reduce_fn)
       with self.cached_session() as sess:
-        s, c = sess.run(result)
+        s, c = self.evaluate(result)
         self.assertEqual(((i + 1) * i) // 2, s)
         self.assertEqual(i, c)
 
@@ -93,7 +93,8 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ds = dataset_ops.Dataset.from_tensors(make_sparse_fn(i+1))
       result = ds.reduce(make_sparse_fn(0), reduce_fn)
       with self.cached_session() as sess:
-        self.assertSparseValuesEqual(make_sparse_fn(i+1), sess.run(result))
+        self.assertSparseValuesEqual(
+            make_sparse_fn(i + 1), self.evaluate(result))
 
   def testNested(self):
 
@@ -116,7 +117,7 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ds = dataset_ops.Dataset.range(1, i + 1).map(map_fn)
       result = ds.reduce(map_fn(0), reduce_fn)
       with self.cached_session() as sess:
-        result = sess.run(result)
+        result = self.evaluate(result)
         self.assertEqual(((i + 1) * i) // 2, result["dense"])
         self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
 
diff --git a/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py b/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
index e86356dee7c63e062c9dfe945246a0461c3e6526..946aa01f7350804b90fcc5924fbacaeea1ef8a31 100644
--- a/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
@@ -49,7 +49,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
       # Test a finite repetition.
       sess.run(init_op, feed_dict={count_placeholder: 3})
       for _ in range(3):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(components, results):
           self.assertAllEqual(component, result_component)
 
@@ -59,7 +59,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
       # Test a different finite repetition.
       sess.run(init_op, feed_dict={count_placeholder: 7})
       for _ in range(7):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(components, results):
           self.assertAllEqual(component, result_component)
       with self.assertRaises(errors.OutOfRangeError):
@@ -75,7 +75,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
       # actually is infinite.
       sess.run(init_op, feed_dict={count_placeholder: -1})
       for _ in range(17):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(components, results):
           self.assertAllEqual(component, result_component)
 
@@ -95,7 +95,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
       # Take fewer than input size
       sess.run(init_op, feed_dict={count_placeholder: 4})
       for i in range(4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual(results, components[0][i:i+1])
 
       with self.assertRaises(errors.OutOfRangeError):
@@ -104,7 +104,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
       # Take more than input size
       sess.run(init_op, feed_dict={count_placeholder: 25})
       for i in range(10):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual(results, components[0][i:i+1])
 
       with self.assertRaises(errors.OutOfRangeError):
@@ -113,7 +113,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
       # Take all of input
       sess.run(init_op, feed_dict={count_placeholder: -1})
       for i in range(10):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual(results, components[0][i:i+1])
 
       with self.assertRaises(errors.OutOfRangeError):
@@ -142,7 +142,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
       # the first 4 elements and then read the rest.
       sess.run(init_op, feed_dict={count_placeholder: 4})
       for i in range(4, 10):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual(results, components[0][i:i+1])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -165,7 +165,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
       # Skip nothing
       sess.run(init_op, feed_dict={count_placeholder: 0})
       for i in range(0, 10):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual(results, components[0][i:i+1])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -187,7 +187,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
     with self.cached_session() as sess:
       sess.run(init_op, feed_dict={inner_count: 7, outer_count: 14})
       for _ in range(7 * 14):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(components, results):
           self.assertAllEqual(component, result_component)
       with self.assertRaises(errors.OutOfRangeError):
@@ -201,7 +201,7 @@ class SequenceDatasetTest(test_base.DatasetTestBase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
index cad28f860e9d04647c510146f73d7d39de774d4a..990f4f212b862b34ec99fbdc1896b4dc734e9562 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
@@ -66,7 +66,7 @@ class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       # First run without shuffling to collect the "ground truth".
-      sess.run(init_fifo_op)
+      self.evaluate(init_fifo_op)
       unshuffled_elements = []
       for _ in range(20):
         unshuffled_elements.append(sess.run(get_next))
@@ -159,7 +159,7 @@ class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={seed_placeholder: 0})
       for elem in elems:
-        self.assertEqual(elem, sess.run(get_next))
+        self.assertEqual(elem, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -188,9 +188,9 @@ class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      initial_permutation = sess.run(next_element)
-      self.assertAllEqual(initial_permutation, sess.run(next_element))
-      self.assertAllEqual(initial_permutation, sess.run(next_element))
+      initial_permutation = self.evaluate(next_element)
+      self.assertAllEqual(initial_permutation, self.evaluate(next_element))
+      self.assertAllEqual(initial_permutation, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -261,7 +261,7 @@ class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.session(graph=g) as sess:
         for iterator in iterators:
           if initializable:
-            sess.run(iterator.initializer)
+            self.evaluate(iterator.initializer)
           next_element = iterator.get_next()
           run_results = []
           for _ in range(300):
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index 219a25a615eb018b759df8923ff1135b959f05be..ca853da3f61ca1b1ca4714f29861b59daf2890d9 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -38,7 +38,7 @@ class DatasetTestBase(test.TestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-  def getNext(self, dataset):
+  def getNext(self, dataset, requires_initialization=False):
     """Returns a callable that returns the next element of the dataset.
 
     Example use:
@@ -50,17 +50,81 @@ class DatasetTestBase(test.TestCase):
     ```
 
     Args:
-      dataset: A dataset whose next element is returned
-
+      dataset: A dataset whose elements will be returned.
+      requires_initialization: Indicates that when the test is executed in graph
+        mode, it should use an initializable iterator to iterate through the
+        dataset (e.g. when it contains stateful nodes). Defaults to False.
     Returns:
-      A callable that returns the next element of `dataset`
+      A callable that returns the next element of `dataset`.
     """
-    it = dataset.make_one_shot_iterator()
     if context.executing_eagerly():
-      return it.get_next
+      iterator = dataset.__iter__()
+      return iterator._next_internal  # pylint: disable=protected-access
     else:
-      nxt = it.get_next()
-      return lambda: nxt
+      if requires_initialization:
+        iterator = dataset.make_initializable_iterator()
+        self.evaluate(iterator.initializer)
+      else:
+        iterator = dataset.make_one_shot_iterator()
+      return iterator.get_next
+
+  def _compareOutputToExpected(self, result_values, expected_values):
+    for i in range(len(result_values)):
+      if sparse_tensor.is_sparse(result_values[i]):
+        self.assertSparseValuesEqual(result_values[i], expected_values[i])
+      else:
+        self.assertAllEqual(result_values[i], expected_values[i])
+
+  def assertDatasetProduces(self,
+                            dataset,
+                            expected_output=None,
+                            expected_error=None,
+                            requires_initialization=False,
+                            num_test_iterations=2):
+    """Asserts that a dataset produces the expected output / error.
+
+    Args:
+      dataset: A dataset to check for the expected output / error.
+      expected_output: A list of elements that the dataset is expected to
+        produce.
+      expected_error: A tuple `(type, predicate)` identifying the expected error
+        `dataset` should raise. The `type` should match the expected exception
+        type, while `predicate` should either be 1) a unary function that inputs
+        the raised exception and returns a boolean indicator of success or 2) a
+        regular expression that is expected to match the error message
+        partially.
+      requires_initialization: Indicates that when the test is executed in graph
+        mode, it should use an initializable iterator to iterate through the
+        dataset (e.g. when it contains stateful nodes). Defaults to False.
+      num_test_iterations: Number of times `dataset` will be iterated. Defaults
+        to 2.
+    """
+    self.assertTrue(
+        expected_error is not None or expected_output is not None,
+        "Exactly one of expected_output or expected error should be provided.")
+    if expected_error:
+      self.assertTrue(
+          expected_output is None,
+          "Exactly one of expected_output or expected error should be provided."
+      )
+      with self.assertRaisesWithPredicateMatch(expected_error[0],
+                                               expected_error[1]):
+        get_next = self.getNext(
+            dataset, requires_initialization=requires_initialization)
+        self.evaluate(get_next())
+      return
+    self.assertGreater(num_test_iterations, 0)
+    for _ in range(num_test_iterations):
+      get_next = self.getNext(
+          dataset, requires_initialization=requires_initialization)
+      result = []
+      for _ in range(len(expected_output)):
+        result.append(self.evaluate(get_next()))
+      self._compareOutputToExpected(result, expected_output)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   def assertDatasetsEqual(self, dataset1, dataset2):
     """Checks that datasets are equal. Supports both graph and eager mode."""
@@ -83,9 +147,7 @@ class DatasetTestBase(test.TestCase):
       op2 = nest.flatten(op2)
       assert len(op1) == len(op2)
       for i in range(len(op1)):
-        if isinstance(
-            op1[i],
-            (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+        if sparse_tensor.is_sparse(op1[i]):
           self.assertSparseValuesEqual(op1[i], op2[i])
         elif flattened_types[i] == dtypes.string:
           self.assertAllEqual(op1[i], op2[i])
@@ -103,7 +165,7 @@ class DatasetTestBase(test.TestCase):
     try:
       self.evaluate(next1())
       raise ValueError(
-          'Expected dataset to raise an error of type %s, but it did not.' %
+          "Expected dataset to raise an error of type %s, but it did not." %
           repr(exception_class))
     except exception_class as e:
       expected_message = e.message
diff --git a/tensorflow/python/data/kernel_tests/window_dataset_op_test.py b/tensorflow/python/data/kernel_tests/window_dataset_op_test.py
index 9d067810944c23a19418a4625dae2997d122d119..35adcddfe709bd0f681d3946d91a9b9515022f34 100644
--- a/tensorflow/python/data/kernel_tests/window_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/window_dataset_op_test.py
@@ -102,7 +102,7 @@ class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       num_full_batches = max(
           0, (count * 7 - ((size - 1) * stride + 1)) // shift + 1)
       for i in range(num_full_batches):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(size):
             self.assertAllEqual(component[(i * shift + j * stride) % 7]**2,
@@ -111,7 +111,7 @@ class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         num_partial_batches = (count * 7) // shift + (
             (count * 7) % shift > 0) - num_full_batches
         for i in range(num_partial_batches):
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           for component, result_component in zip(components, result):
             remaining = (count * 7) - ((num_full_batches + i) * shift)
             num_elements = remaining // stride + ((remaining % stride) > 0)
@@ -164,10 +164,10 @@ class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       num_batches = (10 - 5) // 3 + 1
       for i in range(num_batches):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         expected = sparse_tensor.SparseTensorValue(
             indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
             values=[i * 3, i * 3 + 1, i * 3 + 2, i * 3 + 3, i * 3 + 4],
@@ -193,10 +193,10 @@ class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       num_batches = (10 - 5) // 3 + 1
       for i in range(num_batches):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         expected_indices = []
         expected_values = []
         for j in range(5):
@@ -227,9 +227,9 @@ class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       # Slide: 1st batch.
-      actual = sess.run(get_next)
+      actual = self.evaluate(get_next)
       expected = sparse_tensor.SparseTensorValue(
           indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
                    [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
@@ -239,7 +239,7 @@ class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertTrue(sparse_tensor.is_sparse(actual))
       self.assertSparseValuesEqual(actual, expected)
       # Slide: 2nd batch.
-      actual = sess.run(get_next)
+      actual = self.evaluate(get_next)
       expected = sparse_tensor.SparseTensorValue(
           indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
                    [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
@@ -265,7 +265,7 @@ class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r"Cannot batch tensors with different shapes in component 0. "
@@ -281,8 +281,8 @@ class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = dataset.make_one_shot_iterator().get_next()
 
     with self.cached_session() as sess:
-      self.assertAllEqual(np.float32([1., 2.]), sess.run(get_next))
-      self.assertAllEqual(np.float32([2., 3.]), sess.run(get_next))
+      self.assertAllEqual(np.float32([1., 2.]), self.evaluate(get_next))
+      self.assertAllEqual(np.float32([2., 3.]), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py b/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
index 9d76387a343de6e8652dd595c08bf72680a8197e..b60ec4ecce53d6623d2268ea45db57edd8fc65a1 100644
--- a/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
@@ -55,7 +55,7 @@ class ZipDatasetTest(test_base.DatasetTestBase):
       sess.run(init_op, feed_dict={ph: value for ph, value in zip(
           component_placeholders, equal_length_components)})
       for i in range(4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(
             equal_length_components, results):
           self.assertAllEqual(component[i], result_component)
@@ -66,7 +66,7 @@ class ZipDatasetTest(test_base.DatasetTestBase):
       sess.run(init_op, feed_dict={ph: value for ph, value in zip(
           component_placeholders, variable_length_components)})
       for i in range(2):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         for component, result_component in zip(
             variable_length_components, results):
           self.assertAllEqual(component[i], result_component)
@@ -103,7 +103,7 @@ class ZipDatasetTest(test_base.DatasetTestBase):
       sess.run(init_op, feed_dict={ph: value for ph, value in zip(
           component_placeholders, equal_length_components)})
       for i in range(4):
-        result1, (result2, result3) = sess.run(get_next)
+        result1, (result2, result3) = self.evaluate(get_next)
         self.assertAllEqual(equal_length_components[0][i], result1)
         self.assertAllEqual(equal_length_components[1][i], result2)
         self.assertAllEqual(equal_length_components[2][i], result3)
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 5e636965a66b93ba4a852eaa73ebd5ab8f30e618..18edc0872d766291acc243581b868869d9be65d3 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -25,6 +25,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:stats_options",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/data/util:sparse",
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 18d3bc1823cd712b44246fea75315a706a308574..4a11619112b5167a1961128a08df47c8c7e5aa31 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import functools
 import threading
 import warnings
 
@@ -25,6 +26,7 @@ import numpy as np
 import six
 
 from tensorflow.python.compat import compat
+from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import random_seed
@@ -51,17 +53,16 @@ from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.Dataset")
+@tf_export("data.Dataset", v1=[])
 @six.add_metaclass(abc.ABCMeta)
-class Dataset(object):
+class DatasetV2(object):
   """Represents a potentially large set of elements.
 
   A `Dataset` can be used to represent an input pipeline as a
   collection of elements (nested structures of tensors) and a "logical
+
   plan" of transformations that act on those elements.
   """
-  def __init__(self):
-    pass
 
   def _as_serialized_graph(self):
     """Produces serialized graph representation of the dataset.
@@ -88,25 +89,33 @@ class Dataset(object):
     raise NotImplementedError("Dataset._inputs")
 
   def options(self):
-    """Returns the options for this dataset.
+    """Returns the options for this dataset and its inputs.
 
     Returns:
       A `tf.data.Options` object representing the dataset options.
     """
+    options = Options()
     for input_dataset in self._inputs():
-      options = input_dataset.options()
-      if options is not None:
-        return options
-    return Options()
+      input_options = input_dataset.options()
+      if input_options is not None:
+        options = options.merge(input_options)
+    return options
 
   def _apply_options(self):
+    """Apply options, such as optimization configuration, to the dataset."""
+
     dataset = self
     options = self.options()
     static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
     if static_optimizations:
       dataset = _OptimizeDataset(dataset, static_optimizations)
-    if options.experimental_autotune:
+    if options.experimental_autotune is not False:
       dataset = _ModelDataset(dataset)
+    if options.experimental_stats and options.experimental_stats.aggregator:  # pylint: disable=line-too-long
+      dataset = _SetStatsAggregatorDataset(  # pylint: disable=protected-access
+          dataset, options.experimental_stats.aggregator,
+          options.experimental_stats.prefix,
+          options.experimental_stats.counter_prefix)
     return dataset
 
   def make_initializable_iterator(self, shared_name=None):
@@ -192,6 +201,7 @@ class Dataset(object):
     # a 0-argument function.
     @function.Defun(capture_by_value=True)
     def _make_dataset():
+      """Factory function for a dataset."""
       # NOTE(mrry): `Defun` does not capture the graph-level seed from the
       # enclosing graph, so if a graph-level seed is present we set the local
       # graph seed based on a combination of the graph- and op-level seeds.
@@ -300,19 +310,6 @@ class Dataset(object):
     """
     return TensorSliceDataset(tensors)
 
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
-  def from_sparse_tensor_slices(sparse_tensor):
-    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
-
-    Args:
-      sparse_tensor: A `tf.SparseTensor`.
-
-    Returns:
-      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
-    """
-    return SparseTensorSliceDataset(sparse_tensor)
-
   class _GeneratorState(object):
     """Stores outstanding iterators created from a Python generator.
 
@@ -424,7 +421,7 @@ class Dataset(object):
     flattened_types = [dtypes.as_dtype(dt) for dt in nest.flatten(output_types)]
     flattened_shapes = nest.flatten(output_shapes)
 
-    generator_state = Dataset._GeneratorState(generator)
+    generator_state = DatasetV2._GeneratorState(generator)
 
     def get_iterator_id_fn(unused_dummy):
       """Creates a unique `iterator_id` for each pass over the dataset.
@@ -1206,7 +1203,7 @@ class Dataset(object):
           dataset.
     """
     dataset = transformation_func(self)
-    if not isinstance(dataset, Dataset):
+    if not isinstance(dataset, DatasetV2):
       raise TypeError("`transformation_func` must return a Dataset.")
     dataset._input_datasets = [self]  # pylint: disable=protected-access
     return dataset
@@ -1390,6 +1387,188 @@ class Dataset(object):
     return _OptionsDataset(self, options)
 
 
+@tf_export(v1=["data.Dataset"])
+class DatasetV1(DatasetV2):
+  """Represents a potentially large set of elements.
+
+  A `Dataset` can be used to represent an input pipeline as a
+  collection of elements (nested structures of tensors) and a "logical
+  plan" of transformations that act on those elements.
+  """
+
+  def __init__(self):
+    pass
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_tensors)
+  def from_tensors(tensors):
+    return DatasetV1Adapter(DatasetV2.from_tensors(tensors))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_tensor_slices)
+  def from_tensor_slices(tensors):
+    return DatasetV1Adapter(DatasetV2.from_tensor_slices(tensors))
+
+  @staticmethod
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
+  def from_sparse_tensor_slices(sparse_tensor):
+    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
+
+    Args:
+      sparse_tensor: A `tf.SparseTensor`.
+
+    Returns:
+      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
+    """
+    return DatasetV1Adapter(SparseTensorSliceDataset(sparse_tensor))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_generator)
+  def from_generator(generator, output_types, output_shapes=None, args=None):
+    return DatasetV1Adapter(DatasetV2.from_generator(
+        generator, output_types, output_shapes, args))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.range)
+  def range(*args):
+    return DatasetV1Adapter(DatasetV2.range(*args))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.zip)
+  def zip(datasets):
+    return DatasetV1Adapter(DatasetV2.zip(datasets))
+
+  @functools.wraps(DatasetV2.concatenate)
+  def concatenate(self, dataset):
+    return DatasetV1Adapter(super(DatasetV1, self).concatenate(dataset))
+
+  @functools.wraps(DatasetV2.prefetch)
+  def prefetch(self, buffer_size):
+    return DatasetV1Adapter(super(DatasetV1, self).prefetch(buffer_size))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.list_files)
+  def list_files(file_pattern, shuffle=None, seed=None):
+    return DatasetV1Adapter(DatasetV2.list_files(file_pattern, shuffle, seed))
+
+  @functools.wraps(DatasetV2.repeat)
+  def repeat(self, count=None):
+    return DatasetV1Adapter(super(DatasetV1, self).repeat(count))
+
+  @functools.wraps(DatasetV2.shuffle)
+  def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None):
+    return DatasetV1Adapter(super(DatasetV1, self).shuffle(
+        buffer_size, seed, reshuffle_each_iteration))
+
+  @functools.wraps(DatasetV2.cache)
+  def cache(self, filename=""):
+    return DatasetV1Adapter(super(DatasetV1, self).cache(filename))
+
+  @functools.wraps(DatasetV2.take)
+  def take(self, count):
+    return DatasetV1Adapter(super(DatasetV1, self).take(count))
+
+  @functools.wraps(DatasetV2.skip)
+  def skip(self, count):
+    return DatasetV1Adapter(super(DatasetV1, self).skip(count))
+
+  @functools.wraps(DatasetV2.shard)
+  def shard(self, num_shards, index):
+    return DatasetV1Adapter(super(DatasetV1, self).shard(num_shards, index))
+
+  @functools.wraps(DatasetV2.batch)
+  def batch(self, batch_size, drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).batch(
+        batch_size, drop_remainder))
+
+  @functools.wraps(DatasetV2.padded_batch)
+  def padded_batch(self,
+                   batch_size,
+                   padded_shapes,
+                   padding_values=None,
+                   drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).padded_batch(
+        batch_size, padded_shapes, padding_values, drop_remainder))
+
+  @functools.wraps(DatasetV2.map)
+  def map(self, map_func, num_parallel_calls=None):
+    return DatasetV1Adapter(super(DatasetV1, self).map(
+        map_func, num_parallel_calls))
+
+  @functools.wraps(DatasetV2.flat_map)
+  def flat_map(self, map_func):
+    return DatasetV1Adapter(super(DatasetV1, self).flat_map(map_func))
+
+  @functools.wraps(DatasetV2.interleave)
+  def interleave(self,
+                 map_func,
+                 cycle_length,
+                 block_length=1,
+                 num_parallel_calls=None):
+    return DatasetV1Adapter(super(DatasetV1, self).interleave(
+        map_func, cycle_length, block_length, num_parallel_calls))
+
+  @functools.wraps(DatasetV2.filter)
+  def filter(self, predicate):
+    return DatasetV1Adapter(super(DatasetV1, self).filter(predicate))
+
+  @functools.wraps(DatasetV2.apply)
+  def apply(self, transformation_func):
+    return DatasetV1Adapter(super(DatasetV1, self).apply(transformation_func))
+
+  @functools.wraps(DatasetV2.window)
+  def window(self, size, shift=None, stride=1, drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).window(
+        size, shift, stride, drop_remainder))
+
+  @functools.wraps(DatasetV2.with_options)
+  def with_options(self, options):
+    return DatasetV1Adapter(super(DatasetV1, self).with_options(options))
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+Dataset = DatasetV1
+
+
+class DatasetV1Adapter(DatasetV1):
+  """Wraps a V2 `Dataset` object in the `tf.compat.v1.data.Dataset` API."""
+
+  def __init__(self, dataset):
+    super(DatasetV1Adapter, self).__init__()
+    self._dataset = dataset
+
+  def _as_variant_tensor(self):
+    return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
+
+  def _inputs(self):
+    return self._dataset._inputs()  # pylint: disable=protected-access
+
+  def options(self):
+    return self._dataset.options()
+
+  @property
+  def output_classes(self):
+    return self._dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._dataset.output_types
+
+  def make_initializable_iterator(self, shared_name=None):
+    return self._dataset.make_initializable_iterator(shared_name)
+
+  def __iter__(self):
+    return iter(self._dataset)
+
+  def make_one_shot_iterator(self):
+    return self._dataset.make_one_shot_iterator()
+
+
 @tf_export("data.Options")
 class Options(object):
   """Represents options for tf.data.Dataset.
@@ -1410,8 +1589,8 @@ class Options(object):
       ("experimental_hoist_random_uniform", bool,
        "Whether to hoist `tf.random_uniform()` ops out of map transformations."
       ),
-      ("experimental_latency_all_edges", bool,
-       "Whether to add latency measurements on all edges."),
+      ("experimental_stats", stats_options.StatsOptions,
+       "Associate the given statistics options with the dataset pipeline."),
       ("experimental_map_and_batch_fusion", bool,
        "Whether to fuse map and batch transformations."),
       ("experimental_map_and_filter_fusion", bool,
@@ -1441,8 +1620,8 @@ class Options(object):
       def setter(self, value):
         if not isinstance(value, ty):
           raise TypeError(
-              "Attempting to set the option %s to incompatible value: %r" %
-              (name, value))
+              "Attempting to set the option %s to incompatible value: %r when "
+              "it expects  %r" % (name, value, ty))
         setattr(self, "_" + name, value)
 
       return setter
@@ -1466,10 +1645,15 @@ class Options(object):
   def _static_optimizations(self):
     """Produces the list of enabled static optimizations."""
     experimental_optimizations = [
-        "filter_fusion", "hoist_random_uniform", "latency_all_edges",
-        "map_and_batch_fusion", "map_and_filter_fusion", "map_fusion",
-        "map_parallelization", "map_vectorization", "noop_elimination",
-        "shuffle_and_repeat_fusion"
+        "filter_fusion",
+        "hoist_random_uniform",
+        "map_and_batch_fusion",
+        "map_and_filter_fusion",
+        "map_fusion",
+        "map_parallelization",
+        "map_vectorization",
+        "noop_elimination",
+        "shuffle_and_repeat_fusion",
     ]
     result = []
     for exp_opt in experimental_optimizations:
@@ -1480,6 +1664,10 @@ class Options(object):
       result.append("make_numa_aware")
     if getattr(self, "experimental_deterministic") is False:
       result.append("make_sloppy")
+    experimental_stats_options = getattr(self, "experimental_stats")
+    if experimental_stats_options and getattr(experimental_stats_options,
+                                              "latency_all_edges"):
+      result.append("latency_all_edges")
     return result
 
   def merge(self, options):
@@ -1505,7 +1693,6 @@ class Options(object):
           "experimental_deterministic",
           "experimental_filter_fusion",
           "experimental_hoist_random_uniform",
-          "experimental_latency_all_edges",
           "experimental_map_and_batch_fusion",
           "experimental_map_and_filter_fusion",
           "experimental_map_fusion",
@@ -1514,6 +1701,7 @@ class Options(object):
           "experimental_noop_elimination",
           "experimental_numa_aware",
           "experimental_shuffle_and_repeat_fusion",
+          "experimental_stats",
       ]:
         this = getattr(result, name)
         that = getattr(other, name)
@@ -1526,14 +1714,14 @@ class Options(object):
     return result
 
 
-class DatasetSource(Dataset):
+class DatasetSource(DatasetV2):
   """Abstract class representing a dataset with no inputs."""
 
   def _inputs(self):
     return []
 
 
-class UnaryDataset(Dataset):
+class UnaryDataset(DatasetV2):
   """Abstract class representing a dataset with one input."""
 
   def __init__(self, input_dataset):
@@ -1544,6 +1732,22 @@ class UnaryDataset(Dataset):
     return [self._input_dataset]
 
 
+class UnaryUnchangedStructureDataset(UnaryDataset):
+  """Represents a unary dataset with the same input and output structure."""
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes  # pylint: disable=protected-access
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes  # pylint: disable=protected-access
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types  # pylint: disable=protected-access
+
+
 class TensorDataset(DatasetSource):
   """A `Dataset` with a single element, viz. a nested structure of tensors."""
 
@@ -1672,11 +1876,6 @@ class _NestedDatasetComponent(object):
   corresponding position in the `output_classes`, `output_shapes`, and
   `output_types` properties.
 
-  NOTE(mrry): This class is not currently exposed via the public API. Support
-  for nested datasets can be enabled on a function-by-function basis by setting
-  `experimental_nested_dataset_support=True` in the `StructuredFunctionWrapper`
-  initializer.
-
   TODO(b/110122868): Add this class, or something equivalent, to the public API.
   We are considering revising the public API for accessing Dataset structure
   (`output_classes` etc.) based on experience with nested datasets and other
@@ -1720,7 +1919,7 @@ class _NestedDatasetComponent(object):
     return self._output_types
 
 
-class _VariantDataset(Dataset):
+class _VariantDataset(DatasetV2):
   """A Dataset wrapper around a `tf.variant`-typed function argument."""
 
   def __init__(self, dataset_variant, structure):
@@ -1759,7 +1958,7 @@ class StructuredFunctionWrapper(object):
                input_shapes=None,
                input_types=None,
                add_to_graph=True,
-               experimental_nested_dataset_support=False):
+               defun_kwargs=None):
     """Creates a new `StructuredFunctionWrapper` for the given function.
 
     Args:
@@ -1778,8 +1977,9 @@ class StructuredFunctionWrapper(object):
         argument defines the element types and structure for `func` arguments.
       add_to_graph: (Optional.) If `True`, the function will be added to the
         default graph.
-      experimental_nested_dataset_support: (Optional.) If `True`, the function
-        will support `tf.data.Dataset` objects as arguments and return values.
+      defun_kwargs: (Optional.) A dictionary mapping string argument names to
+        values. If supplied, will be passed to `function.Defun()` as keyword
+        arguments.
 
     Raises:
       ValueError: If an invalid combination of `dataset`, `input_classes`,
@@ -1811,10 +2011,11 @@ class StructuredFunctionWrapper(object):
 
     ])
 
-    # TODO(b/110122868): Enable this support for all `tf.data` functions.
-    self._nested_dataset_support = experimental_nested_dataset_support
+    if defun_kwargs is None:
+      defun_kwargs = {}
 
-    @function.Defun(*self._defun_args(), func_name=self._func_name)
+    @function.Defun(
+        *self._defun_args(), func_name=self._func_name, **defun_kwargs)
     def tf_data_structured_function_wrapper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
       flat_args = []
@@ -1831,7 +2032,6 @@ class StructuredFunctionWrapper(object):
           arg.indices.set_shape([None, arg_shape.ndims])
           arg.dense_shape.set_shape([arg_shape.ndims])
         elif isinstance(arg_class, _NestedDatasetComponent):
-          assert self._nested_dataset_support
           arg = _VariantDataset(arg, arg_class)
         else:
           arg.set_shape(arg_shape)
@@ -1869,12 +2069,7 @@ class StructuredFunctionWrapper(object):
           flat_classes.append(sparse_tensor_lib.SparseTensor)
           flat_shapes.append(t.get_shape())
           flat_types.append(t.dtype)
-        elif isinstance(t, Dataset):
-          if not self._nested_dataset_support:
-            raise NotImplementedError(
-                "The %s transformation does not currently support nested "
-                "datasets as outputs." % self._transformation_name)
-
+        elif isinstance(t, DatasetV2):
           flat_ret.append(t._as_variant_tensor())  # pylint: disable=protected-access
           component = _NestedDatasetComponent(t)
           flat_classes.append(component)
@@ -1922,10 +2117,6 @@ class StructuredFunctionWrapper(object):
       if input_class is sparse_tensor_lib.SparseTensor:
         ret.append(dtypes.variant)
       elif isinstance(input_class, _NestedDatasetComponent):
-        if not self._nested_dataset_support:
-          raise NotImplementedError(
-              "The %s transformation does not currently support nested "
-              "datasets as inputs." % self._transformation_name)
         ret.append(dtypes.variant)
       else:
         assert isinstance(input_type, dtypes.DType)
@@ -2080,14 +2271,14 @@ class _GeneratorDataset(DatasetSource):
     return "Dataset.from_generator()"
 
 
-class ZipDataset(Dataset):
+class ZipDataset(DatasetV2):
   """A `Dataset` that zips its inputs together."""
 
   def __init__(self, datasets):
     """See `Dataset.zip()` for details."""
     super(ZipDataset, self).__init__()
     for ds in nest.flatten(datasets):
-      if not isinstance(ds, Dataset):
+      if not isinstance(ds, DatasetV2):
         if isinstance(ds, list):
           message = ("The argument to `Dataset.zip()` must be a nested "
                      "structure of `Dataset` objects. Nested structures do not "
@@ -2127,7 +2318,7 @@ class ZipDataset(Dataset):
         [ds.output_types for ds in nest.flatten(self._datasets)])
 
 
-class ConcatenateDataset(Dataset):
+class ConcatenateDataset(DatasetV2):
   """A `Dataset` that concatenates its input with given dataset."""
 
   def __init__(self, input_dataset, dataset_to_concatenate):
@@ -2182,7 +2373,7 @@ class ConcatenateDataset(Dataset):
     return self._output_types
 
 
-class RepeatDataset(UnaryDataset):
+class RepeatDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that repeats its input several times."""
 
   def __init__(self, input_dataset, count):
@@ -2201,18 +2392,6 @@ class RepeatDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class RangeDataset(DatasetSource):
   """A `Dataset` of a step separated range of values."""
@@ -2262,7 +2441,7 @@ class RangeDataset(DatasetSource):
     return dtypes.int64
 
 
-class CacheDataset(UnaryDataset):
+class CacheDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that caches elements of its input."""
 
   def __init__(self, input_dataset, filename):
@@ -2278,20 +2457,8 @@ class CacheDataset(UnaryDataset):
         filename=self._filename,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class ShuffleDataset(UnaryDataset):
+class ShuffleDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that randomly shuffles the elements of its input."""
 
   def __init__(self,
@@ -2339,20 +2506,8 @@ class ShuffleDataset(UnaryDataset):
         reshuffle_each_iteration=self._reshuffle_each_iteration,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class TakeDataset(UnaryDataset):
+class TakeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` containing the first `count` elements from its input."""
 
   def __init__(self, input_dataset, count):
@@ -2367,20 +2522,8 @@ class TakeDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
 
-
-class SkipDataset(UnaryDataset):
+class SkipDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` skipping the first `count` elements from its input."""
 
   def __init__(self, input_dataset, count):
@@ -2395,18 +2538,6 @@ class SkipDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class BatchDataset(UnaryDataset):
   """A `Dataset` that batches contiguous elements from its input."""
@@ -2709,7 +2840,7 @@ class MapDataset(UnaryDataset):
     return "Dataset.map()"
 
 
-class MatchingFilesDataset(Dataset):
+class MatchingFilesDataset(DatasetSource):
   """A `Dataset` that list the files according to the input patterns."""
 
   def __init__(self, patterns):
@@ -2720,9 +2851,6 @@ class MatchingFilesDataset(Dataset):
   def _as_variant_tensor(self):
     return gen_dataset_ops.matching_files_dataset(self._patterns)
 
-  def _inputs(self):
-    return []
-
   @property
   def output_classes(self):
     return ops.Tensor
@@ -2773,10 +2901,7 @@ class FlatMapDataset(UnaryDataset):
     self._input_dataset = input_dataset
 
     wrapped_func = StructuredFunctionWrapper(
-        map_func,
-        self._transformation_name(),
-        dataset=input_dataset,
-        experimental_nested_dataset_support=True)
+        map_func, self._transformation_name(), dataset=input_dataset)
     if not isinstance(wrapped_func.output_classes, _NestedDatasetComponent):
       raise TypeError("`map_func` must return a `Dataset` object.")
     self._output_classes = wrapped_func.output_classes.output_classes
@@ -2862,7 +2987,7 @@ class ParallelInterleaveDataset(FlatMapDataset):
     return "Dataset.interleave()"
 
 
-class FilterDataset(UnaryDataset):
+class FilterDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that filters its input according to a predicate function."""
 
   def __init__(self, input_dataset, predicate):
@@ -2884,23 +3009,11 @@ class FilterDataset(UnaryDataset):
         predicate=self._predicate,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
   def _transformation_name(self):
     return "Dataset.filter()"
 
 
-class PrefetchDataset(UnaryDataset):
+class PrefetchDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that asynchronously prefetches its input."""
 
   def __init__(self, input_dataset, buffer_size):
@@ -2918,18 +3031,6 @@ class PrefetchDataset(UnaryDataset):
         buffer_size=self._buffer_size,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class WindowDataset(UnaryDataset):
   """A dataset that creates window datasets from the input elements."""
@@ -2981,7 +3082,7 @@ class WindowDataset(UnaryDataset):
     return self._output_types
 
 
-class _OptionsDataset(UnaryDataset):
+class _OptionsDataset(UnaryUnchangedStructureDataset):
   """An identity `Dataset` that stores options."""
 
   def __init__(self, input_dataset, options):
@@ -2999,20 +3100,8 @@ class _OptionsDataset(UnaryDataset):
   def options(self):
     return self._options
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class _ModelDataset(UnaryDataset):
+class _ModelDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and models performance."""
 
   def __init__(self, input_dataset):
@@ -3025,20 +3114,8 @@ class _ModelDataset(UnaryDataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class _OptimizeDataset(UnaryDataset):
+class _OptimizeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and applies optimizations."""
 
   def __init__(self, input_dataset, optimizations):
@@ -3056,14 +3133,21 @@ class _OptimizeDataset(UnaryDataset):
         self._optimizations,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
+class _SetStatsAggregatorDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, and sets stats aggregator."""
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def __init__(self, input_dataset, aggregator, prefix, counter_prefix):
+    super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._stats_aggregator = aggregator
+    self._prefix = prefix
+    self._counter_prefix = counter_prefix
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.set_stats_aggregator_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._stats_aggregator._resource,  # pylint: disable=protected-access
+        self._prefix,
+        self._counter_prefix,
+        **flat_structure(self))
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 0e4666287b18c7e95ff0b68639cf7686896e2be5..0f9add6461aeeb1e1d81dfb75fefb345b659c349 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -55,7 +55,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
     def _init_func():
       return multi_device_iterator_string_handle
 
-    init_func_concrete = _init_func.get_concrete_function()
+    init_func_concrete = _init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun()
     def _remote_init_func():
       return functional_ops.remote_call(
@@ -64,7 +65,7 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           Tout=[dtypes.string],
           f=init_func_concrete)
 
-    self._init_func = _remote_init_func.get_concrete_function()
+    self._init_func = _remote_init_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._init_captured_args = self._init_func.captured_inputs
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
@@ -81,7 +82,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           output_types=self._flat_output_types,
           output_shapes=self._flat_output_shapes)
 
-    next_func_concrete = _next_func.get_concrete_function()
+    next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun_with_attributes(
         input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
         attributes={"experimental_ints_on_device": True})
@@ -93,14 +95,15 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           Tout=self._flat_output_types,
           f=next_func_concrete)
 
-    self._next_func = _remote_next_func.get_concrete_function()
+    self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._next_captured_args = self._next_func.captured_inputs
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _finalize_func(unused_string_handle):
       return array_ops.constant(0, dtypes.int64)
 
-    finalize_func_concrete = _finalize_func.get_concrete_function()
+    finalize_func_concrete = _finalize_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
@@ -110,7 +113,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           Tout=[dtypes.int64],
           f=finalize_func_concrete)
 
-    self._finalize_func = _remote_finalize_func.get_concrete_function()
+    self._finalize_func = _remote_finalize_func._get_concrete_function_internal(  # pylint: disable=protected-access
+    )
     self._finalize_captured_args = self._finalize_func.captured_inputs
 
   def _as_variant_tensor(self):
@@ -216,6 +220,10 @@ class MultiDeviceIterator(object):
           self._dataset.output_types, self._dataset.output_classes)
       if prefetch_buffer_size > 0:
         ds = ds.prefetch(prefetch_buffer_size)
+      # TODO(jsimsa): Enable auto-tuning when supported for non-CPU devices.
+      options = dataset_ops.Options()
+      options.experimental_autotune = False
+      ds = ds.with_options(options)
       with ops.device(device):
         self._device_iterators.append(ds.make_initializable_iterator())
 
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index d08da6704caf8b6c3bc94b49d0fce6ecb8157a75..880e005653d2cb657860260ee6fd0e57f5e7c1c7 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
@@ -31,8 +32,8 @@ from tensorflow.python.util.tf_export import tf_export
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
-@tf_export("data.TextLineDataset")
-class TextLineDataset(dataset_ops.Dataset):
+@tf_export("data.TextLineDataset", v1=[])
+class TextLineDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` comprising lines from one or more text files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -46,7 +47,7 @@ class TextLineDataset(dataset_ops.Dataset):
         to buffer. A value of 0 results in the default buffering values chosen
         based on the compression type.
     """
-    super(TextLineDataset, self).__init__()
+    super(TextLineDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -61,9 +62,6 @@ class TextLineDataset(dataset_ops.Dataset):
     return gen_dataset_ops.text_line_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
-  def _inputs(self):
-    return []
-
   @property
   def output_classes(self):
     return ops.Tensor
@@ -77,7 +75,25 @@ class TextLineDataset(dataset_ops.Dataset):
     return dtypes.string
 
 
-class _TFRecordDataset(dataset_ops.Dataset):
+@tf_export(v1=["data.TextLineDataset"])
+class TextLineDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` comprising lines from one or more text files."""
+
+  def __init__(self, filenames, compression_type=None, buffer_size=None):
+    wrapped = TextLineDatasetV2(filenames, compression_type, buffer_size)
+    super(TextLineDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = TextLineDatasetV2.__init__.__doc__
+
+  @property
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
+
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
+
+
+class _TFRecordDataset(dataset_ops.DatasetSource):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -108,9 +124,6 @@ class _TFRecordDataset(dataset_ops.Dataset):
     return gen_dataset_ops.tf_record_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
-  def _inputs(self):
-    return []
-
   @property
   def output_classes(self):
     return ops.Tensor
@@ -161,8 +174,8 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
     return "tf.data.experimental.parallel_interleave()"
 
 
-@tf_export("data.TFRecordDataset")
-class TFRecordDataset(dataset_ops.Dataset):
+@tf_export("data.TFRecordDataset", v1=[])
+class TFRecordDatasetV2(dataset_ops.DatasetV2):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None,
@@ -187,8 +200,8 @@ class TFRecordDataset(dataset_ops.Dataset):
       TypeError: If any argument does not have the expected type.
       ValueError: If any argument does not have the expected shape.
     """
-    super(TFRecordDataset, self).__init__()
-    if isinstance(filenames, dataset_ops.Dataset):
+    super(TFRecordDatasetV2, self).__init__()
+    if isinstance(filenames, dataset_ops.DatasetV2):
       if filenames.output_types != dtypes.string:
         raise TypeError(
             "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
@@ -199,7 +212,7 @@ class TFRecordDataset(dataset_ops.Dataset):
     else:
       filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
       filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
-      filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
+      filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames)
 
     self._filenames = filenames
     self._compression_type = compression_type
@@ -222,10 +235,10 @@ class TFRecordDataset(dataset_ops.Dataset):
              compression_type=None,
              buffer_size=None,
              num_parallel_reads=None):
-    return TFRecordDataset(filenames or self._filenames,
-                           compression_type or self._compression_type,
-                           buffer_size or self._buffer_size,
-                           num_parallel_reads or self._num_parallel_reads)
+    return TFRecordDatasetV2(filenames or self._filenames,
+                             compression_type or self._compression_type,
+                             buffer_size or self._buffer_size,
+                             num_parallel_reads or self._num_parallel_reads)
 
   def _as_variant_tensor(self):
     return self._impl._as_variant_tensor()  # pylint: disable=protected-access
@@ -246,8 +259,40 @@ class TFRecordDataset(dataset_ops.Dataset):
     return self._impl.output_types
 
 
-@tf_export("data.FixedLengthRecordDataset")
-class FixedLengthRecordDataset(dataset_ops.Dataset):
+@tf_export(v1=["data.TFRecordDataset"])
+class TFRecordDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` comprising records from one or more TFRecord files."""
+
+  def __init__(self, filenames, compression_type=None, buffer_size=None,
+               num_parallel_reads=None):
+    wrapped = TFRecordDatasetV2(
+        filenames, compression_type, buffer_size, num_parallel_reads)
+    super(TFRecordDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = TFRecordDatasetV2.__init__.__doc__
+
+  def _clone(self,
+             filenames=None,
+             compression_type=None,
+             buffer_size=None,
+             num_parallel_reads=None):
+    # pylint: disable=protected-access
+    return TFRecordDatasetV1(
+        filenames or self._dataset._filenames,
+        compression_type or self._dataset._compression_type,
+        buffer_size or self._dataset._buffer_size,
+        num_parallel_reads or self._dataset._num_parallel_reads)
+
+  @property
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
+
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
+
+
+@tf_export("data.FixedLengthRecordDataset", v1=[])
+class FixedLengthRecordDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
   def __init__(self,
@@ -255,7 +300,8 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
                record_bytes,
                header_bytes=None,
                footer_bytes=None,
-               buffer_size=None):
+               buffer_size=None,
+               compression_type=None):
     """Creates a `FixedLengthRecordDataset`.
 
     Args:
@@ -268,8 +314,10 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
         bytes to ignore at the end of a file.
       buffer_size: (Optional.) A `tf.int64` scalar representing the number of
         bytes to buffer when reading.
+      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
+        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
     """
-    super(FixedLengthRecordDataset, self).__init__()
+    super(FixedLengthRecordDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._record_bytes = ops.convert_to_tensor(
@@ -281,14 +329,22 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
         "footer_bytes", footer_bytes)
     self._buffer_size = convert.optional_param_to_tensor(
         "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
+    self._compression_type = convert.optional_param_to_tensor(
+        "compression_type",
+        compression_type,
+        argument_default="",
+        argument_dtype=dtypes.string)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.fixed_length_record_dataset(
-        self._filenames, self._header_bytes, self._record_bytes,
-        self._footer_bytes, self._buffer_size)
-
-  def _inputs(self):
-    return []
+    if (self._compression_type is not None or
+        compat.forward_compatible(2018, 11, 30)):
+      return gen_dataset_ops.fixed_length_record_dataset_v2(
+          self._filenames, self._header_bytes, self._record_bytes,
+          self._footer_bytes, self._buffer_size, self._compression_type)
+    else:
+      return gen_dataset_ops.fixed_length_record_dataset(
+          self._filenames, self._header_bytes, self._record_bytes,
+          self._footer_bytes, self._buffer_size)
 
   @property
   def output_classes(self):
@@ -301,3 +357,36 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
   @property
   def output_types(self):
     return dtypes.string
+
+
+@tf_export(v1=["data.FixedLengthRecordDataset"])
+class FixedLengthRecordDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` of fixed-length records from one or more binary files."""
+
+  def __init__(self,
+               filenames,
+               record_bytes,
+               header_bytes=None,
+               footer_bytes=None,
+               buffer_size=None,
+               compression_type=None):
+    wrapped = FixedLengthRecordDatasetV2(
+        filenames, record_bytes, header_bytes, footer_bytes, buffer_size,
+        compression_type)
+    super(FixedLengthRecordDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = FixedLengthRecordDatasetV2.__init__.__doc__
+
+  @property
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
+
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+FixedLengthRecordDataset = FixedLengthRecordDatasetV1
+TFRecordDataset = TFRecordDatasetV1
+TextLineDataset = TextLineDatasetV1
diff --git a/tensorflow/python/data/util/convert_test.py b/tensorflow/python/data/util/convert_test.py
index 89c3afb29691f4f24b7cb4208b16663b616515fa..4a5b73038114b365db4603f566f48ae7f8a5b7b6 100644
--- a/tensorflow/python/data/util/convert_test.py
+++ b/tensorflow/python/data/util/convert_test.py
@@ -31,24 +31,24 @@ class ConvertTest(test.TestCase):
   def testInteger(self):
     resp = convert.optional_param_to_tensor("foo", 3)
     with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(resp))
+      self.assertEqual(3, self.evaluate(resp))
 
   def testIntegerDefault(self):
     resp = convert.optional_param_to_tensor("foo", None)
     with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(resp))
+      self.assertEqual(0, self.evaluate(resp))
 
   def testStringDefault(self):
     resp = convert.optional_param_to_tensor("bar", None, "default",
                                             dtypes.string)
     with self.cached_session() as sess:
-      self.assertEqual(compat.as_bytes("default"), sess.run(resp))
+      self.assertEqual(compat.as_bytes("default"), self.evaluate(resp))
 
   def testString(self):
     resp = convert.optional_param_to_tensor("bar", "value", "default",
                                             dtypes.string)
     with self.cached_session() as sess:
-      self.assertEqual(compat.as_bytes("value"), sess.run(resp))
+      self.assertEqual(compat.as_bytes("value"), self.evaluate(resp))
 
   def testPartialShapeToTensorKnownDimension(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/python/data/util/sparse_test.py b/tensorflow/python/data/util/sparse_test.py
index 056b32480f3898726940f3c228c9b9eefa28b237..4ba314f06a4bd03ff0c7a99deb49e6a2c246c107 100644
--- a/tensorflow/python/data/util/sparse_test.py
+++ b/tensorflow/python/data/util/sparse_test.py
@@ -292,9 +292,9 @@ class SparseTest(test.TestCase):
       return
     self.assertTrue(isinstance(b, sparse_tensor.SparseTensor))
     with self.cached_session():
-      self.assertAllEqual(a.eval().indices, b.eval().indices)
-      self.assertAllEqual(a.eval().values, b.eval().values)
-      self.assertAllEqual(a.eval().dense_shape, b.eval().dense_shape)
+      self.assertAllEqual(a.eval().indices, self.evaluate(b).indices)
+      self.assertAllEqual(a.eval().values, self.evaluate(b).values)
+      self.assertAllEqual(a.eval().dense_shape, self.evaluate(b).dense_shape)
 
   def testSerializeDeserialize(self):
     test_cases = (
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index 242215dccb95c31ab640579486bc2234dfc6b12d..ffbdff8c47b7208279966fdfcf022865c8a09309 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Public Python API of TensorFlow Debugger (tfdbg).
 
-See the [TFDBG](https://tensorflow.org/api_guides/python/tfdbg) guide.
+See the [TFDBG](https://www.tensorflow.org/guide/debugger) guide.
 
 @@add_debug_tensor_watch
 @@watch_graph
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index f197a9e4dcefdb528a3a843effa95f7311ca007a..5aa7d1bb4c3e035302796108d88b647f6eba7051 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -1583,7 +1583,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       x = variables.VariableV1([1, 3, 3, 7], name="x")
       _, idx = array_ops.unique(x, name="x_unique")
       idx_times_two = math_ops.multiply(idx, 2, name="idx_times_two")
-      sess.run(x.initializer)
+      self.evaluate(x.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index 1f67f8a0d4e55c7faf8ca65af51169831e731576..34030c0adcab30647d360260741a8dcbb870cc73 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -126,8 +126,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       u = variables.Variable([12.0], name="u")
       v = variables.Variable([30.0], name="v")
       w = math_ops.add(u, v, name="w")
-      sess.run(u.initializer)
-      sess.run(v.initializer)
+      self.evaluate(u.initializer)
+      self.evaluate(v.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, w, expected_output=[42.0])
@@ -139,7 +139,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
         b = math_ops.add(a, a, name="b")
       with ops.control_dependencies([a, b]):
         c = math_ops.multiply(b, b, name="c")
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, c, expected_output=400.0)
@@ -150,8 +150,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       y = variables.Variable(20.0, name="y")
       cond = control_flow_ops.cond(
           x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1))
-      sess.run(x.initializer)
-      sess.run(y.initializer)
+      self.evaluate(x.initializer)
+      self.evaluate(y.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, cond, expected_output=21.0)
@@ -173,8 +173,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       toy_loss = x * (u - v)
       train_op = gradient_descent.GradientDescentOptimizer(
           learning_rate=0.1).minimize(toy_loss, name="train_op")
-      sess.run(u.initializer)
-      sess.run(v.initializer)
+      self.evaluate(u.initializer)
+      self.evaluate(v.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(sess, train_op)
 
diff --git a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
index 74498c8ea3dd494cd8fc6237b60b11a202497990..b78c3d16d482baea2adce80e528de0d8af9cfac2 100644
--- a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
@@ -131,8 +131,8 @@ class DistributedSessionDebugTest(test_util.TensorFlowTestCase):
     with session.Session(
         config=self.session_config, graph=graph,
         target=self.server_target) as sess:
-      sess.run(self.a.initializer)
-      sess.run(self.b.initializer)
+      self.evaluate(self.a.initializer)
+      self.evaluate(self.b.initializer)
 
       run_options = config_pb2.RunOptions()
       debug_utils.watch_graph(
@@ -198,8 +198,8 @@ class DistributedSessionDebugTest(test_util.TensorFlowTestCase):
     with session.Session(
         config=self.session_config, graph=graph,
         target=self.server_target) as sess:
-      sess.run(self.a.initializer)
-      sess.run(self.b.initializer)
+      self.evaluate(self.a.initializer)
+      self.evaluate(self.b.initializer)
 
       def watch_fn(feeds, fetch_keys):
         del feeds, fetch_keys
diff --git a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
index b0dc25851ca3101a48543aeca1325fa155dd29b7..8eef45392f2fb56bc57b6bd6156f9fed8a93cd1f 100644
--- a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
+++ b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
@@ -67,7 +67,7 @@ class SessionDebugMultiGPUTest(test_util.TensorFlowTestCase):
         u1 = math_ops.multiply(v, v, name="u1")
       w = math_ops.subtract(u1, u0, name="w")
 
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(run_options, sess.graph,
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index 4a8d4eaa99f28db26f05a00e7759c79699ca9ab4..a16d68329a3a9823da441b466a30bba5bb401394 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -109,8 +109,8 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
       self.w = math_ops.matmul(self.u, self.v, name="w")
       self.w_line_number = line_number_above()
 
-      sess.run(self.u.initializer)
-      sess.run(self.v.initializer)
+      self.evaluate(self.u.initializer)
+      self.evaluate(self.v.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 73e08ce7d5969de2ae54e2505fa7b449bfaf631a..68584b4ede46f2e61310c262d543837b71542de4 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -339,7 +339,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
     with wrapper.as_default():
       foo = constant_op.constant(42, name="foo")
-      self.assertEqual(42, foo.eval())
+      self.assertEqual(42, self.evaluate(foo))
       self.assertEqual(foo, self._observer["run_fetches"])
 
   def testWrapperShouldSupportSessionClose(self):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 1c3d8ea67e524775e26c2fcb41af4349b0706353..999543d71fa10c6fc2b2cc8d73e9624b1d49a94b 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -7,15 +7,79 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
-    name = "distribute",
+    name = "all_reduce",
+    srcs = [
+        "all_reduce.py",
+    ],
     srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
     deps = [
-        ":distribute_config",
-        ":distribute_coordinator",
-        ":distribute_coordinator_context",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nccl_ops",
+    ],
+)
+
+tf_py_test(
+    name = "all_reduce_test",
+    srcs = ["all_reduce_test.py"],
+    additional_deps = [
+        ":all_reduce",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:state_ops",
+    ],
+)
+
+py_library(
+    name = "cross_device_ops",
+    srcs = ["cross_device_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cross_device_utils",
+        ":reduce_util",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "cross_device_utils",
+    srcs = ["cross_device_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":all_reduce",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nccl_ops",
     ],
 )
 
@@ -45,7 +109,6 @@ py_library(
 
 py_test(
     name = "distribute_coordinator_test",
-    size = "large",
     srcs = ["distribute_coordinator_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -76,6 +139,35 @@ py_library(
     deps = [],
 )
 
+py_library(
+    name = "mirrored_strategy",
+    srcs = ["mirrored_strategy.py"],
+    deps = [
+        ":cross_device_ops",
+        ":multi_worker_util",
+        ":reduce_util",
+        ":shared_variable_creator",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+    ],
+)
+
 py_library(
     name = "multi_worker_util",
     srcs = [
@@ -88,6 +180,34 @@ py_library(
     ],
 )
 
+py_library(
+    name = "input_ops",
+    srcs = ["input_ops.py"],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+cuda_py_test(
+    name = "input_ops_test",
+    srcs = ["input_ops_test.py"],
+    additional_deps = [
+        ":input_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python:util",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
+
 py_test(
     name = "multi_worker_util_test",
     srcs = ["multi_worker_util_test.py"],
@@ -120,3 +240,49 @@ py_library(
         "//tensorflow/python:training",
     ],
 )
+
+py_library(
+    name = "reduce_util",
+    srcs = ["reduce_util.py"],
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_library(
+    name = "shared_variable_creator",
+    srcs = ["shared_variable_creator.py"],
+)
+
+py_test(
+    name = "shared_variable_creator_test",
+    srcs = ["shared_variable_creator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":shared_variable_creator",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "values",
+    srcs = ["values.py"],
+    deps = [
+        ":input_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:base",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/python/distribute/all_reduce.py b/tensorflow/python/distribute/all_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd7c45ae27ac2d093c7feaf9d490ffa074533ddc
--- /dev/null
+++ b/tensorflow/python/distribute/all_reduce.py
@@ -0,0 +1,860 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to construct a TF subgraph implementing distributed All-Reduce."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+from tensorflow.python.framework import device as device_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nccl_ops
+
+
+def _flatten_tensors(tensors):
+  """Check tensors for isomorphism and flatten.
+
+  Args:
+    tensors: list of T `tf.Tensor` which must all have the same shape.
+
+  Returns:
+    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
+    shape: the original shape of each element of input tensors
+
+  Raises:
+    ValueError: tensors are empty or non-isomorphic or have unknown shape.
+  """
+  if not tensors:
+    raise ValueError("tensors cannot be empty")
+  shape = tensors[0].shape
+  for tensor in tensors:
+    shape = shape.merge_with(tensor.shape)
+  if not shape.is_fully_defined():
+    raise ValueError("Tensors must have statically known shape.")
+  if len(shape) != 1:
+    reshaped = []
+    for t in tensors:
+      with ops.colocate_with(t):
+        reshaped.append(array_ops.reshape(t, [-1]))
+    tensors = reshaped
+  return tensors, shape
+
+
+def _reshape_tensors(tensors, shape):
+  """Reshape tensors flattened by _flatten_tensors.
+
+  Args:
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
+    shape: list of integers describing the desired shape.  Product of
+      the elements must equal the length of each tensor.
+
+  Returns:
+    list of T `tf.Tensor` which are the reshaped inputs.
+  """
+  reshaped = []
+  for t in tensors:
+    with ops.colocate_with(t):
+      reshaped.append(array_ops.reshape(t, shape))
+  return reshaped
+
+
+def _padded_split(tensor, pieces):
+  """Like split for 1D tensors but pads-out case where len % pieces != 0.
+
+  Args:
+    tensor: T `tf.Tensor` that must be 1D.
+    pieces: a positive integer specifying the number of pieces into which
+      tensor should be split.
+
+  Returns:
+    list of T `tf.Tensor` of length pieces, which hold the values of
+      thin input tensor, in order.  The final tensor may
+      be zero-padded on the end to make its size equal to those of all
+      of the other tensors.
+
+  Raises:
+    ValueError: The input tensor is not 1D.
+  """
+  shape = tensor.shape
+  if 1 != len(shape):
+    raise ValueError("input tensor must be 1D")
+  tensor_len = shape.dims[0].value
+  with ops.colocate_with(tensor):
+    if tensor_len % pieces != 0:
+      # pad to an even length
+      chunk_size = 1 + tensor_len // pieces
+      if pieces > tensor_len:
+        # This is an edge case that should not come up in practice,
+        # i.e. a different reduction algorithm would be better,
+        # but we'll make it work just for completeness.
+        pad_len = pieces - tensor_len
+        extended_whole = array_ops.concat(
+            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        parts = array_ops.split(extended_whole, pieces)
+        return parts, pad_len
+      elif (pieces - 1) * chunk_size >= tensor_len:
+        # Another edge case of limited real interest.
+        pad_len = (pieces * chunk_size) % tensor_len
+        extended_whole = array_ops.concat(
+            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        parts = array_ops.split(extended_whole, pieces)
+        return parts, pad_len
+      else:
+        last_chunk_size = tensor_len - (pieces - 1) * chunk_size
+        pad_len = chunk_size - last_chunk_size
+        piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
+        parts = array_ops.split(tensor, piece_lens)
+        parts[-1] = array_ops.concat(
+            [parts[-1], array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        return parts, pad_len
+    else:
+      return array_ops.split(tensor, pieces), 0
+
+
+def _strip_padding(tensors, pad_len):
+  """Strip the suffix padding added by _padded_split.
+
+  Args:
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
+    pad_len: number of elements to be stripped from the end of each tensor.
+
+  Returns:
+    list of T `tf.Tensor` which are the stripped inputs.
+
+  Raises:
+    ValueError: tensors must be a non-empty list of 1D tensors, and
+      each must be longer than pad_len.
+  """
+  if not tensors:
+    raise ValueError("tensors cannot be empty")
+  shape = tensors[0].shape
+  if len(shape) > 1:
+    raise ValueError("tensors must be 1D")
+  prefix_len = int(shape[0] - pad_len)
+  if prefix_len < 0:
+    raise ValueError("pad_len longer than tensor")
+  stripped = []
+  for t in tensors:
+    with ops.colocate_with(t):
+      stripped.append(array_ops.slice(t, [0], [prefix_len]))
+  return stripped
+
+
+def _ragged_split(tensor, pieces):
+  """Like split for 1D tensors but allows case where len % pieces != 0.
+
+  Args:
+    tensor: T `tf.Tensor` that must be 1D.
+    pieces: a positive integer specifying the number of pieces into which
+      tensor should be split.
+
+  Returns:
+    list of T `tf.Tensor` of length pieces, which hold the values of
+      the input tensor, in order.  The final tensor may be shorter
+      than the others, which will all be of equal length.
+
+  Raises:
+    ValueError: input tensor must be 1D.
+  """
+  shape = tensor.shape
+  if 1 != len(shape):
+    raise ValueError("input tensor must be 1D")
+  tensor_len = shape.dims[0].value
+  chunk_size = tensor_len // pieces
+  with ops.colocate_with(tensor):
+    if tensor_len != (pieces * chunk_size):
+      # last piece will be short
+      assert pieces > 1
+      last_chunk_size = tensor_len - ((pieces - 1) * chunk_size)
+      assert last_chunk_size > 0
+      piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
+      return array_ops.split(tensor, piece_lens)
+    else:
+      return array_ops.split(tensor, pieces)
+
+
+def _ring_permutations(num_workers, num_subchunks, gpu_perm):
+  """"Generate an array of device index arrays, one for each subchunk.
+
+  In the basic ring reduction algorithm there are size(T)/num_devices
+  data chunks and each device process one chunk per tick, i.e. sending
+  one chunk and receiving one chunk.  The idea of subchunking is that
+  each device processes num_subchunks smaller data regions per tick,
+  and the ring rank permutation is different for each subchunk index
+  so that a device is potentially sending to and receiving from
+  num_subchunks different other devices at each tick.  Where multiple
+  independent data channels exist between devices, this strategy
+  supplies a method of using them in parallel.
+
+  Args:
+    num_workers: number of worker tasks
+    num_subchunks: number of subchunks into which to divide each per-GPU chunk.
+    gpu_perm: an array of integers in [0, num_gpus-1] giving the default
+      ring order of GPUs at each worker.  Other permutations will be generated
+      by rotating this array and splicing together per-worker instances.
+
+  Raises:
+    ValueError: the number of subchunks may not exceed the number of GPUs.
+
+  Returns:
+    pred_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
+        preceding device in the permutation for that subchunk.  The
+        device index of GPU i at worker j is i + (j * num_gpus).
+    rank_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
+       local rank of device d in the permutation for that subchunk.
+  """
+  num_gpus = len(gpu_perm)
+  devices = num_workers * num_gpus
+  if devices == 0:
+    return [], []
+  if num_subchunks > num_gpus:
+    raise ValueError(
+        "num_subchunks %d must be <= num_gpus %d" % (num_subchunks, num_gpus))
+  rotation_interval = max(1, int(num_gpus / num_subchunks))
+  perms_by_s = []
+  for s in range(0, num_subchunks):
+    full_order = []
+    offset = s * rotation_interval
+    for w in range(0, num_workers):
+      default_order = [(w * num_gpus) + i for i in gpu_perm]
+      dev_order = default_order[offset:] + default_order[:offset]
+      full_order += dev_order
+    perms_by_s.append(full_order)
+  pred_by_s_d = [[-1 for d in range(0, devices)]
+                 for s in range(0, num_subchunks)]
+  rank_by_s_d = [[-1 for d in range(0, devices)]
+                 for s in range(0, num_subchunks)]
+  for s in range(0, num_subchunks):
+    for d in range(0, devices):
+      for t in range(0, devices):
+        if d == perms_by_s[s][t]:
+          rank_by_s_d[s][d] = t
+          pred_by_s_d[s][d] = perms_by_s[s][(t + devices - 1) % devices]
+          break
+  return (pred_by_s_d, rank_by_s_d)
+
+
+def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
+                          gpu_perm, red_op, un_op=None):
+  """Construct a subgraph performing a ring-style all-reduce of input_tensors.
+
+  Args:
+    input_tensors: a list of T `tf.Tensor` objects, which must all
+      have the same shape and type.
+    num_workers: number of worker tasks spanned by input_tensors.
+    num_subchunks: number of subchunks each device should process in one tick.
+    gpu_perm: a list of ints giving a ring-wise rank ordering of GPUs at
+      each worker.  All workers must have the same number of
+      GPUs with the same rank ordering.  If NVLINK is available, this should
+      be a ring order supported by NVLINK edges.
+    red_op: a binary operator for elementwise reduction.
+    un_op: an optional unary operator to apply to fully reduced values.
+
+  Raises:
+    ValueError: empty input_tensors or they don't all have same
+    size.
+
+  Returns:
+    a list of T `tf.Tensor` identical sum-reductions of input_tensors.
+  """
+  if len(input_tensors) < 2:
+    raise ValueError("input_tensors must be length 2 or longer")
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  (pred_by_s_d, rank_by_s_d) = _ring_permutations(
+      num_workers, num_subchunks, gpu_perm)
+  chunks_by_dev, pad_len = _build_ring_gather(
+      input_tensors, devices,
+      num_subchunks, pred_by_s_d, rank_by_s_d, red_op)
+  if un_op:
+    chunks_by_dev = _apply_unary_to_chunks(un_op, chunks_by_dev)
+  output_tensors = _build_ring_scatter(pred_by_s_d, rank_by_s_d,
+                                       chunks_by_dev)
+  if pad_len > 0:
+    output_tensors = _strip_padding(output_tensors, pad_len)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_ring_gather(input_tensors, devices, num_subchunks,
+                       pred_by_s_d, rank_by_s_d, red_op):
+  """Construct a subgraph for the first (reduction) pass of ring all-reduce.
+
+  Args:
+    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
+      shape and type.
+    devices: array of device name strings
+    num_subchunks: number of subchunks each device should process in one tick.
+    pred_by_s_d: as produced by _ring_permutations
+    rank_by_s_d: as produced by _ring_permutations
+    red_op: a binary operator for elementwise reduction
+
+  Raises:
+    ValueError: tensors must all be one dimensional.
+
+  Returns:
+    list of list of T `tf.Tensor` of (partially) reduced values where
+    exactly num_subchunks chunks at each device are fully reduced.
+  """
+  num_devices = len(input_tensors)
+  if num_devices == 0:
+    return []
+  if num_devices == 1:
+    return input_tensors
+  shape = input_tensors[0].shape
+  if 1 != len(shape):
+    raise ValueError("input tensors must be 1D")
+  num_chunks = num_devices * num_subchunks
+  num_ticks = num_devices - 1
+  # Initialize chunks_by_dev with splits of the input tensors.
+  chunks_by_dev = []
+  split_pad_len = 0
+  for d in range(0, num_devices):
+    with ops.device(devices[d]):
+      splits, split_pad_len = _padded_split(input_tensors[d], num_chunks)
+      chunks_by_dev.append(splits)
+  # Reduction phase
+  for tick in range(0, num_ticks):
+    # One new partial reduction for every chunk
+    new_partial_reductions = [None for _ in range(0, num_chunks)]
+    # Compute reductions with respect to last tick's values
+    for d in range(0, num_devices):
+      with ops.device(devices[d]):
+        for s in range(0, num_subchunks):
+          rank = rank_by_s_d[s][d]
+          seg_index = (rank + num_devices - (2 + tick)) % num_devices
+          pred_dev = pred_by_s_d[s][d]
+          chunk_index = (seg_index * num_subchunks) + s
+          new_partial_reductions[chunk_index] = red_op(
+              chunks_by_dev[pred_dev][chunk_index],
+              chunks_by_dev[d][chunk_index])
+    # Update chunks_by_dev with the new values at the end of the tick.
+    for d in range(0, num_devices):
+      for s in range(0, num_subchunks):
+        rank = rank_by_s_d[s][d]
+        seg_index = (rank + num_devices - (2 + tick)) % num_devices
+        chunk_index = (seg_index * num_subchunks) + s
+        chunks_by_dev[d][chunk_index] = new_partial_reductions[chunk_index]
+  return chunks_by_dev, split_pad_len
+
+
+def _apply_unary_to_chunks(f, chunks_by_dev):
+  """Apply a unary op to each tensor in chunks_by_dev, on same device.
+
+  Args:
+    f: a unary function over T `tf.Tensor`.
+    chunks_by_dev: list of lists of T `tf.Tensor`.
+
+  Returns:
+    new list of lists of T `tf.Tensor` with the same structure as
+    chunks_by_dev containing the derived tensors.
+  """
+  output = []
+  for x in chunks_by_dev:
+    with ops.colocate_with(x[0]):
+      output.append([f(t) for t in x])
+  return output
+
+
+def _build_ring_scatter(pred_by_s_d, rank_by_s_d,
+                        chunks_by_dev):
+  """Construct subgraph for second (scatter) pass of ring all-reduce.
+
+  Args:
+    pred_by_s_d: as produced by _ring_permutations
+    rank_by_s_d: as produced by _ring_permutations
+    chunks_by_dev: list of list of T `tf.Tensor` indexed by ints
+      (device, chunk)
+
+  Raises:
+    ValueError: chunks_by_dev is not well-formed
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors, one
+    at each device corresponding to the outer dimension of chunks_by_dev.
+  """
+  num_devices = len(chunks_by_dev)
+  num_chunks = len(chunks_by_dev[0])
+  if 0 != num_chunks % num_devices:
+    raise ValueError(
+        "Expect number of chunks per device to be divisible by num_devices")
+  num_subchunks = int(num_chunks / num_devices)
+  num_ticks = num_devices - 1
+  for tick in range(0, num_ticks):
+    passed_values = [None for _ in range(0, num_chunks)]
+    for d in range(0, num_devices):
+      with ops.colocate_with(chunks_by_dev[d][0]):
+        for s in range(0, num_subchunks):
+          rank = rank_by_s_d[s][d]
+          seg_index = (rank + num_devices - (1 + tick)) % num_devices
+          pred_dev = pred_by_s_d[s][d]
+          chunk_index = (seg_index * num_subchunks) + s
+          passed_values[chunk_index] = array_ops.identity(
+              chunks_by_dev[pred_dev][chunk_index])
+    for d in range(0, num_devices):
+      for s in range(0, num_subchunks):
+        rank = rank_by_s_d[s][d]
+        seg_index = (rank + num_devices - (1 + tick)) % num_devices
+        chunk_index = (seg_index * num_subchunks) + s
+        chunks_by_dev[d][chunk_index] = passed_values[chunk_index]
+  # Join chunks at each device.
+  output = []
+  for x in chunks_by_dev:
+    with ops.colocate_with(x[0]):
+      output.append(array_ops.concat(x, 0))
+  return output
+
+
+def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
+  """Construct a subgraph for recursive halving-doubling all-reduce.
+
+  The recursive halving-doubling algorithm is described in
+  http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf
+
+  The concept is to arrange the participating n devices in
+  a linear sequence where devices exchange data pairwise
+  with one other device in each round.  During the gather
+  phase there are lg(n) rounds where devices exchange
+  increasingly smaller sub-tensors with another device
+  at increasingly greater distances, until at the top
+  each device has 1/n of the fully reduced values.  During the
+  scatter phase each device exchanges its fully reduced
+  sub-tensor (which doubles in length at each round)
+  with one other device at increasingly smaller distances
+  until each device has all of the fully reduced values.
+
+  Note: this preliminary version requires that len(input_tensors) be a
+    power of 2.  TODO(tucker): relax this restriction.  Also, the
+    number of elements in each tensor must be divisible by 2^h where h
+    is the number of hops in each phase.  This will also be relaxed in
+    the future with edge-case specific logic.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
+    red_op: a binary elementwise reduction Op.
+    un_op: an optional unary elementwise Op to apply to reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors, one
+    at each device of input_tensors.
+
+  Raises:
+    ValueError: num_devices not a power of 2, or tensor len not divisible
+    by 2 the proper number of times.
+  """
+  devices = [t.device for t in input_tensors]
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  reduced_shards = _build_recursive_hd_gather(input_tensors, devices, red_op)
+  if un_op:
+    reduced_shards = [un_op(t) for t in reduced_shards]
+  output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_recursive_hd_gather(input_tensors, devices, red_op):
+  """Construct the gather phase of recursive halving-doubling all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
+    devices: a list of strings naming the devices hosting input_tensors,
+      which will also be used to host the (partial) reduction values.
+    red_op: a binary elementwise reduction Op.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensor shards.
+
+  Raises:
+    ValueError: num_devices not a power of 2, or tensor len not divisible
+    by 2 the proper number of times.
+  """
+  num_devices = len(devices)
+  num_hops = int(math.log(num_devices, 2))
+  if num_devices != (2 ** num_hops):
+    raise ValueError("num_devices must be a power of 2")
+  chunks = input_tensors
+  for h in range(0, num_hops):
+    span = 2 ** h
+    group_size = span * 2
+    new_chunks = [[] for _ in devices]
+    for d in range(0, num_devices):
+      if (d % group_size) >= (group_size / 2):
+        # skip right half of a pair
+        continue
+      left_dev = devices[d]
+      right_dev = devices[d + span]
+      left_split = array_ops.split(chunks[d], 2)
+      right_split = array_ops.split(chunks[d+span], 2)
+      with ops.device(left_dev):
+        new_chunks[d] = red_op(left_split[0], right_split[0])
+      with ops.device(right_dev):
+        new_chunks[d + span] = red_op(left_split[1], right_split[1])
+    chunks = new_chunks
+  return chunks
+
+
+def _build_recursive_hd_scatter(input_tensors, devices):
+  """Construct the scatter phase of recursive halving-doublng all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
+    devices: a list of strings naming the devices on which the reconstituted
+      full tensors should be placed.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors.
+  """
+  num_devices = len(devices)
+  num_hops = int(math.log(num_devices, 2))
+  assert num_devices == (2 ** num_hops), "num_devices must be a power of 2"
+  chunks = input_tensors
+  for h in reversed(range(0, num_hops)):
+    span = 2 ** h
+    group_size = span * 2
+    new_chunks = [[] for _ in devices]
+    for d in range(0, num_devices):
+      if (d % group_size) >= (group_size / 2):
+        # skip right half of a pair
+        continue
+      left_idx = d
+      right_idx = d + span
+      left_dev = devices[left_idx]
+      right_dev = devices[right_idx]
+      with ops.device(left_dev):
+        new_chunks[left_idx] = array_ops.concat([chunks[left_idx],
+                                                 chunks[right_idx]], 0)
+      with ops.device(right_dev):
+        new_chunks[right_idx] = array_ops.concat([chunks[left_idx],
+                                                  chunks[right_idx]], 0)
+    chunks = new_chunks
+  return chunks
+
+
+def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
+  """Construct a subgraph for shuffle all-reduce.
+
+  Shuffle reduce is essentially the algorithm implemented when using
+  parameter servers.  Suppose tensor length is n, there are d devices
+  and g gather shards.  Each device sends a n/g length sub-tensor to
+  each gather shard.  The gather shards perform a reduction across d
+  fragments, then broadcast the result back to each device.  The
+  devices then join the g fully reduced fragments they receive from
+  the shards.  The gather shards could perform d-1 pairwise
+  reductions, or one d-way reduction.  The first is better where
+  reduction Op time is low compared to transmission time, the second
+  better in the other case.
+
+  Args:
+    input_tensors: list of T @(tf.Tensor} values to be reduced.
+    gather_devices: list of names of devices on which reduction shards
+      should be placed.
+    red_op: an n-array elementwise reduction Op
+    un_op: optional elementwise unary Op to be applied to fully-reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  dst_devices = [t.device for t in input_tensors]
+  reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
+                                         red_op, un_op)
+  output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None):
+  """Construct the gather (concentrate and reduce) phase of shuffle all-reduce.
+
+  Args:
+    input_tensors: list of T @(tf.Tensor} values to be reduced.
+    gather_devices: list of names of devices on which reduction shards
+      should be placed.
+    red_op: the binary reduction Op
+    un_op: optional elementwise unary Op to be applied to fully-reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced shards.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  num_source_devices = len(input_tensors)
+  num_gather_devices = len(gather_devices)
+  shape = input_tensors[0].shape
+  if len(shape) != 1:
+    raise ValueError("input_tensors must be 1D")
+  shards_by_source = []
+  for d in range(0, num_source_devices):
+    with ops.colocate_with(input_tensors[d]):
+      shards_by_source.append(
+          _ragged_split(input_tensors[d], num_gather_devices))
+  reduced_shards = []
+  for d in range(0, num_gather_devices):
+    with ops.device(gather_devices[d]):
+      values = [s[d] for s in shards_by_source]
+      red_shard = red_op(values)
+      if un_op:
+        red_shard = un_op(red_shard)
+      reduced_shards.append(red_shard)
+  return reduced_shards
+
+
+def _build_shuffle_scatter(reduced_shards, dst_devices):
+  """Build the scatter phase of shuffle all-reduce.
+
+  Args:
+    reduced_shards:  list of T @(tf.Tensor} fully reduced shards
+    dst_devices: list of names of devices at which the fully-reduced value
+      should be reconstituted.
+
+  Returns:
+    list of T `tf.Tensor` scattered tensors.
+  """
+  num_devices = len(dst_devices)
+  out_tensors = []
+  for d in range(0, num_devices):
+    with ops.device(dst_devices[d]):
+      out_tensors.append(array_ops.concat(reduced_shards, 0))
+  return out_tensors
+
+
+def _split_by_task(devices, values):
+  """Partition devices and values by common task.
+
+  Args:
+    devices: list of device name strings
+    values: list of T `tf.tensor` of same length as devices.
+
+  Returns:
+    (per_task_devices, per_task_values) where both values are
+    lists of lists with isomorphic structure: the outer list is
+    indexed by task, and the inner list has length of the number
+    of values belonging to that task.  per_task_devices contains
+    the specific devices to which the values are local, and
+    per_task_values contains the corresponding values.
+
+  Raises:
+    ValueError: devices must be same length as values.
+  """
+  num_devices = len(devices)
+  if num_devices != len(values):
+    raise ValueError("len(devices) must equal len(values)")
+  per_task_devices = collections.OrderedDict()
+  per_task_values = collections.OrderedDict()
+  for d in range(num_devices):
+    d_spec = device_lib.DeviceSpec.from_string(devices[d])
+    if not hasattr(d_spec, "task") or d_spec.task is None:
+      assert False, "failed to parse device %s" % devices[d]
+    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
+    if index not in per_task_devices:
+      per_task_devices[index] = []
+      per_task_values[index] = []
+    per_task_devices[index].append(devices[d])
+    per_task_values[index].append(values[d])
+
+  return (list(per_task_devices.values()), list(per_task_values.values()))
+
+
+def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
+  """Build a subgraph that does one full all-reduce, using NCCL.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    red_op: binary elementwise reduction operator.  Must be one of
+      {tf.add}
+    un_op: optional unary elementwise Op to apply to fully-reduce values.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: red_op not supported.
+  """
+  if red_op == math_ops.add:
+    output_tensors = nccl_ops.all_sum(input_tensors)
+  else:
+    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
+  if un_op:
+    un_op_wrapped = []
+    for t in output_tensors:
+      with ops.colocate_with(t):
+        un_op_wrapped.append(un_op(t))
+    output_tensors = un_op_wrapped
+  return output_tensors
+
+
+def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
+  """Construct a subgraph for NCCL hybrid all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    red_op: binary elementwise reduction operator.
+    upper_level_f: function for reducing one value per worker, across
+      workers.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
+  num_workers = len(per_worker_devices)
+  up_values = [None for w in range(0, num_workers)]
+  up_devices = up_values[:]
+  down_values = up_values[:]
+  # First stage: reduce within each worker using NCCL
+  for w in range(0, num_workers):
+    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
+    # NOTE: these reductions will not run to completion unless
+    # every output value is used.  Since we only need one, we
+    # need to put control dependencies on the rest.
+    with ops.control_dependencies(worker_values):
+      with ops.device(worker_values[0].device):
+        up_values[w] = array_ops.identity(worker_values[0])
+      up_devices[w] = per_worker_devices[w][0]
+  # Second stage: Apply upper_level_f to reduce across first device at
+  # each worker
+  level_2_output = upper_level_f(up_values)
+  # Third stage: propagate within each worker using NCCL Broadcast
+  for w in range(0, num_workers):
+    dst_tensors = []
+    with ops.device(per_worker_devices[w][0]):
+      broadcast_src = nccl_ops.broadcast(array_ops.identity(level_2_output[w]))
+    for d in per_worker_devices[w]:
+      with ops.device(d):
+        dst_tensors.append(array_ops.identity(broadcast_src))
+    down_values[w] = dst_tensors
+  output_tensors = [v for sublist in down_values for v in sublist]
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _reduce_non_singleton(input_tensors, red_f, un_op):
+  """If len(input_tensors) > 1, apply red_f, else apply un_op."""
+  if len(input_tensors) > 1:
+    return red_f(input_tensors)
+  else:
+    if not un_op:
+      return input_tensors
+    output_tensors = []
+    for t in input_tensors:
+      with ops.colocate_with(t):
+        output_tensors.append(un_op(t))
+    return output_tensors
+
+
+def build_nccl_then_ring(input_tensors, subdiv, red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Ring across workers."""
+  def upper_builder(y):
+    return build_ring_all_reduce(y, len(y), subdiv, [0], red_op, un_op)
+  def upper_level_f(x):
+    return _reduce_non_singleton(x, upper_builder, un_op)
+  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
+
+
+def build_nccl_then_recursive_hd(input_tensors, red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Recursive-HD across workers."""
+  upper_level_f = lambda x: build_recursive_hd_all_reduce(x, red_op, un_op)
+  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
+
+
+def build_nccl_then_shuffle(input_tensors, gather_devices, nccl_red_op,
+                            shuffle_red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Shuffle across workers."""
+  def upper_level_f(x):
+    return build_shuffle_all_reduce(x, gather_devices, shuffle_red_op, un_op)
+
+  return _build_nccl_hybrid(input_tensors, nccl_red_op, upper_level_f)
+
+
+def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
+  """Construct a subgraph for Shuffle hybrid all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    gather_devices: list of device names on which to host gather shards.
+    red_op: binary elementwise reduction operator.
+    upper_level_f: function for reducing one value per worker, across
+      workers.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  # First stage, reduce across each worker using gather_devices.
+  devices = [t.device for t in input_tensors]
+  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
+  num_workers = len(per_worker_devices)
+  up_values = []
+  if len(gather_devices) != num_workers:
+    raise ValueError("For shuffle hybrid, gather_devices must contain one "
+                     "device per worker. ")
+  for w in range(0, num_workers):
+    reduced_shards = _build_shuffle_gather(
+        per_worker_values[w], [gather_devices[w]], red_op)
+    up_values.append(reduced_shards[0])
+  # Second stage, apply upper_level_f.
+  level_2_output = upper_level_f(up_values)
+  # Third stage, apply shuffle scatter at each worker.
+  output_tensors = []
+  for w in range(0, num_workers):
+    output_tensors += _build_shuffle_scatter(
+        [level_2_output[w]], per_worker_devices[w])
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def build_shuffle_then_ring(input_tensors, gather_devices, subdiv,
+                            red_n_op, red_op, un_op=None):
+  """Construct hybrid of Shuffle within workers, Ring across workers."""
+  def upper_builder(tensors):
+    return build_ring_all_reduce(tensors, len(tensors), subdiv, [0],
+                                 red_op, un_op)
+  def upper_level_f(tensors):
+    return _reduce_non_singleton(tensors, upper_builder, un_op)
+  return _build_shuffle_hybrid(
+      input_tensors, gather_devices, red_n_op, upper_level_f)
+
+
+def build_shuffle_then_shuffle(input_tensors, first_gather_devices,
+                               second_gather_devices, red_op, un_op=None):
+  """Construct hybrid of Shuffle within workers, Shuffle across workers."""
+  def upper_builder(tensors):
+    return build_shuffle_all_reduce(tensors, second_gather_devices,
+                                    red_op, un_op)
+  def upper_level_f(tensors):
+    return _reduce_non_singleton(tensors, upper_builder, un_op)
+  return _build_shuffle_hybrid(
+      input_tensors, first_gather_devices, red_op, upper_level_f)
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce_test.py b/tensorflow/python/distribute/all_reduce_test.py
similarity index 98%
rename from tensorflow/contrib/all_reduce/python/all_reduce_test.py
rename to tensorflow/python/distribute/all_reduce_test.py
index 304fd7fb8a37f1aab91f47d754eb2efba81304a5..5bf983a1b20c1852e08991184be0106c9ec03e62 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce_test.py
+++ b/tensorflow/python/distribute/all_reduce_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.contrib.all_reduce.python..all_reduce."""
+"""Tests for all_reduce."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,8 +22,8 @@ import time
 
 import numpy as np
 
-from tensorflow.contrib.all_reduce.python import all_reduce as ar
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.distribute import all_reduce as ar
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -159,7 +159,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
       output_tensors = build_f(input_tensors, un_op)
       sum_reduced = math_ops.add_n(output_tensors)
       sum_reduced.op.run()
-      self.assertAllClose(sum_reduced.eval(), simple_sum.eval())
+      self.assertAllClose(sum_reduced.eval(), self.evaluate(simple_sum))
 
   def _testRingAllReduce(self, num_workers, num_gpus, shape, subdiv):
     start_time = time.time()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/python/distribute/cross_device_ops.py
similarity index 81%
rename from tensorflow/contrib/distribute/python/cross_tower_ops.py
rename to tensorflow/python/distribute/cross_device_ops.py
index bae0f474d27b3256358f8ac08cdd6b5f04be56c5..f55385eddcc462d450d072904296c80b37f84cb0 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -21,15 +21,15 @@ from __future__ import print_function
 import collections
 import six
 
-from tensorflow.contrib.distribute.python import cross_tower_utils
-from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.python.client import device_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import device_util
 
@@ -62,26 +62,26 @@ def validate_destinations(destinations):
     raise ValueError("destinations can not be empty")
 
 
-def _make_tensor_into_per_device(input_tensor):
-  """Converts a single tensor into a PerDevice object."""
+def _make_tensor_into_per_replica(input_tensor):
+  """Converts a single tensor into a PerReplica object."""
   if isinstance(input_tensor, (tuple, list)):
-    raise ValueError("Cannot convert `input_tensor` to a `PerDevice` object, "
+    raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object, "
                      "got %r but expected a object that is not a tuple or list."
                      % (input_tensor,))
-  if isinstance(input_tensor, value_lib.PerDevice):
+  if isinstance(input_tensor, value_lib.PerReplica):
     return input_tensor
 
   try:
     device = input_tensor.device
   except AttributeError:
-    raise ValueError("Cannot convert `input_tensor` to a `PerDevice` object "
+    raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object "
                      "because it doesn't have device set.")
 
-  return value_lib.PerDevice({device: input_tensor})
+  return value_lib.PerReplica({device: input_tensor})
 
 
 def _normalize_value_destination_pairs(value_destination_pairs):
-  """Converts each tensor into a PerDevice object in the input list."""
+  """Converts each tensor into a PerReplica object in the input list."""
   result = []
   if not isinstance(value_destination_pairs, (list, tuple)):
     raise ValueError("`value_destination_pairs` should be a list or tuple")
@@ -93,8 +93,8 @@ def _normalize_value_destination_pairs(value_destination_pairs):
       raise ValueError("Each element of `value_destination_pairs` should be a "
                        "tuple of size 2.")
 
-    per_device = _make_tensor_into_per_device(pair[0])
-    result.append((per_device, pair[1]))
+    per_replica = _make_tensor_into_per_replica(pair[0])
+    result.append((per_replica, pair[1]))
   return result
 
 
@@ -105,7 +105,7 @@ def _validate_value_destination_pairs(value_destination_pairs):
   if not isinstance(value_destination_pairs, (list, tuple)): return False
   if not all([isinstance(pair, tuple) for pair in value_destination_pairs]):
     return False
-  if not all([isinstance(v[0], value_lib.PerDevice)
+  if not all([isinstance(v[0], value_lib.PerReplica)
               for v in value_destination_pairs]):
     return False
   return True
@@ -144,42 +144,31 @@ def _simple_broadcast(value, destinations):
   index = {}
   devices = get_devices_from(destinations)
   for d in devices:
-    index[d] = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+    index[d] = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
         value, d)
   return value_lib.Mirrored(index)
 
 
-def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn,
-                   aggregation):
+def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
+                   reduce_op):
   # pylint: disable=g-missing-docstring
   all_values = []
   count = 0
-  for v in per_device_value._index.values():  # pylint: disable=protected-access
-    if isinstance(v, value_lib.MapOutput):
-      v_list = v.get()
-      if not v_list:
-        continue
-      count += len(v_list)
-      # Sum within each device before aggregating across devices.
-      # TODO(yuefengz): Check whether it helps to use accumulation_fn here.
-      v = cross_tower_utils.aggregate_tensors_or_indexed_slices(
-          v_list, math_ops.add_n)
-    else:
-      count += 1
+  for v in per_replica_value._index.values():  # pylint: disable=protected-access
+    count += 1
     all_values.append(v)
   if not all_values:
-    raise ValueError("`per_device_value` must be non-empty")
+    raise ValueError("`per_replica_value` must be non-empty")
 
   with ops.device(reduce_to_device):
     with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-      reduced = cross_tower_utils.aggregate_tensors_or_indexed_slices(
+      reduced = cross_device_utils.aggregate_tensors_or_indexed_slices(
           all_values, accumulation_fn)
-      if aggregation == vs.VariableAggregation.MEAN:
-        reduced = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(
+      if reduce_op == reduce_util.ReduceOp.MEAN:
+        reduced = cross_device_utils.divide_by_n_tensors_or_indexed_slices(
             reduced, count)
-      elif aggregation != vs.VariableAggregation.SUM:
-        raise ValueError("`aggregation` must be VariableAggregation.SUM "
-                         "or VariableAggregation.MEAN.")
+      elif reduce_op != reduce_util.ReduceOp.SUM:
+        raise ValueError("`reduce_op` must be Reduce.SUM or Reduce.MEAN.")
   return reduced
 
 
@@ -189,40 +178,40 @@ class CrossDeviceOps(object):
   def __init__(self):
     pass
 
-  def reduce(self, aggregation, per_device_value, destinations):
-    """Reduce `per_device_value` to `destinations`.
+  def reduce(self, reduce_op, per_replica_value, destinations):
+    """Reduce `per_replica_value` to `destinations`.
 
-    It runs the reduction operation defined by `aggregation` and put the
+    It runs the reduction operation defined by `reduce_op` and put the
     result on `destinations`.
 
     Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
-      per_device_value: a PerDevice object or a tensor with device set.
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      per_replica_value: a PerReplica object or a tensor with device set.
       destinations: the reduction destinations.
 
     Returns:
       a Mirrored object.
 
     Raises:
-      ValueError: if per_device_value is not a PerDevice object.
+      ValueError: if per_replica_value is not a PerReplica object.
     """
-    if not isinstance(per_device_value, value_lib.PerDevice):
-      per_device_value = _make_tensor_into_per_device(per_device_value)
+    if not isinstance(per_replica_value, value_lib.PerReplica):
+      per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
     validate_destinations(destinations)
-    return self._reduce(aggregation, per_device_value, destinations)
+    return self._reduce(reduce_op, per_replica_value, destinations)
 
-  def batch_reduce(self, aggregation, value_destination_pairs):
-    """Reduce PerDevice objects in a batch.
+  def batch_reduce(self, reduce_op, value_destination_pairs):
+    """Reduce PerReplica objects in a batch.
 
     Reduce each first element in `value_destination_pairs` to each second
     element which indicates the destinations.
 
     Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
-      value_destination_pairs: a list or a tuple of tuples of PerDevice objects
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      value_destination_pairs: a list or a tuple of tuples of PerReplica objects
         (or tensors with device set if there is one device) and destinations.
 
     Returns:
@@ -230,18 +219,18 @@ class CrossDeviceOps(object):
 
     Raises:
       ValueError: if `value_destination_pairs` is not a list or a tuple of
-        tuples of PerDevice objects and destinations
+        tuples of PerReplica objects and destinations
     """
     if not _validate_value_destination_pairs(value_destination_pairs):
       # If the first element of each pair is a tensor, we try to turn it into a
-      # PerDevice object.
+      # PerReplica object.
       value_destination_pairs = _normalize_value_destination_pairs(
           value_destination_pairs)
 
     for _, d in value_destination_pairs:
       validate_destinations(d)
 
-    return self._batch_reduce(aggregation, value_destination_pairs)
+    return self._batch_reduce(reduce_op, value_destination_pairs)
 
   def broadcast(self, tensor, destinations):
     """Broadcast the `tensor` to destinations.
@@ -256,11 +245,11 @@ class CrossDeviceOps(object):
     validate_destinations(destinations)
     return self._broadcast(tensor, destinations)
 
-  def _reduce(self, aggregation, per_device_value, destinations):
+  def _reduce(self, reduce_op, per_replica_value, destinations):
     raise NotImplementedError(
         "_reduce method must be implemented in descendants.")
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
     raise NotImplementedError(
         "_batch_reduce method must be implemented in descendants.")
 
@@ -286,24 +275,24 @@ class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
     self.accumulation_fn = accumulation_fn
     super(ReductionToOneDeviceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, aggregation, per_device_value, destinations):
+  def _reduce(self, reduce_op, per_replica_value, destinations):
     if check_destinations(destinations):
       devices = get_devices_from(destinations)
     else:
-      devices = get_devices_from(per_device_value)
+      devices = get_devices_from(per_replica_value)
     reduce_to_device = self.reduce_to_device or devices[0]
-    reduced = _simple_reduce(per_device_value, reduce_to_device,
-                             self.accumulation_fn, aggregation)
+    reduced = _simple_reduce(per_replica_value, reduce_to_device,
+                             self.accumulation_fn, reduce_op)
     return self.broadcast(reduced, devices)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
     return [
-        self._reduce(aggregation, t, destinations=v)
+        self._reduce(reduce_op, t, destinations=v)
         for t, v in value_destination_pairs
     ]
 
 
-def _group_value_by_device(per_device_values):
+def _group_value_by_device(per_replica_values):
   """Group values into sublists by their devices.
 
   This grouping is needed to call the all-reduce library because it expects a
@@ -315,38 +304,38 @@ def _group_value_by_device(per_device_values):
     ]
 
   Args:
-    per_device_values: a list of PerDevice obejcts.
+    per_replica_values: a list of PerReplica obejcts.
 
   Returns:
     a list of lists, each sublist has components for its corresponding device of
-      PerDevice objects, paired with a None.
+      PerReplica objects, paired with a None.
   """
-  destinations = per_device_values[0].devices
+  destinations = per_replica_values[0].devices
   grouped = [[] for _ in range(len(destinations))]
-  for per_device_value in per_device_values:
+  for per_replica_value in per_replica_values:
     # pylint: disable=protected-access
-    for i, v in enumerate(per_device_value._index.values()):
-      assert per_device_value.devices == destinations
+    for i, v in enumerate(per_replica_value._index.values()):
+      assert per_replica_value.devices == destinations
       grouped[i].append((v, None))
   return grouped
 
 
 def _ungroup_and_make_mirrored(grouped_reduced,
                                destinations,
-                               aggregation,
+                               reduce_op,
                                num_between_graph_workers=1):
   """Ungroup results from all-reduce and make Mirrored objects.
 
   Each all-reduce result will be divided by the number of destinations before
-  Mirrored objects are created if aggregation is "mean".
+  Mirrored objects are created if reduce_op is "mean".
 
   Args:
     grouped_reduced: a list of lists, each sublist has components for each
       device, paired with a None. It is the result from
-      cross_tower_utils.aggregate_gradients_using*.
+      cross_device_utils.aggregate_gradients_using*.
     destinations: a list of device strings for returned Mirrored objects.
-    aggregation: Indicates how a variable will be aggregated. Accepted values
-      are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
+    reduce_op: Indicates how values will be aggregated. Accepted values
+      are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
     num_between_graph_workers: number of workers in the between-graph
       replication.
 
@@ -354,9 +343,9 @@ def _ungroup_and_make_mirrored(grouped_reduced,
     a list of Mirrored objects.
   """
   index = [{} for _ in range(len(grouped_reduced[0]))]
-  for d, per_device_reduced in enumerate(grouped_reduced):
-    for i, (v, _) in enumerate(per_device_reduced):
-      if aggregation == vs.VariableAggregation.MEAN:
+  for d, per_replica_reduced in enumerate(grouped_reduced):
+    for i, (v, _) in enumerate(per_replica_reduced):
+      if reduce_op == reduce_util.ReduceOp.MEAN:
         index[i][destinations[d]] = v / (
             len(destinations) * num_between_graph_workers)
       else:
@@ -496,7 +485,7 @@ class AggregateSmallTensorPacker(object):
     """Aggregate small tensors."""
     if (self.agg_small_grads_max_bytes > 0 and
         self.agg_small_grads_max_group > 0):
-      device_grads, self.packing = cross_tower_utils.pack_small_tensors(
+      device_grads, self.packing = cross_device_utils.pack_small_tensors(
           grouped_grads_and_vars,
           max_bytes=self.agg_small_grads_max_bytes,
           max_group=self.agg_small_grads_max_group)
@@ -504,8 +493,8 @@ class AggregateSmallTensorPacker(object):
 
   def unpack(self, summed_device_grad_packs):
     """Reverse the aggregation process."""
-    return cross_tower_utils.unpack_small_tensors(summed_device_grad_packs,
-                                                  self.packing)
+    return cross_device_utils.unpack_small_tensors(summed_device_grad_packs,
+                                                   self.packing)
 
 
 def _pack_tensors(device_grads,
@@ -567,13 +556,13 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     self._agg_small_grads_max_group = agg_small_grads_max_group
     super(AllReduceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, aggregation, per_device_value, destinations):
-    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
-        per_device_value)
-    if (_devices_match(per_device_value, destinations)
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
+        per_replica_value)
+    if (_devices_match(per_replica_value, destinations)
         and not context.executing_eagerly()
         and not contains_indexed_slices):
-      return self._batch_all_reduce(aggregation, [per_device_value])[0]
+      return self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     else:
       if contains_indexed_slices:
         logging.log_first_n(
@@ -583,19 +572,19 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
       if check_destinations(destinations):
         devices = get_devices_from(destinations)
       else:
-        devices = get_devices_from(per_device_value)
+        devices = get_devices_from(per_replica_value)
       reduce_to_device = devices[0]
-      reduced = _simple_reduce(per_device_value, reduce_to_device,
-                               math_ops.add_n, aggregation)
+      reduced = _simple_reduce(per_replica_value, reduce_to_device,
+                               math_ops.add_n, reduce_op)
       return self.broadcast(reduced, devices)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
     all_devices_match = _all_devices_match(value_destination_pairs)
-    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
+    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
         value_destination_pairs)
     if (all_devices_match and not context.executing_eagerly()
         and not contains_indexed_slices):
-      return self._batch_all_reduce(aggregation,
+      return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs])
     else:
       if not all_devices_match:
@@ -605,20 +594,20 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
                             10)
 
       return [
-          self._reduce(aggregation, t, destinations=v)
+          self._reduce(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, aggregation, per_device_values):
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO, "batch_all_reduce invoked for batches size = %d with "
         "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
         "agg_small_grads_max_group = %d" %
-        (len(per_device_values), self._all_reduce_alg, self._num_packs,
+        (len(per_replica_values), self._all_reduce_alg, self._num_packs,
          self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
-    destinations = per_device_values[0].devices
-    grouped = _group_value_by_device(per_device_values)
+    destinations = per_replica_values[0].devices
+    grouped = _group_value_by_device(per_replica_values)
 
     device_grad_packs, tensor_packer = _pack_tensors(
         grouped, self._num_packs, self._agg_small_grads_max_bytes,
@@ -629,18 +618,18 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     # the balance on num_splits.
     if self._all_reduce_alg == "nccl":
       # TODO(yuefengz): merge this into the all-reduce library.
-      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
+      reduced = cross_device_utils.aggregate_gradients_using_nccl(
           device_grad_packs)
     else:
       # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
       # order.
       reduced = (
-          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
+          cross_device_utils.aggregate_gradients_using_hierarchical_copy(
               destinations, device_grad_packs))
 
     reduced = _unpack_tensors(reduced, tensor_packer)
-    return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
-                                      aggregation)
+    return _ungroup_and_make_mirrored(reduced, per_replica_values[0].devices,
+                                      reduce_op)
 
 
 # For compatibility with code using the old name of `AllReduceCrossDeviceOps`.
@@ -723,18 +712,18 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
           validate_and_complete_spec(spec) for spec in all_reduce_spec
       ]
 
-  def _batch_all_reduce(self, aggregation, per_device_values):
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO,
         "distributed batch_all_reduce invoked for batches size = %d with "
         "allreduce_spec = %r, num_packs = %d, agg_small_grads_max_bytes = %d "
         "and agg_small_grads_max_group = %d" %
-        (len(per_device_values), self._all_reduce_spec, self._num_packs,
+        (len(per_replica_values), self._all_reduce_spec, self._num_packs,
          self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
 
-    destinations = sorted(per_device_values[0].devices)
-    device_grads = _group_value_by_device(per_device_values)
+    destinations = sorted(per_replica_values[0].devices)
+    device_grads = _group_value_by_device(per_replica_values)
 
     # The all reduce library requires fully defined shapes.
     # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
@@ -751,13 +740,13 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
         this_grads = remaining_grads
         remaining_grads = []
       else:
-        (this_grads, remaining_grads) = cross_tower_utils.split_grads_by_size(
+        (this_grads, remaining_grads) = cross_device_utils.split_grads_by_size(
             spec_tuple.limit, remaining_grads)
       if this_grads:
         device_grad_packs, tensor_packer = _pack_tensors(
             this_grads, self._num_packs, self._agg_small_grads_max_bytes,
             self._agg_small_grads_max_group)
-        range_agg_grads = cross_tower_utils.sum_gradients_all_reduce(
+        range_agg_grads = cross_device_utils.sum_gradients_all_reduce(
             self._worker_devices, device_grad_packs, len(self._worker_devices),
             spec_tuple.alg, spec_tuple.shards, range(self._num_gpus_per_worker))
         range_agg_grads = _unpack_tensors(range_agg_grads, tensor_packer)
@@ -771,7 +760,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
     assert not remaining_grads
 
     return _ungroup_and_make_mirrored(aggregated_grads, destinations,
-                                      aggregation)
+                                      reduce_op)
 
 
 # TODO(yuefengz): support in-graph collective all-reduce.
@@ -800,21 +789,21 @@ class CollectiveAllReduce(CrossDeviceOps):
     self._num_workers = num_workers
     self._num_gpus_per_worker = num_gpus_per_worker
     self._all_reduce_merge_scope = all_reduce_merge_scope
-    self._collective_keys = collective_keys or cross_tower_utils.CollectiveKeys(
-    )
+    self._collective_keys = (collective_keys or
+                             cross_device_utils.CollectiveKeys())
     super(CollectiveAllReduce, self).__init__()
 
   # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
-  def _reduce(self, aggregation, per_device_value, destinations):
-    if cross_tower_utils.contains_indexed_slices(per_device_value):
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    if cross_device_utils.contains_indexed_slices(per_replica_value):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
     if context.executing_eagerly():
       raise ValueError(
           "Eager execution is not supported for Collective All-Reduce")
 
-    all_reduced = self._batch_all_reduce(aggregation, [per_device_value])[0]
-    if _devices_match(per_device_value, destinations):
+    all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
+    if _devices_match(per_replica_value, destinations):
       return all_reduced
     else:
       index = {}
@@ -829,8 +818,8 @@ class CollectiveAllReduce(CrossDeviceOps):
 
       return value_lib.Mirrored(index)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if cross_tower_utils.contains_indexed_slices(value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
+    if cross_device_utils.contains_indexed_slices(value_destination_pairs):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
     if context.executing_eagerly():
@@ -839,7 +828,7 @@ class CollectiveAllReduce(CrossDeviceOps):
 
     all_devices_match = _all_devices_match(value_destination_pairs)
     if all_devices_match:
-      return self._batch_all_reduce(aggregation,
+      return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs])
     else:
       if not all_devices_match:
@@ -848,11 +837,11 @@ class CollectiveAllReduce(CrossDeviceOps):
             "destinations are different.", 10)
 
       return [
-          self._reduce(aggregation, t, destinations=v)
+          self._reduce(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, aggregation, per_device_values):
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All-reduce across all workers in a batch."""
     if context.executing_eagerly():
       raise ValueError(
@@ -860,9 +849,9 @@ class CollectiveAllReduce(CrossDeviceOps):
 
     logging.log_first_n(
         logging.INFO, "Collective All-reduce invoked with batches size = %d, "
-        "num_workers = %d" % (len(per_device_values), self._num_workers), 10)
+        "num_workers = %d" % (len(per_replica_values), self._num_workers), 10)
 
-    grouped_by_device = _group_value_by_device(per_device_values)
+    grouped_by_device = _group_value_by_device(per_replica_values)
 
     grouped_by_var = list(zip(*grouped_by_device))
     # grouped_by_var is grouped by variables and takes the following format:
@@ -881,7 +870,7 @@ class CollectiveAllReduce(CrossDeviceOps):
       with ops.name_scope("allreduce"):
         for grad_and_vars in chunk:
           scaled_grads = [g for g, _ in grad_and_vars]
-          collective_reduced = cross_tower_utils.build_collective_reduce(
+          collective_reduced = cross_device_utils.build_collective_reduce(
               scaled_grads, self._num_workers, self._collective_keys, "Add",
               "Id")
           result = []
@@ -892,8 +881,8 @@ class CollectiveAllReduce(CrossDeviceOps):
     new_device_grads = [list(x) for x in zip(*reduced_gv_list)]
     return _ungroup_and_make_mirrored(
         new_device_grads,
-        per_device_values[0].devices,
-        aggregation,
+        per_replica_values[0].devices,
+        reduce_op,
         num_between_graph_workers=self._num_workers)
 
 
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/python/distribute/cross_device_utils.py
similarity index 98%
rename from tensorflow/contrib/distribute/python/cross_tower_utils.py
rename to tensorflow/python/distribute/cross_device_utils.py
index 35324d15d4416364698390468d65d442f442ec50..7903992ac7de71bacf377fd223285dda8e5412ab 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for cross_tower_ops."""
+"""Utilities for cross_device_ops."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +21,8 @@ from __future__ import print_function
 import collections as pycoll
 import threading
 
-from tensorflow.contrib.all_reduce.python import all_reduce
-from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.distribute import all_reduce
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -667,7 +667,5 @@ def contains_indexed_slices(value):
     return any(contains_indexed_slices(v) for v in value)
   elif isinstance(value, value_lib.DistributedValues):
     return contains_indexed_slices(list(value._index.values()))  # pylint: disable=protected-access
-  elif isinstance(value, value_lib.MapOutput):
-    return contains_indexed_slices(value.get())
   else:
     return False
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index 520413102bec27f762acc242a8e2a99a58ed4ce5..c0f9b8a1fdfdf8bd95375f489058cadcd63c9cb9 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -245,7 +245,7 @@ class _WorkerContext(object):
     else:
       session_config = self._session_config
 
-    if not self._strategy or self._strategy.should_init:
+    if not self._strategy or self._strategy.extended.experimental_should_init:
       logging.info("Creating chief session creator with config: %r", config)
       return monitored_session.ChiefSessionCreator(
           scaffold,
@@ -261,6 +261,10 @@ class _WorkerContext(object):
           config=session_config,
           max_wait_secs=max_wait_secs)
 
+  @property
+  def session_config(self):
+    return copy.deepcopy(self._session_config)
+
   @property
   def has_barrier(self):
     """Whether the barrier is set or not."""
@@ -301,15 +305,20 @@ class _WorkerContext(object):
     """Returns number of workers in the cluster, including chief."""
     return self._num_workers
 
+  @property
+  def experimental_should_init(self):
+    """Whether to run init ops."""
+    return self._strategy.extended.experimental_should_init
+
   @property
   def should_checkpoint(self):
     """Whether to save checkpoint."""
-    return self._strategy.should_checkpoint
+    return self._strategy.extended.should_checkpoint
 
   @property
   def should_save_summary(self):
     """Whether to save summaries."""
-    return self._strategy.should_save_summary
+    return self._strategy.extended.should_save_summary
 
 
 def _run_single_worker(worker_fn,
@@ -623,10 +632,10 @@ def run_distribute_coordinator(worker_fn,
   The `strategy` object is expected to be a DistributionStrategy object which
   has implemented methods needed by distributed coordinator such as
   `configure(session_config, cluster_spec, task_type, task_id)` which configures
-  the strategy object for a specific task and `should_init` property which
-  instructs the distribute coordinator whether to run init ops for a task. The
-  distribute coordinator will make a copy of the `strategy` object, call its
-  `configure` method and pass it to `worker_fn` as an argument.
+  the strategy object for a specific task and `experimental_should_init`
+  property which instructs the distribute coordinator whether to run init ops
+  for a task. The distribute coordinator will make a copy of the `strategy`
+  object, call its `configure` method and pass it to `worker_fn` as an argument.
 
   The `worker_fn` defines the training logic and is called under a its own
   worker context which can be accessed to via `get_current_worker_context`. A
@@ -749,7 +758,7 @@ def run_distribute_coordinator(worker_fn,
     # The client must know the cluster but servers in the cluster don't have to
     # know the client.
     if task_type in [_TaskType.CLIENT, None]:
-      if strategy.between_graph:
+      if strategy.extended.experimental_between_graph:
         return _run_between_graph_client(worker_fn, strategy, eval_fn,
                                          eval_strategy, cluster_spec,
                                          session_config, rpc_layer)
@@ -795,7 +804,7 @@ def run_distribute_coordinator(worker_fn,
         environment=environment)
 
     if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
-      if strategy.between_graph:
+      if strategy.extended.experimental_between_graph:
         # All jobs run `worker_fn` if between-graph.
         _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
                            task_id, session_config, rpc_layer)
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index 5d336648ce97f30dc034b1b42af994830baeffc8..f2cb950aada5a7aea7c239ec822893d56dece0bd 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
+from tensorflow.python.training import session_manager
 
 
 CHIEF = distribute_coordinator._TaskType.CHIEF
@@ -78,46 +79,53 @@ def _strip_protocol(target):
     return target
 
 
-class MockStrategy(object):
+class MockExtended(object):
 
   def __init__(self,
                between_graph=False,
                should_init=None,
                should_checkpoint=None,
                should_save_summary=None):
-    self._between_graph = between_graph
-    self._should_init = should_init
-    self._should_checkpoint = should_checkpoint
-    self._should_save_summary = should_save_summary
+    self.experimental_between_graph = between_graph
+    self.experimental_should_init = should_init
+    self.should_checkpoint = should_checkpoint
+    self.should_save_summary = should_save_summary
 
-  @property
-  def between_graph(self):
-    return self._between_graph
+
+class MockStrategy(object):
+
+  def __init__(self,
+               between_graph=False,
+               should_init=None,
+               should_checkpoint=None,
+               should_save_summary=None):
+    self.extended = MockExtended(between_graph, should_init, should_checkpoint,
+                                 should_save_summary)
 
   def configure(self,
                 session_config=None,
                 cluster_spec=None,
                 task_type=None,
                 task_id=None):
-    if self._should_init is None:
+    if self.extended.experimental_should_init is None:
       if task_id == 0:
-        self._should_init = True
+        self.extended.experimental_should_init = True
       else:
-        self._should_init = False
-    if self._should_checkpoint is None:
+        self.extended.experimental_should_init = False
+    if self.extended.should_checkpoint is None:
       if task_id == 0:
-        self._should_checkpoint = True
+        self.extended.should_checkpoint = True
       else:
-        self._should_checkpoint = False
-    if self._should_save_summary is None:
+        self.extended.should_checkpoint = False
+    if self.extended.should_save_summary is None:
       if task_id == 0:
-        self._should_save_summary = True
+        self.extended.should_save_summary = True
       else:
-        self._should_save_summary = False
+        self.extended.should_save_summary = False
 
     if session_config:
       if (cluster_spec and task_type and task_id is not None and
-          self._between_graph):
+          self.extended.experimental_between_graph):
         session_config.intra_op_parallelism_threads += 1
         if task_type in ["chief", "worker"]:
           session_config.device_filters.extend(
@@ -126,18 +134,6 @@ class MockStrategy(object):
         session_config.inter_op_parallelism_threads += 1
         session_config.device_filters.append("/job:somejob")
 
-  @property
-  def should_init(self):
-    return self._should_init
-
-  @property
-  def should_checkpoint(self):
-    return self._should_checkpoint
-
-  @property
-  def should_save_summary(self):
-    return self._should_save_summary
-
 
 class MockServer(object):
 
@@ -372,9 +368,12 @@ class DistributeCoordinatorTestBase(test.TestCase):
     context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
 
-    self.assertEqual(context._strategy.should_init, strategy.should_init)
-    self.assertEqual(context.should_checkpoint, strategy.should_checkpoint)
-    self.assertEqual(context.should_save_summary, strategy.should_save_summary)
+    self.assertEqual(context._strategy.extended.experimental_should_init,
+                     strategy.extended.experimental_should_init)
+    self.assertEqual(context.should_checkpoint,
+                     strategy.extended.should_checkpoint)
+    self.assertEqual(context.should_save_summary,
+                     strategy.extended.should_save_summary)
 
     task_type = str(context.task_type)
     task_id = context.task_id or 0
@@ -384,7 +383,8 @@ class DistributeCoordinatorTestBase(test.TestCase):
       while len(self._strategy_property[task_type]) <= task_id:
         self._strategy_property[task_type].append(None)
       self._strategy_property[task_type][task_id] = (
-          context._strategy.should_init, context.should_checkpoint,
+          context._strategy.extended.experimental_should_init,
+          context.should_checkpoint,
           context.should_save_summary)
 
   def _run_mock_std_server(self,
@@ -930,4 +930,14 @@ class RunStandardTensorflowServerTest(test.TestCase):
 if __name__ == "__main__":
   # TODO(yuefengz): find a smart way to terminite std server threads.
   with test.mock.patch.object(sys, "exit", os._exit):
+    # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly.
+    orig_init = session_manager.SessionManager.__init__
+
+    def new_init(*args, **kwargs):
+      kwargs.pop("recovery_wait_secs", None)
+      kwargs["recovery_wait_secs"] = 0.5
+      orig_init(*args, **kwargs)
+
+    session_manager.SessionManager.__init__ = new_init
+
     test.main()
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 227b00fb3e566b9d0adc9a8def9b1785a7128854..549fa8fb8aaaa047402f2bfedda9cb4c648fe861 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -308,7 +308,7 @@ def estimator_train(estimator, train_distributed_fn, hooks):
     raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
                      '`estimator.train`')
 
-  if estimator._config._train_distribute.between_graph:
+  if estimator._config._train_distribute.extended.experimental_between_graph:
     # TODO(yuefengz): remove this limitation once we figure out how to merge
     # return values from `_worker_fn`s.
     raise ValueError('`Estimator.train` API is not supported for %s with '
@@ -356,7 +356,7 @@ def estimator_evaluate(estimator, evaluate_distributed_fn, hooks):
     raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
                      '`Estimator.train`')
 
-  if estimator._config._eval_distribute.between_graph:
+  if estimator._config._eval_distribute.extended.experimental_between_graph:
     # TODO(yuefengz): remove this limitation once we figure out how to merge
     # return values from `_worker_fn`s.
     raise ValueError('`Estimator.evaluate` API is not supported for %s with '
diff --git a/tensorflow/contrib/distribute/python/input_ops.py b/tensorflow/python/distribute/input_ops.py
similarity index 93%
rename from tensorflow/contrib/distribute/python/input_ops.py
rename to tensorflow/python/distribute/input_ops.py
index f07ec8234dfe87f2869cd7c2dd6a64c477712d15..c40b2bf27a8db115eae510112ee212d6c065c1be 100644
--- a/tensorflow/contrib/distribute/python/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import ops
@@ -27,9 +28,8 @@ from tensorflow.python.platform import tf_logging
 
 # TODO(priyag): Any other reader datasets to consider here?
 _READER_DATASET_OPS = [
-    "TextLineDataset",
-    "TFRecordDataset",
-    "FixedLengthRecordDataset"
+    "TextLineDataset", "TFRecordDataset", "FixedLengthRecordDataset",
+    "FixedLengthRecordDatasetV2"
 ]
 
 
@@ -75,6 +75,8 @@ def auto_shard_dataset(dataset, num_shards, index):
         # instead of updating in-place.
         return dataset._clone(
             filenames=dataset._filenames.shard(num_shards, index))
+      elif isinstance(dataset, dataset_ops.RangeDataset):
+        return dataset.shard(num_shards, index)
       elif hasattr(dataset, "_map_func"):
         # TODO(priyag): Make this check more robust by enforcing some common
         # property on all map/flatmap/interleave datasets.
@@ -100,6 +102,11 @@ def auto_shard_dataset(dataset, num_shards, index):
               dataset._input_dataset, found_reader_op)
           return dataset
 
+    if isinstance(dataset, dataset_ops.DatasetV1Adapter):
+      dataset._dataset = _auto_shard_impl(
+          dataset._dataset, found_reader_op)
+      return dataset
+
     # TODO(priyag): Make _input_dataset(s) a common property of all datasets to
     # make this check more robust.
     if hasattr(dataset, "_input_dataset"):
diff --git a/tensorflow/contrib/distribute/python/input_ops_test.py b/tensorflow/python/distribute/input_ops_test.py
similarity index 96%
rename from tensorflow/contrib/distribute/python/input_ops_test.py
rename to tensorflow/python/distribute/input_ops_test.py
index 559de97bb1f93f990ddaf775d9203d5a2d46aa99..54f7c5d01211f5ba27f220a3b7ee1257a03e9f51 100644
--- a/tensorflow/contrib/distribute/python/input_ops_test.py
+++ b/tensorflow/python/distribute/input_ops_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.distribute.python import input_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.distribute import input_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
@@ -92,7 +92,7 @@ class AutoShardDatasetTest(test.TestCase):
     with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(record_fn(r, f), sess.run(next_element))
+          self.assertAllEqual(record_fn(r, f), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
@@ -205,10 +205,11 @@ class AutoShardDatasetTest(test.TestCase):
     with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(self._record(r, f), sess.run(next_element))
+          self.assertAllEqual(self._record(r, f), self.evaluate(next_element))
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(self._text_line(r, f), sess.run(next_element))
+          self.assertAllEqual(
+              self._text_line(r, f), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed096b8638cc1871c58a941ff3f5d6e81edf4da
--- /dev/null
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -0,0 +1,805 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class MirroredStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import functools
+import threading
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import shared_variable_creator
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.util import nest
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+@contextlib.contextmanager
+def _enter_graph(g):
+  if context.executing_eagerly():
+    with g.as_default(), context.eager_mode():
+      yield
+  else:
+    with g.as_default():
+      yield
+
+
+def _cpu_device(device):
+  cpu_device = tf_device.DeviceSpec.from_string(device)
+  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
+  return cpu_device.to_string()
+
+
+class _RequestedStop(Exception):  # pylint: disable=g-bad-exception-name
+  pass
+
+
+# _call_for_each_replica and _reduce_non_distributed_value are not members of
+# MirroredStrategy so that they are generally not allowed to use anything
+# specific to MirroredStrategy and thus can be shared with other distribution
+# strategies.
+
+
+# TODO(yuefengz): maybe create a common class for those who need to call this
+# _call_for_each_replica.
+def _call_for_each_replica(distribution, fn, args, kwargs):
+  """Run `fn` in separate threads, once per replica/worker device.
+
+  Args:
+    distribution: the DistributionStrategy object.
+    fn: function to run (will be run once per device, each in its own thread).
+    args: positional arguments for `fn`
+    kwargs: keyword arguments for `fn`.
+
+  Returns:
+    Merged return value of `fn` across all replicas.
+
+  Raises:
+    RuntimeError: If fn() calls get_replica_context().merge_call() a different
+        number of times from the available devices.
+  """
+  # TODO(josh11b): Add this option once we add synchronization to variable
+  # creation. Until then, this is pretty unsafe to use.
+  run_concurrently = False
+  if not context.executing_eagerly():
+    # Needed for per-thread device, etc. contexts in graph mode.
+    ops.get_default_graph().switch_to_thread_local()
+
+  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
+
+  shared_variable_store = {}
+
+  # TODO(isaprykin): Create these threads once instead of during every run()
+  # call.
+  threads = []
+  for index, d in enumerate(distribution.extended.worker_devices):
+    variable_creator_fn = shared_variable_creator.make_fn(
+        shared_variable_store, index)
+    t = MirroredExtended._MirroredReplicaThread(  # pylint: disable=protected-access
+        distribution, coord, d, variable_creator_fn, fn,
+        *values.select_device(d, args), **values.select_device(d, kwargs))
+    threads.append(t)
+
+  for t in threads:
+    t.start()
+
+  # When `fn` starts `should_run` event is set on _MirroredReplicaThread
+  # (`MRT`) threads. The execution waits until
+  # `MRT.has_paused` is set, which indicates that either `fn` is
+  # complete or a `get_replica_context().merge_call()` is called.  If `fn` is
+  # complete, then `MRT.done` is set to True.  Otherwise, arguments
+  # of `get_replica_context().merge_call` from all paused threads are grouped
+  # and the `merge_fn` is performed.  Results of the
+  # `get_replica_context().merge_call` are then set to `MRT.merge_result`.
+  # Each such `get_replica_context().merge_call` call returns the
+  # `MRT.merge_result` for that thread when `MRT.should_run` event
+  # is reset again. Execution of `fn` resumes.
+
+  try:
+    with coord.stop_on_exception():
+      all_done = False
+      while not all_done and not coord.should_stop():
+        done = []
+        if run_concurrently:
+          for t in threads:
+            t.should_run.set()
+          for t in threads:
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        else:
+          for t in threads:
+            t.should_run.set()
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        if coord.should_stop():
+          return None
+        all_done = all(done)
+        if not all_done:
+          if any(done):
+            raise RuntimeError("Some replicas made a different number of "
+                               "replica_context().merge_call() calls.")
+          # get_replica_context().merge_call() case
+          merge_args = values.regroup({t.device: t.merge_args for t in threads})
+          merge_kwargs = values.regroup(
+              {t.device: t.merge_kwargs for t in threads})
+          # We capture the name_scope of the MRT when we call merge_fn
+          # to ensure that if we have opened a name scope in the MRT,
+          # it will be respected when executing the merge function. We only
+          # capture the name_scope from the first MRT and assume it is
+          # the same for all other MRTs.
+          mtt_captured_name_scope = threads[0].captured_name_scope
+          with ops.name_scope(mtt_captured_name_scope):
+            merge_result = threads[0].merge_fn(distribution, *merge_args,
+                                               **merge_kwargs)
+          for t in threads:
+            t.merge_result = values.select_device(t.device, merge_result)
+  finally:
+    for t in threads:
+      t.should_run.set()
+    coord.join(threads)
+
+  return values.regroup({t.device: t.main_result for t in threads})
+
+
+def _reduce_non_distributed_value(extended, reduce_op, value, destinations):
+  """Reduce a non-DistributedValue `value` to `destinations`."""
+  if isinstance(value, values.DistributedValues):
+    raise ValueError("You are passing a `DistributedValue` to "
+                     "`_reduce_non_distributed_value`, which is not allowed.")
+
+  # If the same value is present on all replicas then the PerReplica value will
+  # be a single value. We also handle the case when `value` is a single value
+  # and equal to 0.
+  if value == 0:
+    return 0
+  # If there is only a single value and the reduce op is MEAN,
+  # that value should be on all destinations.
+  if reduce_op == reduce_util.ReduceOp.MEAN:
+    return value
+
+  cross_device_ops_lib.validate_destinations(destinations)
+  # We do not support a reduce op of SUM if the value is the same across
+  # all replicas. We call this as part of assign functions for MirroredVariables
+  # and summing up identical values across replicas is not clearly defined.
+  if (len(extended.worker_devices) != 1 or
+      not cross_device_ops_lib.check_destinations(destinations)):
+    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
+                     "the given reduce op %s." % (value, reduce_op))
+  # TODO(anjalisridhar): Moves these methods to a device utility file?
+  devices = cross_device_ops_lib.get_devices_from(destinations)
+  if len(devices) == 1:
+    with ops.device(devices[0]):
+      return array_ops.identity(value)
+  else:
+    value_updates = {}
+    for d in devices:
+      with ops.device(d):
+        value_updates[d] = array_ops.identity(value)
+    return values.Mirrored(value_updates)
+
+
+def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
+  # Figure out what collections this variable should be added to.
+  # We'll add the MirroredVariable to those collections instead.
+  collections = kwargs.pop("collections", None)
+  if collections is None:
+    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  kwargs["collections"] = []
+
+  # Get synchronization value
+  synchronization = kwargs.get("synchronization",
+                               variable_scope.VariableSynchronization.ON_WRITE)
+  if synchronization == variable_scope.VariableSynchronization.NONE:
+    raise ValueError("`NONE` variable synchronization mode is not "
+                     "supported with `Mirrored` distribution strategy. Please"
+                     " change the `synchronization` for variable: " +
+                     kwargs["name"])
+  elif synchronization == variable_scope.VariableSynchronization.ON_READ:
+    # Variables that are to be synced on read are replica local.
+    is_replica_local = True
+    kwargs["trainable"] = False
+  elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or
+        synchronization == variable_scope.VariableSynchronization.AUTO):
+    # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`.
+    is_replica_local = False
+  else:
+    raise ValueError("Invalid variable synchronization mode: " +
+                     synchronization + " for variable: " + kwargs["name"])
+
+  # Get aggregation value
+  aggregation = kwargs.pop("aggregation",
+                           variable_scope.VariableAggregation.NONE)
+  if aggregation not in (
+      variable_scope.VariableAggregation.NONE,
+      variable_scope.VariableAggregation.SUM,
+      variable_scope.VariableAggregation.MEAN,
+      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
+  ):
+    raise ValueError("Invalid variable aggregation mode: " + aggregation +
+                     " for variable: " + kwargs["name"])
+
+  # Ignore user-specified caching device, not needed for mirrored variables.
+  kwargs.pop("caching_device", None)
+
+  # TODO(josh11b,apassos): It would be better if variable initialization
+  # was never recorded on the tape instead of having to do this manually
+  # here.
+  with tape.stop_recording():
+    index = real_mirrored_creator(devices, *args, **kwargs)
+
+    if is_replica_local:
+      result = values.ReplicaLocalVariable(
+          index, index[devices[0]], aggregation)
+    else:
+      result = values.MirroredVariable(index, index[devices[0]], aggregation)
+
+  # Add the wrapped variable to the requested collections.
+  # The handling of eager mode and the global step matches
+  # ResourceVariable._init_from_args().
+  if not context.executing_eagerly():
+    g = ops.get_default_graph()
+    # If "trainable" is True, next_creator() will add the member variables
+    # to the TRAINABLE_VARIABLES collection, so we manually remove
+    # them and replace with the MirroredVariable. We can't set
+    # "trainable" to False for next_creator() since that causes functions
+    # like implicit_gradients to skip those variables.
+    if kwargs.get("trainable", True):
+      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+      for v in index.values():
+        if v in l:
+          l.remove(v)
+    g.add_to_collections(collections, result)
+  elif ops.GraphKeys.GLOBAL_STEP in collections:
+    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
+
+  return result
+
+
+class MirroredStrategy(distribute_lib.DistributionStrategy):
+  """Mirrors vars to distribute across multiple devices and machines.
+
+  This strategy uses one replica per device and sync replication for its
+  multi-GPU version.
+
+  The multi-worker version will be added in the fture.
+
+  Args:
+    devices: a list of device strings.
+    num_gpus_per_worker: number of GPUs per worker.
+    cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
+      set, nccl will be use by default.
+  """
+
+  def __init__(self,
+               devices=None,
+               num_gpus_per_worker=None,
+               cross_device_ops=None):
+    extended = MirroredExtended(self, devices, num_gpus_per_worker,
+                                cross_device_ops)
+    super(MirroredStrategy, self).__init__(extended)
+
+
+class MirroredExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of MirroredStrategy."""
+
+  def __init__(self,
+               container_strategy,
+               devices=None,
+               num_gpus_per_worker=None,
+               cross_device_ops=None):
+    super(MirroredExtended, self).__init__(container_strategy)
+    self._cross_device_ops = cross_device_ops
+    # Remember num GPUs which might be needed by `configure` method.
+    self._num_gpus = num_gpus_per_worker
+
+    self._initialize_local(self._num_gpus, devices)
+
+  def _initialize_local(self, num_gpus, devices):
+    """Initializes the object for local training."""
+    self._cluster_spec = None
+    # Convert `num_gpus` into `devices`, shouldn't specify both.
+    if devices is None:
+      if num_gpus is None:
+        num_gpus = context.num_gpus()
+      if num_gpus == 0:
+        devices = ["/device:CPU:0"]
+      else:
+        devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
+    elif num_gpus is not None:
+      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
+    self._num_gpus = num_gpus
+    # TODO(yuefengz): consider setting the default device.
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = [device_util.resolve(d) for d in devices]
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
+
+  def _initialize_multi_worker(self, num_gpus, cluster_spec):
+    """Initializes the object for multi-worker training."""
+    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+    self._cluster_spec = cluster_spec
+
+    self._workers = []
+    for job in ["chief", "worker"]:
+      for task in range(len(cluster_spec.as_dict().get(job, []))):
+        self._workers.append("/job:%s/task:%d" % (job, task))
+
+    if num_gpus is None:
+      raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
+    if num_gpus > 0:
+      self._worker_devices = [
+          (worker, [
+              device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
+              for gpu in range(num_gpus)
+          ]) for worker in self._workers
+      ]
+    else:
+      self._worker_devices = [
+          (worker, [device_util.canonicalize(worker, "/device:CPU:0")])
+          for worker in self._workers
+      ]
+
+    devices = nest.flatten([l for _, l in self._worker_devices])
+
+    # Setting `_default_device` will add a device scope in the
+    # distribution.scope. We set the default device to the first worker. When
+    # users specify device under distribution.scope by
+    #   with tf.device("/cpu:0"):
+    #     ...
+    # their ops will end up on the cpu device of its first worker, e.g.
+    # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
+    self._default_device = self._workers[0]
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = [device_util.resolve(d) for d in devices]
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    """Create a mirrored variable. See `DistributionStrategy.scope`."""
+    colocate_with = kwargs.pop("colocate_with", None)
+    devices = self._get_devices_from(colocate_with)
+
+    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
+      index = {}
+      for i, d in enumerate(devices):
+        with ops.device(d):
+          if i > 0:
+            # Give replicas meaningful distinct names:
+            var0name = index[devices[0]].name.split(":")[0]
+            # We append a / to variable names created on replicas with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
+            # Initialize replicas with the same value:
+            def initial_value_fn(device=d):
+              if context.executing_eagerly():
+                init_value = index[devices[0]].value()
+                return array_ops.identity(init_value)
+              else:
+                with ops.device(device):
+                  init_value = index[devices[0]].initial_value
+                  return array_ops.identity(init_value)
+            kwargs["initial_value"] = initial_value_fn
+          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+            # Don't record operations (e.g. other variable reads) during
+            # variable creation.
+            with tape.stop_recording():
+              v = next_creator(*args, **kwargs)
+          assert not isinstance(v, values.DistributedVariable)
+          index[d] = v
+      return index
+
+    return _create_mirrored_variable(devices, _real_mirrored_creator, *args,
+                                     **kwargs)
+
+  def _distribute_dataset(self, dataset_fn):
+    if self._cluster_spec:
+      return values.MultiWorkerDataset(
+          functools.partial(self._call_dataset_fn, dataset_fn),
+          self._worker_devices,
+          auto_shard=False)
+    else:
+      return values.PerReplicaDataset(
+          self._call_dataset_fn(dataset_fn), self._devices)
+
+  def _make_dataset_iterator(self, dataset):
+    if self._cluster_spec:
+      worker_device_pairs = self._worker_devices
+    else:
+      worker_device_pairs = [("/job:localhost", self._devices)]
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    input_contexts = []
+    if self._cluster_spec:
+      num_workers = len(self._worker_devices)
+      worker_device_pairs = self._worker_devices
+    else:
+      num_workers = 1
+      worker_device_pairs = [("/job:localhost", self._devices)]
+    for i in range(num_workers):
+      input_contexts.append(distribute_lib.InputContext(
+          num_input_pipelines=num_workers,
+          input_pipeline_id=i,
+          num_replicas_in_sync=self._num_replicas_in_sync))
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs, input_contexts)
+
+  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+
+    ctx = values.MultiStepContext()
+    def body(i, *args):
+      """A wrapper around `fn` to create the while loop body."""
+      del args
+      fn_inputs = iterator.get_next()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, fn_inputs)
+      for (name, output) in ctx.last_step_outputs.items():
+        # Convert all outputs to tensors, potentially from `DistributedValues`.
+        ctx.last_step_outputs[name] = self._unwrap(output)
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      with ops.control_dependencies([fn_result]):
+        return [i + 1] + flat_last_step_outputs
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    cond = lambda i, *args: i < iterations
+    i = constant_op.constant(0)
+    loop_result = control_flow_ops.while_loop(
+        cond, body, [i] + initial_loop_values, name="",
+        parallel_iterations=1, back_prop=False, swap_memory=False,
+        return_same_structure=True)
+    del self._outer_control_flow_context
+
+    ctx.run_op = control_flow_ops.group(loop_result)
+
+    # Convert the last_step_outputs from a list to the original dict structure
+    # of last_step_outputs.
+    last_step_tensor_outputs = loop_result[1:]
+    last_step_tensor_outputs_dict = nest.pack_sequence_as(
+        ctx.last_step_outputs, last_step_tensor_outputs)
+
+    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
+      output = last_step_tensor_outputs_dict[name]
+      # For outputs that have already been reduced, wrap them in a Mirrored
+      # container, else in a PerReplica container.
+      if reduce_op is None:
+        last_step_tensor_outputs_dict[name] = values.regroup(
+            {d: t for d, t in zip(self._devices, output)}, values.PerReplica)
+      else:
+        assert len(output) == 1
+        last_step_tensor_outputs_dict[name] = output[0]
+
+    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+    return ctx
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
+    return self._get_cross_device_ops().broadcast(
+        tensor, destinations or self._devices)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    return _call_for_each_replica(self._container_strategy(), fn, args, kwargs)
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    del task_type, task_id
+
+    if session_config:
+      session_config.isolate_session_state = True
+
+    if cluster_spec:
+      self._initialize_multi_worker(self._num_gpus, cluster_spec)
+
+    if self._cross_device_ops is None:
+      if self._cluster_spec:
+        # It currently cannot detect the toplogy of remote workers. So we
+        # hard-code the multi-worker all-reduce algorithm for now.
+        if len(self._workers) == 1:
+          # The default is "nccl".
+          self._cross_device_ops = (
+              cross_device_ops_lib.AllReduceCrossDeviceOps())
+        else:
+          # The default is hierarchical reduce and broadcast.
+          self._cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
+              self._workers, self._num_gpus)
+      else:
+        self._cross_device_ops = cross_device_ops_lib.choose_the_best(
+            self._devices, session_config=session_config)
+
+  def _get_cross_device_ops(self):
+    if self._cross_device_ops is None:
+      self._cross_device_ops = (
+          cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps())
+    return self._cross_device_ops
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    assert not isinstance(value, values.Mirrored)
+    if not isinstance(value, values.DistributedValues):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return _reduce_non_distributed_value(self, reduce_op, value,
+                                           destinations)
+    return self._get_cross_device_ops().reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    return self._get_cross_device_ops().batch_reduce(reduce_op,
+                                                     value_destination_pairs)
+
+  def _update(self, var, fn, args, kwargs, group):
+    # TODO(josh11b): In eager mode, use one thread per device.
+    assert isinstance(var, values.DistributedVariable)
+    updates = {}
+    for d, v in var._index.items():  # pylint: disable=protected-access
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        # If args and kwargs are not mirrored, the value is returned as is.
+        updates[d] = fn(v,
+                        *values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.update_regroup(self, updates, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    assert isinstance(colocate_with, list)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d in colocate_with:
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(*values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.update_regroup(self, updates, group)
+
+  def read_var(self, replica_local_var):
+    """Read the aggregate value of a replica-local variable."""
+    if isinstance(replica_local_var, values.ReplicaLocalVariable):
+      return replica_local_var._get_cross_replica()  # pylint: disable=protected-access
+    assert isinstance(replica_local_var, values.Mirrored)
+    return array_ops.identity(replica_local_var.get())
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      if set(val.devices) == self._canonical_device_set:
+        return [val.get(device=d) for d in self._devices]
+      return [val.get(device=d) for d in sorted(val.devices)]
+    return [val]
+
+  def value_container(self, val):
+    return values.value_container(val)
+
+  @property
+  def _num_replicas_in_sync(self):
+    return len(self._devices)
+
+  @property
+  def worker_devices(self):
+    # Make a copy to prevent users from accidentally mutating our copy.
+    return list(self._devices)
+
+  @property
+  def parameter_devices(self):
+    return list(self._devices)
+
+  @property
+  def experimental_between_graph(self):
+    return False
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return list(self._devices)
+
+  def _get_devices_from(self, colocate_with=None):
+    if colocate_with is None:
+      return self._devices
+    else:
+      return cross_device_ops_lib.get_devices_from(colocate_with)
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
+
+  class _MirroredReplicaThread(threading.Thread):
+    """A thread that runs() a function on a device."""
+
+    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
+                 **kwargs):
+      super(MirroredExtended._MirroredReplicaThread, self).__init__()  # pylint: disable=protected-access
+      self.coord = coord
+      self.distribution = dist
+      self.device = device
+      self.replica_id = dist.extended.worker_devices.index(device)
+      self.variable_creator_fn = variable_creator_fn
+      # State needed to run and return the results of `fn`.
+      self.main_fn = fn
+      self.main_args = args
+      self.main_kwargs = kwargs
+      self.main_result = None
+      self.done = False
+      # State needed to run the next merge_call() (if any) requested via
+      # ReplicaContext.
+      self.merge_fn = None
+      self.merge_args = None
+      self.merge_kwargs = None
+      self.merge_result = None
+      self.captured_name_scope = None
+      # We use a thread.Event for the main thread to signal when this
+      # thread should start running (`should_run`), and another for
+      # this thread to transfer control back to the main thread
+      # (`has_paused`, either when it gets to a
+      # `get_replica_context().merge_call` or when `fn` returns). In
+      # either case the event starts cleared, is signaled by calling
+      # set(). The receiving thread waits for the signal by calling
+      # wait() and then immediately clearing the event using clear().
+      self.should_run = threading.Event()
+      self.has_paused = threading.Event()
+      # These fields have to do with inheriting various contexts from the
+      # parent thread:
+      # pylint: disable=protected-access
+      self.context_mode = context.context()._eager_context.mode
+      if not context.context()._context_handle:
+        context.context()._initialize_handle_and_devices()
+      self.context_device_policy = (
+          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+              context.context()._context_handle))
+      self.graph = ops.get_default_graph()
+      self._variable_creator_stack = self.graph._variable_creator_stack[:]
+      self._captured_var_scope = variable_scope.get_variable_scope()
+      # Adding a "/" at end lets us re-enter this scope later.
+      self._name_scope = self.graph.get_name_scope()
+      if self._name_scope:
+        self._name_scope += "/"
+      if self.replica_id > 0:
+        if not self._name_scope:
+          self._name_scope = ""
+        self._name_scope += "replica_%d/" % self.replica_id
+
+    def run(self):
+      # pylint: disable=protected-access
+      self.graph._variable_creator_stack = self._variable_creator_stack
+      self.should_run.wait()
+      self.should_run.clear()
+      try:
+        if self.coord.should_stop():
+          return
+        with self.coord.stop_on_exception(), \
+            context.context()._mode(self.context_mode), \
+            context.context().device_policy(self.context_device_policy), \
+            _enter_graph(self.graph), \
+            MirroredReplicaContext(self.distribution, constant_op.constant(
+                self.replica_id, dtypes.int32)), \
+            ops.device(self.device), \
+            ops.name_scope(self._name_scope), \
+            variable_scope.variable_scope(
+                self._captured_var_scope, reuse=self.replica_id > 0), \
+            variable_scope.variable_creator_scope(self.variable_creator_fn):
+          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
+          self.done = True
+      finally:
+        self.has_paused.set()
+
+
+class MirroredReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext used in MirroredStrategy.call_for_each_replica().
+
+  Opened in `_MirroredReplicaThread`, to allow the user to invoke
+  `MirroredStrategy`'s specific implementation of `merge_call()`,
+  which works by delegating the function and its arguments to
+  the main thread (the one that invoked
+  `MirroredStrategy.call_for_each_replica()`).
+  """
+
+  def _merge_call(self, fn, args, kwargs):
+    """Delegate to the main thread to actually perform merge_call()."""
+    t = threading.current_thread()  # a _MirroredReplicaThread
+    t.merge_fn = fn
+    t.merge_args = args
+    t.merge_kwargs = kwargs
+    t.captured_name_scope = t.graph.get_name_scope()
+    # Adding a "/" at end lets us re-enter this scope later.
+    if t.captured_name_scope:
+      t.captured_name_scope += "/"
+    t.has_paused.set()
+    t.should_run.wait()
+    t.should_run.clear()
+    if t.coord.should_stop():
+      raise _RequestedStop()
+    return t.merge_result
+
+  @property
+  def devices(self):
+    distribute_lib.require_replica_context(self)
+    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+    return [self._distribution_strategy.extended.worker_devices[replica_id]]
diff --git a/tensorflow/python/distribute/multi_worker_util.py b/tensorflow/python/distribute/multi_worker_util.py
index 360733eff64606db2c4bde1a83351fb414ff2068..2986a6726a5bc2c837a554892f5aebd09da43c91 100644
--- a/tensorflow/python/distribute/multi_worker_util.py
+++ b/tensorflow/python/distribute/multi_worker_util.py
@@ -45,6 +45,33 @@ def normalize_cluster_spec(cluster_spec):
   return cluster_spec
 
 
+# TODO(yuefengz): add more validations.
+def _validate_cluster_spec(cluster_spec, task_type, task_id):
+  """Validates `cluster_spec`.
+
+  It checks
+  1) whether there is such a task type as `task_type` in the
+  `cluster_spec`.
+  2) whether there is at most one "chief" job.
+  3) whether the `task_id` is smaller than the number of `task_type`.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object to be validated.
+    task_type: string indicating the type of the task.
+    task_id: task_id: the id of the `task_type` in this cluster.
+  Throws:
+    ValueError: if `cluster_spec` fails any check.
+  """
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+  if task_type and task_type not in cluster_spec:
+    raise ValueError("`task_type` %r not found in cluster_spec." % task_type)
+  if len(cluster_spec.get("chief", [])) > 1:
+    raise ValueError("There must be at most one 'chief' job.")
+  if task_id >= len(cluster_spec[task_type]):
+    raise ValueError(
+        "The `task_id` %d exceeds the maximum id of %s." % (task_id, task_type))
+
+
 def is_chief(cluster_spec, task_type, task_id):
   """Returns whether the given task is chief in the cluster.
 
@@ -61,20 +88,73 @@ def is_chief(cluster_spec, task_type, task_id):
     ValueError: if `task_type` is not in the `cluster_spec` or `task_id` exceeds
       the maximum id of the `task_type`.
   """
-  cluster_spec = normalize_cluster_spec(cluster_spec)
-  if task_type not in cluster_spec.jobs:
-    raise ValueError(
-        "The task_type \"%s\" is not in the `cluster_spec`." % task_type)
-  if task_id >= cluster_spec.num_tasks(task_type):
-    raise ValueError("The `task_id` %d exceeds the maximum id of %s." % (
-        task_id, task_type))
+  _validate_cluster_spec(cluster_spec, task_type, task_id)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
 
   if task_type == "chief":
     return True
 
   # If chief not in the cluster_spec, use the first worker as chief. This is
   # common in CollectiveAllReduceStrategy.
-  if ("chief" not in cluster_spec.jobs and task_type == "worker" and
-      task_id == 0):
+  if ("chief" not in cluster_spec and task_type == "worker" and task_id == 0):
     return True
   return False
+
+
+def worker_count(cluster_spec, task_type):
+  """Returns the number of workers in the cluster."""
+  _validate_cluster_spec(cluster_spec, task_type, task_id=0)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+
+  # Other jobs such as "ps" shouldn't call this function.
+  if task_type not in ["chief", "worker", "evaluator"]:
+    raise ValueError("Unexpected `task_type` %r" % task_type)
+
+  if task_type == "evaluator":
+    # The "evaluator" is in its own cluster or its own partition of a cluster.
+    # So we don't have to count "chief" or "worker" if the current task is an
+    # "evaluator".
+    return len(cluster_spec["evaluator"])
+  else:
+    # In the non-evaluator case, we return the total number of "chief" and
+    # "worker" tasks as the "chief" is also a worker.
+    return (len(cluster_spec.get("chief", [])) + len(
+        cluster_spec.get("worker", [])))
+
+
+def id_in_cluster(cluster_spec, task_type, task_id):
+  """Returns a unique id for the task in the `task_type`'s cluster.
+
+  It returns an id ranging from [0, `worker_count(task_type, task_id)`).
+
+  Note: this function assumes that "evaluate" job is in its own cluster or its
+  own partition of a cluster.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object to be validated.
+    task_type: string indicating the type of the task.
+    task_id: the id of the `task_type` in this cluster.
+
+  Returns:
+    an int indicating the unique id.
+
+  Throws:
+    ValueError: if `task_type` is not "chief", "worker" or "evaluator".
+  """
+  _validate_cluster_spec(cluster_spec, task_type, task_id)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+
+  # The "chief" job has always id 0 and there is at most one and "worker" jobs
+  # come after it.
+  if task_type == "chief":
+    return 0
+
+  if task_type == "worker":
+    return task_id + len(cluster_spec.get("chief", []))
+
+  # The "evaluator" is in its own cluster or its own partition of a cluster.
+  if task_type == "evaluator":
+    return task_id
+
+  # We currently don't assign ids to other tasks.
+  raise ValueError("There is no id for task_type %r" % task_type)
diff --git a/tensorflow/python/distribute/multi_worker_util_test.py b/tensorflow/python/distribute/multi_worker_util_test.py
index bdc49725c7751873bed665abd3b24b1722b00525..9e1596eefdf6ee83c3b31ef2ccbf1d0637a6027e 100644
--- a/tensorflow/python/distribute/multi_worker_util_test.py
+++ b/tensorflow/python/distribute/multi_worker_util_test.py
@@ -95,7 +95,7 @@ class IsChiefTest(test.TestCase):
     self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1))
 
     with self.assertRaisesRegexp(
-        ValueError, "The task_type \"chief\" is not in the `cluster_spec`."):
+        ValueError, "`task_type` 'chief' not found in cluster_spec."):
       multi_worker_util.is_chief(cluster_spec, "chief", 0)
 
     with self.assertRaisesRegexp(
@@ -103,5 +103,94 @@ class IsChiefTest(test.TestCase):
       multi_worker_util.is_chief(cluster_spec, "worker", 2)
 
 
+class NumWorkersTest(test.TestCase):
+
+  def testCountWorker(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="chief"), 3)
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="worker"), 3)
+
+  def testCountEvaluator(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "evaluator": ["127.0.0.1:7566"]
+    }
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="evaluator"), 1)
+
+  def testTaskTypeNotFound(self):
+    cluster_spec = {}
+    with self.assertRaisesRegexp(
+        ValueError, "`task_type` 'worker' not found in cluster_spec."):
+      multi_worker_util.worker_count(cluster_spec, task_type="worker")
+
+  def testCountPs(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    # A "ps" job shouldn't call this method.
+    with self.assertRaisesRegexp(ValueError, "Unexpected `task_type` 'ps'"):
+      multi_worker_util.worker_count(cluster_spec, task_type="ps")
+
+
+class IdInClusterTest(test.TestCase):
+
+  def testChiefId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "chief", 0), 0)
+
+  def testWorkerId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "worker", 1), 2)
+
+    cluster_spec = {
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "worker", 1), 1)
+
+  def testEvaluatorId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "evaluator": ["127.0.0.1:7566"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "evaluator", 0), 0)
+
+  def testPsId(self):
+    cluster_spec = {"chief": ["127.0.0.1:1234"], "ps": ["127.0.0.1:7566"]}
+    with self.assertRaisesRegexp(ValueError,
+                                 "There is no id for task_type 'ps'"):
+      multi_worker_util.id_in_cluster(cluster_spec, "ps", 0)
+
+  def testMultipleChiefs(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:8258", "127.0.0.1:7566"],
+    }
+    with self.assertRaisesRegexp(ValueError,
+                                 "There must be at most one 'chief' job."):
+      multi_worker_util.id_in_cluster(cluster_spec, "chief", 0)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/reduce_util.py b/tensorflow/python/distribute/reduce_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b2a4e9dba81e38e6bb3ea970e390628fe3cb540
--- /dev/null
+++ b/tensorflow/python/distribute/reduce_util.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilites for reduce operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import enum
+
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("distribute.ReduceOp")
+class ReduceOp(enum.Enum):
+  """Indicates how a set of values should be reduced.
+
+  * `SUM`: Add all the values.
+  * `MEAN`: Take the arithmetic mean ("average") of the values.
+
+  TODO(priyag): Add the following types:
+  * `MIN`: Return the minimum of all values.
+  * `MAX`: Return the maximum of all values.
+  """
+
+  SUM = "SUM"
+  MEAN = "MEAN"
+
+  @staticmethod
+  def from_variable_aggregation(aggregation):
+    mapping = {
+        variable_scope.VariableAggregation.SUM: ReduceOp.SUM,
+        variable_scope.VariableAggregation.MEAN: ReduceOp.MEAN,
+    }
+
+    reduce_op = mapping.get(aggregation)
+    if not reduce_op:
+      raise ValueError("Could not convert from `tf.VariableAggregation` %s to"
+                       "`tf.distribute.ReduceOp` type" % aggregation)
+    return reduce_op
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator.py b/tensorflow/python/distribute/shared_variable_creator.py
similarity index 100%
rename from tensorflow/contrib/distribute/python/shared_variable_creator.py
rename to tensorflow/python/distribute/shared_variable_creator.py
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py b/tensorflow/python/distribute/shared_variable_creator_test.py
similarity index 97%
rename from tensorflow/contrib/distribute/python/shared_variable_creator_test.py
rename to tensorflow/python/distribute/shared_variable_creator_test.py
index 2a9ab51fcfd29a8ae5b37b5c513415af29b277dc..4ddc29f256761c2359f0a49415932b53eda066f4 100644
--- a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
+++ b/tensorflow/python/distribute/shared_variable_creator_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variable_scope
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/python/distribute/values.py
similarity index 74%
rename from tensorflow/contrib/distribute/python/values.py
rename to tensorflow/python/distribute/values.py
index 42fb92014a08001d9ed2b6833dac6b1b4efad434..5f69323bffb1ffd62313698455f25c28379f8410 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -23,11 +23,15 @@ from __future__ import print_function
 
 import collections
 import contextlib
+import operator
 import weakref
 import six
 
-from tensorflow.contrib.distribute.python import input_ops
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.distribute import input_ops
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as tf_device
@@ -38,7 +42,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import distribution_strategy_context
@@ -51,7 +54,7 @@ from tensorflow.python.util import nest
 # TODO(josh11b): Should device values be strings or DeviceSpec objects?
 # Not sure DeviceSpec objects are usable as a dict key.
 class DistributedValues(object):
-  """Holds a map from device to values. Either PerDevice or Mirrored."""
+  """Holds a map from device to values. Either PerReplica or Mirrored."""
 
   def __init__(self, index):
     self._index = {device_util.canonicalize(key): value
@@ -62,7 +65,8 @@ class DistributedValues(object):
     if device is None:
       replica_context = distribution_strategy_context.get_replica_context()
       if replica_context:
-        device = replica_context.device
+        # TODO(josh11b): support model parallelism better here
+        device = replica_context.devices[0]
       else:
         device = distribute_lib.get_update_device()
         if device is None:
@@ -75,10 +79,6 @@ class DistributedValues(object):
           ValueError("Device %s not found in %s (current device %s)" %
                      (device, self._index.keys(), device_util.current())), e)
 
-  def on_device(self, device):
-    device = device_util.canonicalize(device)
-    return device in self._index
-
   @property
   def devices(self):
     return list(self._index.keys())
@@ -167,12 +167,12 @@ class DistributedDelegate(DistributedValues):
   # TODO(josh11b): Even more operator overloads.
 
 
-class PerDevice(DistributedValues):
+class PerReplica(DistributedValues):
   """Holds a map from device to unsynchronized values."""
   pass
 
 
-# Note that unlike PerDevice, Mirrored values inherit from
+# Note that unlike PerReplica, Mirrored values inherit from
 # DistributedDelegate and so can be used directly in cross-replica mode.
 class Mirrored(DistributedDelegate):
   """Holds a map from device to values which are kept in sync."""
@@ -319,6 +319,14 @@ class DistributedVariable(DistributedDelegate):
 ops.register_dense_tensor_like_type(DistributedVariable)
 
 
+def _apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.broadcast(strategy.unwrap(value)[0],
+                              destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.reduce(reduce_op, value=value, destinations=destinations)
+
+
 class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
 
@@ -376,14 +384,11 @@ class MirroredVariable(DistributedVariable, Mirrored,
                          "MirroredVariable in Replica Context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, *args, **kwargs)
+          merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
@@ -482,7 +487,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
     if device is None:
       replica_context = distribution_strategy_context.get_replica_context()
       if replica_context:
-        device = replica_context.device
+        # TODO(josh11b): support model parallelism better here
+        device = replica_context.devices[0]
       else:
         device = distribute_lib.get_update_device()
         if device is None:
@@ -583,7 +589,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
   # update_non_slot() function (like OptimizerV2._finish), which can
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
-    if distribution_strategy_context.get_distribution_strategy().__class__.__name__ != "TPUStrategy":
+    strategy = distribution_strategy_context.get_distribution_strategy()
+    if strategy.__class__.__name__ != "TPUStrategy":
       raise ValueError("You may only assign to a TPUMirroredVariable within a "
                        "TPUStrategy.")
     f = kwargs.pop("f")
@@ -615,14 +622,11 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
                          "TPUMirroredVariable in Replica Context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, *args, **kwargs)
+          merge_fn, args=args, kwargs=kwargs)
 
   @contextlib.contextmanager
   def _handle_graph(self, handle):
@@ -776,6 +780,18 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
   def op(self):
     return self._primary_var.op
 
+  # pylint: disable=protected-access
+  @property
+  def _save_slice_info(self):
+    return self._primary_var._save_slice_info
+
+  def _get_save_slice_info(self):
+    return self._primary_var._get_save_slice_info()
+
+  def _set_save_slice_info(self, save_slice_info):
+    return self._primary_var._set_save_slice_info(save_slice_info)
+  # pylint: enable=protected-access
+
   @property
   def _in_graph_mode(self):
     return self._primary_var._in_graph_mode   # pylint: disable=protected-access
@@ -861,7 +877,7 @@ def _assert_replica_context():
         "Replica-local variables may only be assigned in a replica context.")
 
 
-class ReplicaLocalVariable(DistributedVariable, PerDevice,
+class ReplicaLocalVariable(DistributedVariable, PerReplica,
                            checkpointable.CheckpointableBase):
   """Holds a map from device to variables whose values are reduced on save."""
 
@@ -942,9 +958,9 @@ def _devices_match(d1, d2):
   return device_util.canonicalize(d1) == device_util.canonicalize(d2)
 
 
-def regroup(per_device, wrap_class=PerDevice):
-  """Makes device->nest map into a nest of PerDevice/Mirrored values."""
-  items = list(per_device.items())
+def regroup(per_replica, wrap_class=PerReplica):
+  """Makes device->nest map into a nest of PerReplica/Mirrored values."""
+  items = list(per_replica.items())
   assert items
   v0 = items[0][1]  # First value
 
@@ -1005,7 +1021,7 @@ def regroup(per_device, wrap_class=PerDevice):
   # want to return the containing MirroredVariable, after a bunch of
   # sanity checking. In particular, each component should have the
   # same container, and the devices of the variables should match the
-  # keys of the per-device dictionary.
+  # keys of the per-replica dictionary.
   if hasattr(v0, "_distributed_container"):
     # pylint: disable=protected-access
     assert not isinstance(v0, MirroredVariable), (
@@ -1021,11 +1037,11 @@ def regroup(per_device, wrap_class=PerDevice):
     return distributed_container
   # pylint: enable=protected-access
 
-  return wrap_class(per_device)
+  return wrap_class(per_replica)
 
 
 def select_device(device, structured):
-  """Specialize a nest of regular & per-device values for one device."""
+  """Specialize a nest of regular & per-replica values for one device."""
   def _get(x):
     return x.get(device) if isinstance(x, DistributedValues) else x
 
@@ -1047,18 +1063,18 @@ def select_device_mirrored(device, structured):
   return nest.map_structure(_get_mirrored, structured)
 
 
-def update_regroup(strategy, updates, should_group):
+def update_regroup(extended, updates, group):
   """Regroup for an update, with dependencies to ensure all updates execute."""
   regrouped = regroup(updates, Mirrored)
-  if not should_group:
-    return nest.map_structure(strategy.unwrap, regrouped)
+  if not group:
+    return nest.map_structure(extended._unwrap, regrouped)  # pylint: disable=protected-access
   grouped_flat = []
   for u in nest.flatten(regrouped):
     if isinstance(u, DistributedValues):
-      g = strategy.group(u)
+      g = extended._group(u)  # pylint: disable=protected-access
       if u.is_tensor_like:
         # Make sure we run all updates. Without this, something like
-        # session.run(strategy.update(...)) may only update one replica.
+        # session.run(extended.update(...)) may only update one replica.
         index = {}
         for d in u.devices:
           with ops.device(d), ops.control_dependencies([g]):
@@ -1070,8 +1086,8 @@ def update_regroup(strategy, updates, should_group):
   return nest.pack_sequence_as(regrouped, grouped_flat)
 
 
-class PerDeviceDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `PerDeviceDataset`."""
+class PerReplicaDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `PerReplicaDataset`."""
 
   def __init__(self, iterator, devices, prefetch_on_device=None):
     self._iterator = iterator
@@ -1114,8 +1130,8 @@ class PerDeviceDataIterator(object):
     return self._iterator.output_types
 
 
-class PerDeviceDataset(object):
-  """Like `tf.data.Dataset` split devices, producing `PerDevice` data."""
+class PerReplicaDataset(object):
+  """Like `tf.data.Dataset` split devices, producing `PerReplica` data."""
 
   def __init__(self, dataset, devices, prefetch_on_device=None):
     self._devices = devices
@@ -1136,20 +1152,20 @@ class PerDeviceDataset(object):
       self._dataset = dataset.batch(len(devices), drop_remainder=True)
 
   def make_one_shot_iterator(self):
-    """Get a one time use iterator for the distributed PerDeviceDataset."""
+    """Get a one time use iterator for the distributed PerReplicaDataset."""
     # Graph mode with one shot iterator is disabled.
     if not context.executing_eagerly():
       raise ValueError("Cannot create a one shot iterator. Please use "
                        "`make_initializable_iterator()` instead.")
     # Eager mode prefetching would error out in constructor. Only remaining
     # case is non-prefetching in eager mode. We delegate to
-    # PerDeviceDataIterator to handle that case.
+    # PerReplicaDataIterator to handle that case.
     dataset_iterator = self._dataset.make_one_shot_iterator()
-    return PerDeviceDataIterator(
+    return PerReplicaDataIterator(
         dataset_iterator, self._devices, prefetch_on_device=False)
 
   def make_initializable_iterator(self):
-    """Get an initializable iterator for the distributed PerDeviceDataset."""
+    """Get an initializable iterator for the distributed PerReplicaDataset."""
     # Eager mode generates already initialized iterators. Hence we cannot create
     # an initializable iterator.
     if context.executing_eagerly():
@@ -1160,7 +1176,7 @@ class PerDeviceDataset(object):
           self._dataset, self._devices)
     else:
       dataset_iterator = self._dataset.make_initializable_iterator()
-    return PerDeviceDataIterator(
+    return PerReplicaDataIterator(
         dataset_iterator,
         self._devices,
         prefetch_on_device=self._prefetch_on_device)
@@ -1169,43 +1185,47 @@ class PerDeviceDataset(object):
 class MultiWorkerDataIterator(object):
   """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
 
-  def __init__(self, iterators, worker_device_map):
+  def __init__(self, iterators, worker_device_pairs):
     """Initialize the MultiWorkerDataIterator object.
 
     Args:
-      iterators: a dict mapping from each worker to an iterator for
-        that worker.
-      worker_device_map: a dict mapping from each worker's devices to a list of
-        devices that belong to this worker.
+      iterators: a list of worker, iterator pairs.
+      worker_device_pairs: a list of (worker's devices, a list of
+        devices that belong to this worker) pairs.
 
     Raises:
-      ValueError: if iterators and worker_device_map are not compatible.
+      ValueError: if iterators and worker_device_pairs are not compatible.
     """
-    self._iterators = iterators
-    self._worker_device_map = worker_device_map
-    if set(self._iterators) != set(self._worker_device_map):
-      raise ValueError("iterators and worker_device_map are not compatible.")
+    if [d for d, _ in iterators] != [d for d, _ in worker_device_pairs]:
+      raise ValueError("iterators and worker_device_pairs are not compatible.")
+    self._workers = [d for d, _ in iterators]
+    self._iterators = [i for _, i in iterators]
+    self._worker_devices = [l for _, l in worker_device_pairs]
 
   @property
   def initializer(self):
     return control_flow_ops.group(
-        [iterator.initializer for iterator in self._iterators.values()])
+        [iterator.initializer for iterator in self._iterators])
 
   def get_iterator(self, worker):
-    return self._iterators.get(worker)
+    for i, w in enumerate(self._workers):
+      if worker == w:
+        return self._iterators[i]
+    return None
 
   @property
   def output_shapes(self):
-    return self._iterators.values()[0].output_shapes
+    return self._iterators[0].output_shapes
 
   @property
   def output_types(self):
-    return self._iterators.values()[0].output_types
+    return self._iterators[0].output_types
 
   def get_next(self, name=None):
     """Scatter the input across hosts and devices."""
     index = {}
-    for worker, iterator in six.iteritems(self._iterators):
+    worker_info = zip(self._workers, self._iterators, self._worker_devices)
+    for worker, iterator, worker_devices in worker_info:
       if name is not None:
         d = tf_device.DeviceSpec.from_string(worker)
         new_name = "%s_%s_%d" % (name, d.job, d.task)
@@ -1214,13 +1234,12 @@ class MultiWorkerDataIterator(object):
       with ops.device(worker):
         data_per_worker = iterator.get_next(name=new_name)
 
-      worker_devices = self._worker_device_map[worker]
-      # Ungroup these per-device value so as to get a flat map from devices to
+      # Ungroup these per-replica value so as to get a flat map from devices to
       # values.
       for d in worker_devices:
         v = select_device(d, data_per_worker)
         if d in index:
-          raise ValueError("Duplicated devices in worker_device_map: %r" % v)
+          raise ValueError("Duplicated devices in worker_device_pairs: %r" % v)
         index[d] = v
 
     return regroup(index)
@@ -1229,174 +1248,381 @@ class MultiWorkerDataIterator(object):
 class MultiWorkerDataset(object):
   """Like a `tf.data.Dataset` that distributes data to different workers.
 
-  Each worker gets one shard of the input dataset. It is currently not working
-  in
-  eager mode.
+  Each worker gets one shard of the input dataset. This currently does not work
+  in eager mode.
   """
 
-  def __init__(self, dataset_fn, worker_device_map, prefetch_on_device=None,
+  def __init__(self, dataset_fn, worker_device_pairs, prefetch_on_device=None,
                auto_shard=False):
     """Initialize the MultiWorkerDataset object.
 
     Args:
-      dataset_fn: a function that returns a `tf.data.Dataset`.
-      worker_device_map: a dict mapping from each worker to a list of devices
-        that belong to this worker.
+      dataset_fn: a function or a list of functions that returns a
+        `tf.data.Dataset`.
+      worker_device_pairs: a list of (worker, list of devices on that worker)
+        pairs; it must have same length with `dataset_fn` if `dataset_fn` is a
+        list.
       prefetch_on_device: whether to prefetch to devices.
       auto_shard: whether to auto-shard the dataset.
     """
-    self._worker_device_map = worker_device_map
-    self._datasets = {}
+    if isinstance(dataset_fn, list):
+      if len(dataset_fn) != len(worker_device_pairs):
+        raise ValueError("If `dataset_fn` is a list, it must have same length "
+                         "as `worker_device_pairs`")
+      if auto_shard:
+        raise ValueError(
+            "If `dataset_fn` is a list, `auto_shard` is not supported.")
+    self._worker_device_pairs = worker_device_pairs
+    self._datasets = []
     # TODO(yuefengz, priyag): support different set of jobs for input
     # processing.
-    for i, (worker, worker_devices) in enumerate(
-        six.iteritems(worker_device_map)):
+    for i, (worker, worker_devices) in enumerate(worker_device_pairs):
       with ops.device(worker):
-        worker_input = dataset_fn()
-        if auto_shard:
-          worker_input = input_ops.auto_shard_dataset(
-              worker_input, len(worker_device_map), i)
-        self._datasets[worker] = PerDeviceDataset(
+        if isinstance(dataset_fn, list):
+          worker_input = dataset_fn[i]()
+        else:
+          worker_input = dataset_fn()
+          if auto_shard:
+            worker_input = input_ops.auto_shard_dataset(
+                worker_input, len(worker_device_pairs), i)
+        dataset = PerReplicaDataset(
             worker_input, worker_devices, prefetch_on_device=prefetch_on_device)
+        self._datasets.append((worker, dataset))
 
   def make_one_shot_iterator(self):
-    iterators = {}
-    for worker, dataset in six.iteritems(self._datasets):
+    iterators = []
+    for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators[worker] = dataset.make_one_shot_iterator()
-    return MultiWorkerDataIterator(iterators, self._worker_device_map)
+        iterators.append((worker, dataset.make_one_shot_iterator()))
+    return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
   def make_initializable_iterator(self):
-    iterators = {}
-    for worker, dataset in six.iteritems(self._datasets):
+    iterators = []
+    for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators[worker] = dataset.make_initializable_iterator()
-    return MultiWorkerDataIterator(iterators, self._worker_device_map)
+        iterators.append((worker, dataset.make_initializable_iterator()))
+    return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
 
-class _PerKey(object):
-  """Holds data associated by keys."""
+class InputIterator(object):
+  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
 
-  def __init__(self, *index):
-    # pylint: disable=protected-access
-    self._index = list(index)
+  def get_next(self):
+    """Returns the next inputs for all replicas."""
+    raise NotImplementedError("must be implemented in descendants")
 
-  def get(self, iteration):
-    return array_ops.gather(self._index, iteration)
+  def initialize(self):
+    """Initialize the underlying input dataset, when applicable.
 
-  def get_shape(self):
-    return self._index[-1][-1].get_shape()
+    In eager mode, this will create a new iterator and return it.
+    In graph mode, this will initialize the same underlying iterator(s).
 
-  def get_dtype(self):
-    return self._index[-1][-1].dtype
+    Users are required to call this if
+    - This iterator was returned from a call to `make_input_fn_iterator` with an
+      input function that returns a dataset.
+    - Or this iterator was returned from a call to `make_dataset_iterator`.
 
-  def __str__(self):
-    return "%s:%s" % (self.__class__.__name__, self._index)
+    Returns:
+      A list of initialization ops to be executed.
+    """
+    raise NotImplementedError("must be implemented in descendants")
 
-  def __repr__(self):
-    return "%s(%r)" % (self.__class__.__name__, self._index)
 
+class InputIteratorImpl(InputIterator):
+  """Common implementation for all input iterators."""
 
-class PerIteration(_PerKey):
-  """Holds input for multiple iterations at once."""
+  def __init__(self, worker_device_pairs, iterators):
+    if not worker_device_pairs:
+      raise ValueError("Should have at least one worker for input iterator.")
 
-  def __init__(self, *index):
-    # pylint: disable=protected-access
-    super(PerIteration, self).__init__(*[batch._index for batch in index])
+    self._iterators = iterators
+    self._worker_device_pairs = worker_device_pairs
+    self._is_eager = context.executing_eagerly()
 
+  def get_next(self, name=None):
+    """Returns the next input from the iterator for all replicas."""
+    assert self._is_eager == context.executing_eagerly(), (
+        "Iterator should be created and used in same execution mode.")
 
-class Batches(_PerKey):
-  pass
+    index = {}
+    for i, (worker, worker_devices) in enumerate(self._worker_device_pairs):
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        data_per_worker = self._iterators[i].get_next(new_name)
 
+      # Ungroup these per-replica value so as to get a flat map from devices to
+      # values.
+      for d in worker_devices:
+        v = select_device(d, data_per_worker)
+        if d in index:
+          raise ValueError("Duplicated devices in worker_device_pairs: %r" % v)
+        index[d] = v
 
-class MultiIterator(object):
-  """Iterator that returns results of multiple get_next()s."""
+    return regroup(index)
 
-  def __init__(self, dataset_iterator, iterations, batches_per_iteration):
-    self._dataset_iterator = dataset_iterator
-    self._iterations = iterations
-    self._batches_per_iteration = batches_per_iteration
+  def initialize(self):
+    """Initialze underlying iterators.
 
-  def get_next(self, name=None):
-    """Return PerIteration with `iterations x batches_per_iteration` inputs."""
-    data = []
-    for _ in range(self._batches_per_iteration):
-      batch = []
-      for _ in range(self._iterations):
-        batch.append(self._dataset_iterator.get_next(name=name))
-      data.append(batch)
-
-    # Here is an example.  Suppose each get_next returns a tuple of two tensors.
-    # For 3 `iterations` and 2 `batches_per_iteration`, the `data` is:
-    # [[(a,z), (b,y), (c,x)], [(A,Z), (B,Y), (C,X)]]
-    #
-    # After the first `map_structure` it gets transformed to:
-    #  [(Batches(a, A), Batches(z, Z)),
-    #   (Batches(b, B), Batches(y, Y)),
-    #   (Batches(c, C), Batches(x, X))]
-    #
-    # After the second `map_structure` it gets transformed to a tuple of:
-    # (PerIteration([Batches(a, A), Batches(b, B), Batches(c, C)]),
-    #  PerIteration([Batches(z, Z), Batches(y, Y), Batches(x, X)]))
-
-    data = nest.map_structure(Batches, *data)
-    data = nest.map_structure(PerIteration, *data)
-
-    return data
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    assert self._is_eager == context.executing_eagerly(), (
+        "Iterator should be created and used in same execution mode.")
 
+    init_ops = []
+    for it in self._iterators:
+      init_ops.extend(it.initialize())
+    return init_ops
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
   @property
-  def initializer(self):
-    return self._dataset_iterator.initializer
+  def output_classes(self):
+    return self._iterators[0].output_classes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_shapes(self):
+    return self._iterators[0].output_shapes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_types(self):
+    return self._iterators[0].output_types
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  def get_iterator(self, worker):
+    for i, (w, _) in enumerate(self._worker_device_pairs):
+      if worker == w:
+        return self._iterators[i]
+    return None
+
 
+class InputFunctionIterator(InputIteratorImpl):
+  """Iterator created from input function."""
 
-class PerIterationDataset(object):
-  """A dataset that returns MultiIterators."""
+  def __init__(self, input_fn, worker_device_pairs, input_contexts):
+    """Make an iterator for input provided via an input function.
 
-  def __init__(self, dataset, iterations, batches_per_iteration):
+    Currently implements PER_WORKER mode, in which the `input_fn` is called
+    once on each worker.
+
+    TODO(priyag): Add other replication modes.
+    TODO(priyag): Allow taking input function that returns a callable that
+    returns nest of tensors.
+
+    Args:
+      input_fn: Input function that returns a `tf.data.Dataset` object.
+      worker_device_pairs: A list of (worker, list of devices on that worker)
+        pairs.
+      input_contexts: A list of `InputContext` instances to be passed to call(s)
+        to `input_fn`. Length and order should match worker order in
+        `worker_device_pairs`.
+    """
+    if len(worker_device_pairs) != len(input_contexts):
+      raise ValueError(
+          "Number of worker_device_pairs (%d) is not same as number of"
+          "input_contexts (%d)" % (
+              len(worker_device_pairs), len(input_contexts)))
+
+    iterators = []
+    for (worker, devices), ctx in zip(worker_device_pairs, input_contexts):
+      # TODO(priyag): We should probably explicitly specify CPU device on worker.
+      with ops.device(worker):
+        result = input_fn(ctx)
+        if not isinstance(result, dataset_ops.Dataset):
+          raise ValueError("input_fn must return a tf.data.Dataset.")
+        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
+        iterators.append(iterator)
+
+    super(InputFunctionIterator, self).__init__(
+        worker_device_pairs, iterators)
+
+
+class DatasetIterator(InputIteratorImpl):
+  """Iterator created from input dataset."""
+
+  def __init__(self, dataset, worker_device_pairs, split_batch_by=None):
+    """Make an iterator for the dataset on given devices.
+
+    If `split_batch_by` is not None, we "split" each batch of the
+    dataset by `split_batch_by` value. To achieve this, we first unbatch the
+    input dataset and then rebatch it with the per replica batch size that is
+    calculated using `global_batch_size // split_batch_by`.
+    The currently supported datasets are as follows:
+    `dataset.batch()` is the last operation on the dataset OR
+    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
+    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
+    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
+
+    TODO(priyag): Support multi worker / host cases properly by cloning
+    and sharding the dataset on each worker. Current setup will only work in
+    some cases, such as in-graph multi worker GPU case. If the input pipeline
+    has random shuffling (with a different seed on each worker), each worker
+    will see random input from the same overall dataset in each step. Otherwise,
+    each worker will see the same input in each step.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be used as the input source.
+      worker_device_pairs: A list of (worker, list of devices on that worker)
+        pairs.
+      split_batch_by: Optional integer. If present, we "split" each batch of the
+        dataset by `split_batch_by` value.
+    """
+    if split_batch_by:
+      dataset = _split_dataset_batch(dataset, split_batch_by)
+
+    iterators = []
+    for worker, worker_devices in worker_device_pairs:
+      with ops.device(worker):
+        iterator = _SingleWorkerDatasetIterator(dataset, worker, worker_devices)
+        iterators.append(iterator)
+
+    super(DatasetIterator, self).__init__(worker_device_pairs, iterators)
+
+
+class _SingleWorkerDatasetIterator(object):
+  """Iterator for a single `tf.data.Dataset`."""
+
+  def __init__(self, dataset, worker, devices):
+    """Create iterator for the `dataset` to fetch data to worker's `devices` .
+
+    `MultiDeviceIterator` is used to prefetch input to the devices on the
+    given worker. `MultiDeviceIterator` doesn't work in eager mode yet.
+
+    Args:
+      dataset: A `tf.data.Dataset` instance.
+      worker: Worker on which ops should be created.
+      devices: Distribute data from `dataset` to these devices.
+    """
     self._dataset = dataset
-    self._iterations = iterations
-    self._batches_per_iteration = batches_per_iteration
+    self._worker = worker
+    self._devices = devices
+    self._is_eager = context.executing_eagerly()
+    self._make_iterator()
+
+  def _make_iterator(self):
+    """Make appropriate iterator on the dataset."""
+    with ops.device(self._worker):
+      if self._is_eager:
+        # TODO(rohanj): Enable prefetching in eager mode.
+        # TODO(priyag): Measure the performance of this approach vs calling
+        # get_next on the original dataset N times.
+        dataset = self._dataset.batch(len(self._devices), drop_remainder=True)
+        iterator = dataset.make_one_shot_iterator()
+      else:
+        iterator = multi_device_iterator_ops.MultiDeviceIterator(
+            self._dataset, self._devices)
+    self._iterator = iterator
 
-  def make_one_shot_iterator(self):
-    iterator = self._dataset.make_one_shot_iterator()
-    return MultiIterator(iterator, self._iterations,
-                         self._batches_per_iteration)
+  def get_next(self, name=None):
+    """Get next element from the underlying iterator."""
+    with ops.device(self._worker):
+      if self._is_eager:
+        # Batched dataset case.
+        batch = self._iterator.get_next(name=name)
+        index = {}
+        for i, d in enumerate(self._devices):
+          index[d] = nest.map_structure(operator.itemgetter(i), batch)
+          with ops.device(d):
+            index[d] = nest.map_structure(array_ops.identity, index[d])
+      else:
+        # MultiDeviceIterator case.
+        data_list = self._iterator.get_next()
+        index = dict(zip(self._devices, data_list))
 
-  def make_initializable_iterator(self):
-    iterator = self._dataset.make_initializable_iterator()
-    return MultiIterator(iterator, self._iterations,
-                         self._batches_per_iteration)
+      return regroup(index)
 
+  def initialize(self):
+    """Initialze underlying iterator.
 
-class MapOutput(object):
-  """Map can result in multiple outputs per device."""
+    In eager execution, this simply recreates the underlying iterator.
+    In graph execution, it returns the initializer ops for the underlying
+    iterator.
 
-  def __init__(self, l):
-    self._l = l
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    if self._is_eager:
+      self._make_iterator()
+      return []
+    else:
+      return [self._iterator.initializer]
 
-  def get(self):
-    return self._l
+  @property
+  def output_classes(self):
+    return self._iterator.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._iterator.output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterator.output_types
+
+
+def _split_dataset_batch(dataset, split_batch_by):
+  """Divide a batch-ed dataset's batches into smaller batches."""
+  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+  # pylint: disable=protected-access
+  def _get_batch_dataset(d):
+    """Get the underlying batch dataset from the dataset object."""
+    if isinstance(d, dataset_ops.DatasetV1Adapter):
+      d = d._dataset
+
+    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
+      return d
+    elif isinstance(d, dataset_ops.PrefetchDataset):
+      return _get_batch_dataset(d._input_dataset)
+    raise ValueError(
+        "Unable to get batched dataset from the input dataset. `batch` "
+        "`map_and_batch` need to be the last operations on the dataset. "
+        "The batch operations can be followed by a prefetch.")
+
+  batched_dataset = _get_batch_dataset(dataset)
+  batch_size = batched_dataset._batch_size
+  drop_remainder = batched_dataset._drop_remainder
+  # pylint: enable=protected-access
+
+  if tensor_util.is_tensor(batch_size):
+    batch_size = tensor_util.constant_value(batch_size)
+
+  if tensor_util.is_tensor(drop_remainder):
+    drop_remainder = tensor_util.constant_value(drop_remainder)
+
+  if batch_size % split_batch_by:
+    raise ValueError(
+        "Batch size %s cannot be sharded evenly across replicas %s" % (
+            batch_size, split_batch_by))
+  new_batch_size = batch_size // split_batch_by
+
+  dataset = dataset.apply(batching.unbatch())
+  return dataset.batch(new_batch_size, drop_remainder=drop_remainder)
 
 
 class MultiStepContext(object):
   """A context object that can be used to capture things when running steps.
 
   This context object is useful when running multiple steps at a time using the
-  `run_steps_on_dataset` API. For e.g. it allows the user's step function to
-  specify which outputs to emit at what frequency. Currently it supports
-  capturing output from the last step, as well as capturing non tensor outputs.
-  In the future it will be augmented to support other use cases such as output
-  each N steps.
+  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
+  function to specify which outputs to emit at what frequency. Currently it
+  supports capturing output from the last step, as well as capturing non tensor
+  outputs.  In the future it will be augmented to support other use cases such
+  as output each N steps.
   """
 
   def __init__(self):
-    """Initializes an output context.
+    """Initialize an output context.
 
     Returns:
       A context object.
     """
     self._last_step_outputs = {}
-    self._last_step_outputs_aggregations = {}
+    self._last_step_outputs_reduce_ops = {}
     self._non_tensor_outputs = {}
 
   @property
@@ -1406,8 +1632,8 @@ class MultiStepContext(object):
     Keys in the dictionary are names of tensors to be captured, as specified
     when `set_last_step_output` is called.
     Values in the dictionary are the tensors themselves. If
-    `set_last_step_output` was called with an `aggregation` for this output,
-    then the value is the aggregated value.
+    `set_last_step_output` was called with a `reduce_op` for this output,
+    then the value is the reduced value.
 
     Returns:
       A dictionary with last step outputs.
@@ -1420,8 +1646,7 @@ class MultiStepContext(object):
       raise ValueError("Need a dictionary to set last_step_outputs.")
     self._last_step_outputs = outputs
 
-  def set_last_step_output(self, name, output,
-                           aggregation=variables_lib.VariableAggregation.NONE):
+  def set_last_step_output(self, name, output, reduce_op=None):
     """Set `output` with `name` to be outputted from the last step.
 
     Args:
@@ -1429,39 +1654,38 @@ class MultiStepContext(object):
         name.
       output: The tensors that should be outputted with `name`. See below for
         actual types supported.
-      aggregation: Aggregation method to use to aggregate outputs from multiple
-        replicas. Required if `set_last_step_output` is called in a replica context.
-        Optional in cross_replica_context.
-        When present, the outputs from all the replicas are aggregated using the
+      reduce_op: Reduction method to use to reduce outputs from multiple
+        replicas. Required if `set_last_step_output` is called in a replica
+        context. Optional in cross_replica_context.
+        When present, the outputs from all the replicas are reduced using the
         current distribution strategy's `reduce` method. Hence, the type of
         `output` must be what's supported by the corresponding `reduce` method.
-        For e.g. if using MirroredStrategy and aggregation is set, output
-        must be a `PerDevice` value.
-        The aggregation method is also recorded in a dictionary
-        `_last_step_outputs_aggregations` for later interpreting of the
+        For e.g. if using MirroredStrategy and reduction is set, output
+        must be a `PerReplica` value.
+        The reduce method is also recorded in a dictionary
+        `_last_step_outputs_reduce_ops` for later interpreting of the
         outputs as already reduced or not.
-
     """
     if distribution_strategy_context.get_cross_replica_context():
-      self._last_step_outputs_aggregations[name] = aggregation
-      if aggregation is variables_lib.VariableAggregation.NONE:
+      self._last_step_outputs_reduce_ops[name] = reduce_op
+      if reduce_op is None:
         self._last_step_outputs[name] = output
       else:
         distribution = distribution_strategy_context.get_distribution_strategy()
         self._last_step_outputs[name] = distribution.reduce(
-            aggregation, output, destinations="/device:CPU:0")
+            reduce_op, output, destinations="/device:CPU:0")
     else:
-      assert aggregation is not variables_lib.VariableAggregation.NONE
+      assert reduce_op is not None
       def merge_fn(distribution, value):
         self._last_step_outputs[name] = distribution.reduce(
-            aggregation, value, destinations="/device:CPU:0")
+            reduce_op, value, destinations="/device:CPU:0")
         # Setting this inside the `merge_fn` because all replicas share the same
         # context object, so it's more robust to set it only once (even if all
         # the replicas are trying to set the same value).
-        self._last_step_outputs_aggregations[name] = aggregation
+        self._last_step_outputs_reduce_ops[name] = reduce_op
 
       distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, output)
+          merge_fn, args=(output,))
 
   @property
   def non_tensor_outputs(self):
@@ -1475,14 +1699,14 @@ class MultiStepContext(object):
     else:
       def merge_fn(distribution, value):
         # NOTE(priyag): For non tensor outputs, we simply return all the values
-        # in a list as aggregation doesn't make sense on non tensors.
+        # in a list as reduction doesn't make sense on non tensors.
         self._non_tensor_outputs[name] = distribution.unwrap(value)
       distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, output)
+          merge_fn, args=(output,))
 
 
 def value_container(val):
-  """Returns the container that this per-device `value` belongs to.
+  """Returns the container that this per-replica `value` belongs to.
 
   Args:
     val: A value returned by `call_for_each_replica()` or a variable
@@ -1528,8 +1752,8 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
         # We are calling an assign function in an update context.
         return f(self._v, *args, **kwargs)
 
-      # We are calling an assign function in cross replica context, wrap it in an
-      # update call.
+      # We are calling an assign function in cross replica context, wrap it in
+      # an update call.
       return distribution_strategy_context.get_distribution_strategy().update(
           self, f, *args, **kwargs)
     else:
@@ -1543,14 +1767,11 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
                          "a variable in Replica Context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, *args, **kwargs)
+          merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 99e3dc0de1861965a23e64903d844c6d87482a1f..d3457ed2417c98fc2b45a770ff1ce24bcf97d78a 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -18,7 +18,7 @@ cc_library(
         "pywrap_tfe.h",
     ],
     visibility = [
-        "//learning/deepmind/courier:__pkg__",
+        "//learning/deepmind/courier:__subpackages__",
         "//tensorflow:internal",
     ],
     deps = [
@@ -114,9 +114,11 @@ cuda_py_test(
         ":backprop",
         ":context",
         ":test",
+        "//third_party/py/numpy",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:resource_variable_ops",
@@ -143,6 +145,55 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "function_argument_naming_test",
+    size = "medium",
+    srcs = ["function_argument_naming_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "function_defun_collection_test",
+    size = "medium",
+    srcs = ["function_defun_collection_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "function_gradients_test",
+    size = "medium",
+    srcs = ["function_gradients_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":context",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+    shard_count = 5,
+    tags = [
+        "no_windows",
+    ],
+)
+
 cuda_py_test(
     name = "function_test",
     size = "medium",
@@ -152,7 +203,6 @@ cuda_py_test(
         ":context",
         ":def_function",
         ":function",
-        ":tape",
         ":test",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:test_ops",
@@ -163,7 +213,10 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
     ],
-    shard_count = 20,
+    shard_count = 15,
+    tags = [
+        "no_windows",
+    ],
 )
 
 py_library(
@@ -288,6 +341,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:execute",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "@six_archive//:six",
     ],
 )
@@ -408,16 +462,29 @@ py_library(
     deps = [
         ":context",
         ":function",
+        ":lift_to_graph",
         "//tensorflow/python:cond_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python/training/checkpointable:base",
     ],
 )
 
+py_library(
+    name = "lift_to_graph",
+    srcs = ["lift_to_graph.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
 py_test(
     name = "def_function_test",
     srcs = ["def_function_test.py"],
@@ -438,6 +505,7 @@ py_library(
     deps = [
         ":context",
         ":function",
+        ":lift_to_graph",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 844c9b52e7fda6e6582201448ee576d9de752223..84b61f47c12b0dc654152be232d70b8ce86ac909 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -42,9 +42,20 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
+# Note that we need to lazy load the following two modules to avoid creating
+# circular dependencies.
+# TODO(b/119775953): fix the circular dependencies.
+pfor_ops = LazyLoader(
+    "pfor_ops", globals(),
+    "tensorflow.python.ops.parallel_for.control_flow_ops")
+
+function = LazyLoader("function", globals(),
+                      "tensorflow.python.eager.function")
+
 _op_attr_type_cache = {}
 
 
@@ -776,6 +787,8 @@ class GradientTape(object):
         context.context().end_step()
       except AttributeError:
         pass
+      except TypeError:
+        pass
 
   def watch(self, tensor):
     """Ensures that `tensor` is being traced by this tape.
@@ -935,3 +948,101 @@ class GradientTape(object):
 
     grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
+
+  def jacobian(self,
+               target,
+               sources,
+               unconnected_gradients=UnconnectedGradients.NONE,
+               experimental_use_pfor=True):
+    """Computes the jacobian using operations recorded in context of this tape.
+
+    See http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant for the
+    definition of a Jacobian.
+
+    Example usage:
+
+    with tf.GradientTape() as g:
+      x  = tf.constant([1.0, 2.0])
+      g.watch(x)
+      y = x * x
+    jacobian = g.jacobian(y, x)
+    # jacobian value is [[2., 0.], [0., 4.]]
+
+    Args:
+      target: Tensor to be differentiated.
+      sources: a list or nested structure of Tensors or Variables. `target`
+        will be differentiated against elements in `sources`.
+      unconnected_gradients: a value which can either hold 'none' or 'zero' and
+        alters the value which will be returned if the target and sources are
+        unconnected. The possible values and effects are detailed in
+        'UnconnectedGradients' and it defaults to 'none'.
+      experimental_use_pfor: If true, vectorizes the jacobian computation. Else
+        falls back to a sequential while_loop. Vectorization can sometimes fail
+        or lead to excessive memory usage. This option can be used to disable
+        vectorization in such cases.
+
+    Returns:
+      a list or nested structure of Tensors (or IndexedSlices, or None),
+      one for each element in `sources`. Returned structure is the same as
+      the structure of `sources`.
+
+    Raises:
+      RuntimeError: If called on a non-persistent tape with eager execution
+        enabled and without enabling experimental_use_pfor.
+      ValueError: If vectorization of jacobian computation fails.
+    """
+    flat_sources = nest.flatten(sources)
+    target_static_shape = target.shape
+    target_shape = array_ops.shape(target)
+    # Note that we push and pop the tape here and below. This is needed since we
+    # need gradients through the enclosed operations.
+    self._push_tape()
+    target = array_ops.reshape(target, [-1])
+    self._pop_tape()
+
+    def loop_fn(i):
+      self._push_tape()
+      y = array_ops.gather(target, i)
+      self._pop_tape()
+      grad = self.gradient(y, flat_sources,
+                           unconnected_gradients=unconnected_gradients)
+      return grad
+
+    try:
+      target_size = int(target.shape[0])
+    except TypeError:
+      target_size = array_ops.shape(target)[0]
+
+    if experimental_use_pfor:
+      def f():
+        return pfor_ops.pfor(loop_fn, target_size)
+      if context.executing_eagerly():
+        f = function.defun(f)
+      try:
+        output = f()
+      except ValueError as err:
+        # TODO(agarwal): Fold this error message into err.
+        logging.error("Encountered an exception while vectorizing the jacobian "
+                      "computation. Vectorization can be disabled by setting "
+                      "experimental_use_pfor to False.")
+        raise err
+    else:
+      if context.executing_eagerly():
+        if not self._persistent:
+          raise RuntimeError(
+              "GradientTape must be created with persistent=True"
+              " to compute the jacobian with eager execution enabled and with "
+              " experimental_use_pfor set to False.")
+      output = pfor_ops.for_loop(
+          loop_fn, [target.dtype] * len(flat_sources), target_size)
+
+    for i, out in enumerate(output):
+      if out is not None:
+        new_shape = array_ops.concat(
+            [target_shape, array_ops.shape(out)[1:]], axis=0)
+        out = array_ops.reshape(out, new_shape)
+        if context.executing_eagerly():
+          out.set_shape(target_static_shape.concatenate(flat_sources[i].shape))
+      output[i] = out
+
+    return nest.pack_sequence_as(sources, output)
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 31ba6ad9b8d246b7d671953c6ac398a37a99de3b..237b7f304e0cf4b305956e0e2b2925e97de45478 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers.pooling import max_pooling3d
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
@@ -214,7 +215,7 @@ class BackpropTest(test.TestCase):
       self.assertAllClose(tf_grad.values.eval(), grad.values)
 
       tf_opt.apply_gradients([(tf_grad, tf_embedding)]).run()
-      expected = tf_embedding.eval()
+      expected = self.evaluate(tf_embedding)
     opt.apply_gradients([(grad, embedding)])
     self.assertAllClose(expected, embedding.read_value())
 
@@ -232,6 +233,68 @@ class BackpropTest(test.TestCase):
     self.assertTrue(ordered_variables[0] is v0)
     self.assertTrue(ordered_variables[1] is v1)
 
+  def testTapeNoOpGradient(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x
+    self.assertEqual(t.gradient(y, x).numpy(), 1.0)
+
+  def testTapeIdentityGradientIsIdentity(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = array_ops.identity(x)
+    self.assertEqual(t.gradient(y, x).numpy(), 1.0)
+
+  def testTapeGradientMultiTargetOneIsSource(self):
+    x = constant_op.constant(2.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x*x
+    self.assertEqual(t.gradient([x, y], x).numpy(), 5.0)
+
+  def testTapeNoOpGradientWithMultiTargetAllSource(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x
+    self.assertEqual(t.gradient([y, y], x).numpy(), 2.0)
+
+  def testTapeNoOpGradientWithMultiTargetMultiSource(self):
+    x = constant_op.constant(3.0)
+    y = constant_op.constant(5.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      t.watch(y)
+      z = y * y
+    self.assertAllEqual(t.gradient([x, y, z], [x, y]), [1.0, 11.0])
+
+  def testTapeNoOpOnVariableIsIdentity(self):
+    v0 = resource_variable_ops.ResourceVariable(1.0)
+    with backprop.GradientTape() as t:
+      y = v0.read_value()
+    self.assertEqual(t.gradient(y, v0).numpy(), 1.0)
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testTapeNoOpGradient2By2(self):
+    a_2_by_2 = constant_op.constant(2.0, shape=[2, 2])
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+    dy_dy = tape.gradient(a_2_by_2, [a_2_by_2])[0]
+    self.assertAllEqual(dy_dy.numpy(),
+                        constant_op.constant(1.0, shape=[2, 2]).numpy())
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testTapeNoOpGradientMultiTarget2By2(self):
+    a_2_by_2 = constant_op.constant(2.0, shape=[2, 2])
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+    dy_dy = tape.gradient([a_2_by_2, a_2_by_2], [a_2_by_2])[0]
+    self.assertAllEqual(dy_dy.numpy(),
+                        constant_op.constant(2.0, shape=[2, 2]).numpy())
+
   def testTapeStopRecording(self):
     with backprop.GradientTape() as t:
       x = resource_variable_ops.ResourceVariable(1.0)
@@ -1140,5 +1203,105 @@ class BackpropTest(test.TestCase):
       g = f(c)
     self.assertAllEqual(self.evaluate(t.gradient(g, c)), 4.0)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMaxPooling3DGradient(self):
+
+    def forward(a):
+      r = max_pooling3d(a, pool_size=pool_size, strides=strides, padding='SAME')
+      return r
+
+    input_sizes = [1, 3, 2, 4, 1]
+    pool_size = (2, 2, 1)
+    strides = (1, 1, 1)
+
+    total_size = np.prod(input_sizes)
+    x = np.arange(1, total_size + 1, dtype=np.float32)
+    aa = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+    da = backprop.gradients_function(forward)(aa)
+
+    if not context.executing_eagerly():
+      tf_aa = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+      tf_max = max_pooling3d(
+          tf_aa, pool_size=pool_size, strides=strides, padding='SAME')
+      tf_da = gradients.gradients(tf_max, [tf_aa])
+      self.assertAllEqual(da[0], tf_da[0].eval())
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class JacobianTest(test.TestCase):
+
+  def _jacobian(self, experimental_use_pfor):
+    persistent = context.executing_eagerly and not experimental_use_pfor
+    with backprop.GradientTape(persistent=persistent) as g:
+      x = constant_op.constant([1., 2.])
+      y = constant_op.constant([3., 4.])
+      g.watch(x)
+      g.watch(y)
+      z = x * x * y
+    jacobian = g.jacobian(z, [x, y],
+                          experimental_use_pfor=experimental_use_pfor)
+    answer = [array_ops.diag(2 * x * y), array_ops.diag(x * x)]
+    return jacobian, answer
+
+  def testPfor(self):
+    jacobian, answer = self._jacobian(experimental_use_pfor=True)
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  def testWhileLoop(self):
+    jacobian, answer = self._jacobian(experimental_use_pfor=False)
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  def testPforDefun(self):
+
+    @function.defun
+    def _f():
+      return self._jacobian(experimental_use_pfor=True)
+
+    jacobian, answer = _f()
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  def testWhileLoopDefun(self):
+
+    @function.defun
+    def _f():
+      return self._jacobian(experimental_use_pfor=False)
+
+    jacobian, answer = _f()
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  def testPersistentTape(self):
+    if not context.executing_eagerly():
+      return
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([1.0, 2.0])
+      g.watch(x)
+      y = x * x
+    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+      g.jacobian(y, x, experimental_use_pfor=False)
+
+  def testPforException(self):
+    var = variables.Variable([1.])
+
+    @custom_gradient.custom_gradient
+    def op(x):
+      def grad(_):
+        # Note that we perform a stateful operation here that will not be
+        # compatible with parallel for construct.
+        with ops.control_dependencies(
+            [var.assign(random_ops.random_uniform([1]))]):
+          return constant_op.constant(1.)
+      return x, grad
+
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([1., 2.])
+      g.watch(x)
+      y = op(x)
+    with self.assertRaisesRegexp(ValueError, 'No converter'):
+      g.jacobian(y, x, experimental_use_pfor=True)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 7913ccf969e1bcea23ca08f80dad6e957eb7c5a4..886715867c8312283811f28e748b14296f668954 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -224,6 +224,18 @@ class MicroBenchmarks(test.Benchmark):
     self._benchmark_create_tensor(
         np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, GPU)
 
+  def benchmark_index_tensor_with_literal(self):
+    func = lambda: constant_op.constant([3.0])[0]
+    self._run(func, 30000)
+
+  def benchmark_index_tensor_with_tensor(self):
+    func = lambda idx=constant_op.constant(0): constant_op.constant([3.0])[idx]
+    self._run(func, 30000)
+
+  def benchmark_index_tensor_with_np_array(self):
+    func = lambda idx=np.array(0): constant_op.constant([3.0])[idx]
+    self._run(func, 30000)
+
   def _benchmark_np_multiply(self, m, num_iters):
     a = m.cpu().numpy()
     func = lambda: a * a
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 0986c4b9a6cfa7de55a01eaa3c3431a83438c568..2f6b038dda945f20fa610a94e02b0dfb59dcab25 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -25,7 +25,6 @@ import random
 import threading
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python import tf2
 from tensorflow.python.framework import c_api_util
@@ -81,6 +80,55 @@ class _EagerTensorCache(object):
     self._data = {}
 
 
+class FunctionCallOptions(object):
+  """Options applied at call sites of eager functions.
+  Eager functions are functions decorated with tf.contrib.eager.defun.
+  """
+
+  def __init__(self, executor_type=None, config_proto=None):
+    """Constructor.
+
+    Args:
+      executor_type: (optional) name of the executor to be used to execute the
+        eager function. If None or an empty string, the default Tensorflow
+        executor will be used.
+      config_proto: (optional) a `config_pb2.ConfigProto` proto or
+        a serialized string of that proto.
+        The config used by Grappler when optimizing the function graph.
+        Each concrete function is optimized the first time is called. Changing
+        config_proto after the first call has no effect.
+        If config_proto is None, an empty RewriterConfig will be used.
+    """
+    self.config_proto_serialized = config_proto
+    self.executor_type = executor_type
+
+  @property
+  def executor_type(self):
+    return self._executor_type
+
+  @executor_type.setter
+  def executor_type(self, executor_type):
+    self._executor_type = executor_type
+
+  @property
+  def config_proto_serialized(self):
+    return self._config_proto_serialized
+
+  @config_proto_serialized.setter
+  def config_proto_serialized(self, config):
+    if isinstance(config, config_pb2.ConfigProto):
+      self._config_proto_serialized = config.SerializeToString()
+    elif isinstance(config, str):
+      self._config_proto_serialized = config
+    elif config is None:
+      self._config_proto_serialized = (
+          config_pb2.ConfigProto().SerializeToString())
+    else:
+      raise ValueError("the rewriter config must be either a "
+                       "config_pb2.ConfigProto, or a serialized string of that "
+                       "proto or None. got: {}".format(type(config)))
+
+
 # TODO(agarwal): better name ?
 class _EagerContext(threading.local):
   """Thread local eager context."""
@@ -101,13 +149,12 @@ class _EagerContext(threading.local):
 
     # Default rewriter config corresponds to turning all default grappler
     # optimizations on.
-    base_config = rewriter_config_pb2.RewriterConfig()
+    base_config = config_pb2.ConfigProto()
 
-    if config is not None and config.HasField(
-        "graph_options") and config.graph_options.HasField("rewrite_options"):
-      base_config.Merge(config.graph_options.rewrite_options)
+    if config is not None:
+      base_config.MergeFrom(config)
 
-    self.rewriter_config = base_config.SerializeToString()
+    self.function_call_options = FunctionCallOptions(config_proto=base_config)
 
 
 ContextSwitch = collections.namedtuple(
@@ -372,36 +419,6 @@ class Context(object):
       if mode == EAGER_MODE:
         self.context_switches.pop()
 
-  @tf_contextlib.contextmanager
-  def rewriter_config(self, rewriter_config_=None):
-    """A context manager to allow setting the grappler rewrite options.
-
-    Args:
-      rewriter_config_: A tensorflow.RewriterConfig proto object.
-
-    Yields:
-      Nothing.
-
-    Raises:
-      ValueError: if rewriter_config is not a tensorflow.RewriterConfig proto.
-    """
-    if rewriter_config_ is None or not isinstance(
-        rewriter_config_, rewriter_config_pb2.RewriterConfig):
-      raise ValueError("Must pass a rewriter_config proto")
-
-    ctx = self._eager_context
-    old_rewriter_config = ctx.rewriter_config
-    ctx.rewriter_config = rewriter_config_.SerializeToString()
-    try:
-      yield
-    finally:
-      ctx.rewriter_config = old_rewriter_config
-
-  @property
-  def rewriter_config_string(self):
-    """Returns the serialized rewriter_config for the current thread."""
-    return self._eager_context.rewriter_config
-
   def executing_eagerly(self):
     """Returns True if current thread has eager executing enabled."""
     return self._eager_context.is_eager
@@ -530,6 +547,35 @@ class Context(object):
     finally:
       self.set_execution_mode(old_mode)
 
+  def get_function_call_options(self):
+    """Returns function call options for current thread.
+
+    Note that the returned object is still referenced by the eager context.
+
+    Returns: the FunctionCallOptions for current thread.
+    """
+    return self._eager_context.function_call_options
+
+  @tf_contextlib.contextmanager
+  def function_call_options(self, set_options_func):
+    """Context manager for setting function call options of current thread.
+
+    Args:
+      set_options_func: A callable that takes one argument of type
+        FunctionCallOptions. It should set the properties of that
+        FunctionCallOptions.
+
+    Yields:
+      Nothing.
+    """
+    current_options = self.get_function_call_options()
+    old_options = copy.copy(current_options)
+    try:
+      set_options_func(current_options)
+      yield
+    finally:
+      self._eager_context.function_call_options = old_options
+
   def async_wait(self):
     """Waits for ops dispatched in ASYNC mode to finish."""
     pywrap_tensorflow.TFE_ContextAsyncWait(self._handle)
@@ -782,6 +828,25 @@ def execution_mode(mode):
   return context().execution_mode(mode)
 
 
+@tf_export("experimental.function_executor_type")
+def function_executor_type(executor_type):
+  """Context manager for setting the executor of eagar defined functions.
+
+  Eager defined functions are functions decorated by tf.contrib.eager.defun.
+
+  Args:
+    executor_type: a string for the name of the executor to be used
+    to execute functions defined by tf.contrib.eager.defun.
+
+  Returns:
+    Context manager for setting the executor of eager defined functions.
+  """
+  def _set_options_func(options):
+    options.executor_type = executor_type
+
+  return context().function_call_options(_set_options_func)
+
+
 def async_wait():
   """Waits for ops dispatched in ASYNC mode to finish."""
   return context().async_wait()
@@ -827,9 +892,23 @@ def export_run_metadata():
   return context().export_run_metadata()
 
 
-def rewriter_config(rewriter_config_):
-  """Context manager for setting the grappler rewrite config."""
-  return context().rewriter_config(rewriter_config_)
+def function_config_proto(config_proto):
+  """Context manager for setting the grappler rewrite config.
+
+  This config is used by Grappler when optimizing the function graph.
+
+  Args:
+    config_proto: a `config_pb2.ConfigProto` proto or
+      a serialized string of that proto or None. If None, the default instance
+      of `config_pb2.ConfigProto` will be used.
+
+  Returns:
+    A context manager.
+  """
+  def _set_options_func(options):
+    options.config_proto_serialized = config_proto
+
+  return context().function_call_options(_set_options_func)
 
 
 def set_server_def(server_def):
diff --git a/tensorflow/python/eager/core.py b/tensorflow/python/eager/core.py
index 8fb69300209d74a164c38654d737432cdfb7884a..e168b4bd5ffedc0b7a244a9c190f2c50726105e4 100644
--- a/tensorflow/python/eager/core.py
+++ b/tensorflow/python/eager/core.py
@@ -60,4 +60,15 @@ class _FallbackException(Exception):
   pass
 
 
+class _SymbolicException(Exception):
+  """Exception class to handle use of symbolic tensors when executing eagerly.
+
+  `keras.Input()` creates symbolic tensors (in a FuncGraph managed by the
+  Keras backend) while in eager execution. This exception is used to
+  identify this case (raised in `convert_to_tensor` cause generated functions
+  for ops to construct graphs instead of executing the kernel).
+  """
+  pass
+
+
 pywrap_tensorflow.TFE_Py_RegisterFallbackExceptionClass(_FallbackException)
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index a56ddeea81b48a3e74f8e145e24a90cf53ca848f..52830d41bf3731527e61c90e668b92030c65750c 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -19,12 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import functools
 import weakref
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as function_lib
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -32,65 +32,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import tf_decorator
-
-
-def _graph_inputs(op):
-  return [x.op for x in op.inputs] + list(op.control_inputs)
-
-
-def _lift_to_graph(init_tensor, graph):
-  """Copies the tensor and all its inputs recursively to the outer graph."""
-  # Check that the initializer does not depend on any placeholders.
-  visited_ops = set([])
-  ops_to_visit = [init_tensor.op]
-  op_outputs = collections.defaultdict(set)
-  while ops_to_visit:
-    op = ops_to_visit.pop()
-    if op in visited_ops:
-      continue
-    visited_ops.add(op)
-    # TODO(apassos) distinguish arg placeholders, capture placeholders,
-    # and placeholders the user might directly use to initialize
-    # variables.
-    if op.type == "Placeholder":
-      raise ValueError(
-          "Unable to lift tensor", init_tensor,
-          "because it depends transitively on placeholder ", op)
-    for inp in _graph_inputs(op):
-      op_outputs[inp].add(op)
-      if inp not in visited_ops:
-        ops_to_visit.append(inp)
-  # Topologically sort the nodes we've extracted. Now we know how many of their
-  # outputs are part of this subgraph.
-  ops_to_copy = []
-  marked_ops = set([])
-  ops_to_visit = [init_tensor.op]
-  while ops_to_visit:
-    op = ops_to_visit.pop()
-    if op in marked_ops:
-      continue
-    marked_ops.add(op)
-    ops_to_copy.append(op)
-    for inp in _graph_inputs(op):
-      if all(x in marked_ops for x in op_outputs[inp]):
-        ops_to_visit.append(inp)
-  assert len(ops_to_copy) == len(visited_ops)
-  # ops_to_copy now holds a reverse topologically sorted list of ops which
-  # ends in the initializer. We copy those to the outermost graph and
-  # build the initialization op there.
-  with graph.as_default():
-    op_map = {}
-    for op in reversed(ops_to_copy):
-      copied_inputs = [op_map[x] for x in op.inputs]
-      copied_control_inputs = [op_map[x] for x in op.control_inputs]
-      with ops.control_dependencies(copied_control_inputs):
-        copied_op = graph.create_op(
-            op.type, copied_inputs, [x.dtype for x in op.outputs],
-            attrs=op.node_def.attr)
-      op_map[op] = copied_op
-      for i, o in enumerate(op.outputs):
-        op_map[o] = copied_op.outputs[i]
-    return op_map[init_tensor]
+from tensorflow.python.util.tf_export import tf_export
 
 
 class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
@@ -110,6 +52,7 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
                name=None,
                dtype=None,
                constraint=None,
+               add_initializers_to=None,
                **unused_kwargs):
     """Creates a variable.
 
@@ -140,6 +83,9 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      add_initializers_to: if not None and not in legacy graph mode, the
+        initializer tensor will be added to this map instead of adding the
+        assignment to the function.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -206,7 +152,8 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       if self._in_graph_mode:
         with ops.init_scope():
           outer_graph = ops.get_default_graph()
-        lifted_initializer = _lift_to_graph(initial_value, outer_graph)
+        lifted_initializer = lift_to_graph.lift_to_graph(
+            initial_value, outer_graph)[initial_value]
         with ops.init_scope():
           self._initial_value = lifted_initializer
           with ops.name_scope("IsInitialized"):
@@ -224,21 +171,24 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
             self._graph_element = value
           ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self)
       else:
-        def assign_fn():
-          with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
-            resource_variable_ops.assign_variable_op(
-                self._handle,
-                initial_value,
-                name=n)
-            # Returning values to keep tf.cond happy.
-          return ops.convert_to_tensor(1)
-        def not_assign_fn():
-          return ops.convert_to_tensor(0)
-        # Note: this cond is always guaranteed to run because we're inside a
-        # defun which will insert automatic control dependencies.
-        control_flow_ops.cond(
-            resource_variable_ops.var_is_initialized_op(self._handle),
-            not_assign_fn, assign_fn)
+        if add_initializers_to is not None:
+          add_initializers_to[self] = initial_value
+        else:
+          def assign_fn():
+            with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+              resource_variable_ops.assign_variable_op(
+                  self._handle,
+                  initial_value,
+                  name=n)
+              # Returning values to keep tf.cond happy.
+            return ops.convert_to_tensor(1)
+          def not_assign_fn():
+            return ops.convert_to_tensor(0)
+          # Note: this cond is always guaranteed to run because we're inside a
+          # defun which will insert automatic control dependencies.
+          control_flow_ops.cond(
+              resource_variable_ops.var_is_initialized_op(self._handle),
+              not_assign_fn, assign_fn)
 
     # After the handle has been created, set up a way to clean it up when
     # executing eagerly. We'll hold the only reference to the deleter, so that
@@ -264,7 +214,7 @@ class PolymorphicFunction(object):
                python_function,
                name,
                input_signature=None,
-               autograph=False,
+               autograph=True,
                experimental_autograph_options=None):
     """Initializes a polymorphic function.
 
@@ -298,26 +248,26 @@ class PolymorphicFunction(object):
   def _defun_with_scope(self, scope):
     """Creates a defun wrapped inside a variable creator scope."""
 
-    fn = self._python_function
-
     def wrapped_fn(*args, **kwds):
       with variable_scope.variable_creator_scope(scope):
-        return fn(*args, **kwds)
+        # __wrapped__ allows AutoGraph to swap in a converted function.
+        return wrapped_fn.__wrapped__(*args, **kwds)
 
     # TODO(mdan): Pipe self._experimental_autograph_options through.
     return function_lib.defun(
-        tf_decorator.make_decorator(fn, wrapped_fn),
+        tf_decorator.make_decorator(self._python_function, wrapped_fn),
         input_signature=self._input_signature,
-        experimental_autograph=self._autograph)
+        autograph=self._autograph)
 
-  def _initialize(self, args, kwds):
+  def _initialize(self, args, kwds, add_initializers_to=None):
     """Initializes, on the first call."""
 
     self._created_variables = []
 
     def variable_capturing_scope(unused_next_creator, **kwds):
       """Creates UnliftedInitializerVariables and saves references to them."""
-      v = UnliftedInitializerVariable(**kwds)
+      v = UnliftedInitializerVariable(
+          add_initializers_to=add_initializers_to, **kwds)
       self._created_variables.append(weakref.ref(v))
       return v
 
@@ -336,7 +286,12 @@ class PolymorphicFunction(object):
 
     self._stateless_fn = self._defun_with_scope(invalid_creator_scope)
     self._stateless_fn._name = self._name  # pylint: disable=protected-access
-    return self._stateful_fn._canonicalize_function_inputs(*args, **kwds)  # pylint: disable=protected-access
+    if self._input_signature is None or args or kwds:
+      return self._stateful_fn._canonicalize_function_inputs(*args, **kwds)  # pylint: disable=protected-access
+    # If an input signature is defined, we may need to fetch a concrete function
+    # without any inputs specified. In this case args and kwds should be ignored
+    # but running _canonicalize_function_inputs would raise an exception.
+    return (), {}
 
   def __call__(self, *args, **kwds):
     """Calls the graph function."""
@@ -387,6 +342,42 @@ class PolymorphicFunction(object):
     """The python function wrapped in this tf.function."""
     return self._python_function
 
+  def get_initialization_function(self, *args, **kwargs):
+    """Returns a `Function` object which initializes this function's variables.
+
+    Requires that this function hasn't been accessed yet through either calling
+    it or calling get_concrete_function. Fails if we cannot build an initializer
+    function which does not depend on the concrete values of the inputs to this
+    function.
+
+    Args:
+      *args: arguments to the underlying python callable.
+      **kwargs: keyword arguments to the python callable.
+
+    Returns:
+      A `Function` object which initializes the variables of this function.
+
+    Raises:
+      RuntimeError: if called after the variables have been initialized.
+    """
+    if self._stateful_fn is not None:
+      raise RuntimeError(
+          "get_initialization_function cannot be called after the function "
+          "has been used")
+    # Here we trace the function, collect the initializers, and attempt to
+    # extract them and run them eagerly. Fail only if we cannot do so.
+    initializer_map = {}
+    self._initialize(args, kwargs, add_initializers_to=initializer_map)
+
+    # Note: using defun here avoids an infinite recursion.
+    @function_lib.defun
+    def initialize_variables():
+      for v, init in initializer_map.items():
+        v.assign(lift_to_graph.lift_to_graph(
+            init, ops.get_default_graph())[init])
+
+    return initialize_variables.get_concrete_function()
+
   def get_concrete_function(self, *args, **kwargs):
     """Returns a `Function` object specialized to inputs and execution context.
 
@@ -463,14 +454,10 @@ class PolymorphicFunction(object):
     Raises:
       ValueError: if this object has not yet been called on concrete values.
     """
-    # TODO(apassos) figure out how to handle this case (what should we return
-    # here?)
+    assert context.executing_eagerly()
     if self._stateful_fn is None:
-      raise ValueError(
-          "Call this function with concrete values before asking for a"
-          " concrete function. Calling the function will ensure that, in"
-          " case this function creates variables, that those are properly"
-          " initialized.")
+      self.get_initialization_function(*args, **kwargs)()
+
     if self._created_variables:
       # In this case we have created variables on the first call, so we run the
       # defunned version which is guaranteed to never create variables.
@@ -512,11 +499,207 @@ class PolymorphicFunction(object):
     return self._descriptor_cache[instance]
 
 
+# In TensorFlow 1.x, exported as tf.contrib.eager.function
+@tf_export("function", v1=[])
 def function(func=None,
              input_signature=None,
-             autograph=False,
+             autograph=True,
              experimental_autograph_options=None):
-  """Defines a function as per the "functions, not sessions" document."""
+  """Creates a callable TensorFlow graph from a Python function.
+
+  `function` constructs a callable that executes a TensorFlow graph
+  (`tf.Graph`) created by tracing the TensorFlow operations in `func`.
+  This allows the TensorFlow runtime to apply optimizations and exploit
+  parallelism in the computation defined by `func`.
+
+  _Example Usage_
+
+  ```python
+  def f(x, y):
+    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
+
+  g = tf.function(f)
+
+  x = tf.constant([[2.0, 3.0]])
+  y = tf.constant([[3.0, -2.0]])
+
+  # `f` and `g` will return the same value, but `g` will be executed as a
+  # TensorFlow graph.
+  assert f(x, y).numpy() == g(x, y).numpy()
+
+  # Tensors and tf.Variables used by the Python function are captured in the
+  # traced graph.
+  @tf.function
+  def h():
+    return f(x, y)
+
+  assert (h().numpy() == f(x, y).numpy()).all()
+  ```
+
+  _Referencing `tf.Variable`s_
+
+  The Python function `func` may reference stateful objects (such as
+  `tf.Variable`).
+  These are captured as implicit inputs to the callable returned by `function`.
+  For example:
+
+  ```python
+  c = tf.Variable(0)
+
+  @tf.function
+  def f(x):
+    c.assign_add(1)
+    return x + tf.to_float(c)
+
+  assert int(c) == 0
+  assert f(1.0) == 3.0
+  assert int(c) == 1
+  assert f(1.0) == 4.0
+  assert int(c) == 2
+  ```
+
+  `function` can be applied to methods of an object. For example:
+
+  ```python
+  class Dense(object):
+    def __init__(self):
+      self.W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))
+      self.b = tf.Variable(tf.zeros(10))
+
+    @tf.function
+    def compute(self, x):
+      return tf.matmul(x, self.W) + self.b
+
+  d1 = Dense()
+  d2 = Dense()
+  x = tf.random_uniform((10, 10))
+  # d1 and d2 are using distinct variables
+  assert not (d1.compute(x).numpy() == d2.compute(x).numpy()).all()
+  ```
+
+  _Usage with `tf.keras`_
+
+  The `call` methods of a `tf.keras.Model` subclass can be decorated with
+  `function` in order to apply graph execution optimizations on it.
+  For example:
+
+  ```python
+  class MyModel(tf.keras.Model):
+    def __init__(self, keep_probability=0.2):
+      super(MyModel, self).__init__()
+      self.dense1 = tf.keras.layers.Dense(4)
+      self.dense2 = tf.keras.layers.Dense(5)
+      self.keep_probability = keep_probability
+
+    @tf.function
+    def call(self, inputs, training=True):
+      y = self.dense2(self.dense1(inputs))
+      if training:
+        return tf.nn.dropout(y, self.keep_probability)
+      else:
+        return y
+
+  model = MyModel()
+  model(x, training=True)  # executes a graph, with dropout
+  model(x, training=False) # executes a graph, without dropout
+  ```
+
+  _Input Signatures_
+  `function` instantiates a separate graph for every unique set of input
+  shapes and datatypes. For example, the following code snippet will result
+  in three distinct graphs being traced, as each input has a different
+  shape.
+
+  ```python
+  @tf.function
+  def f(x): return tf.add(x, 1.)
+
+  scalar = tf.constant(1.0)
+  vector = tf.constant([1.0, 1.0])
+  matrix = tf.constant([[3.0]])
+
+  f(scalar)
+  f(vector)
+  f(matrix)
+  ```
+
+  An "input signature" can be optionally provided to `function` to control
+  the graphs traced. The input signature specifies the shape and type of each
+  `Tensor` argument to the function using a `tf.TensorSpec` object. For example,
+  the following code snippet ensures that a single graph is created where the
+  input `Tensor` is required to be a floating point tensor with no restrictions
+  on shape.
+
+  ```python
+  @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+  def f(x): return tf.add(x, 1.)
+  ```
+
+  When an `input_signature` is specified, the callable will only accept `Tensor`
+  (or NumPy `ndarray`) objects as arguments.
+
+  _Tracing_
+  Note that `function` only traces TensorFlow operations, all the other
+  Python code that `func` executes will shape the _construction_ of the graph.
+  For example, consider the following:
+
+  ```python
+  import numpy as np
+
+  def add_noise():
+    return tf.eye(5) + np.random.randn(5, 5)
+
+  traced = tf.function(add_noise)
+  ```
+
+  `add_noise()` will return a different output every time it is invoked.
+  However, `traced` will return the same value every time it is called, since a
+  particular random value generated by the `np.random.randn` call will be
+  inserted in the traced TensorFlow graph as a constant. In this particular
+  example, replacing `np.random.randn(5, 5)` with `tf.random_normal((5, 5))`
+  will result in the same behavior for `add_noise()` and `traced()`.
+
+  _Python Side-Effects_
+  A corollary of the previous discussion on tracing is the following: If a
+  Python function `func` has Python side-effects, then executing `func` multiple
+  times
+  may not be semantically equivalent to executing `F = tf.function(func)`
+  multiple times; this difference is due to the fact that `function` only
+  captures the subgraph of TensorFlow operations that is constructed when `func`
+  is invoked to trace a graph.
+
+  Args:
+    func: function to be compiled. If `func` is None, returns a decorator that
+      can be invoked with a single argument - `func`. The end result is
+      equivalent to providing all the arguments up front. In other words,
+      `tf.function(input_signature=...)(func)` is equivalent to
+      `tf.function(func, input_signature=...)`. The former can be used to
+      decorate Python functions, for example:
+        @tf.function(input_signature=...)
+        def foo(...): ...
+    input_signature: A possibly nested sequence of `tf.TensorSpec` objects
+      specifying the shapes and dtypes of the Tensors that will be supplied to
+      this function. If `None`, a separate function is instantiated for each
+      inferred input signature.  If input_signature is specified, every input to
+      `func` must be a `Tensor`, and `func` cannot accept `**kwargs`.
+    autograph: Whether autograph should be applied on `func` before tracing a
+      graph. This allows for dynamic control flow (Python if's, loops etc.)
+      in the traced graph. See https://www.tensorflow.org/guide/autograph for
+        more information.
+    experimental_autograph_options: Experimental knobs (in the form of a tuple
+      of tensorflow.autograph.Feature values) to control behavior when
+      autograph=True.
+
+  Returns:
+     If `func` is not None, returns a callable that will execute the compiled
+     function (and return zero or more `tf.Tensor` objects).
+     If `func` is None, returns a decorator that, when invoked with a single
+     `func` argument, returns a callable equivalent to the case above.
+
+  Raises:
+    TypeError: If `input_signature` is neither `None` nor a sequence of
+      `TensorSpec` objects.
+  """
   if input_signature is not None:
     function_lib.validate_signature(input_signature)
 
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 543dcd19ae8450f8c680a351ef7c935625a9a4cf..54991344b75832655b3113dbb823dee5f93458e1 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
@@ -106,6 +107,23 @@ class DefFunctionTest(test.TestCase):
 
     self.assertAllEqual(fn(constant_op.constant(1.0)), 2.0)
 
+  def testFunctionInitializationFunction(self):
+
+    state = []
+
+    @def_function.function
+    def fn(x):
+      if not state:
+        state.append(variables.Variable(2.0))
+      return state[0] * x
+
+    init_fn = fn.get_initialization_function(constant_op.constant(1.0))
+    self.assertEqual(len(state), 1)
+    self.assertFalse(
+        resource_variable_ops.var_is_initialized_op(state[0].handle))
+    init_fn()
+    self.assertEqual(state[0].numpy(), 2.0)
+
   def testVariableInitializerNotConstant(self):
 
     state = []
@@ -131,9 +149,9 @@ class DefFunctionTest(test.TestCase):
 
       result = fn(3.0)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(sess.run(state[0]), 2.0)
-      self.assertAllEqual(sess.run(result), 6.0)
+      self.assertAllEqual(self.evaluate(result), 6.0)
 
   def testLegacyGraphModeVariablesNonTrivialInitializer(self):
     with ops.Graph().as_default(), self.test_session() as sess:
@@ -150,9 +168,9 @@ class DefFunctionTest(test.TestCase):
 
       result = fn(3.0)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(sess.run(state[0]), 6.0)
-      self.assertAllEqual(sess.run(result), 18.0)
+      self.assertAllEqual(self.evaluate(result), 18.0)
 
   def testLegacyGraphModeInputDependentInitializerFails(self):
     with ops.Graph().as_default():
@@ -195,6 +213,19 @@ class DefFunctionTest(test.TestCase):
     model = _ModelWithOptimizer()
     model(x, y)
 
+  def test_concrete_function_from_signature(self):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    def compute(x):
+      return 2. * x
+
+    concrete = compute.get_concrete_function()
+    self.assertAllClose(1., concrete(constant_op.constant(0.5)))
+    concrete = compute.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32))
+    self.assertAllClose(4., concrete(constant_op.constant(2.)))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index f9b8d2cb5db9aedcd834afcde00dac3afa4008bb..6f8c780170cc8e3bfe5aa23603c0448e70b5e49c 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -64,6 +64,16 @@ def quick_execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
     else:
       message = e.message
     six.raise_from(core._status_to_exception(e.code, message), None)
+  except TypeError as e:
+    if any(ops._is_keras_symbolic_tensor(x) for x in inputs):
+      if any(isinstance(x, ops.EagerTensor) for x in inputs):
+        raise TypeError("You are attempting to mix computation of symbolic "
+                        "Tensors (computation rooted at tf.keras.Input()) "
+                        "and concrete values. This is not supported. "
+                        "If you need this support, file an issue on the "
+                        "TensorFlow GitHub repository.")
+      raise core._SymbolicException
+    raise e
   # pylint: enable=protected-access
   return tensors
 
@@ -188,7 +198,10 @@ def args_to_matching_eager(l, ctx, default_dtype=None):
     ret = []
     for t in l:
       ret.append(internal_convert_to_tensor(
-          t, dtype, preferred_dtype=default_dtype, ctx=ctx))
+          t, dtype,
+          preferred_dtype=default_dtype,
+          ctx=ctx,
+          accept_symbolic_tensors=False))
       if dtype is None:
         dtype = ret[-1].dtype
   else:
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index a12bbb792f9b64a729301cc5ed18957a7ee59381..bc92a0c97454684c2e71fe1d9d289a2da3640e0f 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -48,6 +48,7 @@ from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -66,6 +67,11 @@ WHITELIST_FUNCTION_ATTRIBUTE_REGEX = [
     BACKWARD_FUNCTION_ATTRIBUTE_NAME
 ]
 
+CacheKey = collections.namedtuple("CacheKey", [
+    "input_signature", "parent_graph", "device_functions", "colocation_stack",
+    "uses_xla"
+])
+
 
 def _parse_func_attrs(attributes):
   """Convert the keyword arguments into function_def attributes.
@@ -254,12 +260,14 @@ class _EagerDefinedFunction(object):
         raise ValueError(
             "Arguments and signature arguments do not match: %s %s " %
             (len(args), len(list(self.signature.input_arg))))
+      function_call_options = ctx.get_function_call_options()
       outputs = functional_ops.partitioned_call(
           args=args,
           f=self,
           tout=self._output_types,
           executing_eagerly=executing_eagerly,
-          config=ctx.rewriter_config_string)  # pylint: disable=protected-access
+          config=function_call_options.config_proto_serialized,
+          executor_type=function_call_options.executor_type)
 
     if executing_eagerly:
       return outputs
@@ -339,7 +347,7 @@ class Function(object):
               "wrap_function-decorated function.")
         return self._call_flat(args)
       raise AssertionError(
-          "Tried to call a concrete function obtained from an interal API "
+          "Tried to call a concrete function obtained from an internal API "
           "through the public interface. Use get_concrete_function instead.")
     if len(args) > self._num_positional_args:
       raise TypeError(
@@ -537,31 +545,37 @@ class Function(object):
                               self._func_graph.structured_outputs)
 
   def add_to_graph(self, g=None, register_gradient_functions=False):
-    """Registers the function into the graph g."""
+    """Registers the function, adds it to the graph g or default graph."""
+    # If we are not executing eagerly, adds the function to default graph if no
+    # graph is specified.
+    # In case of eager execution, function definition gets added to context
+    # during construction itself.
+
     # TODO(allel/shivaniagrawal): rename this to register to reflect the
     # method's functionality better. Remove register_gradient_functions argument
     # and figure out if these needs to be registered.
 
-    if not g:
-      g = ops.get_default_graph()
-    self._inference_function.add_to_graph(g)  # pylint: disable=protected-access
+    if not context.executing_eagerly() or g:
+      if not g:
+        g = ops.get_default_graph()
+      self._inference_function.add_to_graph(g)  # pylint: disable=protected-access
 
-    # pylint: disable=protected-access
-    if register_gradient_functions:
-      # There are two situations for the actual call of a defun:
-      # 1. If none of the input args are resource variables or watch by any
-      #   tape, and it will run the _inference_function of concrete_func for
-      #   forward pass, the gradient will be generated by standard mechanism.
-      # 2. Otherwise, defun will create two functions, one for forward pass, and
-      #   the backward pass will be created via tape.
-      #   When registering the function, we register both cases.
-      if self._backward_graph_function is None:
-        self._construct_backprop_function()
-      forward_function = self._forward_function
-      backward_function = self._backward_graph_function._inference_function
-      # pylint: enable=protected-access
-      forward_function.add_to_graph(g)
-      backward_function.add_to_graph(g)
+      # pylint: disable=protected-access
+      if register_gradient_functions:
+        # There are two situations for the actual call of a defun:
+        # 1. If none of the input args are resource variables or watch by any
+        #   tape, and it will run the _inference_function of concrete_func for
+        #   forward pass, the gradient will be generated by standard mechanism.
+        # 2. Otherwise, defun will create two functions, one for forward pass,
+        #   and the backward pass will be created via tape.
+        #   When registering the function, we register both cases.
+        if self._backward_graph_function is None:
+          self._construct_backprop_function()
+        forward_function = self._forward_function
+        backward_function = self._backward_graph_function._inference_function
+        # pylint: enable=protected-access
+        forward_function.add_to_graph(g)
+        backward_function.add_to_graph(g)
 
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
@@ -716,7 +730,7 @@ class PolymorphicFunction(object):
                name,
                input_signature=None,
                attributes=None,
-               experimental_autograph=False):
+               autograph=True):
     """Initializes a polymorphic function.
 
     Args:
@@ -727,7 +741,7 @@ class PolymorphicFunction(object):
         function is instantiated for each inferred input signature.
       attributes: dict, extra keyword arguments that will be added as attribute
         of the function.
-      experimental_autograph: whether to use autograph to compile
+      autograph: whether to use autograph to compile
         `python_function`. See https://www.tensorflow.org/guide/autograph for
         more information.
 
@@ -745,7 +759,7 @@ class PolymorphicFunction(object):
       self._args_to_prepend = tuple()
       self._kwargs_to_include = {}
     self._name = name
-    self._experimental_autograph = experimental_autograph
+    self._autograph = autograph
     self._function_cache = collections.OrderedDict()
     self._function_attributes = attributes or {}
 
@@ -919,17 +933,17 @@ class PolymorphicFunction(object):
     """Computes the cache key given inputs and execution context."""
     if self._input_signature is None:
       inputs = (args, kwargs) if kwargs else args
-      cache_key = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
+      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
     else:
       del args, kwargs
-      cache_key = self._flat_input_signature
+      input_signature = self._flat_input_signature
 
     ctx = context.context()
     with ops.init_scope():
       # The graph, or whether we're executing eagerly, should be a part of the
       # cache key so we don't improperly capture tensors such as variables.
       executing_eagerly = ctx.executing_eagerly()
-      execution_context = executing_eagerly or ops.get_default_graph()
+      parent_graph = None if executing_eagerly else ops.get_default_graph()
 
     # pylint: disable=protected-access
     default_graph = ops.get_default_graph()
@@ -958,8 +972,8 @@ class PolymorphicFunction(object):
       else:
         device_functions = ()
     # pylint: enable=protected-access
-    return (cache_key, execution_context, device_functions, colocation_stack,
-            uses_xla)
+    return CacheKey(input_signature, parent_graph, device_functions,
+                    colocation_stack, uses_xla)
 
   def _canonicalize_function_inputs(self, *args, **kwargs):
     """Canonicalizes `args` and `kwargs`.
@@ -1075,6 +1089,9 @@ class PolymorphicFunction(object):
                         "must be hashable.")
 
       if graph_function is None:
+        logging.vlog(1,
+                     "Creating new FuncGraph for Python function %r (key: %r)",
+                     self._python_function, cache_key)
         if self._input_signature is None:
           arglen = len(args)
         else:
@@ -1089,7 +1106,7 @@ class PolymorphicFunction(object):
                 args,
                 kwargs,
                 self._input_signature,
-                experimental_autograph=self._experimental_autograph,
+                autograph=self._autograph,
                 arg_names=arg_names),
             self._function_attributes)
         self._function_cache[cache_key] = graph_function
@@ -1129,7 +1146,7 @@ def validate_signature(signature):
                     "a possibly nested sequence of TensorSpec objects.")
 
 
-def defun(func=None, input_signature=None, experimental_autograph=False):
+def defun(func=None, input_signature=None, autograph=True):
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") trace-compiles a Python function
@@ -1438,7 +1455,7 @@ def defun(func=None, input_signature=None, experimental_autograph=False):
       function is instantiated for each inferred input signature.  If a
       signature is specified, every input to `func` must be a `Tensor`, and
       `func` cannot accept `**kwargs`.
-    experimental_autograph: Whether `func` should be compiled before
+    autograph: Whether `func` should be compiled before
       constructing the graph. See https://www.tensorflow.org/guide/autograph
       for more information.
 
@@ -1456,13 +1473,13 @@ def defun(func=None, input_signature=None, experimental_autograph=False):
   return defun_with_attributes(
       func=func,
       input_signature=input_signature,
-      experimental_autograph=experimental_autograph)
+      autograph=autograph)
 
 
 def defun_with_attributes(func=None,
                           input_signature=None,
                           attributes=None,
-                          experimental_autograph=False):
+                          autograph=True):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
@@ -1479,7 +1496,7 @@ def defun_with_attributes(func=None,
       unsupported value will result into ValueError. `func_name` is also one of
       the whitelisted argument which is a python string, and sets the name for
       this `Function` in the graph.
-    experimental_autograph: same as defun()'s experimental_autograph.
+    autograph: same as defun()'s autograph.
 
   Returns:
     Same as the return value of defun, with attributes added to the function in
@@ -1504,7 +1521,7 @@ def defun_with_attributes(func=None,
             name,
             input_signature=input_signature,
             attributes=attributes,
-            experimental_autograph=experimental_autograph))
+            autograph=autograph))
 
   # This code path is for the `foo = tfe.defun(foo, ...)` use case
   if func is not None:
@@ -1520,22 +1537,49 @@ def defun_with_attributes(func=None,
   return decorated
 
 
+# When a method is bound to objects of this type, it allows AutoGraph to
+# recover a weak reference the original method's self pointer. This uses the
+# mechanism from pyct.inspect_utils.getmethodclass.
+# TODO(b/119246461): This is not pretty. Use a descriptor instead?
+class _WeakrefSelf(object):
+
+  def __init__(self, target):
+    self.ag_self_weakref__ = target
+
+
 def class_method_to_instance_method(original_function, instance):
   """Constructs a new PolymorphicFunction with `self` bound."""
-  def make_partial_py_func(py_func, weak_instance):
-    return lambda *args, **kwargs: py_func(weak_instance(), *args, **kwargs)
   weak_instance = weakref.ref(instance)
 
+  # Note: while we could bind to a weakref proxy instead, that causes the
+  # bound method to be unhashable.
+  bound_method = types_lib.MethodType(original_function.python_function,
+                                      _WeakrefSelf(weak_instance))
+
+  # original_function is expected to be of one of the two PolymorphicFunction
+  # types (defined either in function.py or def_function.py).
+  assert hasattr(original_function, "_name")
+  assert hasattr(original_function, "_autograph")
+  assert hasattr(original_function, "_input_signature")
+  assert hasattr(original_function, "python_function")
+
+  def bound_method_wrapper(*args, **kwargs):
+    # __wrapped__ allows AutoGraph to swap in a converted function.
+    wrapped_fn = bound_method_wrapper.__wrapped__
+    # If __wrapped__ was not replaced, then call original_function.
+    # TODO(b/119246461): This needs to be simplified.
+    if tf_inspect.ismethod(wrapped_fn):
+      wrapped_fn = original_function.python_function
+    return wrapped_fn(weak_instance(), *args, **kwargs)
+
   # pylint: disable=protected-access
   # We make a dummy MethodType object to generate the correct bound method
   # signature. The actual call is to a function with a weak reference to
   # `instance`.
   instance_func = type(original_function)(
-      tf_decorator.make_decorator(
-          types_lib.MethodType(original_function.python_function, False),
-          make_partial_py_func(original_function.python_function,
-                               weak_instance)),
+      tf_decorator.make_decorator(bound_method, bound_method_wrapper),
       name=original_function._name,
+      autograph=original_function._autograph,
       input_signature=original_function._input_signature)
   # pylint: enable=protected-access
 
diff --git a/tensorflow/python/eager/function_argument_naming_test.py b/tensorflow/python/eager/function_argument_naming_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9358c4fd07111f7adfbf60241727215f978b2a36
--- /dev/null
+++ b/tensorflow/python/eager/function_argument_naming_test.py
@@ -0,0 +1,258 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@parameterized.named_parameters(
+    dict(testcase_name='Defun', function_decorator=function.defun),
+    dict(testcase_name='DefFunction', function_decorator=def_function.function))
+class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
+  """Tests for recognizable export signatures from concrete functions."""
+
+  def testBasic(self, function_decorator):
+    @function_decorator
+    def fn(a, b):
+      return a + b, a * b
+    # Call the function to make def_function happy
+    fn(array_ops.ones([]), array_ops.ones([]))
+
+    fn_op = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['a', 'b'],
+        [inp.op.name for inp in fn_op.inputs])
+    self.assertEqual(
+        [b'a', b'b'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+    self.assertEqual(2, len(fn_op.graph.structured_outputs))
+    self.assertAllClose(
+        [3., 2.],
+        fn_op(constant_op.constant(1.), constant_op.constant(2.)))
+    self.assertAllClose(
+        [3., 2.],
+        fn_op(a=constant_op.constant(1.), b=constant_op.constant(2.)))
+
+  def testVariable(self, function_decorator):
+    @function_decorator
+    def fn(a, b):
+      return a + b, a * b
+    # Call the function to make def_function happy
+    fn(array_ops.ones([]), array_ops.ones([]))
+
+    fn_op = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+        variables.Variable(1.))
+    self.assertEqual(
+        ['a', 'b'],
+        [inp.op.name for inp in fn_op.inputs])
+    self.assertEqual(
+        [b'a', b'b'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+    self.assertEqual(2, len(fn_op.graph.structured_outputs))
+
+  def testDictReturned(self, function_decorator):
+    @function_decorator
+    def fn(x, z=(1., 2.), y=3.):
+      z1, z2 = z
+      return {'alpha': x + y + z1, 'beta': x * y + z2}
+    # Call the function to make def_function happy
+    fn(array_ops.ones([]))
+
+    fn_op = fn.get_concrete_function(
+        x=tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['x', 'y'],
+        [inp.op.name for inp in fn_op.inputs])
+    self.assertEqual(
+        [b'x', b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+    self.assertEqual({'alpha', 'beta'},
+                     set(fn_op.graph.structured_outputs.keys()))
+
+    with self.assertRaisesRegexp(ValueError, "two arguments named 'z'"):
+      fn.get_concrete_function(
+          z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+             tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)),
+          y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
+                                   name='custom'),
+          x=4.)
+    fn_op2 = fn.get_concrete_function(
+        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
+                                  name='z_first'),
+           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
+                                  name='z_second')),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
+        x=4.)
+    self.assertEqual(
+        ['z_first', 'z_second', 'custom'],
+        [inp.op.name for inp in fn_op2.inputs])
+    self.assertEqual(
+        [b'z_first', b'z_second', b'custom'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op2.inputs])
+
+    fn_op3 = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
+        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
+                                  name='z1'),
+           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z2')),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['custom', 'z1', 'z2', 'y'],
+        [inp.op.name for inp in fn_op3.inputs])
+    self.assertEqual(
+        [b'custom', b'z1', b'z2', b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op3.inputs])
+
+  def testMethod(self, function_decorator):
+    class HasMethod(object):
+
+      @function_decorator
+      def method(self, x):
+        return x
+
+    has_method = HasMethod()
+    # Call the function to make def_function happy
+    HasMethod.method(has_method, array_ops.ones([]))
+    class_op = HasMethod.method.get_concrete_function(
+        has_method, tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['x'],
+        [inp.op.name for inp in class_op.inputs])
+    self.assertEqual(
+        [b'x'],
+        [inp.op.get_attr('_user_specified_name') for inp in class_op.inputs])
+    # Call the function to make def_function happy
+    has_method.method(array_ops.ones([]))
+    method_op = has_method.method.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['x'],
+        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(
+        [b'x'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+    # TODO(allenl): It should be possible to override names when exporting. Do
+    # TensorSpec names need to go in cache keys? Or maybe get_concrete_function
+    # should always retrace?
+    self.skipTest('Not working')
+    method_op = has_method.method.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='y'))
+    self.assertEqual(
+        ['y'],
+        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(
+        [b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+
+  def testMethodSignature(self, function_decorator):
+
+    class HasMethod(object):
+
+      @function_decorator(
+          input_signature=(tensor_spec.TensorSpec(
+              shape=None, dtype=dtypes.float64, name='y'),))
+      def method(self, x):
+        hash(self)  # No weak proxies passed as `self`
+        return x
+
+    has_method = HasMethod()
+    # Call the function to make def_function happy
+    has_method.method(array_ops.ones([], dtype=dtypes.float64))
+    method_op = has_method.method.get_concrete_function()
+    self.assertEqual(
+        ['y'],
+        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(
+        [b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+    method_op2 = has_method.method.get_concrete_function()
+    self.assertEqual(
+        ['y'],
+        [inp.op.name for inp in method_op2.inputs])
+    self.assertEqual(
+        [b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op2.inputs])
+
+  def testVariadic(self, function_decorator):
+    @function_decorator
+    def variadic_fn(x, *args, **kwargs):
+      return x + math_ops.add_n(list(args) + list(kwargs.values()))
+
+    # Call the function to make def_function happy
+    variadic_fn(array_ops.ones([]), array_ops.ones([]))
+    variadic_op = variadic_fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+        tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
+                               name='second_variadic'),
+        z=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+        zz=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='cust'))
+    self.assertEqual(
+        ['x', 'y', 'args', 'second_variadic', 'z', 'cust'],
+        [inp.op.name for inp in variadic_op.inputs])
+    self.assertEqual(
+        [b'x', b'y', b'args', b'second_variadic', b'z', b'cust'],
+        [inp.op.get_attr('_user_specified_name')
+         for inp in variadic_op.inputs])
+
+  def testVariadicInputSignature(self, function_decorator):
+    @function_decorator(
+        input_signature=(
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32),
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
+            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z'),
+        ))
+    def variadic_fn(x, *args):
+      return x + math_ops.add_n(list(args))
+
+    # Call the function to make def_function happy
+    variadic_fn(array_ops.ones([]), array_ops.ones([]),
+                array_ops.ones([]), array_ops.ones([]))
+    variadic_op = variadic_fn.get_concrete_function()
+    self.assertIn(b'variadic_fn', variadic_op.name)
+    self.assertEqual(
+        ['x', 'y', 'args', 'z'],
+        [inp.op.name for inp in variadic_op.inputs])
+    self.assertEqual(
+        [b'x', b'y', b'args', b'z'],
+        [inp.op.get_attr('_user_specified_name')
+         for inp in variadic_op.inputs])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_defun_collection_test.py b/tensorflow/python/eager/function_defun_collection_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..53478ad121ce689650a9ef9e81215817af605be5
--- /dev/null
+++ b/tensorflow/python/eager/function_defun_collection_test.py
@@ -0,0 +1,102 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class DefunCollectionTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun', function_decorator=function.defun),
+      dict(
+          testcase_name='DefFunction',
+          function_decorator=def_function.function))
+  def testCollectionValueAccess(self, function_decorator):
+    """Read values from graph collections inside of defun."""
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+        x = 2
+        y = 5
+        ops.add_to_collection('x', x)
+        ops.add_to_collection('y', y)
+
+        @function_decorator
+        def fn():
+          x_const = constant_op.constant(ops.get_collection('x')[0])
+          y_const = constant_op.constant(ops.get_collection('y')[0])
+          z = math_ops.add(x_const, y_const)
+          ops.add_to_collection('z', 7)
+          return z
+
+        self.assertEqual(7, int(self.evaluate(fn())))
+        self.assertEquals(ops.get_collection('x'), [2])
+        self.assertEquals(ops.get_collection('y'), [5])
+        self.assertEquals(ops.get_collection('z'), [])
+
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun', function_decorator=function.defun),
+      dict(
+          testcase_name='DefFunction',
+          function_decorator=def_function.function))
+  def testCollectionVariableValueAccess(self, function_decorator):
+    """Read variable value from graph collections inside of defun."""
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+        v = resource_variable_ops.ResourceVariable(1.0)
+
+        @function_decorator
+        def f():
+          return v.read_value()
+
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(1.0, float(self.evaluate(f())))
+        self.assertEquals(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+
+  def testCollectionVariableValueWrite(self):
+    """Write variable value inside defun."""
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+
+        @function.defun
+        def f():
+          v = resource_variable_ops.ResourceVariable(2.0)
+          return v
+
+        _ = f.get_concrete_function()
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(2.0, float(self.evaluate(f())))
+        self.assertEquals(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba596573f992289ce4fe68678b221349d6c5155
--- /dev/null
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -0,0 +1,755 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
+
+  def testGraphModeWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    @def_function.function
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    self.assertAllEqual(step(), 2.0)
+
+  def testGraphGradientVariable(self):
+    with ops.Graph().as_default(), self.cached_session():
+      v = variables.Variable(1.0)
+
+      @def_function.function
+      def f():
+        return 2.0 * v
+
+      node = f()
+      grads, = gradients_impl.gradients(node, v)
+      v.initializer.run()
+      self.assertAllEqual(grads.eval(), 2.0)
+      self.assertEqual(grads.shape, v.shape)
+
+  def testSymGradGatherNd(self):
+    with ops.Graph().as_default(), self.cached_session() as sess:
+
+      @def_function.function
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertAllEqual(self.evaluate(g).values, [[1.0]])
+
+  def testNoSymGradNestedDefun(self):
+
+    @def_function.function
+    def outer():
+
+      @def_function.function
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertIsInstance(g, ops.IndexedSlices)
+
+    outer()
+
+  def testGraphFunctionWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    @def_function.function
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    step_op = step.get_concrete_function()
+    self.assertEqual(step_op.output_dtypes, dtypes.float32)
+    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
+    self.assertAllEqual(step_op(), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDefunCondGradient(self):
+
+    @def_function.function
+    def f(x):
+      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGraphLoopGradient(self):
+
+    @def_function.function
+    def f(x):
+      return control_flow_ops.while_loop(lambda _, i: i < 2,
+                                         lambda x, i: (2*x, i + 1),
+                                         [x, 0])[0]
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 4.0)
+
+  def testDefunDifferentiable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
+  def testDefunCanBeDifferentiatedTwice(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+    # Ensure that v is watched again.
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
+  def testSymbolicGradientVariableNoneNotZerosLike(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1.0)
+
+      @def_function.function
+      def f(x, v):
+        v.read_value()
+        return x * x
+
+      x = constant_op.constant(1.0)
+      l = f(x, v)
+      _, dv = gradients_impl.gradients(l, [x, v])
+      with self.cached_session():
+        v.initializer.run()
+        self.assertEqual(dv, None)
+
+  def testDefunCallBackprop(self):
+
+    @def_function.function
+    def f(x):
+      return math_ops.add(x, x)
+
+    @def_function.function
+    def g(x):
+      return backprop.gradients_function(f, [0])(x)[0]
+
+    self.assertAllEqual(2, g(constant_op.constant(2.)))
+
+  def testGraphModeEagerGradError(self):
+    with context.graph_mode():
+      def f():
+        x = variable_scope.get_variable(
+            'v', initializer=constant_op.constant(1.0))
+        return x * constant_op.constant(2.0)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'No trainable variables were accessed'):
+        backprop.implicit_val_and_grad(f)()
+
+  def testDefunCallBackpropUsingSameObjectForMultipleArguments(self):
+
+    @def_function.function
+    def g(x):
+      return backprop.gradients_function(math_ops.multiply, [0, 1])(x, x)
+
+    def np_g(x):
+      return [d.numpy() for d in g(x)]
+
+    x = constant_op.constant(1.)
+    self.assertAllEqual([1., 1.], np_g(x))
+    self.assertAllEqual([1., 1.], np_g(1.))
+
+  def testGradientTensorConversionWithDefun(self):
+    three = resource_variable_ops.ResourceVariable(3.0, name='v')
+
+    @def_function.function
+    def f(x):
+      return math_ops.add(x, three)
+
+    def g(x):
+      return f(x)
+
+    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
+    self.assertAllEqual(g, 1.0)
+
+  def testGradient(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    def sq(x):
+      return matmul(x, x, transpose_a=True)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    grad_t, = backprop.gradients_function(sq, [0])(t)
+    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
+
+  def testGradientInFunction(self):
+
+    @def_function.function
+    def f(x):
+      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
+
+    self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
+
+  def testGradientOfGatherWithDefun(self):
+    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+
+    def sum_gather():
+      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+
+    grad_fn = backprop.implicit_grad(sum_gather)
+    gradient = grad_fn()
+    defun_grad_fn = backprop.implicit_grad(def_function.function(sum_gather))
+    defun_gradient = defun_grad_fn()
+    self.assertEqual(len(gradient), len(defun_gradient))
+
+    gradient = gradient[0][0]
+    defun_gradient = defun_gradient[0][0]
+    self.assertAllEqual(gradient.values, defun_gradient.values)
+    self.assertAllEqual(gradient.indices, defun_gradient.indices)
+    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+
+  def testDifferentiableFunctionNoneOutputs(self):
+
+    @def_function.function
+    def my_function(x):
+      return x, None
+
+    def wrapper(x):
+      return my_function(x)[0]
+
+    g = backprop.gradients_function(wrapper, [0])(constant_op.constant(0.0))
+    self.assertAllEqual(g[0], 1.)
+
+    @def_function.function
+    def foo(a):
+      return None, a * a
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      none, r = foo(x)
+    g = tp.gradient(r, x)
+
+    self.assertIs(none, None)
+    self.assertAllEqual(r, 25.0)
+    self.assertAllEqual(g, 2 * 5.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNestedDifferentiableFunction(self):
+    @def_function.function
+    def inner_fn(a, b):
+      return a * math_ops.add(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return inner_fn(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunction(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionWithMultipleGradCalls(self):
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return math_ops.mul(a, inner_fn(a, b))
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, 3.0)
+
+    x = constant_op.constant(5.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+    self.assertAllEqual(middle_fn(3.0, x), 3.0 * (3.0 + 5.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+    y = constant_op.constant(4.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = inner_fn(y, y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInMultNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    @def_function.function
+    def outer_outer_fn(x):
+      return outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInMultNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    @def_function.function
+    def outer_outer_fn(x):
+      return outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariable(self):
+    var = variables.Variable(constant_op.constant(1.0))
+
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, var)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleGradCalls(self):
+    v = variables.Variable(constant_op.constant(3.0))
+
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return math_ops.mul(a, inner_fn(a, b))
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, v)
+
+    x = constant_op.constant(5.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+    self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+    y = constant_op.constant(4.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+    v.assign(constant_op.constant(1.5))
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 1.5)
+
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = inner_fn(y, v)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleTFGrads(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(3.0)
+      v.initializer.run()
+
+      @def_function.function
+      def inner_fn(a, b):
+        return math_ops.add(a, b)
+
+      @def_function.function
+      def middle_fn(a, b):
+        return math_ops.mul(a, inner_fn(a, b))
+
+      @def_function.function
+      def outer_fn(x):
+        return middle_fn(x, v)
+
+      x = constant_op.constant(5.0)
+      self.assertAllEqual(outer_fn(x).eval(), 5.0 * (5.0 + 3.0))
+
+      grad, = gradients_impl.gradients(outer_fn(x), x)
+
+      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+      self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+      self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
+
+      grad, = gradients_impl.gradients(outer_fn(x), x)
+
+      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+      y = constant_op.constant(4.0)
+      grad, = gradients_impl.gradients(outer_fn(y), y)
+      self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+      self.evaluate(v.assign(constant_op.constant(1.5)))
+      grad, = gradients_impl.gradients(outer_fn(y), y)
+
+      self.assertAllEqual(grad, 2 * 4.0 + 1.5)
+
+      grad, = gradients_impl.gradients(inner_fn(y, v), y)
+      self.assertAllEqual(grad, 1.0)
+
+  def testNestedDifferentiableFunctionNoneOutputs(self):
+    @def_function.function
+    def foo(a, b):
+      return None, a * math_ops.add(a, b), None, 2*a
+
+    @def_function.function
+    def bar(x):
+      return foo(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape(persistent=True) as tp:
+      tp.watch(x)
+      none1, r1, none2, r2 = bar(x)
+    g1 = tp.gradient(r1, x)
+    g2 = tp.gradient(r2, x)
+
+    self.assertAllEqual(r1, 30.0)
+    self.assertAllEqual(r2, 10.0)
+    self.assertIs(none1, None)
+    self.assertIs(none2, None)
+    self.assertAllEqual(g1, 2 * 5.0 + 1.0)
+    self.assertAllEqual(g2, 2.0)
+
+  def testGradientWithKeywordArguments(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    def sq(x):
+      return matmul(a=x, b=x, transpose_a=True)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    grad_t, = backprop.gradients_function(sq, [0])(t)
+    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
+
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(t)
+      one = matmul(t, b=t, transpose_a=True)
+      two = matmul(b=t, a=t, transpose_a=True)
+      three = matmul(a=t, b=t, transpose_a=True)
+
+    for output in [one, two, three]:
+      self.assertAllEqual(tape.gradient(output, t), [[6, 6], [14, 14]])
+
+  def testGradientInFunctionWithKeywordArguments(self):
+
+    @def_function.function
+    def f(x):
+      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
+
+    self.assertAllEqual(f(x=constant_op.constant(1.0)), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBackwardNone(self):
+    model = variables.Variable(1.0, name='model')
+    count = variables.Variable(0)
+
+    @function.defun
+    def forward_pass(value):
+      count.assign_add(1)
+      residuals = value - model
+      loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
+      # Note: count is an integer, so its doutput will be None
+      return loss, count
+
+    def reduce_fn(x):
+      if context.executing_eagerly():
+        with backprop.GradientTape() as t:
+          loss, count = forward_pass(x)
+        return t.gradient(loss, model), count
+      loss, count = forward_pass(x)
+      grad_only = gradients_impl.gradients(loss, model)
+      return grad_only, count
+
+    g, _ = reduce_fn(constant_op.constant([7.0]))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(nest.flatten(self.evaluate(g)), [-6.0])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 651d6cec7247bc79568f79615b7546f9cfe95eff..a206b1f791126dcbf30ca9bbb7df167ba6a3abe4 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -29,7 +29,6 @@ import numpy
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
@@ -48,7 +47,6 @@ from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -102,10 +100,10 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       _ = x * y
       return x + y
 
-    # The default config allows everything.
-    rewrites = rewriter_config_pb2.RewriterConfig()
+    # The default config allows all rewrites.
+    config_proto = config_pb2.ConfigProto()
 
-    with context.rewriter_config(rewrites):
+    with context.function_config_proto(config_proto):
       t = constant_op.constant(1.0)
       self.assertAllEqual(add(t, t).numpy(), 2.0)
 
@@ -149,32 +147,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     out = a_times_b(pair({'a': t}, {'b': t}))
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
-  def testGraphModeWithGradients(self):
-    v = resource_variable_ops.ResourceVariable(1.0, name='v')
-
-    @def_function.function
-    def step():
-      def inner():
-        return v * v
-
-      return backprop.implicit_grad(inner)()[0][0]
-
-    self.assertAllEqual(step(), 2.0)
-
-  def testGraphGradientVariable(self):
-    with ops.Graph().as_default(), self.cached_session():
-      v = variables.Variable(1.0)
-
-      @def_function.function
-      def f():
-        return 2.0 * v
-
-      node = f()
-      grads, = gradients_impl.gradients(node, v)
-      v.initializer.run()
-      self.assertAllEqual(grads.eval(), 2.0)
-      self.assertEqual(grads.shape, v.shape)
-
   def testGraphEagerIsolation(self):
 
     @function.defun
@@ -190,7 +162,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testBasicGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(a):
       return matmul(a, a)
 
@@ -204,7 +176,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testInputSpecGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(a):
       return matmul(a, a)
 
@@ -223,7 +195,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testNestedInputSpecGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(mats):
       ((a, b),) = mats
       return matmul(a, b)
@@ -314,40 +286,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     random_seed.set_random_seed(1)
     self.assertAllEqual(f(), x)
 
-  def testSymGradGatherNd(self):
-    with ops.Graph().as_default(), self.cached_session() as sess:
-
-      @def_function.function
-      def f(x):
-        return array_ops.gather_nd(x, [[0]])
-
-      c = constant_op.constant([[2.]])
-      f_c = f(c)
-      g, = gradients_impl.gradients(f_c, c)
-      self.assertAllEqual(sess.run(g).values, [[1.0]])
-
-  def testNoSymGradNestedDefun(self):
-
-    @def_function.function
-    def outer():
-
-      @def_function.function
-      def f(x):
-        return array_ops.gather_nd(x, [[0]])
-
-      c = constant_op.constant([[2.]])
-      f_c = f(c)
-      g, = gradients_impl.gradients(f_c, c)
-      self.assertIsInstance(g, ops.IndexedSlices)
-
-    outer()
-
   def testNestedInputsGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
-    @function.defun
+    @def_function.function
     def a_times_b(inputs):
       return matmul(inputs.a['a'], inputs.b['b'])
 
@@ -362,7 +306,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testNestedOutputGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(a):
       return (matmul(a, a), {'b': constant_op.constant(1.0)})
 
@@ -378,23 +322,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(a, math_ops.matmul(t, t).numpy())
     self.assertAllEqual(b['b'].numpy(), 1.0)
 
-  def testGraphFunctionWithGradients(self):
-    v = resource_variable_ops.ResourceVariable(1.0, name='v')
-
-    @function.defun
-    def step():
-      def inner():
-        return v * v
-
-      return backprop.implicit_grad(inner)()[0][0]
-
-    step_op = step.get_concrete_function()
-    self.assertEqual(step_op.output_dtypes, dtypes.float32)
-    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
-    self.assertAllEqual(step_op(), 2.0)
-
   def testGraphFunctionNoneOutput(self):
-    @function.defun
+    @def_function.function
     def fn(unused_a, unused_b):
       return None
 
@@ -404,34 +333,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(fn_op.output_shapes, None)
     self.assertAllEqual(fn_op(x, x), None)
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testDefunCondGradient(self):
-
-    @def_function.function
-    def f(x):
-      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
-
-    with backprop.GradientTape() as t:
-      x = constant_op.constant(1.0)
-      t.watch(x)
-      y = f(x)
-    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testGraphLoopGradient(self):
-
-    @def_function.function
-    def f(x):
-      return control_flow_ops.while_loop(lambda _, i: i < 2,
-                                         lambda x, i: (2*x, i + 1),
-                                         [x, 0])[0]
-
-    with backprop.GradientTape() as t:
-      x = constant_op.constant(1.0)
-      t.watch(x)
-      y = f(x)
-    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 4.0)
-
   def testDefunNumpyArraysConvertedToTensors(self):
 
     def f(x):
@@ -625,27 +526,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertIsInstance(
         self.v, resource_variable_ops.ResourceVariable)
 
-  def testDefunDifferentiable(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
-
-    @def_function.function
-    def f():
-      return v * v
-
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-
-  def testDefunCanBeDifferentiatedTwice(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
-
-    @def_function.function
-    def f():
-      return v * v
-
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-    # Ensure that v is watched again.
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-
-  def testRunMetadata(self):
+  def disabled_testRunMetadata(self):
 
     @def_function.function
     def f(x):
@@ -683,23 +564,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       variables.global_variables_initializer().run()
       call = def_function.function(o.call)
       op = call()
-      self.assertAllEqual(sess.run(op), 2.0)
-
-  def testSymbolicGradientVariableNoneNotZerosLike(self):
-    with ops.Graph().as_default():
-      v = variables.Variable(1.0)
-
-      @def_function.function
-      def f(x, v):
-        v.read_value()
-        return x * x
-
-      x = constant_op.constant(1.0)
-      l = f(x, v)
-      _, dv = gradients_impl.gradients(l, [x, v])
-      with self.cached_session():
-        v.initializer.run()
-        self.assertEqual(dv, None)
+      self.assertAllEqual(self.evaluate(op), 2.0)
 
   def testGraphModeManyFunctions(self):
     with ops.Graph().as_default(), self.cached_session():
@@ -742,42 +607,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(8, g(constant_op.constant(2)))
 
-  def testDefunCallBackprop(self):
-
-    @def_function.function
-    def f(x):
-      return math_ops.add(x, x)
-
-    @def_function.function
-    def g(x):
-      return backprop.gradients_function(f, [0])(x)[0]
-
-    self.assertAllEqual(2, g(constant_op.constant(2.)))
-
-  def testGraphModeEagerGradError(self):
-    with context.graph_mode():
-      def f():
-        x = variable_scope.get_variable(
-            'v', initializer=constant_op.constant(1.0))
-        return x * constant_op.constant(2.0)
-
-      with self.assertRaisesRegexp(ValueError,
-                                   'No trainable variables were accessed'):
-        backprop.implicit_val_and_grad(f)()
-
-  def testDefunCallBackpropUsingSameObjectForMultipleArguments(self):
-
-    @def_function.function
-    def g(x):
-      return backprop.gradients_function(math_ops.multiply, [0, 1])(x, x)
-
-    def np_g(x):
-      return [d.numpy() for d in g(x)]
-
-    x = constant_op.constant(1.)
-    self.assertAllEqual([1., 1.], np_g(x))
-    self.assertAllEqual([1., 1.], np_g(1.))
-
   def testCallShape(self):
 
     @def_function.function
@@ -808,37 +637,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     g(three)
 
-  def testGradientTensorConversionWithDefun(self):
-    three = resource_variable_ops.ResourceVariable(3.0, name='v')
-
-    @def_function.function
-    def f(x):
-      return math_ops.add(x, three)
-
-    def g(x):
-      return f(x)
-
-    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
-    self.assertAllEqual(g, 1.0)
-
-  def testGradient(self):
-    matmul = def_function.function(math_ops.matmul)
-
-    def sq(x):
-      return matmul(x, x, transpose_a=True)
-
-    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-    grad_t, = backprop.gradients_function(sq, [0])(t)
-    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
-
-  def testGradientInFunction(self):
-
-    @def_function.function
-    def f(x):
-      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
-
-    self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
-
   def testGatherResourceWithDefun(self):
     with ops.device('cpu:0'):
       v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
@@ -849,24 +647,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = def_function.function(sum_gather)
     self.assertAllEqual(sum_gather(), defined())
 
-  def testGradientOfGatherWithDefun(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
-
-    def sum_gather():
-      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
-
-    grad_fn = backprop.implicit_grad(sum_gather)
-    gradient = grad_fn()
-    defun_grad_fn = backprop.implicit_grad(def_function.function(sum_gather))
-    defun_gradient = defun_grad_fn()
-    self.assertEqual(len(gradient), len(defun_gradient))
-
-    gradient = gradient[0][0]
-    defun_gradient = defun_gradient[0][0]
-    self.assertAllEqual(gradient.values, defun_gradient.values)
-    self.assertAllEqual(gradient.indices, defun_gradient.indices)
-    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
-
   def testReturningIndexedSlicesWithDefun(self):
 
     def validate(indexed_slice):
@@ -968,7 +748,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       v_gpu = resource_variable_ops.ResourceVariable(
           [0.0, 1.0, 2.0], name='gpu')
 
-    @function.defun
+    @def_function.function
     def resource_apply_adam():
       training_ops.resource_apply_adam(
           v_cpu.handle,
@@ -1012,440 +792,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     shape = constant_op.constant([2, 1]).gpu()
     reshape(value, shape)  # No error is raised
 
-  def testDifferentiableFunctionNoneOutputs(self):
-
-    @def_function.function
-    def my_function(x):
-      return x, None
-
-    def wrapper(x):
-      return my_function(x)[0]
-
-    g = backprop.gradients_function(wrapper, [0])(constant_op.constant(0.0))
-    self.assertAllEqual(g[0], 1.)
-
-    @def_function.function
-    def foo(a):
-      return None, a * a
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      none, r = foo(x)
-    g = tp.gradient(r, x)
-
-    self.assertIs(none, None)
-    self.assertAllEqual(r, 25.0)
-    self.assertAllEqual(g, 2 * 5.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testNestedDifferentiableFunction(self):
-    @function.defun
-    def inner_fn(a, b):
-      return a * math_ops.add(a, b)
-
-    @function.defun
-    def outer_fn(x):
-      return inner_fn(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunction(self):
-    @function.defun
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @function.defun
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @function.defun
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @function.defun
-    def outer_fn(x):
-      return middle_fn(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionWithMultipleGradCalls(self):
-    @function.defun
-    def inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @function.defun
-    def middle_fn(a, b):
-      return math_ops.mul(a, inner_fn(a, b))
-
-    @function.defun
-    def outer_fn(x):
-      return middle_fn(x, 3.0)
-
-    x = constant_op.constant(5.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-    self.assertAllEqual(middle_fn(3.0, x), 3.0 * (3.0 + 5.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-
-    y = constant_op.constant(4.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = outer_fn(y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
-
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = inner_fn(y, y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionGradientTapeInDefun(self):
-    @function.defun
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @function.defun
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @function.defun
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @function.defun
-    def outer_fn(x):
-      with backprop.GradientTape() as tp:
-        tp.watch(x)
-        result = middle_fn(x, 1.0)
-      grad = tp.gradient(result, x)
-      return grad
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionGradientTapeInNestedDefun(self):
-    @function.defun
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @function.defun
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @function.defun
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @function.defun
-    def almost_outer_fn(x):
-      with backprop.GradientTape() as tp:
-        tp.watch(x)
-        result = middle_fn(x, 1.0)
-      grad = tp.gradient(result, x)
-      return grad
-
-    @function.defun
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionGradientTapeInMultNestedDefun(self):
-    @function.defun
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @function.defun
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @function.defun
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @function.defun
-    def almost_outer_fn(x):
-      with backprop.GradientTape() as tp:
-        tp.watch(x)
-        result = middle_fn(x, 1.0)
-      grad = tp.gradient(result, x)
-      return grad
-
-    @function.defun
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    @function.defun
-    def outer_outer_fn(x):
-      return outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionTFGradientInDefun(self):
-    @function.defun
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @function.defun
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @function.defun
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @function.defun
-    def outer_fn(x):
-      result = middle_fn(x, 1.0)
-      return gradients_impl.gradients(result, [x])[0]
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionTFGradientInNestedDefun(self):
-    @function.defun
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @function.defun
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @function.defun
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @function.defun
-    def almost_outer_fn(x):
-      result = middle_fn(x, 1.0)
-      return gradients_impl.gradients(result, [x])[0]
-
-    @function.defun
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionTFGradientInMultNestedDefun(self):
-    @function.defun
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @function.defun
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @function.defun
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @function.defun
-    def almost_outer_fn(x):
-      result = middle_fn(x, 1.0)
-      return gradients_impl.gradients(result, [x])[0]
-
-    @function.defun
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    @function.defun
-    def outer_outer_fn(x):
-      return outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  def testDeeplyNestedDifferentiableFunctionWithVariable(self):
-    var = variables.Variable(constant_op.constant(1.0))
-
-    @def_function.function
-    def inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, var)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleGradCalls(self):
-    v = variables.Variable(constant_op.constant(3.0))
-
-    @def_function.function
-    def inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return math_ops.mul(a, inner_fn(a, b))
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, v)
-
-    x = constant_op.constant(5.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-    self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-
-    y = constant_op.constant(4.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = outer_fn(y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
-
-    v.assign(constant_op.constant(1.5))
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = outer_fn(y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2 * 4.0 + 1.5)
-
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = inner_fn(y, v)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 1.0)
-
-  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleTFGrads(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(3.0)
-      v.initializer.run()
-
-      @def_function.function
-      def inner_fn(a, b):
-        return math_ops.add(a, b)
-
-      @def_function.function
-      def middle_fn(a, b):
-        return math_ops.mul(a, inner_fn(a, b))
-
-      @def_function.function
-      def outer_fn(x):
-        return middle_fn(x, v)
-
-      x = constant_op.constant(5.0)
-      self.assertAllEqual(outer_fn(x).eval(), 5.0 * (5.0 + 3.0))
-
-      grad, = gradients_impl.gradients(outer_fn(x), x)
-
-      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-      self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-      self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
-
-      grad, = gradients_impl.gradients(outer_fn(x), x)
-
-      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-
-      y = constant_op.constant(4.0)
-      grad, = gradients_impl.gradients(outer_fn(y), y)
-      self.assertAllEqual(grad, 2 * 4.0 + 3.0)
-
-      self.evaluate(v.assign(constant_op.constant(1.5)))
-      grad, = gradients_impl.gradients(outer_fn(y), y)
-
-      self.assertAllEqual(grad, 2 * 4.0 + 1.5)
-
-      grad, = gradients_impl.gradients(inner_fn(y, v), y)
-      self.assertAllEqual(grad, 1.0)
-
-  def testNestedDifferentiableFunctionNoneOutputs(self):
-    @def_function.function
-    def foo(a, b):
-      return None, a * math_ops.add(a, b), None, 2*a
-
-    @def_function.function
-    def bar(x):
-      return foo(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape(persistent=True) as tp:
-      tp.watch(x)
-      none1, r1, none2, r2 = bar(x)
-    g1 = tp.gradient(r1, x)
-    g2 = tp.gradient(r2, x)
-
-    self.assertAllEqual(r1, 30.0)
-    self.assertAllEqual(r2, 10.0)
-    self.assertIs(none1, None)
-    self.assertIs(none2, None)
-    self.assertAllEqual(g1, 2 * 5.0 + 1.0)
-    self.assertAllEqual(g2, 2.0)
-
   def testNoneOutput(self):
 
     @def_function.function
@@ -1461,7 +807,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def add(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def add_one(x):
       return add(x, 1)
 
@@ -1570,7 +916,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     if not context.executing_eagerly():
       self.evaluate(variables.global_variables_initializer())
 
-    self.assertAllEqual([[[[4.0]]]], self.evaluate(y))
+    self.assertAllClose([[[[4.0]]]], self.evaluate(y))
 
     # Remove reference cycles in model
     test_util.dismantle_polymorphic_function(model)
@@ -1675,7 +1021,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with ops.device('gpu:0'):
       y = constant_op.constant(1.0)
 
-    @function.defun
+    @def_function.function
     def foo():
       return test_ops.device_placement_op()
 
@@ -2039,33 +1385,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(six, 2.0)
     self.assertAllEqual(seven, 2.0)
 
-  def testGradientWithKeywordArguments(self):
-    matmul = def_function.function(math_ops.matmul)
-
-    def sq(x):
-      return matmul(a=x, b=x, transpose_a=True)
-
-    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-    grad_t, = backprop.gradients_function(sq, [0])(t)
-    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
-
-    with backprop.GradientTape(persistent=True) as tape:
-      tape.watch(t)
-      one = matmul(t, b=t, transpose_a=True)
-      two = matmul(b=t, a=t, transpose_a=True)
-      three = matmul(a=t, b=t, transpose_a=True)
-
-    for output in [one, two, three]:
-      self.assertAllEqual(tape.gradient(output, t), [[6, 6], [14, 14]])
-
-  def testGradientInFunctionWithKeywordArguments(self):
-
-    @def_function.function
-    def f(x):
-      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
-
-    self.assertAllEqual(f(x=constant_op.constant(1.0)), 2.0)
-
   def testDefuningInstanceMethod(self):
 
     integer = constant_op.constant(2, dtypes.int64)
@@ -2339,33 +1658,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         # pylint: disable=protected-access
         self.assertEqual(len(graph._functions), 3)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testBackwardNone(self):
-    model = variables.Variable(1.0, name='model')
-    count = variables.Variable(0)
-
-    @function.defun
-    def forward_pass(value):
-      count.assign_add(1)
-      residuals = value - model
-      loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
-      # Note: count is an integer, so its doutput will be None
-      return loss, count
-
-    def reduce_fn(x):
-      if context.executing_eagerly():
-        with backprop.GradientTape() as t:
-          loss, count = forward_pass(x)
-        return t.gradient(loss, model), count
-      loss, count = forward_pass(x)
-      grad_only = gradients_impl.gradients(loss, model)
-      return grad_only, count
-
-    g, _ = reduce_fn(constant_op.constant([7.0]))
-
-    self.evaluate(variables.global_variables_initializer())
-    self.assertAllEqual(nest.flatten(self.evaluate(g)), [-6.0])
-
   def testCallingFunctionWithDifferentVariables(self):
 
     @function.defun
@@ -2403,8 +1695,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                                  'be Tensors;.*'):
       graph_function('Not a Tensor.')
 
-  # TODO(scottzhu): Revive the test once the grappler plugin is updated.
-  def disabled_testSwapImplementationWithGrapplerPlugin(self):
+  def testSwapImplementationWithGrapplerPlugin(self):
     rewrites = rewriter_config_pb2.RewriterConfig()
     # function_optimizer has to be turn off, otherwise it will delete the
     # registered function if it does not get called.
@@ -2441,7 +1732,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       function.register(cpu_boost, x)
       y = gpu_boost(x)
-      y_value = sess.run(y)
+      y_value = self.evaluate(y)
 
       if test.is_gpu_available():
         self.assertEqual(y_value, 5.0)
@@ -2703,291 +1994,24 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     del m
     self.assertEqual([], list(weak_variables))
 
+  def testExecutorType(self):
+    @function.defun
+    def add_five(x):
+      return x + 5
 
-@parameterized.named_parameters(
-    dict(testcase_name='Defun', function_decorator=function.defun),
-    dict(testcase_name='DefFunction', function_decorator=def_function.function))
-class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
-  """Tests for recognizable export signatures from concrete functions."""
-
-  def testBasic(self, function_decorator):
-    @function_decorator
-    def fn(a, b):
-      return a + b, a * b
-    # Call the function to make def_function happy
-    fn(array_ops.ones([]), array_ops.ones([]))
-
-    fn_op = fn.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['a', 'b'],
-        [inp.op.name for inp in fn_op.inputs])
-    self.assertEqual(
-        [b'a', b'b'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
-    self.assertEqual(2, len(fn_op.graph.structured_outputs))
-    self.assertAllClose(
-        [3., 2.],
-        fn_op(constant_op.constant(1.), constant_op.constant(2.)))
-    self.assertAllClose(
-        [3., 2.],
-        fn_op(a=constant_op.constant(1.), b=constant_op.constant(2.)))
-
-  def testVariable(self, function_decorator):
-    @function_decorator
-    def fn(a, b):
-      return a + b, a * b
-    # Call the function to make def_function happy
-    fn(array_ops.ones([]), array_ops.ones([]))
-
-    fn_op = fn.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-        variables.Variable(1.))
-    self.assertEqual(
-        ['a', 'b'],
-        [inp.op.name for inp in fn_op.inputs])
-    self.assertEqual(
-        [b'a', b'b'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
-    self.assertEqual(2, len(fn_op.graph.structured_outputs))
-
-  def testDictReturned(self, function_decorator):
-    @function_decorator
-    def fn(x, z=(1., 2.), y=3.):
-      z1, z2 = z
-      return {'alpha': x + y + z1, 'beta': x * y + z2}
-    # Call the function to make def_function happy
-    fn(array_ops.ones([]))
-
-    fn_op = fn.get_concrete_function(
-        x=tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['x', 'y'],
-        [inp.op.name for inp in fn_op.inputs])
-    self.assertEqual(
-        [b'x', b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
-    self.assertEqual({'alpha', 'beta'},
-                     set(fn_op.graph.structured_outputs.keys()))
-
-    with self.assertRaisesRegexp(ValueError, "two arguments named 'z'"):
-      fn.get_concrete_function(
-          z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-             tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)),
-          y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
-                                   name='custom'),
-          x=4.)
-    fn_op2 = fn.get_concrete_function(
-        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
-                                  name='z_first'),
-           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
-                                  name='z_second')),
-        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
-        x=4.)
-    self.assertEqual(
-        ['z_first', 'z_second', 'custom'],
-        [inp.op.name for inp in fn_op2.inputs])
-    self.assertEqual(
-        [b'z_first', b'z_second', b'custom'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op2.inputs])
-
-    fn_op3 = fn.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
-        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
-                                  name='z1'),
-           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z2')),
-        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['custom', 'z1', 'z2', 'y'],
-        [inp.op.name for inp in fn_op3.inputs])
-    self.assertEqual(
-        [b'custom', b'z1', b'z2', b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op3.inputs])
-
-  def testMethod(self, function_decorator):
-    class HasMethod(object):
-
-      @function_decorator
-      def method(self, x):
-        return x
-
-    has_method = HasMethod()
-    # Call the function to make def_function happy
-    HasMethod.method(has_method, array_ops.ones([]))
-    class_op = HasMethod.method.get_concrete_function(
-        has_method, tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['x'],
-        [inp.op.name for inp in class_op.inputs])
-    self.assertEqual(
-        [b'x'],
-        [inp.op.get_attr('_user_specified_name') for inp in class_op.inputs])
-    # Call the function to make def_function happy
-    has_method.method(array_ops.ones([]))
-    method_op = has_method.method.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['x'],
-        [inp.op.name for inp in method_op.inputs])
-    self.assertEqual(
-        [b'x'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
-    # TODO(allenl): It should be possible to override names when exporting. Do
-    # TensorSpec names need to go in cache keys? Or maybe get_concrete_function
-    # should always retrace?
-    self.skipTest('Not working')
-    method_op = has_method.method.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='y'))
-    self.assertEqual(
-        ['y'],
-        [inp.op.name for inp in method_op.inputs])
-    self.assertEqual(
-        [b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
-
-  def testMethodSignature(self, function_decorator):
-
-    class HasMethod(object):
-
-      @function_decorator(
-          input_signature=(tensor_spec.TensorSpec(
-              shape=None, dtype=dtypes.float64, name='y'),))
-      def method(self, x):
-        hash(self)  # No weak proxies passed as `self`
-        return x
-
-    has_method = HasMethod()
-    # Call the function to make def_function happy
-    has_method.method(array_ops.ones([], dtype=dtypes.float64))
-    method_op = has_method.method.get_concrete_function()
-    self.assertEqual(
-        ['y'],
-        [inp.op.name for inp in method_op.inputs])
-    self.assertEqual(
-        [b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
-    method_op2 = has_method.method.get_concrete_function()
     self.assertEqual(
-        ['y'],
-        [inp.op.name for inp in method_op2.inputs])
-    self.assertEqual(
-        [b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op2.inputs])
-
-  def testVariadic(self, function_decorator):
-    @function_decorator
-    def variadic_fn(x, *args, **kwargs):
-      return x + math_ops.add_n(list(args) + list(kwargs.values()))
-
-    # Call the function to make def_function happy
-    variadic_fn(array_ops.ones([]), array_ops.ones([]))
-    variadic_op = variadic_fn.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-        tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
-                               name='second_variadic'),
-        z=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-        zz=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='cust'))
-    self.assertEqual(
-        ['x', 'y', 'args', 'second_variadic', 'z', 'cust'],
-        [inp.op.name for inp in variadic_op.inputs])
-    self.assertEqual(
-        [b'x', b'y', b'args', b'second_variadic', b'z', b'cust'],
-        [inp.op.get_attr('_user_specified_name')
-         for inp in variadic_op.inputs])
-
-  def testVariadicInputSignature(self, function_decorator):
-    @function_decorator(
-        input_signature=(
-            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32),
-            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
-            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z'),
-        ))
-    def variadic_fn(x, *args):
-      return x + math_ops.add_n(list(args))
-
-    # Call the function to make def_function happy
-    variadic_fn(array_ops.ones([]), array_ops.ones([]),
-                array_ops.ones([]), array_ops.ones([]))
-    variadic_op = variadic_fn.get_concrete_function()
-    self.assertIn(b'variadic_fn', variadic_op.name)
-    self.assertEqual(
-        ['x', 'y', 'args', 'z'],
-        [inp.op.name for inp in variadic_op.inputs])
-    self.assertEqual(
-        [b'x', b'y', b'args', b'z'],
-        [inp.op.get_attr('_user_specified_name')
-         for inp in variadic_op.inputs])
-
-
-class DefunCollectionTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      dict(testcase_name='Defun', function_decorator=function.defun),
-      dict(
-          testcase_name='DefFunction',
-          function_decorator=def_function.function))
-  def testCollectionValueAccess(self, function_decorator):
-    """Read values from graph collections inside of defun."""
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g):
-        x = 2
-        y = 5
-        ops.add_to_collection('x', x)
-        ops.add_to_collection('y', y)
-
-        @function_decorator
-        def fn():
-          x_const = constant_op.constant(ops.get_collection('x')[0])
-          y_const = constant_op.constant(ops.get_collection('y')[0])
-          z = math_ops.add(x_const, y_const)
-          ops.add_to_collection('z', 7)
-          return z
-
-        self.assertEqual(7, int(self.evaluate(fn())))
-        self.assertEquals(ops.get_collection('x'), [2])
-        self.assertEquals(ops.get_collection('y'), [5])
-        self.assertEquals(ops.get_collection('z'), [])
-
-  @parameterized.named_parameters(
-      dict(testcase_name='Defun', function_decorator=function.defun),
-      dict(
-          testcase_name='DefFunction',
-          function_decorator=def_function.function))
-  def testCollectionVariableValueAccess(self, function_decorator):
-    """Read variable value from graph collections inside of defun."""
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g):
-        v = resource_variable_ops.ResourceVariable(1.0)
-
-        @function_decorator
-        def f():
-          return v.read_value()
-
-        self.evaluate(variables.global_variables_initializer())
-        self.assertEqual(1.0, float(self.evaluate(f())))
-        self.assertEquals(
-            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
-
-  def testCollectionVariableValueWrite(self):
-    """Write variable value inside defun."""
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g):
-
-        @function.defun
-        def f():
-          v = resource_variable_ops.ResourceVariable(2.0)
-          return v
-
-        _ = f.get_concrete_function()
-        self.evaluate(variables.global_variables_initializer())
-        self.assertEqual(2.0, float(self.evaluate(f())))
-        self.assertEquals(
-            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+        5,
+        add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
+
+    with self.assertRaisesRegexp(errors.NotFoundError, 'NON_EXISTENT_EXECUTOR'):
+      with context.function_executor_type('NON_EXISTENT_EXECUTOR'):
+        add_five(constant_op.constant(0, dtype=dtypes.int32))
+
+    for executor_type in ('', 'DEFAULT', None):
+      with context.function_executor_type(executor_type):
+        self.assertAllEqual(
+            5,
+            add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/graph_only_ops_test.py b/tensorflow/python/eager/graph_only_ops_test.py
index 3cf3a61a62b1b22f092ad505017fd54f278b3f95..3aedf5fee1c282aac12b612ccc8e399f720cd993 100644
--- a/tensorflow/python/eager/graph_only_ops_test.py
+++ b/tensorflow/python/eager/graph_only_ops_test.py
@@ -33,7 +33,7 @@ class GraphOnlyOpsTest(test_util.TensorFlowTestCase):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     z_tf = graph_only_ops.graph_zeros_like(x)
     with self.cached_session():
-      self.assertAllClose(np.zeros((2, 3)), z_tf.eval())
+      self.assertAllClose(np.zeros((2, 3)), self.evaluate(z_tf))
 
   def testGraphPlaceholder(self):
     x_tf = graph_only_ops.graph_placeholder(dtypes.int32, shape=(1,))
diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..c231264047bedccbb11abf996ff9ac93f15964f9
--- /dev/null
+++ b/tensorflow/python/eager/lift_to_graph.py
@@ -0,0 +1,88 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=unidiomatic-typecheck
+"""Utility to lift subgraphs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+
+
+def _graph_inputs(op):
+  return [x.op for x in op.inputs] + list(op.control_inputs)
+
+
+def lift_to_graph(init_tensor, graph, sources=None):
+  """Copies the tensor and all its inputs recursively to the outer graph."""
+  # Check that the initializer does not depend on any placeholders.
+  if sources is None:
+    sources = set([])
+  visited_ops = set([x.op for x in sources])
+  ops_to_visit = [init_tensor.op]
+  op_outputs = collections.defaultdict(set)
+  while ops_to_visit:
+    op = ops_to_visit.pop()
+    if op in visited_ops:
+      continue
+    visited_ops.add(op)
+    # TODO(apassos) distinguish arg placeholders, capture placeholders,
+    # and placeholders the user might directly use to initialize
+    # variables.
+    if op.type == "Placeholder":
+      raise ValueError(
+          "Unable to lift tensor", init_tensor,
+          "because it depends transitively on placeholder ", op)
+    for inp in _graph_inputs(op):
+      op_outputs[inp].add(op)
+      if inp not in visited_ops and inp not in sources:
+        ops_to_visit.append(inp)
+  # Topologically sort the nodes we've extracted. Now we know how many of their
+  # outputs are part of this subgraph.
+  ops_to_copy = []
+  marked_ops = set([])
+  ops_to_visit = [init_tensor.op]
+  while ops_to_visit:
+    op = ops_to_visit.pop()
+    if op in marked_ops:
+      continue
+    marked_ops.add(op)
+    ops_to_copy.append(op)
+    for inp in _graph_inputs(op):
+      if all(x in marked_ops for x in op_outputs[inp]) and inp not in sources:
+        ops_to_visit.append(inp)
+  assert len(ops_to_copy) == len(visited_ops)
+  # ops_to_copy now holds a reverse topologically sorted list of ops which
+  # ends in the initializer. We copy those to the outermost graph and
+  # build the initialization op there.
+  with graph.as_default():
+    op_map = {}
+    for s in sources:
+      op_map[s] = array_ops.placeholder(dtype=s.dtype, shape=s.shape)
+    for op in reversed(ops_to_copy):
+      copied_inputs = [op_map[x] for x in op.inputs]
+      copied_control_inputs = [op_map[x] for x in op.control_inputs]
+      with ops.control_dependencies(copied_control_inputs):
+        copied_op = graph.create_op(
+            op.type, copied_inputs, [x.dtype for x in op.outputs],
+            attrs=op.node_def.attr)
+      op_map[op] = copied_op
+      for i, o in enumerate(op.outputs):
+        op_map[o] = copied_op.outputs[i]
+    return op_map
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index a2407854fd74831d3fb619930a90e5c67d15f1c7..ed19047f0954923d2ab16b656aa13613cefb047e 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -420,9 +420,14 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       if (TF_GetCode(self->status) != TF_OK) {
         PyErr_SetString(
             PyExc_TypeError,
-            tensorflow::strings::StrCat("Error while casting from DataType ",
-                                        handle_dtype, " to ", desired_dtype,
-                                        ". ", TF_Message(self->status))
+            tensorflow::strings::StrCat(
+                "Error while casting from DataType ",
+                tensorflow::DataTypeString(
+                    static_cast<tensorflow::DataType>(handle_dtype)),
+                " to ",
+                tensorflow::DataTypeString(
+                    static_cast<tensorflow::DataType>(desired_dtype)),
+                ". ", TF_Message(self->status))
                 .c_str());
         // Cleanup self->status before returning.
         TF_SetStatus(self->status, TF_OK, "");
@@ -434,8 +439,10 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       PyErr_SetString(
           PyExc_TypeError,
           tensorflow::strings::StrCat(
-              "Cannot convert value ", TFE_GetPythonString(value_str.get()),
-              " to EagerTensor with requested dtype: ", desired_dtype)
+              "Cannot convert provided value to EagerTensor. Provided value: ",
+              TFE_GetPythonString(value_str.get()), " Requested dtype: ",
+              tensorflow::DataTypeString(
+                  static_cast<tensorflow::DataType>(desired_dtype)))
               .c_str());
       return -1;
     }
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 70de5e0c03ec01584563c2c8a2388cb8a5c2201b..9ce500bc08e478815f2dbe1d5d5353eefa4f17a8 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1645,6 +1645,29 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   if (PyErr_Occurred()) {
     return nullptr;
   }
+  tensorflow::gtl::FlatSet<tensorflow::int64> sources_set(sources_vec.begin(),
+                                                          sources_vec.end());
+
+  tensorflow::Safe_PyObjectPtr seq =
+      tensorflow::make_safe(PySequence_Fast(target, "expected a sequence"));
+  int len = PySequence_Fast_GET_SIZE(seq.get());
+  tensorflow::gtl::FlatMap<tensorflow::int64, PyTapeTensor>
+      source_tensors_that_are_targets;
+  for (int i = 0; i < len; ++i) {
+    tensorflow::int64 target_id = target_vec[i];
+    if (sources_set.find(target_id) != sources_set.end()) {
+      auto tensor = PySequence_Fast_GET_ITEM(seq.get(), i);
+      source_tensors_that_are_targets.insert(
+          std::make_pair(target_id, TapeTensorFromTensor(tensor)));
+    }
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
+  }
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
+
   std::vector<PyObject*> outgrad_vec;
   if (output_gradients != Py_None) {
     outgrad_vec = MakeTensorList(output_gradients);
@@ -1659,7 +1682,8 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   }
   std::vector<PyObject*> result;
   status->status = tape_obj->tape->ComputeGradient(
-      *py_vspace, target_vec, sources_vec, outgrad_vec, &result);
+      *py_vspace, target_vec, sources_vec, source_tensors_that_are_targets,
+      outgrad_vec, &result);
   if (!status->status.ok()) {
     if (PyErr_Occurred()) {
       // Do not propagate the erroneous status as that would swallow the
@@ -1853,7 +1877,7 @@ bool OpGradientDoesntRequireOutputIndices(
           {"Conv3DBackpropInputV2", {true, {}}},
           {"AvgPool3D", {true, {}}},
           {"AvgPool3DGrad", {true, {}}},
-          {"MaxPool3D", {true, {}}},
+          {"MaxPool3D", {false, {}}},
           {"MaxPool3DGrad", {true, {}}},
           {"MaxPool3DGradGrad", {true, {}}},
           {"BiasAdd", {true, {}}},
@@ -2279,8 +2303,10 @@ bool ConvertToTensor(
       PyErr_SetString(
           PyExc_TypeError,
           tensorflow::strings::StrCat(
-              "Cannot convert value ", TFE_GetPythonString(input_str.get()),
-              " to EagerTensor with requested dtype: ", desired_dtype)
+              "Cannot convert provided value to EagerTensor. Provided value: ",
+              TFE_GetPythonString(input_str.get()), " Requested dtype: ",
+              tensorflow::DataTypeString(
+                  static_cast<tensorflow::DataType>(desired_dtype)))
               .c_str());
       return false;
     }
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 6282a6c4595c96f3fbcd71fcf666553ea2965c54..669fa084888a52da1601984fa11791f84add6170 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -22,7 +22,6 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
-from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,11 +34,6 @@ from tensorflow.python.ops import resource_variable_ops
 
 class Tests(test.TestCase):
 
-  def setUp(self):
-    # Force-load `distribution_strategy_context` to prevent GC at
-    # test time. See discussion in cl//219478951.
-    tape.distribution_strategy_context.get_distribution_strategy()
-
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
   def testFastpathExecute_MatMulCorrectResponse(self):
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 1326f09713065503b2bb359c6c997a0801680dc0..e501b403a39144a673e8ac5155edf0498425bcd6 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -63,7 +63,7 @@ def watch_variable(tape, variable):
   """Marks this variable to be watched by the given tape."""
   strategy = distribution_strategy_context.get_distribution_strategy()
   if distribution_strategy_context.get_replica_context():
-    variables = [strategy.value_container(variable)]
+    variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
   for var in variables:
@@ -78,7 +78,7 @@ def variable_accessed(variable):
   """
   strategy = distribution_strategy_context.get_distribution_strategy()
   if distribution_strategy_context.get_replica_context():
-    variables = [strategy.value_container(variable)]
+    variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
   for var in variables:
diff --git a/tensorflow/python/eager/tape_test.py b/tensorflow/python/eager/tape_test.py
index acd0e569f11a90e2cc53e113f59df6f072a6de42..48d3b8ac6ee0fb5b747caf32b034f82959611292 100644
--- a/tensorflow/python/eager/tape_test.py
+++ b/tensorflow/python/eager/tape_test.py
@@ -80,8 +80,8 @@ class TapeTest(test.TestCase):
       tf_e = tf_d + tf_f
       tf_da, tf_db = gradients_impl.gradients(tf_e, [tf_a, tf_b])
 
-      self.assertAllEqual(da, tf_da.eval())
-      self.assertAllEqual(db, tf_db.eval())
+      self.assertAllEqual(da, self.evaluate(tf_da))
+      self.assertAllEqual(db, self.evaluate(tf_db))
 
   def testBasicFunctional(self):
 
@@ -142,8 +142,8 @@ class TapeTest(test.TestCase):
       tf_rr = 2 * math_ops.reduce_sum(tf_mm)
       tf_da, tf_db = gradients_impl.gradients(tf_rr, [tf_a, tf_b])
 
-      self.assertAllEqual(da, tf_da.eval())
-      self.assertAllEqual(db, tf_db.eval())
+      self.assertAllEqual(da, self.evaluate(tf_da))
+      self.assertAllEqual(db, self.evaluate(tf_db))
 
   def testGcTwoOutputs(self):
 
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index ddd46c167bf49baed4520f56e20b0cb3a77a5fb6..8c9d5dabe79b4190ae86e30dfd0cde013f4cc0fa 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -128,6 +128,23 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     tensor = constant_op.constant(numpy_tensor)
     self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
 
+  def testLenAgreesWithNumpy(self):
+    numpy_tensor = np.asarray(1.0)
+    tensor = constant_op.constant(numpy_tensor)
+    with self.assertRaises(TypeError):
+      len(numpy_tensor)
+    with self.assertRaisesRegexp(
+        TypeError, r"Scalar tensor has no `len[(][)]`"):
+      len(tensor)
+
+    numpy_tensor = np.asarray([1.0, 2.0, 3.0])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(len(numpy_tensor), len(tensor))
+
+    numpy_tensor = np.asarray([[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(len(numpy_tensor), len(tensor))
+
   def testCopy(self):
     t = constant_op.constant(1.0)
     tt = copy.copy(t)
@@ -261,9 +278,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testCompatibility(self):
-    # TODO(nareshmodi): uint32, uint64 are not correctly handled in graph mode.
     integer_types = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
-                     dtypes.uint8, dtypes.uint16]
+                     dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
 
     # Floats are not compatible with ints
     for t in integer_types:
@@ -307,6 +323,14 @@ class TFETensorTest(test_util.TensorFlowTestCase):
   def testConvertToTensorAllowsOverflow(self):
     _ = ops.convert_to_tensor(123456789, dtype=dtypes.uint8)
 
+  def testEagerTensorError(self):
+    with self.assertRaisesRegexp(
+        TypeError,
+        "Cannot convert provided value to EagerTensor. "
+        "Provided value.*Requested dtype.*"):
+      _ = ops.convert_to_tensor(1., dtype=dtypes.int32)
+
+
 
 class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/eager/test.py b/tensorflow/python/eager/test.py
index 33ee797678ed73c52ebb17723f688cec4feca402..a45deac962de931ebd8a8804cea7fef2b3f97629 100644
--- a/tensorflow/python/eager/test.py
+++ b/tensorflow/python/eager/test.py
@@ -24,6 +24,6 @@ from tensorflow.python.platform.test import *  # pylint: disable=wildcard-import
 
 
 # TODO(akshayka): Do away with this file.
-def main(argv=None):
+def main(argv=None):  # pylint: disable=function-redefined
   _ops.enable_eager_execution()
   _test.main(argv)
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 7f9c896adee842fc409b21ce44e00e35f855995f..2b39e99a4ea5d145f9bb8cef5c5931c306bcaeea 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -20,8 +20,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import function
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class VariableHolder(object):
@@ -41,6 +46,40 @@ class VariableHolder(object):
       return self._fn(*args, **kwargs)
 
 
+# TODO(allenl): make this checkpointable
+class WrappedFunction(function.Function):
+  """Wraps a tf V1 piece of code in a function."""
+
+  def __init__(self, fn_graph, variable_holder, attrs=None, signature=None):
+    super(WrappedFunction, self).__init__(
+        fn_graph, attrs=attrs, signature=signature)
+    self._variable_holder = variable_holder
+
+  def prune(self, feeds, fetches):
+    flat_feeds, flat_fetches = nest.flatten(feeds), nest.flatten(fetches)
+    for f in flat_feeds + flat_fetches:
+      if not isinstance(f, ops.Tensor):
+        raise ValueError("Feeds and fetches must be tensors.")
+      if f.graph is not self._func_graph:
+        raise ValueError(
+            "Can only prune function whose feeds and fetches "
+            "are from this graph (%s). Tensor %s from graph %s" % (
+                self._func_graph, f, f.graph))
+    with self._func_graph.as_default():
+      pruned_graph = func_graph.FuncGraph("pruned")
+      sink_tensor = array_ops.identity_n(flat_fetches)[0]
+    lift_map = lift_to_graph.lift_to_graph(
+        sink_tensor, pruned_graph, sources=flat_feeds)
+    pruned_graph.outputs.extend(lift_map[x] for x in flat_fetches)
+    pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
+    pruned_fn = WrappedFunction(
+        pruned_graph, variable_holder=self._variable_holder)
+    pruned_fn._num_positional_args = len(flat_feeds)  # pylint: disable=protected-access
+    pruned_fn._arg_keywords = []  # pylint: disable=protected-access
+    return pruned_fn
+
+
+@tf_export(v1=["wrap_function"])
 def wrap_function(fn, signature, name=None):
   """Wraps the TF 1.x function fn into a graph function.
 
@@ -73,6 +112,21 @@ def wrap_function(fn, signature, name=None):
   assert float(f_sub(1.0)) == 3.0
   ```
 
+  Both `tf.compat.v1.wrap_function` and `tf.function` create a callable
+  TensorFlow graph. But while `tf.function` runs all stateful operations
+  (e.g. `tf.print`) and sequences operations to provide the same semantics as
+  eager execution, `wrap_function` is closer to the behavior of `session.run` in
+  TensorFlow 1.x. It will not run any operations unless they are required to
+  compute the function's outputs, either through a data dependency or a control
+  dependency. Nor will it sequence operations.
+
+  Unlike `tf.function`, `wrap_function` will only trace the Python function
+  once. As with placeholders in TF 1.x, shapes and dtypes must be provided to
+  `wrap_function`'s `signature` argument.
+
+  Since it is only traced once, variables and state may be created inside the
+  function and owned by the function wrapper object.
+
   Args:
     fn: python function to be wrapped
     signature: the placeholder and python arguments to be passed to the
@@ -83,12 +137,11 @@ def wrap_function(fn, signature, name=None):
     the wrapped graph function.
   """
   holder = VariableHolder(fn)
-  fn = function.Function(
+  return WrappedFunction(
       func_graph.func_graph_from_py_func(
           name,
           holder,
           args=None, kwargs=None, signature=signature,
           add_control_dependencies=False),
+      variable_holder=holder,
       signature=signature)
-  fn._variable_holder = holder
-  return fn
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index 0690358491dce88182deb0e3087419bf987517cb..b32b6ca42691a6261576da6b105a0afc97e0ec63 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -53,6 +53,23 @@ class WrapFunctionTest(test.TestCase):
     self.assertAllEqual(f_sub(1.0), 4.0)
     self.assertAllEqual(f_sub(1.0), 3.0)
 
+  def testPrune(self):
+
+    x_in = []
+    x_out = []
+
+    def f(x, y):
+      x_in.append(x)
+      xx = x * x
+      x_out.append(xx)
+      return xx, 2 * y*y
+
+    f_wrapped = wrap_function.wrap_function(
+        f, [tensor_spec.TensorSpec((), dtypes.float32)] * 2)
+
+    f_pruned = f_wrapped.prune(x_in[0], [x_out[0]])
+    self.assertAllEqual(f_pruned(ops.convert_to_tensor(2.0)), [4.0])
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index fe4fd21eaa45304c3892a814a6a19b2f5bd8d96b..d24a7ae80c86d407ae3bb60ca55fff98be9f27a1 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -133,6 +133,7 @@ py_test(
     name = "feature_column_v2_test",
     srcs = ["feature_column_v2_test.py"],
     data = [":vocabulary_testdata"],
+    shard_count = 5,
     srcs_version = "PY2AND3",
     tags = [
         "no_cuda_on_cpu_tap",
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index cb0a340c06a81124da6e4799a17d0d51aadc721d..a858d92608db1a0d9d00b34f91860b7d4be01d68 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -230,7 +230,7 @@ def _internal_input_layer(features,
       return _get_logits()
 
 
-@tf_export('feature_column.input_layer')
+@tf_export(v1=['feature_column.input_layer'])
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -365,7 +365,7 @@ class InputLayer(object):
     return self._input_layer_template.weights
 
 
-@tf_export('feature_column.linear_model')
+@tf_export(v1=['feature_column.linear_model'])
 def linear_model(features,
                  feature_columns,
                  units=1,
@@ -445,15 +445,16 @@ def linear_model(features,
             [0, 0]: "d"
             [1, 0]: "e"
             [1, 1]: "f"
-            [1, 2]: "g"
+            [1, 2]: "f"
         }
       ```
-      with `sparse_combiner` as "mean", the linear model outputs conceptly are:
+      with `sparse_combiner` as "mean", the linear model outputs consequently
+      are:
       ```
-        y_0 = 1.0 / 2.0 * ( w_a + w_ b) + w_c + b_0
-        y_1 = w_d + 1.0 / 3.0 * ( w_e + w_ f + w_g) + b_1
+        y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b
+        y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b
       ```
-      where `y_i` is the output, `b_i` is the bias, and `w_x` is the weight
+      where `y_i` is the output, `b` is the bias, and `w_x` is the weight
       assigned to the presence of `x` in the input features.
     weight_collections: A list of collection names to which the Variable will be
       added. Note that, variables will also be added to collections
@@ -745,7 +746,7 @@ def _transform_features(features, feature_columns):
   return outputs
 
 
-@tf_export('feature_column.make_parse_example_spec')
+@tf_export(v1=['feature_column.make_parse_example_spec'])
 def make_parse_example_spec(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -806,11 +807,14 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
-@tf_export('feature_column.embedding_column')
-def embedding_column(
-    categorical_column, dimension, combiner='mean', initializer=None,
-    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
-    trainable=True):
+def _embedding_column(categorical_column,
+                      dimension,
+                      combiner='mean',
+                      initializer=None,
+                      ckpt_to_load_from=None,
+                      tensor_name_in_ckpt=None,
+                      max_norm=None,
+                      trainable=True):
   """`_DenseColumn` that converts from sparse, categorical input.
 
   Use this when your inputs are sparse, but you want to convert them to a dense
@@ -918,178 +922,11 @@ def embedding_column(
       trainable=trainable)
 
 
-@tf_export('feature_column.shared_embedding_columns')
-def shared_embedding_columns(
-    categorical_columns, dimension, combiner='mean', initializer=None,
-    shared_embedding_collection_name=None, ckpt_to_load_from=None,
-    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
-  """List of dense columns that convert from sparse, categorical input.
-
-  This is similar to `embedding_column`, except that it produces a list of
-  embedding columns that share the same embedding weights.
-
-  Use this when your inputs are sparse and of the same type (e.g. watched and
-  impression video IDs that share the same vocabulary), and you want to convert
-  them to a dense representation (e.g., to feed to a DNN).
-
-  Inputs must be a list of categorical columns created by any of the
-  `categorical_column_*` function. They must all be of the same type and have
-  the same arguments except `key`. E.g. they can be
-  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
-  all columns could also be weighted_categorical_column.
-
-  Here is an example embedding of two features for a DNNClassifier model:
-
-  ```python
-  watched_video_id = categorical_column_with_vocabulary_file(
-      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
-  impression_video_id = categorical_column_with_vocabulary_file(
-      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
-  columns = shared_embedding_columns(
-      [watched_video_id, impression_video_id], dimension=10)
-
-  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
-
-  label_column = ...
-  def input_fn():
-    features = tf.parse_example(
-        ..., features=make_parse_example_spec(columns + [label_column]))
-    labels = features.pop(label_column.name)
-    return features, labels
-
-  estimator.train(input_fn=input_fn, steps=100)
-  ```
-
-  Here is an example using `shared_embedding_columns` with model_fn:
-
-  ```python
-  def model_fn(features, ...):
-    watched_video_id = categorical_column_with_vocabulary_file(
-        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
-    impression_video_id = categorical_column_with_vocabulary_file(
-        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
-    columns = shared_embedding_columns(
-        [watched_video_id, impression_video_id], dimension=10)
-    dense_tensor = input_layer(features, columns)
-    # Form DNN layers, calculate loss, and return EstimatorSpec.
-    ...
-  ```
-
-  Args:
-    categorical_columns: List of categorical columns created by a
-      `categorical_column_with_*` function. These columns produce the sparse IDs
-      that are inputs to the embedding lookup. All columns must be of the same
-      type and have the same arguments except `key`. E.g. they can be
-      categorical_column_with_vocabulary_file with the same vocabulary_file.
-      Some or all columns could also be weighted_categorical_column.
-    dimension: An integer specifying dimension of the embedding, must be > 0.
-    combiner: A string specifying how to reduce if there are multiple entries
-      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
-      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
-      with bag-of-words columns. Each of this can be thought as example level
-      normalizations on the column. For more information, see
-      `tf.embedding_lookup_sparse`.
-    initializer: A variable initializer function to be used in embedding
-      variable initialization. If not specified, defaults to
-      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-      `1/sqrt(dimension)`.
-    shared_embedding_collection_name: Optional name of the collection where
-      shared embedding weights are added. If not given, a reasonable name will
-      be chosen based on the names of `categorical_columns`. This is also used
-      in `variable_scope` when creating shared embedding weights.
-    ckpt_to_load_from: String representing checkpoint name/pattern from which to
-      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
-    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
-      which to restore the column weights. Required if `ckpt_to_load_from` is
-      not `None`.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value, before combining.
-    trainable: Whether or not the embedding is trainable. Default is True.
-
-  Returns:
-    A list of dense columns that converts from sparse input. The order of
-    results follows the ordering of `categorical_columns`.
-
-  Raises:
-    ValueError: if `dimension` not > 0.
-    ValueError: if any of the given `categorical_columns` is of different type
-      or has different arguments than the others.
-    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
-      is specified.
-    ValueError: if `initializer` is specified and is not callable.
-    RuntimeError: if eager execution is enabled.
-  """
-  if context.executing_eagerly():
-    raise RuntimeError('shared_embedding_columns are not supported when eager '
-                       'execution is enabled.')
-
-  if (dimension is None) or (dimension < 1):
-    raise ValueError('Invalid dimension {}.'.format(dimension))
-  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
-    raise ValueError('Must specify both `ckpt_to_load_from` and '
-                     '`tensor_name_in_ckpt` or none of them.')
-
-  if (initializer is not None) and (not callable(initializer)):
-    raise ValueError('initializer must be callable if specified.')
-  if initializer is None:
-    initializer = init_ops.truncated_normal_initializer(
-        mean=0.0, stddev=1. / math.sqrt(dimension))
-
-  # Sort the columns so the default collection name is deterministic even if the
-  # user passes columns from an unsorted collection, such as dict.values().
-  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
-
-  c0 = sorted_columns[0]
-  num_buckets = c0._num_buckets  # pylint: disable=protected-access
-  if not isinstance(c0, _CategoricalColumn):
-    raise ValueError(
-        'All categorical_columns must be subclasses of _CategoricalColumn. '
-        'Given: {}, of type: {}'.format(c0, type(c0)))
-  if isinstance(c0, _WeightedCategoricalColumn):
-    c0 = c0.categorical_column
-  for c in sorted_columns[1:]:
-    if isinstance(c, _WeightedCategoricalColumn):
-      c = c.categorical_column
-    if not isinstance(c, type(c0)):
-      raise ValueError(
-          'To use shared_embedding_column, all categorical_columns must have '
-          'the same type, or be weighted_categorical_column of the same type. '
-          'Given column: {} of type: {} does not match given column: {} of '
-          'type: {}'.format(c0, type(c0), c, type(c)))
-    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
-      raise ValueError(
-          'To use shared_embedding_column, all categorical_columns must have '
-          'the same number of buckets. Given column: {} with buckets: {} does  '
-          'not match column: {} with buckets: {}'.format(
-              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
-
-  if not shared_embedding_collection_name:
-    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
-    shared_embedding_collection_name += '_shared_embedding'
-
-  result = []
-  for column in categorical_columns:
-    result.append(
-        _SharedEmbeddingColumn(
-            categorical_column=column,
-            initializer=initializer,
-            dimension=dimension,
-            combiner=combiner,
-            shared_embedding_collection_name=shared_embedding_collection_name,
-            ckpt_to_load_from=ckpt_to_load_from,
-            tensor_name_in_ckpt=tensor_name_in_ckpt,
-            max_norm=max_norm,
-            trainable=trainable))
-
-  return result
-
-
-@tf_export('feature_column.numeric_column')
-def numeric_column(key,
-                   shape=(1,),
-                   default_value=None,
-                   dtype=dtypes.float32,
-                   normalizer_fn=None):
+def _numeric_column(key,
+                    shape=(1,),
+                    default_value=None,
+                    dtype=dtypes.float32,
+                    normalizer_fn=None):
   """Represents real valued or numerical features.
 
   Example:
@@ -1160,8 +997,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
-@tf_export('feature_column.bucketized_column')
-def bucketized_column(source_column, boundaries):
+def _bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
   Buckets include the left boundary, and exclude the right boundary. Namely,
@@ -1257,10 +1093,9 @@ def _assert_key_is_string(key):
             type(key), key))
 
 
-@tf_export('feature_column.categorical_column_with_hash_bucket')
-def categorical_column_with_hash_bucket(key,
-                                        hash_bucket_size,
-                                        dtype=dtypes.string):
+def _categorical_column_with_hash_bucket(key,
+                                         hash_bucket_size,
+                                         dtype=dtypes.string):
   """Represents sparse feature where ids are set by hashing.
 
   Use this when your sparse features are in string or integer format, and you
@@ -1316,13 +1151,12 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
-@tf_export('feature_column.categorical_column_with_vocabulary_file')
-def categorical_column_with_vocabulary_file(key,
-                                            vocabulary_file,
-                                            vocabulary_size=None,
-                                            num_oov_buckets=0,
-                                            default_value=None,
-                                            dtype=dtypes.string):
+def _categorical_column_with_vocabulary_file(key,
+                                             vocabulary_file,
+                                             vocabulary_size=None,
+                                             num_oov_buckets=0,
+                                             default_value=None,
+                                             dtype=dtypes.string):
   """A `_CategoricalColumn` with a vocabulary file.
 
   Use this when your inputs are in string or integer format, and you have a
@@ -1436,9 +1270,11 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
-@tf_export('feature_column.categorical_column_with_vocabulary_list')
-def categorical_column_with_vocabulary_list(
-    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+def _categorical_column_with_vocabulary_list(key,
+                                             vocabulary_list,
+                                             dtype=None,
+                                             default_value=-1,
+                                             num_oov_buckets=0):
   """A `_CategoricalColumn` with in-memory vocabulary.
 
   Use this when your inputs are in string or integer format, and you have an
@@ -1547,8 +1383,7 @@ def categorical_column_with_vocabulary_list(
       default_value=default_value, num_oov_buckets=num_oov_buckets)
 
 
-@tf_export('feature_column.categorical_column_with_identity')
-def categorical_column_with_identity(key, num_buckets, default_value=None):
+def _categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `_CategoricalColumn` that returns identity values.
 
   Use this when your inputs are integers in the range `[0, num_buckets)`, and
@@ -1615,8 +1450,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, num_buckets=num_buckets, default_value=default_value)
 
 
-@tf_export('feature_column.indicator_column')
-def indicator_column(categorical_column):
+def _indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
   - For DNN model, `indicator_column` can be used to wrap any
@@ -1650,9 +1484,9 @@ def indicator_column(categorical_column):
   return _IndicatorColumn(categorical_column)
 
 
-@tf_export('feature_column.weighted_categorical_column')
-def weighted_categorical_column(
-    categorical_column, weight_feature_key, dtype=dtypes.float32):
+def _weighted_categorical_column(categorical_column,
+                                 weight_feature_key,
+                                 dtype=dtypes.float32):
   """Applies weight values to a `_CategoricalColumn`.
 
   Use this when each of your sparse inputs has both an ID and a value. For
@@ -1725,8 +1559,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
-@tf_export('feature_column.crossed_column')
-def crossed_column(keys, hash_bucket_size, hash_key=None):
+def _crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
   Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 3b818f18b5b0fce99b81e51ce89e58c72cab0b91..68a2712425c56ae4b3e42c6bd7ae497c0358a074 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -20,4 +20,5 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.feature_column.feature_column import *
+from tensorflow.python.feature_column.feature_column_v2 import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 1ae510250cfd030d965d0480599d4e333fe30b50..2c70d66810395eab34f4b300e089e85d6216a7bf 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -30,7 +30,8 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_v2 as fc_new
 from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
@@ -185,7 +186,7 @@ class LazyColumnTest(test.TestCase):
 class NumericColumnTest(test.TestCase):
 
   def test_defaults(self):
-    a = fc.numeric_column('aaa')
+    a = fc._numeric_column('aaa')
     self.assertEqual('aaa', a.key)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a._var_scope_name)
@@ -196,53 +197,53 @@ class NumericColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.numeric_column(key=('aaa',))
+      fc._numeric_column(key=('aaa',))
 
   def test_shape_saved_as_tuple(self):
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual((1, 2), a.shape)
 
   def test_default_value_saved_as_tuple(self):
-    a = fc.numeric_column('aaa', default_value=4.)
+    a = fc._numeric_column('aaa', default_value=4.)
     self.assertEqual((4.,), a.default_value)
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual(((3., 2.),), a.default_value)
 
   def test_shape_and_default_value_compatibility(self):
-    fc.numeric_column('aaa', shape=[2], default_value=[1, 2.])
+    fc._numeric_column('aaa', shape=[2], default_value=[1, 2.])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
-    fc.numeric_column(
+      fc._numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
+    fc._numeric_column(
         'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]])
 
   def test_default_value_type_check(self):
-    fc.numeric_column(
+    fc._numeric_column(
         'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32)
-    fc.numeric_column(
+    fc._numeric_column(
         'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32)
     with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32)
     with self.assertRaisesRegexp(TypeError,
                                  'default_value must be compatible with dtype'):
-      fc.numeric_column('aaa', default_value=['string'])
+      fc._numeric_column('aaa', default_value=['string'])
 
   def test_shape_must_be_positive_integer(self):
     with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[
               1.0,
           ])
 
     with self.assertRaisesRegexp(ValueError,
                                  'shape dimensions must be greater than 0'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[
               0,
           ])
@@ -250,20 +251,20 @@ class NumericColumnTest(test.TestCase):
   def test_dtype_is_convertible_to_float(self):
     with self.assertRaisesRegexp(ValueError,
                                  'dtype must be convertible to float'):
-      fc.numeric_column('aaa', dtype=dtypes.string)
+      fc._numeric_column('aaa', dtype=dtypes.string)
 
   def test_scalar_default_value_fills_the_shape(self):
-    a = fc.numeric_column('aaa', shape=[2, 3], default_value=2.)
+    a = fc._numeric_column('aaa', shape=[2, 3], default_value=2.)
     self.assertEqual(((2., 2., 2.), (2., 2., 2.)), a.default_value)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
+    a = fc._numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
     self.assertEqual({
         'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
     }, a._parse_example_spec)
 
   def test_parse_example_no_default_value(self):
-    price = fc.numeric_column('price', shape=[2])
+    price = fc._numeric_column('price', shape=[2])
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -278,7 +279,7 @@ class NumericColumnTest(test.TestCase):
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
   def test_parse_example_with_default_value(self):
-    price = fc.numeric_column('price', shape=[2], default_value=11.)
+    price = fc._numeric_column('price', shape=[2], default_value=11.)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -301,14 +302,14 @@ class NumericColumnTest(test.TestCase):
 
   def test_normalizer_fn_must_be_callable(self):
     with self.assertRaisesRegexp(TypeError, 'must be a callable'):
-      fc.numeric_column('price', normalizer_fn='NotACallable')
+      fc._numeric_column('price', normalizer_fn='NotACallable')
 
   def test_normalizer_fn_transform_feature(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
-    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price])
     with self.cached_session():
       self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
@@ -318,12 +319,12 @@ class NumericColumnTest(test.TestCase):
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
-    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     builder = _LazyBuilder({'price': [[1., 2.], [5., 6.]]})
     self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
 
   def test_sparse_tensor_not_supported(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     builder = _LazyBuilder({
         'price':
             sparse_tensor.SparseTensor(
@@ -333,108 +334,108 @@ class NumericColumnTest(test.TestCase):
       price._transform_feature(builder)
 
   def test_deep_copy(self):
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
     a_copy = copy.deepcopy(a)
     self.assertEqual(a_copy.name, 'aaa')
     self.assertEqual(a_copy.shape, (1, 2))
     self.assertEqual(a_copy.default_value, ((3., 2.),))
 
   def test_numpy_default_value(self):
-    a = fc.numeric_column(
+    a = fc._numeric_column(
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
   def test_linear_model(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
   def test_keras_linear_model(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
 
 class BucketizedColumnTest(test.TestCase):
 
   def test_invalid_source_column_type(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
+    a = fc._categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
     with self.assertRaisesRegexp(
         ValueError,
         'source_column must be a column generated with numeric_column'):
-      fc.bucketized_column(a, boundaries=[0, 1])
+      fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_source_column_shape(self):
-    a = fc.numeric_column('aaa', shape=[2, 3])
+    a = fc._numeric_column('aaa', shape=[2, 3])
     with self.assertRaisesRegexp(
         ValueError, 'source_column must be one-dimensional column'):
-      fc.bucketized_column(a, boundaries=[0, 1])
+      fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_boundaries(self):
-    a = fc.numeric_column('aaa')
+    a = fc._numeric_column('aaa')
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=None)
+      fc._bucketized_column(a, boundaries=None)
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=1.)
+      fc._bucketized_column(a, boundaries=1.)
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=[1, 0])
+      fc._bucketized_column(a, boundaries=[1, 0])
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=[1, 1])
+      fc._bucketized_column(a, boundaries=[1, 1])
 
   def test_name(self):
-    a = fc.numeric_column('aaa', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual('aaa_bucketized', b.name)
 
   def test_var_scope_name(self):
-    a = fc.numeric_column('aaa', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual('aaa_bucketized', b._var_scope_name)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual({
         'aaa': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32)
     }, b._parse_example_spec)
 
   def test_variable_shape(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     # Column 'aaa` has shape [2] times three buckets -> variable_shape=[2, 3].
     self.assertAllEqual((2, 3), b._variable_shape)
 
   def test_num_buckets(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
     self.assertEqual(6, b._num_buckets)
 
   def test_parse_example(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -449,8 +450,8 @@ class BucketizedColumnTest(test.TestCase):
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
   def test_transform_feature(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       transformed_tensor = _transform_features({
           'price': [[-1., 1.], [5., 6.]]
@@ -461,24 +462,22 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_get_dense_tensor_one_input_value(self):
     """Tests _get_dense_tensor() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
       with _initialized_session():
         bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
         self.assertAllClose(
             # One-hot tensor.
-            [[[1., 0., 0., 0., 0.]],
-             [[0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.]],
-             [[0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            [[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 1.]]],
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_dense_tensor_two_input_values(self):
     """Tests _get_dense_tensor() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
       with _initialized_session():
@@ -487,12 +486,12 @@ class BucketizedColumnTest(test.TestCase):
             # One-hot tensor.
             [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
              [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_sparse_tensors_one_input_value(self):
     """Tests _get_sparse_tensors() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
       with _initialized_session() as sess:
@@ -506,8 +505,8 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_two_input_values(self):
     """Tests _get_sparse_tensors() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
       with _initialized_session() as sess:
@@ -522,8 +521,8 @@ class BucketizedColumnTest(test.TestCase):
         self.assertAllEqual([2, 2], id_tensor_value.dense_shape)
 
   def test_sparse_tensor_input_not_supported(self):
-    price = fc.numeric_column('price')
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+    price = fc._numeric_column('price')
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 1])
     builder = _LazyBuilder({
         'price':
             sparse_tensor.SparseTensor(
@@ -533,8 +532,8 @@ class BucketizedColumnTest(test.TestCase):
       bucketized_price._transform_feature(builder)
 
   def test_deep_copy(self):
-    a = fc.numeric_column('aaa', shape=[2])
-    a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2])
+    a_bucketized = fc._bucketized_column(a, boundaries=[0, 1])
     a_bucketized_copy = copy.deepcopy(a_bucketized)
     self.assertEqual(a_bucketized_copy.name, 'aaa_bucketized')
     self.assertAllEqual(a_bucketized_copy._variable_shape, (2, 3))
@@ -542,45 +541,48 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = fc.linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = fc.linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.],
              [60.], [70.], [80.], [90.], [100.]]))
@@ -590,14 +592,14 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_keras_linear_model_one_input_value(self):
     """Tests _LinearModel for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -605,25 +607,28 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_keras_linear_model_two_input_values(self):
     """Tests _LinearModel for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -631,12 +636,12 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
                                          [60.], [70.], [80.], [90.], [100.]]))
@@ -646,15 +651,15 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
 
 class HashedCategoricalColumnTest(test.TestCase):
 
   def test_defaults(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a._var_scope_name)
     self.assertEqual('aaa', a.key)
@@ -663,25 +668,25 @@ class HashedCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_hash_bucket(('key',), 10)
+      fc._categorical_column_with_hash_bucket(('key',), 10)
 
   def test_bucket_size_should_be_given(self):
     with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
-      fc.categorical_column_with_hash_bucket('aaa', None)
+      fc._categorical_column_with_hash_bucket('aaa', None)
 
   def test_bucket_size_should_be_positive(self):
     with self.assertRaisesRegexp(ValueError,
                                  'hash_bucket_size must be at least 1'):
-      fc.categorical_column_with_hash_bucket('aaa', 0)
+      fc._categorical_column_with_hash_bucket('aaa', 0)
 
   def test_dtype_should_be_string_or_integer(self):
-    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
-    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
+    fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
+      fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
   def test_deep_copy(self):
-    original = fc.categorical_column_with_hash_bucket('aaa', 10)
+    original = fc._categorical_column_with_hash_bucket('aaa', 10)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(10, column.hash_bucket_size)
@@ -689,19 +694,19 @@ class HashedCategoricalColumnTest(test.TestCase):
       self.assertEqual(dtypes.string, column.dtype)
 
   def test_parse_spec_string(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, a._parse_example_spec)
 
   def test_parse_spec_int(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, a._parse_example_spec)
 
   def test_parse_example(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -722,7 +727,7 @@ class HashedCategoricalColumnTest(test.TestCase):
           features['aaa'].eval())
 
   def test_strings_should_be_hashed(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -739,11 +744,11 @@ class HashedCategoricalColumnTest(test.TestCase):
                           output.dense_shape.eval())
 
   def test_tensor_dtype_should_be_string_or_integer(self):
-    string_fc = fc.categorical_column_with_hash_bucket(
+    string_fc = fc._categorical_column_with_hash_bucket(
         'a_string', 10, dtype=dtypes.string)
-    int_fc = fc.categorical_column_with_hash_bucket(
+    int_fc = fc._categorical_column_with_hash_bucket(
         'a_int', 10, dtype=dtypes.int32)
-    float_fc = fc.categorical_column_with_hash_bucket(
+    float_fc = fc._categorical_column_with_hash_bucket(
         'a_float', 10, dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(
         values=[101],
@@ -768,7 +773,7 @@ class HashedCategoricalColumnTest(test.TestCase):
       builder.get(float_fc)
 
   def test_dtype_should_match_with_tensor(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -777,7 +782,7 @@ class HashedCategoricalColumnTest(test.TestCase):
       builder.get(hashed_sparse)
 
   def test_ints_should_be_hashed(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=[101, 201, 301],
@@ -791,7 +796,7 @@ class HashedCategoricalColumnTest(test.TestCase):
       self.assertAllEqual(expected_values, output.values.eval())
 
   def test_int32_64_is_compatible(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=constant_op.constant([101, 201, 301], dtype=dtypes.int32),
@@ -805,7 +810,7 @@ class HashedCategoricalColumnTest(test.TestCase):
       self.assertAllEqual(expected_values, output.values.eval())
 
   def test_get_sparse_tensors(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     builder = _LazyBuilder({
         'wire':
             sparse_tensor.SparseTensor(
@@ -818,7 +823,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_hash_bucket('aaa', 10)
+    column = fc._categorical_column_with_hash_bucket('aaa', 10)
     inputs = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -833,14 +838,14 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
   def test_get_sparse_tensors_dense_input(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     builder = _LazyBuilder({'wire': (('omar', ''), ('stringer', 'marlo'))})
     id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
     self.assertIsNone(id_weight_pair.weight_tensor)
     self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
 
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -852,16 +857,17 @@ class HashedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
@@ -874,13 +880,14 @@ class HashedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
 
 class CrossedColumnTest(test.TestCase):
@@ -888,100 +895,100 @@ class CrossedColumnTest(test.TestCase):
   def test_keys_empty(self):
     with self.assertRaisesRegexp(
         ValueError, 'keys must be a list with length > 1'):
-      fc.crossed_column([], 10)
+      fc._crossed_column([], 10)
 
   def test_keys_length_one(self):
     with self.assertRaisesRegexp(
         ValueError, 'keys must be a list with length > 1'):
-      fc.crossed_column(['a'], 10)
+      fc._crossed_column(['a'], 10)
 
   def test_key_type_unsupported(self):
     with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
-      fc.crossed_column(['a', fc.numeric_column('c')], 10)
+      fc._crossed_column(['a', fc._numeric_column('c')], 10)
 
     with self.assertRaisesRegexp(
         ValueError, 'categorical_column_with_hash_bucket is not supported'):
-      fc.crossed_column(
-          ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
+      fc._crossed_column(
+          ['a', fc._categorical_column_with_hash_bucket('c', 10)], 10)
 
   def test_hash_bucket_size_negative(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], -1)
+      fc._crossed_column(['a', 'c'], -1)
 
   def test_hash_bucket_size_zero(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], 0)
+      fc._crossed_column(['a', 'c'], 0)
 
   def test_hash_bucket_size_none(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], None)
+      fc._crossed_column(['a', 'c'], None)
 
   def test_name(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_name_ordered_alphabetically(self):
     """Tests that the name does not depend on the order of given columns."""
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([crossed1, 'c', b], 10)
+    crossed2 = fc._crossed_column([crossed1, 'c', b], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_name_leaf_keys_ordered_alphabetically(self):
     """Tests that the name does not depend on the order of given columns."""
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d2', 'c'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d2', 'c'], 10)
 
-    crossed2 = fc.crossed_column([crossed1, 'd1', b], 10)
+    crossed2 = fc._crossed_column([crossed1, 'd1', b], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_var_scope_name(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2._var_scope_name)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed = fc.crossed_column([b, 'c'], 10)
+    a = fc._numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed = fc._crossed_column([b, 'c'], 10)
     self.assertEqual({
         'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32),
         'c': parsing_ops.VarLenFeature(dtypes.string),
     }, crossed._parse_example_spec)
 
   def test_num_buckets(self):
-    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed = fc.crossed_column([b, 'c'], 15)
+    a = fc._numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed = fc._crossed_column([b, 'c'], 15)
     self.assertEqual(15, crossed._num_buckets)
 
   def test_deep_copy(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 15, hash_key=5)
     crossed2_copy = copy.deepcopy(crossed2)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
     self.assertEqual(15, crossed2_copy.hash_bucket_size)
     self.assertEqual(5, crossed2_copy.hash_key)
 
   def test_parse_example(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
-    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
+    price_cross_wire = fc._crossed_column([bucketized_price, 'wire'], 10)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -1005,11 +1012,11 @@ class CrossedColumnTest(test.TestCase):
       self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
 
   def test_transform_feature(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
     hash_bucket_size = 10
-    price_cross_wire = fc.crossed_column(
-        [bucketized_price, 'wire'], hash_bucket_size)
+    price_cross_wire = fc._crossed_column([bucketized_price, 'wire'],
+                                          hash_bucket_size)
     features = {
         'price': constant_op.constant([[1., 2.], [5., 6.]]),
         'wire': sparse_tensor.SparseTensor(
@@ -1020,7 +1027,7 @@ class CrossedColumnTest(test.TestCase):
     outputs = _transform_features(features, [price_cross_wire])
     output = outputs[price_cross_wire]
     with self.cached_session() as sess:
-      output_val = sess.run(output)
+      output_val = self.evaluate(output)
       self.assertAllEqual(
           [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
       for val in output_val.values:
@@ -1028,10 +1035,10 @@ class CrossedColumnTest(test.TestCase):
       self.assertAllEqual([2, 4], output_val.dense_shape)
 
   def test_get_sparse_tensors(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 15, hash_key=5)
     with ops.Graph().as_default():
       builder = _LazyBuilder({
           'a':
@@ -1069,9 +1076,9 @@ class CrossedColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_simple(self):
     """Same as test_get_sparse_tensors, but with simpler values."""
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       builder = _LazyBuilder({
           'a':
@@ -1099,9 +1106,9 @@ class CrossedColumnTest(test.TestCase):
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
           'a': constant_op.constant(((-1., .5), (.5, 1.))),
@@ -1113,15 +1120,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(
-            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_linear_model_with_weights(self):
     class _TestColumnWithWeights(_CategoricalColumn):
@@ -1155,7 +1162,7 @@ class CrossedColumnTest(test.TestCase):
             id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
     t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(
           ValueError,
@@ -1180,9 +1187,9 @@ class CrossedColumnTest(test.TestCase):
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
           'a':
@@ -1196,15 +1203,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_keras_linear_model_with_weights(self):
 
@@ -1242,7 +1249,7 @@ class CrossedColumnTest(test.TestCase):
             id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
     t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(
           ValueError,
@@ -1331,31 +1338,31 @@ class LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.linear_model(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_dense_bias(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1366,15 +1373,16 @@ class LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1389,7 +1397,7 @@ class LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -1442,25 +1450,25 @@ class LinearModelTest(test.TestCase):
         sess.run(dense_and_sparse_column_var.assign(
             [[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1471,29 +1479,29 @@ class LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
                 1000., 1100., 1200.
             ], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.linear_model(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = array_ops.sparse_placeholder(dtypes.string)
       wire_value = sparse_tensor.SparseTensorValue(
@@ -1504,7 +1512,7 @@ class LinearModelTest(test.TestCase):
       predictions = fc.linear_model(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -1514,7 +1522,7 @@ class LinearModelTest(test.TestCase):
             predictions.eval(feed_dict={wire_tensor: wire_value}))
 
   def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1528,11 +1536,11 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc._weighted_categorical_column(wire_cast, 'weights')
 
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
@@ -1550,25 +1558,25 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -1577,22 +1585,22 @@ class LinearModelTest(test.TestCase):
         fc.linear_model(features, [price])
 
   def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [5., 6.]],
@@ -1603,18 +1611,18 @@ class LinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       cols_to_vars = {}
@@ -1627,8 +1635,8 @@ class LinearModelTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[price2], [price2_var])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=3)
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2', shape=3)
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [6., 7.]],
@@ -1653,13 +1661,13 @@ class LinearModelTest(test.TestCase):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
     # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
     # creates a Variable.
-    apple_numeric_column = fc.numeric_column('apple_numeric_column')
-    banana_dense_feature = fc.numeric_column('banana_dense_feature')
-    banana_dense_feature_bucketized = fc.bucketized_column(
+    apple_numeric_column = fc._numeric_column('apple_numeric_column')
+    banana_dense_feature = fc._numeric_column('banana_dense_feature')
+    banana_dense_feature_bucketized = fc._bucketized_column(
         banana_dense_feature, boundaries=[0.])
-    cherry_sparse_column = fc.categorical_column_with_hash_bucket(
+    cherry_sparse_column = fc._categorical_column_with_hash_bucket(
         'cherry_sparse_feature', hash_bucket_size=5)
-    dragonfruit_embedding_column = fc.embedding_column(
+    dragonfruit_embedding_column = fc._embedding_column(
         cherry_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -1684,7 +1692,7 @@ class LinearModelTest(test.TestCase):
       self.assertItemsEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price], weight_collections=['my-vars'])
@@ -1695,7 +1703,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(price_var, my_vars)
 
   def test_sparse_collection(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1709,7 +1717,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, my_vars)
 
   def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price])
@@ -1720,7 +1728,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(price_var, trainable_vars)
 
   def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1733,7 +1741,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, trainable_vars)
 
   def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price], trainable=False)
@@ -1741,7 +1749,7 @@ class LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1751,9 +1759,9 @@ class LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       features = {
           'price_a': [[1.]],
@@ -1787,8 +1795,8 @@ class LinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -1800,9 +1808,9 @@ class LinearModelTest(test.TestCase):
       fc.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -1815,8 +1823,8 @@ class LinearModelTest(test.TestCase):
         fc.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -1830,8 +1838,8 @@ class LinearModelTest(test.TestCase):
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -1847,9 +1855,14 @@ class LinearModelTest(test.TestCase):
             })
 
   def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -1873,14 +1886,20 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -1918,7 +1937,7 @@ class LinearModelTest(test.TestCase):
                               }))
 
   def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -1939,7 +1958,7 @@ class LinearModelTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
   def test_multiple_linear_models(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features1 = {'price': [[1.], [5.]]}
       features2 = {'price': [[2.], [10.]]}
@@ -1950,14 +1969,14 @@ class LinearModelTest(test.TestCase):
       price_var1 = get_linear_model_column_var(price, name='linear_model')
       price_var2 = get_linear_model_column_var(price, name='linear_model_1')
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
 
 class _LinearModelTest(test.TestCase):
@@ -1996,31 +2015,31 @@ class _LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       get_keras_linear_model_predictions(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_dense_bias(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2031,15 +2050,16 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2055,7 +2075,7 @@ class _LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -2114,10 +2134,10 @@ class _LinearModelTest(test.TestCase):
             dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
                                                 [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(
@@ -2125,15 +2145,15 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2145,29 +2165,29 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
                                   [1000., 1100.,
                                    1200.], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = array_ops.sparse_placeholder(dtypes.string)
       wire_value = sparse_tensor.SparseTensorValue(
@@ -2178,7 +2198,7 @@ class _LinearModelTest(test.TestCase):
       predictions = get_keras_linear_model_predictions(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -2188,7 +2208,7 @@ class _LinearModelTest(test.TestCase):
             predictions.eval(feed_dict={wire_tensor: wire_value}))
 
   def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2202,10 +2222,10 @@ class _LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(
@@ -2213,15 +2233,15 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -2230,22 +2250,22 @@ class _LinearModelTest(test.TestCase):
         get_keras_linear_model_predictions(features, [price])
 
   def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -2254,18 +2274,18 @@ class _LinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       cols_to_vars = {}
@@ -2279,8 +2299,8 @@ class _LinearModelTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[price2], [price2_var])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=3)
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2', shape=3)
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [6., 7.]],
@@ -2303,7 +2323,7 @@ class _LinearModelTest(test.TestCase):
         self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
 
   def test_dense_collection(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(
@@ -2315,7 +2335,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(price_var, my_vars)
 
   def test_sparse_collection(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2329,7 +2349,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, my_vars)
 
   def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price])
@@ -2340,7 +2360,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(price_var, trainable_vars)
 
   def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2353,7 +2373,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, trainable_vars)
 
   def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price], trainable=False)
@@ -2361,7 +2381,7 @@ class _LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2371,9 +2391,9 @@ class _LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       features = {
           'price_a': [[1.]],
@@ -2407,8 +2427,8 @@ class _LinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2420,9 +2440,9 @@ class _LinearModelTest(test.TestCase):
       get_keras_linear_model_predictions(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2435,8 +2455,8 @@ class _LinearModelTest(test.TestCase):
         get_keras_linear_model_predictions(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2451,8 +2471,8 @@ class _LinearModelTest(test.TestCase):
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -2469,14 +2489,14 @@ class _LinearModelTest(test.TestCase):
             })
 
   def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -2506,19 +2526,20 @@ class _LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -2555,7 +2576,7 @@ class _LinearModelTest(test.TestCase):
                               }))
 
   def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -2581,7 +2602,7 @@ class InputLayerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_retrieving_input(self):
     features = {'a': [0.]}
-    input_layer = InputLayer(fc.numeric_column('a'))
+    input_layer = InputLayer(fc._numeric_column('a'))
     inputs = self.evaluate(input_layer(features))
     self.assertAllClose([[0.]], inputs)
 
@@ -2593,8 +2614,8 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
+      categorical_column = fc._categorical_column_with_identity(
+          key='a', num_buckets=3)
       embedding_dimension = 2
       def _embedding_column_initializer(shape, dtype, partition_info):
         del shape  # unused
@@ -2605,7 +2626,8 @@ class InputLayerTest(test.TestCase):
             (0, 1),  # id 1
             (1, 1))  # id 2
         return embedding_values
-      embedding_column = fc.embedding_column(
+
+      embedding_column = fc._embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
@@ -2636,8 +2658,8 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
+      categorical_column = fc._categorical_column_with_identity(
+          key='a', num_buckets=3)
       embedding_dimension = 2
 
       def _embedding_column_initializer(shape, dtype, partition_info):
@@ -2650,7 +2672,7 @@ class InputLayerTest(test.TestCase):
             (1, 1))  # id 2
         return embedding_values
 
-      embedding_column = fc.embedding_column(
+      embedding_column = fc._embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
@@ -2687,56 +2709,56 @@ class FunctionalInputLayerTest(test.TestCase):
       fc.input_layer(
           features={'a': [[0]]},
           feature_columns=[
-              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+              fc._categorical_column_with_hash_bucket('wire_cast', 4)
           ])
 
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.input_layer(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_bare_column(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
-      net = fc.input_layer(features, fc.numeric_column('a'))
+      net = fc.input_layer(features, fc._numeric_column('a'))
       with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+        self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
-      columns = (fc.numeric_column(key) for key in features)
+      columns = (fc._numeric_column(key) for key in features)
       net = fc.input_layer(features, columns)
       with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+        self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.input_layer(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_one_column(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+        self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -2745,16 +2767,16 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price])
 
   def test_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [5., 6.]],
@@ -2762,19 +2784,19 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       net = fc.input_layer(features, [price1, price2])
       with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_fills_cols_to_vars(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
     # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
     # creates a Variable.
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -2798,19 +2820,19 @@ class FunctionalInputLayerTest(test.TestCase):
     # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
     # EmbeddingColumn creates a Variable and the two SharedEmbeddingColumns
     # shared one variable.
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
       features = {
@@ -2850,13 +2872,13 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[shared_embedding_a][0].shape, [3, 2])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -2883,8 +2905,8 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[some_embedding_column][2].shape, [1, 10])
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
     with ops.Graph().as_default():
       features = {
           'price_a': [[1.]],
@@ -2893,11 +2915,11 @@ class FunctionalInputLayerTest(test.TestCase):
       net1 = fc.input_layer(features, [price_a, price_b])
       net2 = fc.input_layer(features, [price_b, price_a])
       with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+        self.assertAllClose([[1., 3.]], self.evaluate(net1))
+        self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
-    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    animal = fc._categorical_column_with_identity('animal', num_buckets=4)
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -2908,8 +2930,8 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [animal])
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2921,9 +2943,9 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2936,8 +2958,8 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2950,8 +2972,8 @@ class FunctionalInputLayerTest(test.TestCase):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -2967,9 +2989,9 @@ class FunctionalInputLayerTest(test.TestCase):
             })
 
   def test_multiple_layers_with_same_embedding_column(self):
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
 
     with ops.Graph().as_default():
@@ -2991,12 +3013,12 @@ class FunctionalInputLayerTest(test.TestCase):
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
   def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
 
@@ -3024,12 +3046,12 @@ class FunctionalInputLayerTest(test.TestCase):
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     all_cols = [embedding_column_a, embedding_column_b]
@@ -3085,18 +3107,18 @@ class FunctionalInputLayerTest(test.TestCase):
       return embedding_values
 
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
 
     # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
+    one_hot_body_style = fc._indicator_column(body_style)
 
     # embedded_body_style has 5 dims in input_layer.
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(country, dimension=5,
-                                           initializer=_initializer)
+    embedded_country = fc._embedding_column(
+        country, dimension=5, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
     features = {
@@ -3135,17 +3157,17 @@ class FunctionalInputLayerTest(test.TestCase):
       return embedding_values
 
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
 
     # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
+    one_hot_body_style = fc._indicator_column(body_style)
 
     # embedded_body_style has 5 dims in input_layer.
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(
+    embedded_country = fc._embedding_column(
         country, dimension=2, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
@@ -3185,7 +3207,7 @@ class FunctionalInputLayerTest(test.TestCase):
 
   def test_with_rank_0_feature(self):
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -3314,7 +3336,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     self._wire_vocabulary_size = 3
 
   def test_defaults(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column._var_scope_name)
@@ -3326,22 +3348,28 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
   def test_all_constructor_args(self):
-    column = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+    column = fc._categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     self.assertEqual(7, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
   def test_deep_copy(self):
-    original = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+    original = fc._categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(7, column._num_buckets)
@@ -3351,16 +3379,16 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_vocabulary_file_none(self):
     with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file=None, vocabulary_size=3)
 
   def test_vocabulary_file_empty_string(self):
     with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
   def test_invalid_vocabulary_file(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -3373,16 +3401,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
   def test_too_large_vocabulary_size(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size + 1)
@@ -3397,20 +3427,24 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
     with self.assertRaisesRegexp(
         ValueError, 'both num_oov_buckets and default_value'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=self._wire_vocabulary_size,
@@ -3418,7 +3452,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           default_value=2)
 
   def test_invalid_input_dtype_int32(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3431,7 +3465,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3444,7 +3478,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_file(
+    a = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3466,7 +3500,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           features['aaa'].eval())
 
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3486,7 +3520,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_none_vocabulary_size(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -3504,7 +3538,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                                   id_weight_pair.id_tensor.eval())
 
   def test_transform_feature(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3514,16 +3548,15 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     id_tensor = _transform_features({'aaa': inputs}, [column])[column]
     with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3541,7 +3574,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3560,7 +3593,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3581,7 +3614,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3605,7 +3638,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     # 'marlo' is the last entry in our vocabulary file, so be setting
     # `vocabulary_size` to 1 less than number of entries in file, we take
     # 'marlo' out of the vocabulary.
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size - 1)
@@ -3625,7 +3658,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_int32(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3647,7 +3680,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3668,7 +3701,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3690,7 +3723,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
+    wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3706,16 +3739,17 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
+    wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3732,19 +3766,20 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_defaults_string(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
@@ -3756,11 +3791,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key=('aaa',), vocabulary_list=('omar', 'stringer', 'marlo'))
 
   def test_defaults_int(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36))
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
@@ -3771,8 +3806,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     }, column._parse_example_spec)
 
   def test_all_constructor_args(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=(12, 24, 36),
+        dtype=dtypes.int32,
         default_value=-99)
     self.assertEqual(3, column._num_buckets)
     self.assertEqual({
@@ -3780,7 +3817,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     }, column._parse_example_spec)
 
   def test_deep_copy(self):
-    original = fc.categorical_column_with_vocabulary_list(
+    original = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
@@ -3791,65 +3828,65 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12., 24., 36.))
 
   def test_mismatched_int_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'dtype.*and vocabulary dtype.*do not match'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'dtype.*and vocabulary dtype.*do not match'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
 
   def test_none_mapping(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary_list.*must be non-empty'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=None)
 
   def test_empty_mapping(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary_list.*must be non-empty'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=tuple([]))
 
   def test_duplicate_mapping(self):
     with self.assertRaisesRegexp(ValueError, 'Duplicate keys'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 12))
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=(12, 24, 36),
-          num_oov_buckets=-1)
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
     with self.assertRaisesRegexp(
         ValueError, 'both num_oov_buckets and default_value'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=(12, 24, 36),
           num_oov_buckets=100,
           default_value=2)
 
   def test_invalid_input_dtype_int32(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
@@ -3858,9 +3895,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=(12, 24, 36))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -3869,7 +3905,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_parse_example_string(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3891,7 +3927,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           features['aaa'].eval())
 
   def test_parse_example_int(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(11, 21, 31))
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3913,9 +3949,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           features['aaa'].eval())
 
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -3932,9 +3967,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_transform_feature(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -3946,13 +3980,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -3967,9 +3999,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     id_weight_pair = column._get_sparse_tensors(
         _LazyBuilder({
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
@@ -3985,7 +4016,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         default_value=2)
@@ -4005,7 +4036,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=100)
@@ -4025,7 +4056,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_int32(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32)
@@ -4046,7 +4077,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32,
@@ -4068,7 +4099,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32,
@@ -4089,7 +4120,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
+    wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
@@ -4104,16 +4135,17 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
+    wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
@@ -4129,19 +4161,20 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
 
 class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_constructor(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
     self.assertEqual('aaa', column._var_scope_name)
@@ -4152,10 +4185,10 @@ class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
+      fc._categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
   def test_deep_copy(self):
-    original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    original = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(3, column._num_buckets)
@@ -4165,24 +4198,24 @@ class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_invalid_num_buckets_zero(self):
     with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'):
-      fc.categorical_column_with_identity(key='aaa', num_buckets=0)
+      fc._categorical_column_with_identity(key='aaa', num_buckets=0)
 
   def test_invalid_num_buckets_negative(self):
     with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'):
-      fc.categorical_column_with_identity(key='aaa', num_buckets=-1)
+      fc._categorical_column_with_identity(key='aaa', num_buckets=-1)
 
   def test_invalid_default_value_too_small(self):
     with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'):
-      fc.categorical_column_with_identity(
+      fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=-1)
 
   def test_invalid_default_value_too_big(self):
     with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'):
-      fc.categorical_column_with_identity(
+      fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=3)
 
   def test_invalid_input_dtype(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -4191,7 +4224,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_parse_example(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=30)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4212,7 +4245,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
           features['aaa'].eval())
 
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4229,7 +4262,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_transform_feature(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4241,11 +4274,10 @@ class IdentityCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4260,7 +4292,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     id_weight_pair = column._get_sparse_tensors(
         _LazyBuilder({
             'aaa': ((0, -1), (1, 0))
@@ -4276,7 +4308,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_with_inputs_too_small(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, -1, 0),
@@ -4289,7 +4321,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
         id_weight_pair.id_tensor.eval()
 
   def test_get_sparse_tensors_with_inputs_too_big(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, 99, 0),
@@ -4302,7 +4334,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
         id_weight_pair.id_tensor.eval()
 
   def test_get_sparse_tensors_with_default_value(self):
-    column = fc.categorical_column_with_identity(
+    column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -4320,7 +4352,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
-    column = fc.categorical_column_with_identity(
+    column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
     input_indices = array_ops.placeholder(dtype=dtypes.int64)
     input_values = array_ops.placeholder(dtype=dtypes.int32)
@@ -4345,7 +4377,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
           }))
 
   def test_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -4357,16 +4389,16 @@ class IdentityCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
   def test_keras_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
@@ -4379,13 +4411,13 @@ class IdentityCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
 
 class TransformFeaturesTest(test.TestCase):
@@ -4393,9 +4425,9 @@ class TransformFeaturesTest(test.TestCase):
   # All transform tests are distributed in column test.
   # Here we only test multi column case and naming
   def transform_multi_column(self):
-    bucketized_price = fc.bucketized_column(
-        fc.numeric_column('price'), boundaries=[0, 2, 4, 6])
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    bucketized_price = fc._bucketized_column(
+        fc._numeric_column('price'), boundaries=[0, 2, 4, 6])
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     with ops.Graph().as_default():
       features = {
           'price': [[-1.], [5.]],
@@ -4452,32 +4484,33 @@ class TransformFeaturesTest(test.TestCase):
 class IndicatorColumnTest(test.TestCase):
 
   def test_indicator_column(self):
-    a = fc.categorical_column_with_hash_bucket('a', 4)
-    indicator_a = fc.indicator_column(a)
+    a = fc._categorical_column_with_hash_bucket('a', 4)
+    indicator_a = fc._indicator_column(a)
     self.assertEqual(indicator_a.categorical_column.name, 'a')
     self.assertEqual(indicator_a.name, 'a_indicator')
     self.assertEqual(indicator_a._var_scope_name, 'a_indicator')
     self.assertEqual(indicator_a._variable_shape, [1, 4])
 
-    b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
-    indicator_b = fc.indicator_column(b)
+    b = fc._categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    indicator_b = fc._indicator_column(b)
     self.assertEqual(indicator_b.categorical_column.name, 'b')
     self.assertEqual(indicator_b.name, 'b_indicator')
     self.assertEqual(indicator_b._var_scope_name, 'b_indicator')
     self.assertEqual(indicator_b._variable_shape, [1, 100])
 
   def test_1D_shape_succeeds(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_hash_bucket('animal', 4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({'animal': ['fox', 'fox']})
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
     # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
-    animal = fc.indicator_column(
-        fc.categorical_column_with_hash_bucket('animal', 4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({
         'animal':
             sparse_tensor.SparseTensor(
@@ -4487,11 +4520,12 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_multi_hot(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
 
     builder = _LazyBuilder({
         'animal':
@@ -4500,11 +4534,11 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+      self.assertAllEqual([[0., 2., 0., 0.]], self.evaluate(output))
 
   def test_multi_hot2(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     builder = _LazyBuilder({
         'animal':
             sparse_tensor.SparseTensor(
@@ -4512,20 +4546,20 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
 
   def test_deep_copy(self):
-    a = fc.categorical_column_with_hash_bucket('a', 4)
-    column = fc.indicator_column(a)
+    a = fc._categorical_column_with_hash_bucket('a', 4)
+    column = fc._indicator_column(a)
     column_copy = copy.deepcopy(column)
     self.assertEqual(column_copy.categorical_column.name, 'a')
     self.assertEqual(column.name, 'a_indicator')
     self.assertEqual(column._variable_shape, [1, 4])
 
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc.indicator_column(a)
+    a_indicator = fc._indicator_column(a)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4546,9 +4580,9 @@ class IndicatorColumnTest(test.TestCase):
           features['aaa'].eval())
 
   def test_transform(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc.indicator_column(a)
+    a_indicator = fc._indicator_column(a)
     features = {
         'aaa': sparse_tensor.SparseTensorValue(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -4557,51 +4591,52 @@ class IndicatorColumnTest(test.TestCase):
     }
     indicator_tensor = _transform_features(features, [a_indicator])[a_indicator]
     with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+      self.assertAllEqual([[0, 0, 1], [1, 0, 0]],
+                          self.evaluate(indicator_tensor))
 
   def test_transform_with_weighted_column(self):
     # Github issue 12557
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc.weighted_categorical_column(ids, 'weights')
-    indicator = fc.indicator_column(weights)
+    weights = fc._weighted_categorical_column(ids, 'weights')
+    indicator = fc._indicator_column(weights)
     features = {
         'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
         'weights': constant_op.constant([[2., 4., 6., 1.]])
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[6., 4., 3.]], indicator_tensor.eval())
+      self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
 
   def test_transform_with_missing_value_in_weighted_column(self):
     # Github issue 12583
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc.weighted_categorical_column(ids, 'weights')
-    indicator = fc.indicator_column(weights)
+    weights = fc._weighted_categorical_column(ids, 'weights')
+    indicator = fc._indicator_column(weights)
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
         'weights': constant_op.constant([[2., 4., 6.]])
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
 
   def test_transform_with_missing_value_in_categorical_column(self):
     # Github issue 12583
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    indicator = fc.indicator_column(ids)
+    indicator = fc._indicator_column(ids)
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
 
   def test_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4613,14 +4648,14 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
   def test_keras_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4632,14 +4667,14 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
   def test_input_layer(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4648,16 +4683,16 @@ class IndicatorColumnTest(test.TestCase):
       }
       net = fc.input_layer(features, [animal])
       with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+        self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
 
 class EmbeddingColumnTest(test.TestCase):
 
   def test_defaults(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    embedding_column = fc.embedding_column(
+    embedding_column = fc._embedding_column(
         categorical_column, dimension=embedding_dimension)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
@@ -4675,14 +4710,18 @@ class EmbeddingColumnTest(test.TestCase):
     }, embedding_column._parse_example_spec)
 
   def test_all_constructor_args(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
@@ -4699,14 +4738,18 @@ class EmbeddingColumnTest(test.TestCase):
     }, embedding_column._parse_example_spec)
 
   def test_deep_copy(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    original = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+    original = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     for embedding_column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', embedding_column.categorical_column.name)
       self.assertEqual(3, embedding_column.categorical_column._num_buckets)
@@ -4728,15 +4771,16 @@ class EmbeddingColumnTest(test.TestCase):
       }, embedding_column._parse_example_spec)
 
   def test_invalid_initializer(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
+      fc._embedding_column(
+          categorical_column, dimension=2, initializer='not_fn')
 
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded = fc.embedding_column(a, dimension=2)
+    a_embedded = fc._embedding_column(a, dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4757,8 +4801,8 @@ class EmbeddingColumnTest(test.TestCase):
           features['aaa'].eval())
 
   def test_transform_feature(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    a_embedded = fc.embedding_column(a, dimension=2)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+    a_embedded = fc._embedding_column(a, dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -4769,8 +4813,8 @@ class EmbeddingColumnTest(test.TestCase):
     output_a = outputs[a]
     output_embedded = outputs[a_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_embedded))
 
   def test_get_dense_tensor(self):
     # Inputs.
@@ -4810,10 +4854,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4828,7 +4873,7 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   def test_get_dense_tensor_3d(self):
     # Inputs.
@@ -4870,10 +4915,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4888,7 +4934,7 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   def test_get_dense_tensor_weight_collections(self):
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -4901,9 +4947,9 @@ class EmbeddingColumnTest(test.TestCase):
         dense_shape=(4, 5))
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    embedding_column = fc.embedding_column(categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     # Provide sparse input and get dense result.
     embedding_column._get_dense_tensor(
@@ -4957,10 +5003,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -5025,10 +5072,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         ckpt_to_load_from=ckpt_path,
         tensor_name_in_ckpt=ckpt_tensor)
 
@@ -5044,7 +5092,7 @@ class EmbeddingColumnTest(test.TestCase):
         ('embedding_weights:0',), tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   def test_linear_model(self):
     # Inputs.
@@ -5070,10 +5118,11 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     with ops.Graph().as_default():
@@ -5100,11 +5149,13 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5119,7 +5170,8 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
   def test_keras_linear_model(self):
     # Inputs.
@@ -5146,9 +5198,9 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
+    embedding_column = fc._embedding_column(
         categorical_column,
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5176,11 +5228,13 @@ class EmbeddingColumnTest(test.TestCase):
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5195,7 +5249,8 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
   def test_input_layer(self):
     # Inputs.
@@ -5235,10 +5290,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -5255,7 +5311,7 @@ class EmbeddingColumnTest(test.TestCase):
         tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
   def test_input_layer_not_trainable(self):
     # Inputs.
@@ -5295,11 +5351,13 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        initializer=_initializer, trainable=False)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
 
     # Provide sparse input and get dense result.
     input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
@@ -5313,18 +5371,18 @@ class EmbeddingColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
 class SharedEmbeddingColumnTest(test.TestCase):
 
   def test_defaults(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
@@ -5363,12 +5421,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
     }, embedding_column_b._parse_example_spec)
 
   def test_all_constructor_args(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5414,12 +5472,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
     }, embedding_column_b._parse_example_spec)
 
   def test_deep_copy(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    original_a, _ = fc.shared_embedding_columns(
+    original_a, _ = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5427,7 +5485,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
         shared_embedding_collection_name='shared_embedding_collection_name',
         ckpt_to_load_from='my_ckpt',
         tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        max_norm=42.,
+        trainable=False)
     for embedding_column_a in (original_a, copy.deepcopy(original_a)):
       self.assertEqual('aaa', embedding_column_a.categorical_column.name)
       self.assertEqual(3, embedding_column_a.categorical_column._num_buckets)
@@ -5451,54 +5510,55 @@ class SharedEmbeddingColumnTest(test.TestCase):
       }, embedding_column_a._parse_example_spec)
 
   def test_invalid_initializer(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.shared_embedding_columns(
-          [categorical_column_a, categorical_column_b], dimension=2,
+      fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=2,
           initializer='not_fn')
 
   def test_incompatible_column_type(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    categorical_column_c = fc.categorical_column_with_hash_bucket(
+    categorical_column_c = fc._categorical_column_with_hash_bucket(
         key='ccc', hash_bucket_size=3)
     with self.assertRaisesRegexp(
         ValueError,
         'all categorical_columns must have the same type.*'
         '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
-      fc.shared_embedding_columns(
+      fc_new.shared_embedding_columns(
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
   def test_weighted_categorical_column_ok(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    weighted_categorical_column_a = fc.weighted_categorical_column(
+    weighted_categorical_column_a = fc._weighted_categorical_column(
         categorical_column_a, weight_feature_key='aaa_weights')
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    weighted_categorical_column_b = fc.weighted_categorical_column(
+    weighted_categorical_column_b = fc._weighted_categorical_column(
         categorical_column_b, weight_feature_key='bbb_weights')
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [weighted_categorical_column_a, categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [categorical_column_a, weighted_categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    b = fc.categorical_column_with_vocabulary_list(
+    b = fc._categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                             dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -5530,10 +5590,10 @@ class SharedEmbeddingColumnTest(test.TestCase):
           features['bbb'].eval())
 
   def test_transform_feature(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+    b = fc._categorical_column_with_identity(key='bbb', num_buckets=3)
+    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                             dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -5550,10 +5610,10 @@ class SharedEmbeddingColumnTest(test.TestCase):
     output_b = outputs[b]
     output_b_embedded = outputs[b_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_a_embedded.eval())
-      _assert_sparse_tensor_value(
-          self, output_b.eval(), output_b_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_a_embedded))
+      _assert_sparse_tensor_value(self, self.evaluate(output_b),
+                                  self.evaluate(output_b_embedded))
 
   def test_get_dense_tensor(self):
     # Inputs.
@@ -5598,13 +5658,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a._get_dense_tensor(
@@ -5618,9 +5679,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
 
   def test_get_dense_tensor_weight_collections(self):
     # Inputs.
@@ -5651,11 +5712,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5712,13 +5773,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a._get_dense_tensor(
@@ -5752,13 +5814,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -5790,13 +5853,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding_1/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5814,7 +5879,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1, ids [], embedding[1] = 0, 0]
         # sum(embeddings * linear_weights)
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+        self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
   def test_keras_linear_model(self):
     # Inputs.
@@ -5842,11 +5907,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5881,13 +5946,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding_1/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5905,7 +5972,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1, ids [], embedding[1] = 0, 0]
         # sum(embeddings * linear_weights)
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+        self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
   def _test_input_layer(self, trainable=True):
     # Inputs.
@@ -5949,13 +6016,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer,
+        dimension=embedding_dimension,
+        initializer=_initializer,
         trainable=trainable)
 
     # Provide sparse input and get dense result.
@@ -5978,7 +6046,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
   def test_input_layer(self):
     self._test_input_layer()
@@ -5990,8 +6058,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
 class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_defaults(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     self.assertEqual('ids_weighted_by_values', column.name)
@@ -6004,8 +6072,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
-    original = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    original = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     for column in (original, copy.deepcopy(original)):
@@ -6018,23 +6086,23 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_invalid_dtype_none(self):
     with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
           weight_feature_key='values',
           dtype=None)
 
   def test_invalid_dtype_string(self):
     with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
           weight_feature_key='values',
           dtype=dtypes.string)
 
   def test_invalid_input_dtype(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     strings = sparse_tensor.SparseTensorValue(
@@ -6046,14 +6114,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_column_name_collision(self):
     with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='aaa', num_buckets=3),
           weight_feature_key='aaa')._parse_example_spec()
 
   def test_missing_weights(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6065,9 +6133,10 @@ class WeightedCategoricalColumnTest(test.TestCase):
       _transform_features({'ids': inputs}, (column,))
 
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
+    a_weighted = fc._weighted_categorical_column(
+        a, weight_feature_key='weights')
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -6099,8 +6168,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
           features['weights'].eval())
 
   def test_transform_features(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6121,19 +6190,17 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
   def test_transform_features_dense_input(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     weights = sparse_tensor.SparseTensorValue(
@@ -6150,19 +6217,17 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
   def test_transform_features_dense_weights(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6179,19 +6244,17 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)),
-          weight_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(weight_tensor))
 
   def test_keras_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6210,18 +6273,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_keras_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6241,8 +6304,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         }, (column,))
 
   def test_keras_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6263,11 +6326,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_keras_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6282,18 +6345,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6310,18 +6373,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6339,8 +6402,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         }, (column,))
 
   def test_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6361,11 +6424,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6379,14 +6442,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   # TODO(ptucker): Add test with embedding of weighted categorical.
 
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index d97d41dd830f57c91bbd2b38df5bd897a16a32a5..2af2b9f254abcb4a2e7a4b655a581338a9622ad3 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -141,7 +141,11 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine.base_layer import Layer
+# TODO(b/118385027): Dependency on keras can be problematic if Keras moves out
+# of the main repo.
+from tensorflow.python.keras import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -161,6 +165,7 @@ from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 _FEATURE_COLUMN_DEPRECATION_DATE = '2018-11-30'
@@ -184,6 +189,7 @@ class StateManager(object):
                       shape,
                       dtype=None,
                       trainable=True,
+                      use_resource=True,
                       initializer=None):
     """Creates a new variable.
 
@@ -193,12 +199,14 @@ class StateManager(object):
       shape: variable shape.
       dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
       trainable: Whether this variable is trainable or not.
+      use_resource: If true, we use resource variables. Otherwise we use
+        RefVariable.
       initializer: initializer instance (callable).
 
     Returns:
       The created variable.
     """
-    del feature_column, name, shape, dtype, trainable, initializer
+    del feature_column, name, shape, dtype, trainable, use_resource, initializer
     raise NotImplementedError('StateManager.create_variable')
 
   def add_variable(self, feature_column, var):
@@ -251,7 +259,7 @@ class StateManager(object):
 
 
 class _StateManagerImpl(StateManager):
-  """Manages the state of FeatureLayer and LinearModel."""
+  """Manages the state of DenseFeatures and LinearLayer."""
 
   def __init__(self, layer, trainable):
     """Creates an _StateManagerImpl object.
@@ -270,6 +278,7 @@ class _StateManagerImpl(StateManager):
                       shape,
                       dtype=None,
                       trainable=True,
+                      use_resource=True,
                       initializer=None):
     if name in self._cols_to_vars_map[feature_column]:
       raise ValueError('Variable already exists.')
@@ -280,7 +289,7 @@ class _StateManagerImpl(StateManager):
         dtype=dtype,
         initializer=initializer,
         trainable=self._trainable and trainable,
-        use_resource=True,
+        use_resource=use_resource,
         # TODO(rohanj): Get rid of this hack once we have a mechanism for
         # specifying a default partitioner for an entire layer. In that case,
         # the default getter for Layers should work.
@@ -294,7 +303,8 @@ class _StateManagerImpl(StateManager):
     raise ValueError('Variable does not exist.')
 
 
-class FeatureLayer(Layer):
+@tf_export('keras.layers.DenseFeatures', v1=[])
+class DenseFeatures(Layer):
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -310,7 +320,7 @@ class FeatureLayer(Layer):
   keywords_embedded = embedding_column(
       categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
   columns = [price, keywords_embedded, ...]
-  feature_layer = FeatureLayer(columns)
+  feature_layer = DenseFeatures(columns)
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   dense_tensor = feature_layer(features)
@@ -325,7 +335,7 @@ class FeatureLayer(Layer):
                trainable=True,
                name=None,
                **kwargs):
-    """Constructs a FeatureLayer.
+    """Constructs a DenseFeatures.
 
     Args:
       feature_columns: An iterable containing the FeatureColumns to use as
@@ -336,13 +346,14 @@ class FeatureLayer(Layer):
         `indicator_column`.
       trainable: If `True` also add the variable to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-      name: Name to give to the FeatureLayer.
+      name: Name to give to the DenseFeatures.
       **kwargs: Keyword arguments to construct a layer.
 
     Raises:
       ValueError: if an item in `feature_columns` is not a `DenseColumn`.
     """
-    super(FeatureLayer, self).__init__(name=name, trainable=trainable, **kwargs)
+    super(DenseFeatures, self).__init__(
+        name=name, trainable=trainable, **kwargs)
 
     self._feature_columns = _normalize_feature_columns(feature_columns)
     self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
@@ -363,7 +374,7 @@ class FeatureLayer(Layer):
       with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
         with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
           column.create_state(self._state_manager)
-      super(FeatureLayer, self).build(None)
+      super(DenseFeatures, self).build(None)
 
   def call(self, features, cols_to_output_tensors=None):
     """Returns a dense tensor corresponding to the `feature_columns`.
@@ -412,14 +423,110 @@ class FeatureLayer(Layer):
     return (input_shape[0], total_elements)
 
 
-class LinearModel(Layer):
+class _LinearModelLayer(Layer):
+  """Layer that contains logic for `LinearModel`."""
+
+  def __init__(self,
+               feature_columns,
+               units=1,
+               sparse_combiner='sum',
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_LinearModelLayer, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+
+    self._feature_columns = _normalize_feature_columns(feature_columns)
+    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
+    for column in self._feature_columns:
+      if not isinstance(column, (DenseColumn, CategoricalColumn)):
+        raise ValueError(
+            'Items of feature_columns must be either a '
+            'DenseColumn or CategoricalColumn. Given: {}'.format(column))
+
+    self._units = units
+    self._sparse_combiner = sparse_combiner
+
+    self._state_manager = _StateManagerImpl(self, self.trainable)
+    self.bias = None
+
+  def build(self, _):
+    # We need variable scopes for now because we want the variable partitioning
+    # information to percolate down. We also use _pure_variable_scope's here
+    # since we want to open up a name_scope in the `call` method while creating
+    # the ops.
+    with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
+      for column in self._feature_columns:
+        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
+          # Create the state for each feature column
+          column.create_state(self._state_manager)
+
+          # Create a weight variable for each column.
+          if isinstance(column, CategoricalColumn):
+            first_dim = column.num_buckets
+          else:
+            first_dim = column.variable_shape.num_elements()
+          self._state_manager.create_variable(
+              column,
+              name='weights',
+              dtype=dtypes.float32,
+              shape=(first_dim, self._units),
+              initializer=init_ops.zeros_initializer(),
+              trainable=self.trainable)
+
+      # Create a bias variable.
+      self.bias = self.add_variable(
+          name='bias_weights',
+          dtype=dtypes.float32,
+          shape=[self._units],
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable,
+          use_resource=True,
+          # TODO(rohanj): Get rid of this hack once we have a mechanism for
+          # specifying a default partitioner for an entire layer. In that case,
+          # the default getter for Layers should work.
+          getter=variable_scope.get_variable)
+
+    super(_LinearModelLayer, self).build(None)
+
+  def call(self, features):
+    if not isinstance(features, dict):
+      raise ValueError('We expected a dictionary here. Instead we got: {}'
+                       .format(features))
+    with ops.name_scope(self.name):
+      transformation_cache = FeatureTransformationCache(features)
+      weighted_sums = []
+      for column in self._feature_columns:
+        with ops.name_scope(column.name):
+          # All the weights used in the linear model are owned by the state
+          # manager associated with this Linear Model.
+          weight_var = self._state_manager.get_variable(column, 'weights')
+
+          weighted_sum = _create_weighted_sum(
+              column=column,
+              transformation_cache=transformation_cache,
+              state_manager=self._state_manager,
+              sparse_combiner=self._sparse_combiner,
+              weight_var=weight_var)
+          weighted_sums.append(weighted_sum)
+
+      _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
+      predictions_no_bias = math_ops.add_n(
+          weighted_sums, name='weighted_sum_no_bias')
+      predictions = nn_ops.bias_add(
+          predictions_no_bias, self.bias, name='weighted_sum')
+      return predictions
+
+
+@tf_export('keras.layers.LinearModel', v1=[])
+class LinearModel(training.Model):
   """Produces a linear prediction `Tensor` based on given `feature_columns`.
 
   This layer generates a weighted sum based on output dimension `units`.
   Weighted sum refers to logits in classification problems. It refers to the
   prediction itself for linear regression problems.
 
-  Note on supported columns: `LinearModel` treats categorical columns as
+  Note on supported columns: `LinearLayer` treats categorical columns as
   `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
   like:
 
@@ -444,7 +551,7 @@ class LinearModel(Layer):
   keywords = categorical_column_with_hash_bucket("keywords", 10K)
   keywords_price = crossed_column('keywords', price_buckets, ...)
   columns = [price_buckets, keywords, keywords_price ...]
-  linear_model = LinearModel(columns)
+  linear_model = LinearLayer(columns)
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   prediction = linear_model(features)
@@ -458,7 +565,7 @@ class LinearModel(Layer):
                trainable=True,
                name=None,
                **kwargs):
-    """Constructs a LinearModel.
+    """Constructs a LinearLayer.
 
     Args:
       feature_columns: An iterable containing the FeatureColumns to use as
@@ -514,60 +621,15 @@ class LinearModel(Layer):
       ValueError: if an item in `feature_columns` is neither a `DenseColumn`
         nor `CategoricalColumn`.
     """
-    super(LinearModel, self).__init__(name=name, trainable=trainable, **kwargs)
-
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
-    for column in self._feature_columns:
-      if not isinstance(column, (DenseColumn, CategoricalColumn)):
-        raise ValueError(
-            'Items of feature_columns must be either a '
-            'DenseColumn or CategoricalColumn. Given: {}'.format(column))
-
-    self._units = units
-    self._sparse_combiner = sparse_combiner
 
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    self._bias_variable = None
-
-  def build(self, _):
-    # We need variable scopes for now because we want the variable partitioning
-    # information to percolate down. We also use _pure_variable_scope's here
-    # since we want to open up a name_scope in the `call` method while creating
-    # the ops.
-    with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
-      for column in self._feature_columns:
-        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
-          # Create the state for each feature column
-          column.create_state(self._state_manager)
-
-          # Create a weight variable for each column.
-          if isinstance(column, CategoricalColumn):
-            first_dim = column.num_buckets
-          else:
-            first_dim = column.variable_shape.num_elements()
-          self._state_manager.create_variable(
-              column,
-              name='weights',
-              dtype=dtypes.float32,
-              shape=(first_dim, self._units),
-              initializer=init_ops.zeros_initializer(),
-              trainable=self.trainable)
-
-      # Create a bias variable.
-      self._bias_variable = self.add_variable(
-          name='bias_weights',
-          dtype=dtypes.float32,
-          shape=[self._units],
-          initializer=init_ops.zeros_initializer(),
-          trainable=self.trainable,
-          use_resource=True,
-          # TODO(rohanj): Get rid of this hack once we have a mechanism for
-          # specifying a default partitioner for an entire layer. In that case,
-          # the default getter for Layers should work.
-          getter=variable_scope.get_variable)
-
-    super(LinearModel, self).build(None)
+    super(LinearModel, self).__init__(name=name, **kwargs)
+    self.layer = _LinearModelLayer(
+        feature_columns,
+        units,
+        sparse_combiner,
+        trainable,
+        name=self.name,
+        **kwargs)
 
   def call(self, features):
     """Returns a `Tensor` the represents the predictions of a linear model.
@@ -585,39 +647,14 @@ class LinearModel(Layer):
     Raises:
       ValueError: If features are not a dictionary.
     """
-    if not isinstance(features, dict):
-      raise ValueError('We expected a dictionary here. Instead we got: ',
-                       features)
-    with ops.name_scope(self.name):
-      transformation_cache = FeatureTransformationCache(features)
-      weighted_sums = []
-      for column in self._feature_columns:
-        with ops.name_scope(column.name):
-          # All the weights used in the linear model are owned by the state
-          # manager associated with this Linear Model.
-          weight_var = self._state_manager.get_variable(column, 'weights')
-
-          weighted_sum = _create_weighted_sum(
-              column=column,
-              transformation_cache=transformation_cache,
-              state_manager=self._state_manager,
-              sparse_combiner=self._sparse_combiner,
-              weight_var=weight_var)
-          weighted_sums.append(weighted_sum)
-
-      _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
-      predictions_no_bias = math_ops.add_n(
-          weighted_sums, name='weighted_sum_no_bias')
-      predictions = nn_ops.bias_add(
-          predictions_no_bias, self._bias_variable, name='weighted_sum')
-      return predictions
+    return self.layer(features)
 
   @property
-  def bias_variable(self):
-    return self._bias_variable
+  def bias(self):
+    return self.layer.bias
 
 
-def _transform_features(features, feature_columns, state_manager):
+def _transform_features_v2(features, feature_columns, state_manager):
   """Returns transformed features based on features columns passed in.
 
   Please note that most probably you would not need to use this function. Please
@@ -662,7 +699,8 @@ def _transform_features(features, feature_columns, state_manager):
   return outputs
 
 
-def make_parse_example_spec(feature_columns):
+@tf_export('feature_column.make_parse_example_spec', v1=[])
+def make_parse_example_spec_v2(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
   The returned dictionary can be used as arg 'features' in `tf.parse_example`.
@@ -721,10 +759,15 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
-def embedding_column(
-    categorical_column, dimension, combiner='mean', initializer=None,
-    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
-    trainable=True):
+@tf_export('feature_column.embedding_column')
+def embedding_column(categorical_column,
+                     dimension,
+                     combiner='mean',
+                     initializer=None,
+                     ckpt_to_load_from=None,
+                     tensor_name_in_ckpt=None,
+                     max_norm=None,
+                     trainable=True):
   """`DenseColumn` that converts from sparse, categorical input.
 
   Use this when your inputs are sparse, but you want to convert them to a dense
@@ -802,25 +845,199 @@ def embedding_column(
     raise ValueError('Must specify both `ckpt_to_load_from` and '
                      '`tensor_name_in_ckpt` or none of them.')
 
-  if (initializer is not None) and (not callable(initializer)):
-    raise ValueError('initializer must be callable if specified. '
-                     'Embedding of column_name: {}'.format(
-                         categorical_column.name))
-  if initializer is None:
-    initializer = init_ops.truncated_normal_initializer(
-        mean=0.0, stddev=1 / math.sqrt(dimension))
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified. '
+                     'Embedding of column_name: {}'.format(
+                         categorical_column.name))
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1 / math.sqrt(dimension))
+
+  return EmbeddingColumn(
+      categorical_column=categorical_column,
+      dimension=dimension,
+      combiner=combiner,
+      initializer=initializer,
+      ckpt_to_load_from=ckpt_to_load_from,
+      tensor_name_in_ckpt=tensor_name_in_ckpt,
+      max_norm=max_norm,
+      trainable=trainable)
+
+
+@tf_export(v1=['feature_column.shared_embedding_columns'])
+def shared_embedding_columns(categorical_columns,
+                             dimension,
+                             combiner='mean',
+                             initializer=None,
+                             shared_embedding_collection_name=None,
+                             ckpt_to_load_from=None,
+                             tensor_name_in_ckpt=None,
+                             max_norm=None,
+                             trainable=True):
+  """List of dense columns that convert from sparse, categorical input.
+
+  This is similar to `embedding_column`, except that it produces a list of
+  embedding columns that share the same embedding weights.
+
+  Use this when your inputs are sparse and of the same type (e.g. watched and
+  impression video IDs that share the same vocabulary), and you want to convert
+  them to a dense representation (e.g., to feed to a DNN).
+
+  Inputs must be a list of categorical columns created by any of the
+  `categorical_column_*` function. They must all be of the same type and have
+  the same arguments except `key`. E.g. they can be
+  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
+  all columns could also be weighted_categorical_column.
+
+  Here is an example embedding of two features for a DNNClassifier model:
+
+  ```python
+  watched_video_id = categorical_column_with_vocabulary_file(
+      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+  impression_video_id = categorical_column_with_vocabulary_file(
+      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+  columns = shared_embedding_columns(
+      [watched_video_id, impression_video_id], dimension=10)
+
+  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
+
+  label_column = ...
+  def input_fn():
+    features = tf.parse_example(
+        ..., features=make_parse_example_spec(columns + [label_column]))
+    labels = features.pop(label_column.name)
+    return features, labels
+
+  estimator.train(input_fn=input_fn, steps=100)
+  ```
+
+  Here is an example using `shared_embedding_columns` with model_fn:
+
+  ```python
+  def model_fn(features, ...):
+    watched_video_id = categorical_column_with_vocabulary_file(
+        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+    impression_video_id = categorical_column_with_vocabulary_file(
+        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+    columns = shared_embedding_columns(
+        [watched_video_id, impression_video_id], dimension=10)
+    dense_tensor = input_layer(features, columns)
+    # Form DNN layers, calculate loss, and return EstimatorSpec.
+    ...
+  ```
+
+  Args:
+    categorical_columns: List of categorical columns created by a
+      `categorical_column_with_*` function. These columns produce the sparse IDs
+      that are inputs to the embedding lookup. All columns must be of the same
+      type and have the same arguments except `key`. E.g. they can be
+      categorical_column_with_vocabulary_file with the same vocabulary_file.
+      Some or all columns could also be weighted_categorical_column.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries in
+      a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+      with bag-of-words columns. Each of this can be thought as example level
+      normalizations on the column. For more information, see
+      `tf.embedding_lookup_sparse`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+    shared_embedding_collection_name: Optional name of the collection where
+      shared embedding weights are added. If not given, a reasonable name will
+      be chosen based on the names of `categorical_columns`. This is also used
+      in `variable_scope` when creating shared embedding weights.
+    ckpt_to_load_from: String representing checkpoint name/pattern from which to
+      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which
+      to restore the column weights. Required if `ckpt_to_load_from` is not
+      `None`.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
+      than this value, before combining.
+    trainable: Whether or not the embedding is trainable. Default is True.
+
+  Returns:
+    A list of dense columns that converts from sparse input. The order of
+    results follows the ordering of `categorical_columns`.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if any of the given `categorical_columns` is of different type
+      or has different arguments than the others.
+    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
+      is specified.
+    ValueError: if `initializer` is specified and is not callable.
+    RuntimeError: if eager execution is enabled.
+  """
+  if context.executing_eagerly():
+    raise RuntimeError('shared_embedding_columns are not supported when eager '
+                       'execution is enabled.')
+
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+    raise ValueError('Must specify both `ckpt_to_load_from` and '
+                     '`tensor_name_in_ckpt` or none of them.')
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified.')
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1. / math.sqrt(dimension))
+
+  # Sort the columns so the default collection name is deterministic even if the
+  # user passes columns from an unsorted collection, such as dict.values().
+  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
+
+  c0 = sorted_columns[0]
+  num_buckets = c0._num_buckets  # pylint: disable=protected-access
+  if not isinstance(c0, fc_old._CategoricalColumn):  # pylint: disable=protected-access
+    raise ValueError(
+        'All categorical_columns must be subclasses of _CategoricalColumn. '
+        'Given: {}, of type: {}'.format(c0, type(c0)))
+  if isinstance(c0,
+                (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)):  # pylint: disable=protected-access
+    c0 = c0.categorical_column
+  for c in sorted_columns[1:]:
+    if isinstance(
+        c, (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)):  # pylint: disable=protected-access
+      c = c.categorical_column
+    if not isinstance(c, type(c0)):
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same type, or be weighted_categorical_column of the same type. '
+          'Given column: {} of type: {} does not match given column: {} of '
+          'type: {}'.format(c0, type(c0), c, type(c)))
+    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same number of buckets. Given column: {} with buckets: {} does  '
+          'not match column: {} with buckets: {}'.format(
+              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
+
+  if not shared_embedding_collection_name:
+    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
+    shared_embedding_collection_name += '_shared_embedding'
 
-  return EmbeddingColumn(
-      categorical_column=categorical_column,
-      dimension=dimension,
-      combiner=combiner,
-      initializer=initializer,
-      ckpt_to_load_from=ckpt_to_load_from,
-      tensor_name_in_ckpt=tensor_name_in_ckpt,
-      max_norm=max_norm,
-      trainable=trainable)
+  result = []
+  for column in categorical_columns:
+    result.append(
+        fc_old._SharedEmbeddingColumn(  # pylint: disable=protected-access
+            categorical_column=column,
+            initializer=initializer,
+            dimension=dimension,
+            combiner=combiner,
+            shared_embedding_collection_name=shared_embedding_collection_name,
+            ckpt_to_load_from=ckpt_to_load_from,
+            tensor_name_in_ckpt=tensor_name_in_ckpt,
+            max_norm=max_norm,
+            trainable=trainable))
+
+  return result
 
 
+@tf_export('feature_column.shared_embedding_columns', v1=[])
 def shared_embedding_columns_v2(categorical_columns,
                                 dimension,
                                 combiner='mean',
@@ -986,6 +1203,7 @@ def shared_embedding_columns_v2(categorical_columns,
   return result
 
 
+@tf_export('feature_column.numeric_column')
 def numeric_column(key,
                    shape=(1,),
                    default_value=None,
@@ -1061,6 +1279,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+@tf_export('feature_column.bucketized_column')
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -1157,6 +1376,7 @@ def _assert_key_is_string(key):
             type(key), key))
 
 
+@tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -1215,6 +1435,7 @@ def categorical_column_with_hash_bucket(key,
   return HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
+@tf_export(v1=['feature_column.categorical_column_with_vocabulary_file'])
 def categorical_column_with_vocabulary_file(key,
                                             vocabulary_file,
                                             vocabulary_size=None,
@@ -1292,6 +1513,97 @@ def categorical_column_with_vocabulary_file(key,
   Returns:
     A `CategoricalColumn` with a vocabulary file.
 
+  Raises:
+    ValueError: `vocabulary_file` is missing or cannot be opened.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return categorical_column_with_vocabulary_file_v2(
+      key, vocabulary_file, vocabulary_size,
+      dtype, default_value,
+      num_oov_buckets)
+
+
+@tf_export('feature_column.categorical_column_with_vocabulary_file', v1=[])
+def categorical_column_with_vocabulary_file_v2(key,
+                                               vocabulary_file,
+                                               vocabulary_size=None,
+                                               dtype=dtypes.string,
+                                               default_value=None,
+                                               num_oov_buckets=0):
+  """A `CategoricalColumn` with a vocabulary file.
+
+  Use this when your inputs are in string or integer format, and you have a
+  vocabulary file that maps each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use either (but not both) of
+  `num_oov_buckets` and `default_value` to specify how to include
+  out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string, which will be dropped by this feature column.
+
+  Example with `num_oov_buckets`:
+  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
+  abbreviation. All inputs with values in that file are assigned an ID 0-49,
+  corresponding to its line number. All other values are hashed and assigned an
+  ID 50-54.
+
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Example with `default_value`:
+  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
+  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
+  in input, and other values missing from the file, will be assigned ID 0. All
+  others are assigned the corresponding line number 1-50.
+
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
+      default_value=0)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  And to make an embedding with either:
+
+  ```python
+  columns = [embedding_column(states, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored. If None, it is set to the length of `vocabulary_file`.
+    dtype: The type of features. Only string and integer types are supported.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+
+  Returns:
+    A `CategoricalColumn` with a vocabulary file.
+
   Raises:
     ValueError: `vocabulary_file` is missing or cannot be opened.
     ValueError: `vocabulary_size` is missing or < 1.
@@ -1334,8 +1646,12 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
-def categorical_column_with_vocabulary_list(
-    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+@tf_export('feature_column.categorical_column_with_vocabulary_list')
+def categorical_column_with_vocabulary_list(key,
+                                            vocabulary_list,
+                                            dtype=None,
+                                            default_value=-1,
+                                            num_oov_buckets=0):
   """A `CategoricalColumn` with in-memory vocabulary.
 
   Use this when your inputs are in string or integer format, and you have an
@@ -1447,6 +1763,7 @@ def categorical_column_with_vocabulary_list(
       num_oov_buckets=num_oov_buckets)
 
 
+@tf_export('feature_column.categorical_column_with_identity')
 def categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `CategoricalColumn` that returns identity values.
 
@@ -1514,6 +1831,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, number_buckets=num_buckets, default_value=default_value)
 
 
+@tf_export('feature_column.indicator_column')
 def indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
@@ -1548,8 +1866,10 @@ def indicator_column(categorical_column):
   return IndicatorColumn(categorical_column)
 
 
-def weighted_categorical_column(
-    categorical_column, weight_feature_key, dtype=dtypes.float32):
+@tf_export('feature_column.weighted_categorical_column')
+def weighted_categorical_column(categorical_column,
+                                weight_feature_key,
+                                dtype=dtypes.float32):
   """Applies weight values to a `CategoricalColumn`.
 
   Use this when each of your sparse inputs has both an ID and a value. For
@@ -1622,6 +1942,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
+@tf_export('feature_column.crossed_column')
 def crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
@@ -1839,6 +2160,101 @@ class FeatureColumn(object):
     """
     pass
 
+  @abc.abstractproperty
+  def parents(self):
+    """Returns a list of immediate raw feature and FeatureColumn dependencies.
+
+    For example:
+    # For the following feature columns
+    a = numeric_column('f1')
+    c = crossed_column(a, 'f2')
+    # The expected parents are:
+    a.parents = ['f1']
+    c.parents = [a, 'f2']
+    """
+    pass
+
+  @abc.abstractmethod
+  def _get_config(self):
+    """Returns the config of the feature column.
+
+    A FeatureColumn config is a Python dictionary (serializable) containing the
+    configuration of a FeatureColumn. The same FeatureColumn can be
+    reinstantiated later from this configuration.
+
+    The config of a feature column does not include information about feature
+    columns depending on it nor the FeatureColumn class name.
+
+    Example with (de)serialization practices followed in this file:
+    ```python
+    class SerializationExampleFeatureColumn(
+        FeatureColumn, collections.namedtuple(
+            'SerializationExampleFeatureColumn',
+            ('dimension', 'parent', 'dtype', 'normalizer_fn'))):
+
+      def _get_config(self):
+        # Create a dict from the namedtuple.
+        # Python attribute literals can be directly copied from / to the config.
+        # For example 'dimension', assuming it is an integer literal.
+        config = dict(zip(self._fields, self))
+
+        # (De)serialization of parent FeatureColumns should use the provided
+        # (de)serialize_feature_column() methods that take care of de-duping.
+        config['parent'] = serialize_feature_column(self.parent)
+
+        # Many objects provide custom (de)serialization e.g: for tf.DType
+        # tf.DType.name, tf.as_dtype() can be used.
+        config['dtype'] = self.dtype.name
+
+        # Non-trivial dependencies should be Keras-(de)serializable.
+        config['normalizer_fn'] = utils.serialize_keras_object(
+            self.normalizer_fn)
+
+        return config
+
+      @classmethod
+      def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+        # This should do the inverse transform from `_get_config` and construct
+        # the namedtuple.
+        kwargs = config.copy()
+        kwargs['parent'] = deserialize_feature_column(
+            config['parent'], custom_objects, columns_by_name)
+        kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+        kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+          config['normalizer_fn'], custom_objects=custom_objects)
+        return cls(**kwargs)
+
+    ```
+    Returns:
+      A serializable Dict that can be used to deserialize the object with
+      from_config.
+    """
+    pass
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """Creates a FeatureColumn from its config.
+
+    This method should be the reverse of `_get_config`, capable of instantiating
+    the same FeatureColumn from the config dictionary. See `_get_config` for an
+    example of common (de)serialization practices followed in this file.
+
+    TODO(b/118939620): This is a private method until consensus is reached on
+    supporting object deserialization deduping within Keras.
+
+    Args:
+      config: A Dict config acquired with `_get_config`.
+      custom_objects: Optional dictionary mapping names (strings) to custom
+        classes or functions to be considered during deserialization.
+      columns_by_name: A Dict[String, FeatureColumn] of existing columns in
+        order to avoid duplication. Should be passed to any calls to
+        deserialize_feature_column().
+
+    Returns:
+      A FeatureColumn for the input config.
+    """
+    pass
+
 
 class DenseColumn(FeatureColumn):
   """Represents a column which can be represented as `Tensor`.
@@ -1992,7 +2408,7 @@ def _create_categorical_column_weighted_sum(
     weight_tensor = sparse_ops.sparse_reshape(
         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
-  return _safe_embedding_lookup_sparse(
+  return embedding_ops.safe_embedding_lookup_sparse(
       weight_var,
       id_tensor,
       sparse_weights=weight_tensor,
@@ -2345,6 +2761,32 @@ class NumericColumn(
     del trainable
     return inputs.get(self)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['normalizer_fn'] = utils.serialize_keras_object(self.normalizer_fn)
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
+    if config['normalizer_fn']:
+      kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+          config['normalizer_fn'], custom_objects=custom_objects)
+    else:
+      kwargs['normalizer_fn'] = None
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class BucketizedColumn(
     DenseColumn,
@@ -2477,6 +2919,26 @@ class BucketizedColumn(
     input_tensor = inputs.get(self)
     return self._get_sparse_tensors_for_input_tensor(input_tensor)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.source_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['source_column'] = serialize_feature_column(self.source_column)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['source_column'] = deserialize_feature_column(
+        config['source_column'], custom_objects, columns_by_name)
+    return cls(**kwargs)
+
 
 class EmbeddingColumn(
     DenseColumn,
@@ -2539,6 +3001,8 @@ class EmbeddingColumn(
         shape=embedding_shape,
         dtype=dtypes.float32,
         trainable=self.trainable,
+        # TODO(rohanj): Make this True when b/118500434 is fixed.
+        use_resource=False,
         initializer=self.initializer)
 
   def _get_dense_tensor_internal_helper(self, sparse_tensors,
@@ -2555,7 +3019,7 @@ class EmbeddingColumn(
       })
 
     # Return embedding lookup result.
-    return _safe_embedding_lookup_sparse(
+    return embedding_ops.safe_embedding_lookup_sparse(
         embedding_weights=embedding_weights,
         sparse_ids=sparse_ids,
         sparse_weights=sparse_weights,
@@ -2682,11 +3146,39 @@ class EmbeddingColumn(
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    config['initializer'] = utils.serialize_keras_object(self.initializer)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
+    if config['initializer']:
+      kwargs['initializer'] = utils.deserialize_keras_object(
+          config['initializer'], custom_objects=custom_objects)
+    else:
+      kwargs['initializer'] = None
+    return cls(**kwargs)
+
 
 def _raise_shared_embedding_column_error():
   raise ValueError('SharedEmbeddingColumns are not supported in '
                    '`linear_model` or `input_layer`. Please use '
-                   '`FeatureLayer` or `LinearModel` instead.')
+                   '`DenseFeatures` or `LinearModel` instead.')
 
 
 class SharedEmbeddingColumnCreator(tracking.Checkpointable):
@@ -2798,7 +3290,7 @@ class SharedEmbeddingColumn(
       embedding_weights = self.shared_embedding_column_creator.embedding_weights
 
       # Return embedding lookup result.
-      return _safe_embedding_lookup_sparse(
+      return embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights=embedding_weights,
           sparse_ids=sparse_ids,
           sparse_weights=sparse_weights,
@@ -2848,6 +3340,20 @@ class SharedEmbeddingColumn(
                                  trainable=None):
     return _raise_shared_embedding_column_error()
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
 
 def _create_tuple(shape, value):
   """Returns a tuple with given shape and filled with value."""
@@ -3042,6 +3548,25 @@ class HashedCategoricalColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class VocabularyFileCategoricalColumn(
     CategoricalColumn,
@@ -3134,6 +3659,25 @@ class VocabularyFileCategoricalColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class VocabularyListCategoricalColumn(
     CategoricalColumn,
@@ -3226,6 +3770,25 @@ class VocabularyListCategoricalColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class IdentityCategoricalColumn(
     CategoricalColumn,
@@ -3327,6 +3890,21 @@ class IdentityCategoricalColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    return dict(zip(self._fields, self))
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    return cls(**config)
+
 
 class WeightedCategoricalColumn(
     CategoricalColumn,
@@ -3397,9 +3975,13 @@ class WeightedCategoricalColumn(
 
   def transform_feature(self, transformation_cache, state_manager):
     """Applies weights to tensor generated from `categorical_column`'."""
+    print('WeightedCategoricalColumn.transform_feature: ', self.name)
+    print('Weight feature key: ', self.weight_feature_key)
     weight_tensor = transformation_cache.get(self.weight_feature_key,
                                              state_manager)
+    print('Weight tensor before: ', weight_tensor)
     weight_tensor = self._transform_weight_tensor(weight_tensor)
+    print('Weight tensor after: ', weight_tensor)
     return (transformation_cache.get(self.categorical_column, state_manager),
             weight_tensor)
 
@@ -3413,7 +3995,9 @@ class WeightedCategoricalColumn(
 
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
+    print('WeightedCategoricalColumn.get_sparse_tensors: ', self.name)
     tensors = transformation_cache.get(self, state_manager)
+    print('tensors[1]: ', tensors[1])
     return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
@@ -3425,6 +4009,29 @@ class WeightedCategoricalColumn(
     tensors = inputs.get(self)
     return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column, self.weight_feature_key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class CrossedColumn(
     CategoricalColumn,
@@ -3544,6 +4151,28 @@ class CrossedColumn(
     del trainable
     return CategoricalColumn.IdWeightPair(inputs.get(self), None)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return list(self.keys)
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['keys'] = tuple([serialize_feature_column(fc) for fc in self.keys])
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['keys'] = tuple([
+        deserialize_feature_column(c, custom_objects, columns_by_name)
+        for c in config['keys']
+    ])
+    return cls(**kwargs)
+
 
 def _collect_leaf_level_keys(cross):
   """Collects base keys by expanding all nested crosses.
@@ -3563,142 +4192,6 @@ def _collect_leaf_level_keys(cross):
   return leaf_level_keys
 
 
-# TODO(zakaria): Move this to embedding_ops and make it public.
-def _safe_embedding_lookup_sparse(embedding_weights,
-                                  sparse_ids,
-                                  sparse_weights=None,
-                                  combiner='mean',
-                                  default_id=None,
-                                  name=None,
-                                  partition_strategy='div',
-                                  max_norm=None):
-  """Lookup embedding results, accounting for invalid IDs and empty features.
-
-  The partitioned embedding in `embedding_weights` must all be the same shape
-  except for the first dimension. The first dimension is allowed to vary as the
-  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
-  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
-  partitioner.
-
-  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
-  with non-positive weight. For an entry with no features, the embedding vector
-  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
-
-  The ids and weights may be multi-dimensional. Embeddings are always aggregated
-  along the last dimension.
-
-  Args:
-    embedding_weights:  A list of `P` float `Tensor`s or values representing
-        partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
-        created by partitioning along dimension 0.  The total unpartitioned
-        shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
-        vocab size and `e_1, ..., e_m` are the embedding dimensions.
-    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
-        ids. `d_0` is typically batch size.
-    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
-        float weights corresponding to `sparse_ids`, or `None` if all weights
-        are be assumed to be 1.0.
-    combiner: A string specifying how to combine embedding results for each
-        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
-        the default.
-    default_id: The id to use for an entry with no features.
-    name: A name for this operation (optional).
-    partition_strategy: A string specifying the partitioning strategy.
-        Currently `"div"` and `"mod"` are supported. Default is `"div"`.
-    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
-        combining.
-
-
-  Returns:
-    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
-
-  Raises:
-    ValueError: if `embedding_weights` is empty.
-  """
-  if embedding_weights is None:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
-  if isinstance(embedding_weights, variables.PartitionedVariable):
-    embedding_weights = list(embedding_weights)  # get underlying Variables.
-  if not isinstance(embedding_weights, list):
-    embedding_weights = [embedding_weights]
-  if len(embedding_weights) < 1:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
-
-  dtype = sparse_weights.dtype if sparse_weights is not None else None
-  # TODO(rohanj): Look into removing this convert_to_tensor call.
-  embedding_weights = [
-      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
-  ]
-
-  with ops.name_scope(name, 'embedding_lookup',
-                      embedding_weights + [sparse_ids,
-                                           sparse_weights]) as scope:
-    # Reshape higher-rank sparse ids and weights to linear segment ids.
-    original_shape = sparse_ids.dense_shape
-    original_rank_dim = tensor_shape.dimension_value(
-        sparse_ids.dense_shape.get_shape()[0])
-    original_rank = (
-        array_ops.size(original_shape)
-        if original_rank_dim is None
-        else original_rank_dim)
-    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
-        math_ops.reduce_prod(
-            array_ops.slice(original_shape, [0], [original_rank - 1])),
-        array_ops.gather(original_shape, original_rank - 1)])
-    if sparse_weights is not None:
-      sparse_weights = sparse_tensor_lib.SparseTensor(
-          sparse_ids.indices,
-          sparse_weights.values, sparse_ids.dense_shape)
-
-    # Prune invalid ids and weights.
-    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
-    if combiner != 'sum':
-      sparse_ids, sparse_weights = _prune_invalid_weights(
-          sparse_ids, sparse_weights)
-
-    # Fill in dummy values for empty features, if necessary.
-    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
-                                                                 default_id or
-                                                                 0)
-    if sparse_weights is not None:
-      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
-
-    result = embedding_ops.embedding_lookup_sparse(
-        embedding_weights,
-        sparse_ids,
-        sparse_weights,
-        combiner=combiner,
-        partition_strategy=partition_strategy,
-        name=None if default_id is None else scope,
-        max_norm=max_norm)
-
-    if default_id is None:
-      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
-      # for use in Select.
-      is_row_empty = array_ops.tile(
-          array_ops.reshape(is_row_empty, [-1, 1]),
-          array_ops.stack([1, array_ops.shape(result)[1]]))
-
-      result = array_ops.where(is_row_empty,
-                               array_ops.zeros_like(result),
-                               result,
-                               name=scope)
-
-    # Reshape back from linear ids back into higher-dimensional dense result.
-    final_result = array_ops.reshape(
-        result,
-        array_ops.concat([
-            array_ops.slice(
-                math_ops.cast(original_shape, dtypes.int32), [0],
-                [original_rank - 1]),
-            array_ops.slice(array_ops.shape(result), [1], [-1])
-        ], 0))
-    final_result.set_shape(tensor_shape.unknown_shape(
-        (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
-            result.get_shape()[1:]))
-    return final_result
-
-
 def _prune_invalid_ids(sparse_ids, sparse_weights):
   """Prune invalid IDs (< 0) from the input ids and weights."""
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
@@ -3754,10 +4247,14 @@ class IndicatorColumn(
           sp_ids=id_tensor,
           sp_values=weight_tensor,
           vocab_size=int(self._variable_shape[-1]))
-      # Remove (?, -1) index
+      # Remove (?, -1) index.
       weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
                                                 weighted_column.dense_shape)
-      return sparse_ops.sparse_tensor_to_dense(weighted_column)
+      # Use scatter_nd to merge duplicated indices if existed,
+      # instead of sparse_tensor_to_dense.
+      return array_ops.scatter_nd(weighted_column.indices,
+                                  weighted_column.values,
+                                  weighted_column.dense_shape)
 
     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
         id_tensor, default_value=-1)
@@ -3922,6 +4419,27 @@ class IndicatorColumn(
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    return cls(**kwargs)
+
 
 def _verify_static_batch_size_equality(tensors, columns):
   """Verify equality between static batch sizes.
@@ -4044,3 +4562,186 @@ class SequenceCategoricalColumn(
                           trainable=None):
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     return self._get_sparse_tensors_helper(sparse_tensors)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    return cls(**kwargs)
+
+
+# FeatureColumn serialization, deserialization logic.
+
+
+def _check_config_keys(config, expected_keys):
+  """Checks that a config has all expected_keys."""
+  if set(config.keys()) != set(expected_keys):
+    raise ValueError('Invalid config: {}, expected keys: {}'.format(
+        config, expected_keys))
+
+
+def serialize_feature_column(fc):
+  """Serializes a FeatureColumn or a raw string key.
+
+  This method should only be used to serialize parent FeatureColumns when
+  implementing FeatureColumn._get_config(), else serialize_feature_columns()
+  is preferable.
+
+  This serialization also keeps information of the FeatureColumn class, so
+  deserialization is possible without knowing the class type. For example:
+
+  a = numeric_column('x')
+  a._get_config() gives:
+  {
+      'key': 'price',
+      'shape': (1,),
+      'default_value': None,
+      'dtype': 'float32',
+      'normalizer_fn': None
+  }
+  While serialize_feature_column(a) gives:
+  {
+      'class_name': 'NumericColumn',
+      'config': {
+          'key': 'price',
+          'shape': (1,),
+          'default_value': None,
+          'dtype': 'float32',
+          'normalizer_fn': None
+      }
+  }
+
+  Args:
+    fc: A FeatureColumn or raw feature key string.
+
+  Returns:
+    Keras serialization for FeatureColumns, leaves string keys unaffected.
+
+  Raises:
+    ValueError if called with input that is not string or FeatureColumn.
+  """
+  if isinstance(fc, six.string_types):
+    return fc
+  elif isinstance(fc, FeatureColumn):
+    return utils.serialize_keras_class_and_config(fc.__class__.__name__,
+                                                  fc._get_config())
+  else:
+    raise ValueError('Instance: {} is not a FeatureColumn'.format(fc))
+
+
+def deserialize_feature_column(config,
+                               custom_objects=None,
+                               columns_by_name=None):
+  """Deserializes a `config` generated with `serialize_feature_column`.
+
+  This method should only be used to deserialize parent FeatureColumns when
+  implementing FeatureColumn._from_config(), else deserialize_feature_columns()
+  is preferable. Returns a FeatureColumn for this config.
+  TODO(b/118939620): Simplify code if Keras utils support object deduping.
+
+  Args:
+    config: A Dict with the serialization of feature columns acquired by
+      `serialize_feature_column`, or a string representing a raw column.
+    custom_objects: A Dict from custom_object name to the associated keras
+      serializable objects (FeatureColumns, classes or functions).
+    columns_by_name: A Dict[String, FeatureColumn] of existing columns in order
+      to avoid duplication.
+
+  Raises:
+    ValueError if `config` has invalid format (e.g: expected keys missing,
+    or refers to unknown classes).
+
+  Returns:
+    A FeatureColumn corresponding to the input `config`.
+  """
+  if isinstance(config, six.string_types):
+    return config
+  # A dict from class_name to class for all FeatureColumns in this module.
+  # FeatureColumns not part of the module can be passed as custom_objects.
+  module_feature_column_classes = {
+      cls.__name__: cls for cls in [
+          BucketizedColumn, EmbeddingColumn, HashedCategoricalColumn,
+          IdentityCategoricalColumn, IndicatorColumn, NumericColumn,
+          SequenceCategoricalColumn, SequenceDenseColumn, SharedEmbeddingColumn,
+          VocabularyFileCategoricalColumn, VocabularyListCategoricalColumn,
+          WeightedCategoricalColumn
+      ]
+  }
+  if columns_by_name is None:
+    columns_by_name = {}
+
+  (cls, cls_config) = utils.class_and_config_for_serialized_keras_object(
+      config,
+      module_objects=module_feature_column_classes,
+      custom_objects=custom_objects,
+      printable_module_name='feature_column_v2')
+
+  if not issubclass(cls, FeatureColumn):
+    raise ValueError(
+        'Expected FeatureColumn class, instead found: {}'.format(cls))
+
+  # Always deserialize the FeatureColumn, in order to get the name.
+  new_instance = cls._from_config(cls_config, columns_by_name=columns_by_name)  # pylint: disable=protected-access
+
+  # If the name already exists, re-use the column from columns_by_name,
+  # (new_instance remains unused).
+  return columns_by_name.setdefault(new_instance.name, new_instance)
+
+
+def serialize_feature_columns(feature_columns):
+  """Serializes a list of FeatureColumns.
+
+  Returns a list of Keras-style config dicts that represent the input
+  FeatureColumns and can be used with `deserialize_feature_columns` for
+  reconstructing the original columns.
+
+  Args:
+    feature_columns: A list of FeatureColumns.
+
+  Returns:
+    Keras serialization for the list of FeatureColumns.
+
+  Raises:
+    ValueError if called with input that is not a list of FeatureColumns.
+  """
+  return [serialize_feature_column(fc) for fc in feature_columns]
+
+
+def deserialize_feature_columns(configs, custom_objects=None):
+  """Deserializes a list of FeatureColumns configs.
+
+  Returns a list of FeatureColumns given a list of config dicts acquired by
+  `serialize_feature_columns`.
+
+  Args:
+    configs: A list of Dicts with the serialization of feature columns acquired
+      by `serialize_feature_columns`.
+    custom_objects: A Dict from custom_object name to the associated keras
+      serializable objects (FeatureColumns, classes or functions).
+
+  Returns:
+    FeatureColumn objects corresponding to the input configs.
+
+  Raises:
+    ValueError if called with input that is not a list of FeatureColumns.
+  """
+  columns_by_name = {}
+  return [
+      deserialize_feature_column(c, custom_objects, columns_by_name)
+      for c in configs
+  ]
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index ab727752b49c911ffbaf5c69da5d9607e898b230..23131e22edef78a6e0970f20bad4dd3918610b3c 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -27,6 +27,7 @@ from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -48,6 +49,7 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import queue_runner_impl
+from tensorflow.python.training import rmsprop
 
 
 def _initialized_session(config=None):
@@ -67,11 +69,30 @@ def get_linear_model_column_var(column, name='linear_model'):
                             name + '/' + column.name)[0]
 
 
+class BaseFeatureColumnForTests(fc.FeatureColumn):
+  """A base FeatureColumn useful to avoid boiler-plate in tests.
+
+  Provides dummy implementations for abstract methods that raise ValueError in
+  order to avoid re-defining all abstract methods for each test sub-class.
+  """
+
+  @property
+  def parents(self):
+    raise ValueError('Should not use this method.')
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    raise ValueError('Should not use this method.')
+
+  def _get_config(self):
+    raise ValueError('Should not use this method.')
+
+
 class LazyColumnTest(test.TestCase):
 
   def test_transformations_called_once(self):
 
-    class TransformCounter(fc.FeatureColumn):
+    class TransformCounter(BaseFeatureColumnForTests):
 
       def __init__(self):
         self.num_transform = 0
@@ -103,7 +124,7 @@ class LazyColumnTest(test.TestCase):
 
   def test_returns_transform_output(self):
 
-    class Transformer(fc.FeatureColumn):
+    class Transformer(BaseFeatureColumnForTests):
 
       @property
       def _is_v2_column(self):
@@ -128,7 +149,7 @@ class LazyColumnTest(test.TestCase):
 
   def test_does_not_pollute_given_features_dict(self):
 
-    class Transformer(fc.FeatureColumn):
+    class Transformer(BaseFeatureColumnForTests):
 
       @property
       def _is_v2_column(self):
@@ -162,7 +183,7 @@ class LazyColumnTest(test.TestCase):
 
   def test_not_supported_feature_column(self):
 
-    class NotAProperColumn(fc.FeatureColumn):
+    class NotAProperColumn(BaseFeatureColumnForTests):
 
       @property
       def _is_v2_column(self):
@@ -304,7 +325,7 @@ class NumericColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([price]))
+        features=fc.make_parse_example_spec_v2([price]))
     self.assertIn('price', features)
     with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
@@ -326,7 +347,7 @@ class NumericColumnTest(test.TestCase):
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString(),
                     no_data.SerializeToString()],
-        features=fc.make_parse_example_spec([price]))
+        features=fc.make_parse_example_spec_v2([price]))
     self.assertIn('price', features)
     with self.cached_session():
       self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
@@ -341,7 +362,7 @@ class NumericColumnTest(test.TestCase):
       return input_tensor + 2.
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
-    output = fc._transform_features({
+    output = fc._transform_features_v2({
         'price': [[1., 2.], [5., 6.]]
     }, [price], None)
     with self.cached_session():
@@ -390,11 +411,11 @@ class NumericColumnTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
   def test_old_linear_model(self):
     price = fc.numeric_column('price')
@@ -404,11 +425,33 @@ class NumericColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
+
+  def test_serialization(self):
+
+    def _increment_two(input_tensor):
+      return input_tensor + 2.
+
+    price = fc.numeric_column('price', normalizer_fn=_increment_two)
+    self.assertEqual(['price'], price.parents)
+
+    config = price._get_config()
+    self.assertEqual({
+        'key': 'price',
+        'shape': (1,),
+        'default_value': None,
+        'dtype': 'float32',
+        'normalizer_fn': '_increment_two'
+    }, config)
+
+    self.assertEqual(
+        price,
+        fc.NumericColumn._from_config(
+            config, custom_objects={'_increment_two': _increment_two}))
 
 
 class BucketizedColumnTest(test.TestCase):
@@ -448,7 +491,7 @@ class BucketizedColumnTest(test.TestCase):
     self.assertEqual('aaa_bucketized', b.name)
 
   def test_is_v2_column_old_numeric(self):
-    a = fc_old.numeric_column('aaa', dtype=dtypes.int32)
+    a = fc_old._numeric_column('aaa', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     self.assertFalse(b._is_v2_column)
     self.assertEqual('aaa_bucketized', b.name)
@@ -483,7 +526,7 @@ class BucketizedColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([bucketized_price]))
+        features=fc.make_parse_example_spec_v2([bucketized_price]))
     self.assertIn('price', features)
     with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
@@ -492,7 +535,7 @@ class BucketizedColumnTest(test.TestCase):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      transformed_tensor = fc._transform_features({
+      transformed_tensor = fc._transform_features_v2({
           'price': [[-1., 1.], [5., 6.]]
       }, [bucketized_price], None)
       with _initialized_session():
@@ -512,11 +555,9 @@ class BucketizedColumnTest(test.TestCase):
             transformation_cache, None)
         self.assertAllClose(
             # One-hot tensor.
-            [[[1., 0., 0., 0., 0.]],
-             [[0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.]],
-             [[0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            [[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 1.]]],
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_dense_tensor_two_input_values(self):
     """Tests _get_dense_tensor() for input with shape=[2]."""
@@ -533,7 +574,7 @@ class BucketizedColumnTest(test.TestCase):
             # One-hot tensor.
             [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
              [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_sparse_tensors_one_input_value(self):
     """Tests _get_sparse_tensors() for input with shape=[1]."""
@@ -602,20 +643,23 @@ class BucketizedColumnTest(test.TestCase):
       predictions = model(features)
       bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
@@ -627,12 +671,12 @@ class BucketizedColumnTest(test.TestCase):
       predictions = model(features)
       bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.],
              [60.], [70.], [80.], [90.], [100.]]))
@@ -642,9 +686,9 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_old_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
@@ -656,20 +700,23 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_old_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
@@ -681,12 +728,12 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
                                          [60.], [70.], [80.], [90.], [100.]]))
@@ -696,13 +743,13 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_old_linear_model_one_input_value_old_numeric(self):
     """Tests linear_model() for input with shape=[1]."""
-    price = fc_old.numeric_column('price', shape=[1])
+    price = fc_old._numeric_column('price', shape=[1])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
@@ -710,20 +757,52 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
+
+  def test_serialization(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    self.assertEqual([price], bucketized_price.parents)
+
+    config = bucketized_price._get_config()
+    self.assertEqual({
+        'source_column': {
+            'class_name': 'NumericColumn',
+            'config': {
+                'key': 'price',
+                'shape': (2,),
+                'default_value': None,
+                'dtype': 'float32',
+                'normalizer_fn': None
+            }
+        },
+        'boundaries': (0, 2, 4, 6)
+    }, config)
+
+    new_bucketized_price = fc.BucketizedColumn._from_config(config)
+    self.assertEqual(bucketized_price, new_bucketized_price)
+    self.assertIsNot(price, new_bucketized_price.source_column)
+
+    new_bucketized_price = fc.BucketizedColumn._from_config(
+        config, columns_by_name={price.name: price})
+    self.assertEqual(bucketized_price, new_bucketized_price)
+    self.assertIs(price, new_bucketized_price.source_column)
 
 
 class HashedCategoricalColumnTest(test.TestCase):
@@ -785,7 +864,7 @@ class HashedCategoricalColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
     with self.cached_session():
       _assert_sparse_tensor_value(
@@ -802,7 +881,7 @@ class HashedCategoricalColumnTest(test.TestCase):
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
-    outputs = fc._transform_features({
+    outputs = fc._transform_features_v2({
         'wire': wire_tensor
     }, [hashed_sparse], None)
     output = outputs[hashed_sparse]
@@ -921,13 +1000,14 @@ class HashedCategoricalColumnTest(test.TestCase):
       })
       wire_var, bias = model.variables
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
@@ -943,13 +1023,28 @@ class HashedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
+
+  def test_serialization(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(['wire'], wire_column.parents)
+
+    config = wire_column._get_config()
+    self.assertEqual({
+        'key': 'wire',
+        'hash_bucket_size': 4,
+        'dtype': 'string'
+    }, config)
+
+    self.assertEqual(wire_column,
+                     fc.HashedCategoricalColumn._from_config(config))
 
 
 class CrossedColumnTest(test.TestCase):
@@ -999,7 +1094,7 @@ class CrossedColumnTest(test.TestCase):
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_is_v2_column(self):
-    a = fc_old.numeric_column('a', dtype=dtypes.int32)
+    a = fc_old._numeric_column('a', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     crossed1 = fc.crossed_column(['d1', 'd2'], 10)
     self.assertTrue(crossed1._is_v2_column)
@@ -1066,7 +1161,7 @@ class CrossedColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([price_cross_wire]))
+        features=fc.make_parse_example_spec_v2([price_cross_wire]))
     self.assertIn('price', features)
     self.assertIn('wire', features)
     with self.cached_session():
@@ -1081,8 +1176,8 @@ class CrossedColumnTest(test.TestCase):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
     hash_bucket_size = 10
-    price_cross_wire = fc.crossed_column(
-        [bucketized_price, 'wire'], hash_bucket_size)
+    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'],
+                                         hash_bucket_size)
     features = {
         'price': constant_op.constant([[1., 2.], [5., 6.]]),
         'wire': sparse_tensor.SparseTensor(
@@ -1090,10 +1185,10 @@ class CrossedColumnTest(test.TestCase):
             indices=[[0, 0], [1, 0], [1, 1]],
             dense_shape=[2, 2]),
     }
-    outputs = fc._transform_features(features, [price_cross_wire], None)
+    outputs = fc._transform_features_v2(features, [price_cross_wire], None)
     output = outputs[price_cross_wire]
     with self.cached_session() as sess:
-      output_val = sess.run(output)
+      output_val = self.evaluate(output)
       self.assertAllEqual(
           [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
       for val in output_val.values:
@@ -1188,19 +1283,20 @@ class CrossedColumnTest(test.TestCase):
       })
       crossed_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(
-            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_linear_model_with_weights(self):
 
-    class _TestColumnWithWeights(fc.CategoricalColumn):
+    class _TestColumnWithWeights(BaseFeatureColumnForTests,
+                                 fc.CategoricalColumn):
       """Produces sparse IDs and sparse weights."""
 
       @property
@@ -1280,19 +1376,20 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_old_linear_model_with_weights(self):
 
-    class _TestColumnWithWeights(fc.CategoricalColumn,
+    class _TestColumnWithWeights(BaseFeatureColumnForTests,
+                                 fc.CategoricalColumn,
                                  fc_old._CategoricalColumn):
       """Produces sparse IDs and sparse weights."""
 
@@ -1373,7 +1470,7 @@ class CrossedColumnTest(test.TestCase):
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc_old.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    a = fc_old._numeric_column('a', dtype=dtypes.int32, shape=(2,))
     b = fc.bucketized_column(a, boundaries=(0, 1))
     crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
@@ -1389,15 +1486,56 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
+
+  def test_serialization(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+
+    self.assertEqual([b, 'c'], crossed.parents)
+
+    config = crossed._get_config()
+    self.assertEqual({
+        'hash_bucket_size':
+            5,
+        'hash_key':
+            5,
+        'keys': ({
+            'config': {
+                'boundaries': (0, 1),
+                'source_column': {
+                    'config': {
+                        'dtype': 'int32',
+                        'default_value': None,
+                        'key': 'a',
+                        'normalizer_fn': None,
+                        'shape': (2,)
+                    },
+                    'class_name': 'NumericColumn'
+                }
+            },
+            'class_name': 'BucketizedColumn'
+        }, 'c')
+    }, config)
+
+    new_crossed = fc.CrossedColumn._from_config(config)
+    self.assertEqual(crossed, new_crossed)
+    self.assertIsNot(b, new_crossed.keys[0])
+
+    new_crossed = fc.CrossedColumn._from_config(
+        config, columns_by_name={b.name: b})
+    self.assertEqual(crossed, new_crossed)
+    self.assertIs(b, new_crossed.keys[0])
+
 
 
 class LinearModelTest(test.TestCase):
@@ -1413,7 +1551,7 @@ class LinearModelTest(test.TestCase):
 
   def test_should_be_dense_or_categorical_column(self):
 
-    class NotSupportedColumn(fc.FeatureColumn):
+    class NotSupportedColumn(BaseFeatureColumnForTests):
 
       @property
       def _is_v2_column(self):
@@ -1446,6 +1584,14 @@ class LinearModelTest(test.TestCase):
           feature_columns=[fc.numeric_column('a'),
                            fc.numeric_column('a')])
 
+  def test_not_dict_input_features(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = [[1.], [5.]]
+      model = fc.LinearModel([price])
+      with self.assertRaisesRegexp(ValueError, 'We expected a dictionary here'):
+        model(features)
+
   def test_dense_bias(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
@@ -1454,10 +1600,10 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1471,11 +1617,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1493,12 +1640,13 @@ class LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
 
-    class _DenseAndSparseColumn(fc.DenseColumn, fc.CategoricalColumn):
+    class _DenseAndSparseColumn(BaseFeatureColumnForTests, fc.DenseColumn,
+                                fc.CategoricalColumn):
 
       @property
       def _is_v2_column(self):
@@ -1547,7 +1695,7 @@ class LinearModelTest(test.TestCase):
         sess.run(dense_and_sparse_column_var.assign(
             [[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
@@ -1557,12 +1705,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1576,15 +1724,15 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
                 1000., 1100., 1200.
             ], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
@@ -1594,9 +1742,9 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, _ = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1611,7 +1759,7 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, _ = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -1634,7 +1782,7 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1655,7 +1803,7 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
@@ -1665,12 +1813,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -1690,11 +1838,11 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -1708,14 +1856,14 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price1_var, price2_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_dense_trainable_default(self):
     price = fc.numeric_column('price')
@@ -1816,6 +1964,8 @@ class LinearModelTest(test.TestCase):
           'sparse_feature': [['a'], ['x']],
       }
       model(features)
+      for var in model.variables:
+        self.assertTrue(isinstance(var, variables_lib.RefVariable))
       variable_names = [var.name for var in model.variables]
       self.assertItemsEqual([
           'linear_model/dense_feature_bucketized/weights:0',
@@ -1825,6 +1975,23 @@ class LinearModelTest(test.TestCase):
           'linear_model/bias_weights:0',
       ], variable_names)
 
+  def test_fit_and_predict(self):
+    columns = [fc.numeric_column('a')]
+
+    model = fc.LinearModel(columns)
+    model.compile(
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    x = {'a': np.random.random((10, 1))}
+    y = np.random.randint(20, size=(10, 1))
+    y = keras.utils.to_categorical(y, num_classes=20)
+    model.fit(x, y, epochs=1, batch_size=5)
+    model.fit(x, y, epochs=1, batch_size=5)
+    model.evaluate(x, y, batch_size=5)
+    model.predict(x, batch_size=5)
+
   def test_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
     price2 = fc.numeric_column('price2')
@@ -1921,7 +2088,8 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]],
+                          self.evaluate(net))
 
       coord.request_stop()
       coord.join(threads)
@@ -1957,7 +2125,8 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
     price = fc.numeric_column('price')
@@ -2040,14 +2209,14 @@ class LinearModelTest(test.TestCase):
       price_var1, bias1 = model1.variables
       price_var2, bias2 = model2.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
 
 class OldLinearModelTest(test.TestCase):
@@ -2063,7 +2232,8 @@ class OldLinearModelTest(test.TestCase):
 
   def test_should_be_dense_or_categorical_column(self):
 
-    class NotSupportedColumn(fc.FeatureColumn, fc_old._FeatureColumn):
+    class NotSupportedColumn(BaseFeatureColumnForTests, fc.FeatureColumn,
+                             fc_old._FeatureColumn):
 
       @property
       def _is_v2_column(self):
@@ -2114,10 +2284,10 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2131,11 +2301,12 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2154,13 +2325,14 @@ class OldLinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
 
-    class _DenseAndSparseColumn(fc.DenseColumn, fc.CategoricalColumn,
-                                fc_old._DenseColumn, fc_old._CategoricalColumn):
+    class _DenseAndSparseColumn(BaseFeatureColumnForTests, fc.DenseColumn,
+                                fc.CategoricalColumn, fc_old._DenseColumn,
+                                fc_old._CategoricalColumn):
 
       @property
       def _is_v2_column(self):
@@ -2235,7 +2407,7 @@ class OldLinearModelTest(test.TestCase):
             dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
                                                 [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
@@ -2245,12 +2417,12 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2264,15 +2436,15 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
                                   [1000., 1100., 1200.],
                                   [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
@@ -2281,9 +2453,9 @@ class OldLinearModelTest(test.TestCase):
       predictions = fc_old.linear_model(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2297,7 +2469,7 @@ class OldLinearModelTest(test.TestCase):
       predictions = fc_old.linear_model(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -2321,7 +2493,7 @@ class OldLinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2343,7 +2515,7 @@ class OldLinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
@@ -2353,12 +2525,12 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -2377,11 +2549,11 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -2393,14 +2565,14 @@ class OldLinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -2673,7 +2845,8 @@ class OldLinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
     price = fc.numeric_column('price')
@@ -2753,14 +2926,14 @@ class OldLinearModelTest(test.TestCase):
       price_var1 = get_linear_model_column_var(price, name='linear_model')
       price_var2 = get_linear_model_column_var(price, name='linear_model_1')
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
   def test_linear_model_v1_shared_embedding_all_other_v2(self):
     price = fc.numeric_column('price')  # v2
@@ -2768,11 +2941,11 @@ class OldLinearModelTest(test.TestCase):
         'sparse_feature', hash_bucket_size=5)  # v2
     some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)  # v2
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)  # v2
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)  # v2
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)  # v1
     all_cols = [
         price, some_embedding_column, shared_embedding_a, shared_embedding_b
@@ -2796,7 +2969,7 @@ class OldLinearModelTest(test.TestCase):
       fc_old.linear_model(features, all_cols)
       bias = get_linear_model_bias()
       with _initialized_session():
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
 
   def test_linear_model_v1_shared_embedding_with_v2_cat_all_other_v2(self):
     price = fc.numeric_column('price')  # v2
@@ -2808,7 +2981,7 @@ class OldLinearModelTest(test.TestCase):
         key='aaa', num_buckets=3)  # v2
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)  # v2
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)  # v1
     all_cols = [
         price, some_embedding_column, shared_embedding_a, shared_embedding_b
@@ -2832,19 +3005,19 @@ class OldLinearModelTest(test.TestCase):
       fc_old.linear_model(features, all_cols)
       bias = get_linear_model_bias()
       with _initialized_session():
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
 
   def test_linear_model_v1_v2_mix(self):
     price = fc.numeric_column('price')  # v2
-    some_sparse_column = fc_old.categorical_column_with_hash_bucket(
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)  # v1
-    some_embedding_column = fc_old.embedding_column(
+    some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)  # v1
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)  # v2
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)  # v2
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)  # v1
     all_cols = [
         price, some_embedding_column, shared_embedding_a, shared_embedding_b
@@ -2868,13 +3041,13 @@ class OldLinearModelTest(test.TestCase):
       fc_old.linear_model(features, all_cols)
       bias = get_linear_model_bias()
       with _initialized_session():
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
 
   def test_linear_model_v2_shared_embedding_all_other_v1(self):
-    price = fc_old.numeric_column('price')  # v1
-    some_sparse_column = fc_old.categorical_column_with_hash_bucket(
+    price = fc.numeric_column('price')  # v1
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)  # v1
-    some_embedding_column = fc_old.embedding_column(
+    some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)  # v1
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)  # v2
@@ -2906,13 +3079,13 @@ class OldLinearModelTest(test.TestCase):
         fc_old.linear_model(features, all_cols)
 
 
-class FeatureLayerTest(test.TestCase):
+class DenseFeaturesTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def test_retrieving_input(self):
     features = {'a': [0.]}
-    feature_layer = fc.FeatureLayer(fc.numeric_column('a'))
-    inputs = self.evaluate(feature_layer(features))
+    dense_features = fc.DenseFeatures(fc.numeric_column('a'))
+    inputs = self.evaluate(dense_features(features))
     self.assertAllClose([[0.]], inputs)
 
   def test_reuses_variables(self):
@@ -2941,11 +3114,11 @@ class FeatureLayerTest(test.TestCase):
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
 
-      feature_layer = fc.FeatureLayer([embedding_column])
+      dense_features = fc.DenseFeatures([embedding_column])
       features = {'a': sparse_input}
 
-      inputs = feature_layer(features)
-      variables = feature_layer.variables
+      inputs = dense_features(features)
+      variables = dense_features.variables
 
       # Sanity check: test that the inputs are correct.
       self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
@@ -2953,13 +3126,13 @@ class FeatureLayerTest(test.TestCase):
       # Check that only one variable was created.
       self.assertEqual(1, len(variables))
 
-      # Check that invoking feature_layer on the same features does not create
+      # Check that invoking dense_features on the same features does not create
       # additional variables
-      _ = feature_layer(features)
+      _ = dense_features(features)
       self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], feature_layer.variables[0])
+      self.assertEqual(variables[0], dense_features.variables[0])
 
-  def test_feature_column_feature_layer_gradient(self):
+  def test_feature_column_dense_features_gradient(self):
     with context.eager_mode():
       sparse_input = sparse_tensor.SparseTensor(
           indices=((0, 0), (1, 0), (2, 0)),
@@ -2986,11 +3159,11 @@ class FeatureLayerTest(test.TestCase):
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
 
-      feature_layer = fc.FeatureLayer([embedding_column])
+      dense_features = fc.DenseFeatures([embedding_column])
       features = {'a': sparse_input}
 
       def scale_matrix():
-        matrix = feature_layer(features)
+        matrix = dense_features(features)
         return 2 * matrix
 
       # Sanity check: Verify that scale_matrix returns the correct output.
@@ -3008,11 +3181,11 @@ class FeatureLayerTest(test.TestCase):
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
-      fc.FeatureLayer(feature_columns=[])(features={})
+      fc.DenseFeatures(feature_columns=[])(features={})
 
   def test_should_be_dense_column(self):
     with self.assertRaisesRegexp(ValueError, 'must be a DenseColumn'):
-      fc.FeatureLayer(feature_columns=[
+      fc.DenseFeatures(feature_columns=[
           fc.categorical_column_with_hash_bucket('wire_cast', 4)
       ])(
           features={
@@ -3022,7 +3195,7 @@ class FeatureLayerTest(test.TestCase):
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.FeatureLayer(feature_columns={'a': fc.numeric_column('a')})(
+      fc.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})(
           features={
               'a': [[0]]
           })
@@ -3030,22 +3203,22 @@ class FeatureLayerTest(test.TestCase):
   def test_bare_column(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
-      net = fc.FeatureLayer(fc.numeric_column('a'))(features)
+      net = fc.DenseFeatures(fc.numeric_column('a'))(features)
       with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+        self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
       columns = (fc.numeric_column(key) for key in features)
-      net = fc.FeatureLayer(columns)(features)
+      net = fc.DenseFeatures(columns)(features)
       with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+        self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
-      fc.FeatureLayer(
+      fc.DenseFeatures(
           feature_columns=[fc.numeric_column('a'),
                            fc.numeric_column('a')])(
                                features={
@@ -3056,17 +3229,17 @@ class FeatureLayerTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      net = fc.FeatureLayer([price])(features)
+      net = fc.DenseFeatures([price])(features)
       with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+        self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      net = fc.FeatureLayer([price])(features)
+      net = fc.DenseFeatures([price])(features)
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_compute_output_shape(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3076,12 +3249,13 @@ class FeatureLayerTest(test.TestCase):
           'price1': [[1., 2.], [5., 6.]],
           'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
       }
-      feature_layer = fc.FeatureLayer([price1, price2])
-      self.assertEqual((None, 6), feature_layer.compute_output_shape((None,)))
-      net = feature_layer(features)
+      dense_features = fc.DenseFeatures([price1, price2])
+      self.assertEqual((None, 6), dense_features.compute_output_shape((None,)))
+      net = dense_features(features)
       with _initialized_session():
         self.assertAllClose(
-            [[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]], net.eval())
+            [[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]],
+            self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -3090,15 +3264,15 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        fc.FeatureLayer([price])(features)
+        fc.DenseFeatures([price])(features)
 
   def test_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      net = fc.FeatureLayer([price])(features)
+      net = fc.DenseFeatures([price])(features)
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3108,9 +3282,9 @@ class FeatureLayerTest(test.TestCase):
           'price1': [[1., 2.], [5., 6.]],
           'price2': [[3.], [4.]]
       }
-      net = fc.FeatureLayer([price1, price2])(features)
+      net = fc.DenseFeatures([price1, price2])(features)
       with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_cols_to_output_tensors(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3118,12 +3292,12 @@ class FeatureLayerTest(test.TestCase):
     with ops.Graph().as_default():
       cols_dict = {}
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      feature_layer = fc.FeatureLayer([price1, price2])
-      net = feature_layer(features, cols_dict)
+      dense_features = fc.DenseFeatures([price1, price2])
+      net = dense_features(features, cols_dict)
       with _initialized_session():
         self.assertAllClose([[1., 2.], [5., 6.]], cols_dict[price1].eval())
         self.assertAllClose([[3.], [4.]], cols_dict[price2].eval())
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_column_order(self):
     price_a = fc.numeric_column('price_a')
@@ -3133,11 +3307,11 @@ class FeatureLayerTest(test.TestCase):
           'price_a': [[1.]],
           'price_b': [[3.]],
       }
-      net1 = fc.FeatureLayer([price_a, price_b])(features)
-      net2 = fc.FeatureLayer([price_b, price_a])(features)
+      net1 = fc.DenseFeatures([price_a, price_b])(features)
+      net2 = fc.DenseFeatures([price_b, price_a])(features)
       with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+        self.assertAllClose([[1., 3.]], self.evaluate(net1))
+        self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
     animal = fc.categorical_column_with_identity('animal', num_buckets=4)
@@ -3148,7 +3322,7 @@ class FeatureLayerTest(test.TestCase):
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
       with self.assertRaisesRegexp(Exception, 'must be a DenseColumn'):
-        fc.FeatureLayer([animal])(features)
+        fc.DenseFeatures([animal])(features)
 
   def test_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -3161,7 +3335,7 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.FeatureLayer([price1, price2])(features)
+        fc.DenseFeatures([price1, price2])(features)
 
   def test_subset_of_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -3176,7 +3350,7 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.FeatureLayer([price1, price2, price3])(features)
+        fc.DenseFeatures([price1, price2, price3])(features)
 
   def test_runtime_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -3186,7 +3360,7 @@ class FeatureLayerTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      net = fc.FeatureLayer([price1, price2])(features)
+      net = fc.DenseFeatures([price1, price2])(features)
       with _initialized_session() as sess:
         with self.assertRaisesRegexp(errors.OpError,
                                      'Dimensions of inputs should match'):
@@ -3200,7 +3374,7 @@ class FeatureLayerTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
           'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
       }
-      net = fc.FeatureLayer([price1, price2])(features)
+      net = fc.DenseFeatures([price1, price2])(features)
       with _initialized_session() as sess:
         sess.run(
             net,
@@ -3220,14 +3394,14 @@ class FeatureLayerTest(test.TestCase):
           'sparse_feature': [['a'], ['x']],
       }
       all_cols = [some_embedding_column]
-      fc.FeatureLayer(all_cols)(features)
-      fc.FeatureLayer(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
       # Make sure that 2 variables get created in this case.
       self.assertEqual(2, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
       expected_var_names = [
-          'feature_layer/sparse_feature_embedding/embedding_weights:0',
-          'feature_layer_1/sparse_feature_embedding/embedding_weights:0'
+          'dense_features/sparse_feature_embedding/embedding_weights:0',
+          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
       ]
       self.assertItemsEqual(
           expected_var_names,
@@ -3257,8 +3431,8 @@ class FeatureLayerTest(test.TestCase):
                   dense_shape=(2, 2)),
       }
       all_cols = [embedding_column_a, embedding_column_b]
-      fc.FeatureLayer(all_cols)(features)
-      fc.FeatureLayer(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
@@ -3290,7 +3464,7 @@ class FeatureLayerTest(test.TestCase):
                   values=(1, 2, 1),
                   dense_shape=(2, 2)),
       }
-      fc.FeatureLayer(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
@@ -3309,7 +3483,7 @@ class FeatureLayerTest(test.TestCase):
                   dense_shape=(2, 2)),
       }
 
-      fc.FeatureLayer(all_cols)(features1)
+      fc.DenseFeatures(all_cols)(features1)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
@@ -3327,13 +3501,13 @@ class FeatureLayerTest(test.TestCase):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in dense_features.
     one_hot_body_style = fc.indicator_column(body_style)
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in dense_features.
     embedded_body_style = fc.embedding_column(
         body_style, dimension=5, initializer=_initializer)
 
@@ -3345,7 +3519,7 @@ class FeatureLayerTest(test.TestCase):
         batch_size=2,
         shuffle=False)
     features = input_fn()
-    net = fc.FeatureLayer([price, one_hot_body_style, embedded_body_style])(
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_body_style])(
         features)
     self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
@@ -3372,15 +3546,15 @@ class FeatureLayerTest(test.TestCase):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in dense_features.
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in dense_features.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
     embedded_country = fc.embedding_column(
@@ -3400,7 +3574,7 @@ class FeatureLayerTest(test.TestCase):
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
     self.assertEqual(1, features['country'].shape.ndims)
 
-    net = fc.FeatureLayer([price, one_hot_body_style, embedded_country])(
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
         features)
     self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
@@ -3422,15 +3596,15 @@ class FeatureLayerTest(test.TestCase):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in dense_features.
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in dense_features.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
     embedded_country = fc.embedding_column(
@@ -3454,7 +3628,7 @@ class FeatureLayerTest(test.TestCase):
         dense_shape=(2,))
     country_data = np.array([['US'], ['CA']])
 
-    net = fc.FeatureLayer([price, one_hot_body_style, embedded_country])(
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
         features)
     self.assertEqual(1 + 3 + 2, net.shape[1])
     with _initialized_session() as sess:
@@ -3472,7 +3646,7 @@ class FeatureLayerTest(test.TestCase):
               }))
 
   def test_with_rank_0_feature(self):
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
     features = {
         'price': constant_op.constant(0),
@@ -3481,13 +3655,13 @@ class FeatureLayerTest(test.TestCase):
 
     # Static rank 0 should fail
     with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      fc.FeatureLayer([price])(features)
+      fc.DenseFeatures([price])(features)
 
     # Dynamic rank 0 should fail
     features = {
         'price': array_ops.placeholder(dtypes.float32),
     }
-    net = fc.FeatureLayer([price])(features)
+    net = fc.DenseFeatures([price])(features)
     self.assertEqual(1, net.shape[1])
     with _initialized_session() as sess:
       with self.assertRaisesOpError('Feature .* cannot have rank 0'):
@@ -3621,7 +3795,7 @@ class FunctionalInputLayerTest(test.TestCase):
       features = features = {'a': [0.]}
       net = fc_old.input_layer(features, fc.numeric_column('a'))
       with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+        self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
@@ -3629,7 +3803,7 @@ class FunctionalInputLayerTest(test.TestCase):
       columns = (fc.numeric_column(key) for key in features)
       net = fc_old.input_layer(features, columns)
       with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+        self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
@@ -3645,7 +3819,7 @@ class FunctionalInputLayerTest(test.TestCase):
       features = {'price': [[1.], [5.]]}
       net = fc_old.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+        self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
@@ -3653,7 +3827,7 @@ class FunctionalInputLayerTest(test.TestCase):
       features = {'price': [[1., 2.], [5., 6.]]}
       net = fc_old.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -3670,7 +3844,7 @@ class FunctionalInputLayerTest(test.TestCase):
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       net = fc_old.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3679,7 +3853,7 @@ class FunctionalInputLayerTest(test.TestCase):
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       net = fc_old.input_layer(features, [price1, price2])
       with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_fills_cols_to_vars(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
@@ -3723,11 +3897,11 @@ class FunctionalInputLayerTest(test.TestCase):
         'sparse_feature', hash_bucket_size=5)
     some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
       features = {
@@ -3810,8 +3984,8 @@ class FunctionalInputLayerTest(test.TestCase):
       net1 = fc_old.input_layer(features, [price_a, price_b])
       net2 = fc_old.input_layer(features, [price_b, price_a])
       with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+        self.assertAllClose([[1., 3.]], self.evaluate(net1))
+        self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
     animal = fc.categorical_column_with_identity('animal', num_buckets=4)
@@ -4046,7 +4220,7 @@ class FunctionalInputLayerTest(test.TestCase):
 
 class MakeParseExampleSpecTest(test.TestCase):
 
-  class _TestFeatureColumn(fc.FeatureColumn,
+  class _TestFeatureColumn(BaseFeatureColumnForTests,
                            collections.namedtuple('_TestFeatureColumn',
                                                   ('parse_spec'))):
 
@@ -4061,12 +4235,19 @@ class MakeParseExampleSpecTest(test.TestCase):
     def transform_feature(self, transformation_cache, state_manager):
       pass
 
+    def _transform_feature(self, inputs):
+      pass
+
     @property
     def parse_example_spec(self):
       return self.parse_spec
 
+    @property
+    def _parse_example_spec(self):
+      return self.parse_spec
+
   def test_no_feature_columns(self):
-    actual = fc.make_parse_example_spec([])
+    actual = fc.make_parse_example_spec_v2([])
     self.assertDictEqual({}, actual)
 
   def test_invalid_type(self):
@@ -4076,15 +4257,17 @@ class MakeParseExampleSpecTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'All feature_columns must be FeatureColumn instances.*invalid_column'):
-      fc.make_parse_example_spec(
-          (self._TestFeatureColumn({key1: parse_spec1}), 'invalid_column'))
+      fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+          key1: parse_spec1
+      }), 'invalid_column'))
 
   def test_one_feature_column(self):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }),))
     self.assertDictEqual({key1: parse_spec1}, actual)
 
   def test_two_feature_columns(self):
@@ -4093,9 +4276,11 @@ class MakeParseExampleSpecTest(test.TestCase):
         shape=(2,), dtype=dtypes.float32, default_value=0.)
     key2 = 'key2'
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key2: parse_spec2})))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key2: parse_spec2
+    })))
     self.assertDictEqual({key1: parse_spec1, key2: parse_spec2}, actual)
 
   def test_equal_keys_different_parse_spec(self):
@@ -4106,17 +4291,21 @@ class MakeParseExampleSpecTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'feature_columns contain different parse_spec for key key1'):
-      fc.make_parse_example_spec(
-          (self._TestFeatureColumn({key1: parse_spec1}),
-           self._TestFeatureColumn({key1: parse_spec2})))
+      fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+          key1: parse_spec1
+      }), self._TestFeatureColumn({
+          key1: parse_spec2
+      })))
 
   def test_equal_keys_equal_parse_spec(self):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key1: parse_spec1})))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key1: parse_spec1
+    })))
     self.assertDictEqual({key1: parse_spec1}, actual)
 
   def test_multiple_features_dict(self):
@@ -4128,9 +4317,12 @@ class MakeParseExampleSpecTest(test.TestCase):
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
     key3 = 'key3'
     parse_spec3 = parsing_ops.VarLenFeature(dtype=dtypes.int32)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key2: parse_spec2, key3: parse_spec3})))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key2: parse_spec2,
+        key3: parse_spec3
+    })))
     self.assertDictEqual(
         {key1: parse_spec1, key2: parse_spec2, key3: parse_spec3}, actual)
 
@@ -4180,8 +4372,11 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     self.assertEqual(7, column.num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
@@ -4189,8 +4384,11 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_deep_copy(self):
     original = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(7, column.num_buckets)
@@ -4226,11 +4424,13 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
   def test_too_large_vocabulary_size(self):
@@ -4253,13 +4453,17 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
@@ -4315,7 +4519,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
     with self.cached_session():
       _assert_sparse_tensor_value(
@@ -4379,15 +4583,16 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    id_tensor = fc._transform_features({'aaa': inputs}, [column], None)[column]
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
     with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_file(
@@ -4571,13 +4776,14 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       })
       wire_var, bias = model.variables
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
@@ -4597,13 +4803,36 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  def test_serialization(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+
+    self.assertEqual(['wire'], wire_column.parents)
+
+    config = wire_column._get_config()
+    self.assertEqual({
+        'default_value': -1,
+        'dtype': 'string',
+        'key': 'wire',
+        'num_oov_buckets': 1,
+        'vocabulary_file': self._wire_vocabulary_file_name,
+        'vocabulary_size': 3
+    }, config)
+
+    self.assertEqual(wire_column,
+                     fc.VocabularyFileCategoricalColumn._from_config(config))
 
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
@@ -4636,7 +4865,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+        key='aaa',
+        vocabulary_list=(12, 24, 36),
+        dtype=dtypes.int32,
         default_value=-99)
     self.assertEqual(3, column.num_buckets)
     self.assertEqual({
@@ -4656,7 +4887,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
@@ -4669,7 +4901,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, r'dtype.*and vocabulary dtype.*do not match'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
@@ -4698,8 +4931,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=(12, 24, 36),
-          num_oov_buckets=-1)
+          key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
     with self.assertRaisesRegexp(
@@ -4712,8 +4944,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_input_dtype_int32(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
@@ -4726,8 +4957,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_input_dtype_string(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=(12, 24, 36))
+        key='aaa', vocabulary_list=(12, 24, 36))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -4749,7 +4979,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
     with self.cached_session():
       _assert_sparse_tensor_value(
@@ -4771,7 +5001,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
     with self.cached_session():
       _assert_sparse_tensor_value(
@@ -4784,8 +5014,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -4806,26 +5035,25 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_transform_feature(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    id_tensor = fc._transform_features({'aaa': inputs}, [column], None)[column]
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
     with _initialized_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
@@ -4973,13 +5201,14 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       })
       wire_var, bias = model.variables
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
@@ -4998,13 +5227,35 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  def test_serialization(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=1)
+
+    self.assertEqual(['aaa'], wire_column.parents)
+
+    config = wire_column._get_config()
+    self.assertEqual({
+        'default_value': -1,
+        'dtype': 'string',
+        'key': 'aaa',
+        'num_oov_buckets': 1,
+        'vocabulary_list': ('omar', 'stringer', 'marlo')
+    }, config)
+
+    self.assertEqual(wire_column,
+                     fc.VocabularyListCategoricalColumn._from_config(config))
+
 
 
 class IdentityCategoricalColumnTest(test.TestCase):
@@ -5072,7 +5323,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
     with self.cached_session():
       _assert_sparse_tensor_value(
@@ -5109,15 +5360,16 @@ class IdentityCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
         dense_shape=(2, 2))
-    id_tensor = fc._transform_features({'aaa': inputs}, [column], None)[column]
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
     with _initialized_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
@@ -5230,13 +5482,13 @@ class IdentityCategoricalColumnTest(test.TestCase):
       })
       weight_var, bias = model.variables
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
@@ -5252,13 +5504,27 @@ class IdentityCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
+
+  def test_serialization(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+
+    self.assertEqual(['aaa'], column.parents)
+
+    config = column._get_config()
+    self.assertEqual({
+        'default_value': None,
+        'key': 'aaa',
+        'number_buckets': 3
+    }, config)
+
+    self.assertEqual(column, fc.IdentityCategoricalColumn._from_config(config))
 
 
 class TransformFeaturesTest(test.TestCase):
@@ -5278,7 +5544,7 @@ class TransformFeaturesTest(test.TestCase):
                   indices=[[0, 0], [1, 0], [1, 1]],
                   dense_shape=[2, 2])
       }
-      transformed = fc._transform_features(
+      transformed = fc._transform_features_v2(
           features, [bucketized_price, hashed_sparse], None)
       with _initialized_session():
         self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
@@ -5289,7 +5555,7 @@ class TransformFeaturesTest(test.TestCase):
   def test_column_order(self):
     """When the column is both dense and sparse, uses sparse tensors."""
 
-    class _LoggerColumn(fc.FeatureColumn):
+    class _LoggerColumn(BaseFeatureColumnForTests):
 
       def __init__(self, name):
         self._name = name
@@ -5315,12 +5581,12 @@ class TransformFeaturesTest(test.TestCase):
       column1 = _LoggerColumn('1')
       column2 = _LoggerColumn('2')
       call_logger = {'count': 0}
-      fc._transform_features({}, [column1, column2], None)
+      fc._transform_features_v2({}, [column1, column2], None)
       self.assertEqual(0, column1.call_order)
       self.assertEqual(1, column2.call_order)
 
       call_logger = {'count': 0}
-      fc._transform_features({}, [column2, column1], None)
+      fc._transform_features_v2({}, [column2, column1], None)
       self.assertEqual(0, column1.call_order)
       self.assertEqual(1, column2.call_order)
 
@@ -5335,7 +5601,7 @@ class IndicatorColumnTest(test.TestCase):
     self.assertEqual(indicator_a.variable_shape, [1, 4])
     self.assertTrue(indicator_a._is_v2_column)
 
-    b = fc_old.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    b = fc_old._categorical_column_with_hash_bucket('b', hash_bucket_size=100)
     indicator_b = fc.indicator_column(b)
     self.assertEqual(indicator_b.categorical_column.name, 'b')
     self.assertEqual(indicator_b.name, 'b_indicator')
@@ -5350,7 +5616,8 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = transformation_cache.get(animal, None)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
     # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
@@ -5365,7 +5632,8 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = transformation_cache.get(animal, None)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_multi_hot(self):
     animal = fc.indicator_column(
@@ -5378,7 +5646,7 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = transformation_cache.get(animal, None)
     with self.cached_session():
-      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+      self.assertAllEqual([[0., 2., 0., 0.]], self.evaluate(output))
 
   def test_multi_hot2(self):
     animal = fc.indicator_column(
@@ -5390,7 +5658,7 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = transformation_cache.get(animal, None)
     with self.cached_session():
-      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
 
   def test_deep_copy(self):
     a = fc.categorical_column_with_hash_bucket('a', 4)
@@ -5412,7 +5680,7 @@ class IndicatorColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_indicator]))
+        features=fc.make_parse_example_spec_v2([a_indicator]))
     self.assertIn('aaa', features)
     with self.cached_session():
       _assert_sparse_tensor_value(
@@ -5433,10 +5701,11 @@ class IndicatorColumnTest(test.TestCase):
             values=('marlo', 'skywalker', 'omar'),
             dense_shape=(2, 2))
     }
-    indicator_tensor = fc._transform_features(features, [a_indicator],
-                                              None)[a_indicator]
+    indicator_tensor = fc._transform_features_v2(features, [a_indicator],
+                                                 None)[a_indicator]
     with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+      self.assertAllEqual([[0, 0, 1], [1, 0, 0]],
+                          self.evaluate(indicator_tensor))
 
   def test_transform_with_weighted_column(self):
     # Github issue 12557
@@ -5445,13 +5714,13 @@ class IndicatorColumnTest(test.TestCase):
     weights = fc.weighted_categorical_column(ids, 'weights')
     indicator = fc.indicator_column(weights)
     features = {
-        'ids': constant_op.constant([['c', 'b', 'a']]),
-        'weights': constant_op.constant([[2., 4., 6.]])
+        'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
+        'weights': constant_op.constant([[2., 4., 6., 1.]])
     }
-    indicator_tensor = fc._transform_features(features, [indicator],
-                                              None)[indicator]
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
     with _initialized_session():
-      self.assertAllEqual([[6., 4., 2.]], indicator_tensor.eval())
+      self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
 
   def test_transform_with_missing_value_in_weighted_column(self):
     # Github issue 12583
@@ -5463,10 +5732,10 @@ class IndicatorColumnTest(test.TestCase):
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
         'weights': constant_op.constant([[2., 4., 6.]])
     }
-    indicator_tensor = fc._transform_features(features, [indicator],
-                                              None)[indicator]
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
 
   def test_transform_with_missing_value_in_categorical_column(self):
     # Github issue 12583
@@ -5476,10 +5745,10 @@ class IndicatorColumnTest(test.TestCase):
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
     }
-    indicator_tensor = fc._transform_features(features, [indicator],
-                                              None)[indicator]
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
 
   def test_linear_model(self):
     animal = fc.indicator_column(
@@ -5496,10 +5765,10 @@ class IndicatorColumnTest(test.TestCase):
       weight_var, _ = model.variables
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
   def test_old_linear_model(self):
     animal = fc.indicator_column(
@@ -5515,14 +5784,14 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
   def test_old_linear_model_old_categorical(self):
     animal = fc.indicator_column(
-        fc_old.categorical_column_with_identity('animal', num_buckets=4))
+        fc_old._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -5534,12 +5803,12 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  def test_feature_layer(self):
+  def test_dense_features(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
@@ -5548,9 +5817,9 @@ class IndicatorColumnTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      net = fc.FeatureLayer([animal])(features)
+      net = fc.DenseFeatures([animal])(features)
       with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+        self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
   def test_input_layer(self):
     animal = fc.indicator_column(
@@ -5563,11 +5832,11 @@ class IndicatorColumnTest(test.TestCase):
       }
       net = fc_old.input_layer(features, [animal])
       with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+        self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
   def test_input_layer_old_categorical(self):
     animal = fc.indicator_column(
-        fc_old.categorical_column_with_identity('animal', num_buckets=4))
+        fc_old._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -5576,7 +5845,35 @@ class IndicatorColumnTest(test.TestCase):
       }
       net = fc_old.input_layer(features, [animal])
       with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+        self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+  def test_serialization(self):
+    parent = fc.categorical_column_with_identity('animal', num_buckets=4)
+    animal = fc.indicator_column(parent)
+
+    self.assertEqual([parent], animal.parents)
+
+    config = animal._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'class_name': 'IdentityCategoricalColumn',
+            'config': {
+                'key': 'animal',
+                'default_value': None,
+                'number_buckets': 4
+            }
+        }
+    }, config)
+
+    new_animal = fc.IndicatorColumn._from_config(config)
+    self.assertEqual(animal, new_animal)
+    self.assertIsNot(parent, new_animal.categorical_column)
+
+    new_animal = fc.IndicatorColumn._from_config(
+        config, columns_by_name={parent.name: parent})
+    self.assertEqual(animal, new_animal)
+    self.assertIs(parent, new_animal.categorical_column)
+
 
 
 class _TestStateManager(fc.StateManager):
@@ -5592,6 +5889,7 @@ class _TestStateManager(fc.StateManager):
                       shape,
                       dtype=None,
                       trainable=True,
+                      use_resource=True,
                       initializer=None):
     if feature_column not in self._all_variables:
       self._all_variables[feature_column] = {}
@@ -5604,6 +5902,7 @@ class _TestStateManager(fc.StateManager):
           shape=shape,
           dtype=dtype,
           trainable=self._trainable and trainable,
+          use_resource=use_resource,
           initializer=initializer)
       var_dict[name] = var
       return var
@@ -5639,7 +5938,7 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertTrue(embedding_column._is_v2_column)
 
   def test_is_v2_column(self):
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     embedding_column = fc.embedding_column(
@@ -5651,10 +5950,14 @@ class EmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
@@ -5673,10 +5976,14 @@ class EmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     original = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     for embedding_column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', embedding_column.categorical_column.name)
       self.assertEqual(3, embedding_column.categorical_column.num_buckets)
@@ -5714,7 +6021,7 @@ class EmbeddingColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_embedded]))
+        features=fc.make_parse_example_spec_v2([a_embedded]))
     self.assertIn('aaa', features)
     with self.cached_session():
       _assert_sparse_tensor_value(
@@ -5734,12 +6041,12 @@ class EmbeddingColumnTest(test.TestCase):
             values=(0, 1, 0),
             dense_shape=(2, 2))
     }
-    outputs = fc._transform_features(features, [a, a_embedded], None)
+    outputs = fc._transform_features_v2(features, [a, a_embedded], None)
     output_a = outputs[a]
     output_embedded = outputs[a_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_embedded))
 
   def test_get_dense_tensor(self):
     # Inputs.
@@ -5782,7 +6089,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -5799,7 +6107,7 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   def test_get_dense_tensor_old_categorical(self):
     # Inputs.
@@ -5840,7 +6148,7 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
         categorical_column,
@@ -5859,7 +6167,7 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   def test_get_dense_tensor_3d(self):
     # Inputs.
@@ -5904,7 +6212,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -5921,7 +6230,7 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
@@ -5964,7 +6273,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -6034,7 +6344,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         ckpt_to_load_from=ckpt_path,
         tensor_name_in_ckpt=ckpt_tensor)
     state_manager = _TestStateManager()
@@ -6052,7 +6363,7 @@ class EmbeddingColumnTest(test.TestCase):
         ('embedding_weights:0',), tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   def test_linear_model(self):
     # Inputs.
@@ -6104,15 +6415,16 @@ class EmbeddingColumnTest(test.TestCase):
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
-      linear_weights = trainable_vars[
-          'linear_model/aaa_embedding/weights:0']
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -6127,9 +6439,10 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
-  def test_feature_layer(self):
+  def test_dense_features(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -6175,21 +6488,23 @@ class EmbeddingColumnTest(test.TestCase):
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
-    l = fc.FeatureLayer((embedding_column,))
-    feature_layer = l({'aaa': sparse_input})
+    l = fc.DenseFeatures((embedding_column,))
+    dense_features = l({'aaa': sparse_input})
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertTrue(isinstance(v, variables_lib.RefVariable))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
 
-  def test_feature_layer_not_trainable(self):
+  def test_dense_features_not_trainable(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -6236,17 +6551,19 @@ class EmbeddingColumnTest(test.TestCase):
         trainable=False)
 
     # Provide sparse input and get dense result.
-    feature_layer = fc.FeatureLayer((embedding_column,))({'aaa': sparse_input})
+    dense_features = fc.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     self.assertItemsEqual(
         [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
 
   def test_input_layer(self):
     # Inputs.
@@ -6308,7 +6625,7 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(feature_layer))
 
   def test_old_linear_model(self):
     # Inputs.
@@ -6365,11 +6682,13 @@ class EmbeddingColumnTest(test.TestCase):
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -6384,7 +6703,8 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
   def test_old_linear_model_old_categorical(self):
     # Inputs.
@@ -6411,7 +6731,7 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
         categorical_column,
@@ -6441,11 +6761,13 @@ class EmbeddingColumnTest(test.TestCase):
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -6460,7 +6782,58 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
+
+  def test_serialization(self):
+
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return ValueError('Not expected to be called')
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=2, initializer=_initializer)
+
+    self.assertEqual([categorical_column], embedding_column.parents)
+
+    config = embedding_column._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'class_name': 'IdentityCategoricalColumn',
+            'config': {
+                'number_buckets': 3,
+                'key': 'aaa',
+                'default_value': None
+            }
+        },
+        'ckpt_to_load_from': None,
+        'combiner': 'mean',
+        'dimension': 2,
+        'initializer': '_initializer',
+        'max_norm': None,
+        'tensor_name_in_ckpt': None,
+        'trainable': True
+    }, config)
+
+    custom_objects = {
+        '_initializer': _initializer,
+    }
+
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config, custom_objects=custom_objects)
+    self.assertEqual(embedding_column, new_embedding_column)
+    self.assertIsNot(categorical_column,
+                     new_embedding_column.categorical_column)
+
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config,
+        custom_objects=custom_objects,
+        columns_by_name={categorical_column.name: categorical_column})
+    self.assertEqual(embedding_column, new_embedding_column)
+    self.assertIs(categorical_column, new_embedding_column.categorical_column)
 
 
 class SharedEmbeddingColumnTest(test.TestCase):
@@ -6610,7 +6983,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_embedded, b_embedded]))
+        features=fc.make_parse_example_spec_v2([a_embedded, b_embedded]))
     self.assertIn('aaa', features)
     self.assertIn('bbb', features)
     with self.cached_session():
@@ -6643,17 +7016,17 @@ class SharedEmbeddingColumnTest(test.TestCase):
             values=(1, 2, 1),
             dense_shape=(2, 2)),
     }
-    outputs = fc._transform_features(features, [a, a_embedded, b, b_embedded],
-                                     None)
+    outputs = fc._transform_features_v2(features,
+                                        [a, a_embedded, b, b_embedded], None)
     output_a = outputs[a]
     output_a_embedded = outputs[a_embedded]
     output_b = outputs[b]
     output_b_embedded = outputs[b_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_a_embedded.eval())
-      _assert_sparse_tensor_value(
-          self, output_b.eval(), output_b_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_a_embedded))
+      _assert_sparse_tensor_value(self, self.evaluate(output_b),
+                                  self.evaluate(output_b_embedded))
 
   def test_get_dense_tensor(self):
     # Inputs.
@@ -6719,9 +7092,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
 
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
@@ -6842,13 +7215,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/bbb_shared_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -6866,9 +7241,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1, ids [], embedding[1] = 0, 0]
         # sum(embeddings * linear_weights)
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+        self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
-  def _test_feature_layer(self, trainable=True):
+  def _test_dense_features(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
     sparse_input_a = sparse_tensor.SparseTensorValue(
@@ -6954,7 +7329,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     }
 
     # Provide sparse input and get dense result.
-    feature_layer = fc.FeatureLayer(
+    dense_features = fc.DenseFeatures(
         feature_columns=(embedding_column_b, embedding_column_a,
                          embedding_column_c, embedding_column_d))(
                              features)
@@ -6964,6 +7339,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(
         ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
         tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertTrue(isinstance(v, variables_lib.RefVariable))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
       self.assertItemsEqual(
@@ -6974,13 +7351,33 @@ class SharedEmbeddingColumnTest(test.TestCase):
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
+  def test_serialization(self):
+
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return ValueError('Not expected to be called')
+
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=2,
+        initializer=_initializer)
 
-  def test_feature_layer(self):
-    self._test_feature_layer()
+    self.assertEqual([categorical_column_a], embedding_column_a.parents)
+    self.assertEqual([categorical_column_b], embedding_column_b.parents)
+    # TODO(rohanj): Add tests for (from|get)_config once implemented
 
-  def test_feature_layer_no_trainable(self):
-    self._test_feature_layer(trainable=False)
 
 
 class WeightedCategoricalColumnTest(test.TestCase):
@@ -7000,7 +7397,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_is_v2_column(self):
     column = fc.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+        categorical_column=fc_old._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     self.assertFalse(column._is_v2_column)
@@ -7045,7 +7442,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(ValueError, 'Bad dtype'):
-      fc._transform_features({
+      fc._transform_features_v2({
           'ids': strings,
           'values': strings
       }, (column,), None)
@@ -7068,7 +7465,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(
         ValueError, 'values is not in features dictionary'):
-      fc._transform_features({'ids': inputs}, (column,), None)
+      fc._transform_features_v2({'ids': inputs}, (column,), None)
 
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
@@ -7085,7 +7482,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_weighted]))
+        features=fc.make_parse_example_spec_v2([a_weighted]))
     self.assertIn('aaa', features)
     self.assertIn('weights', features)
     with self.cached_session():
@@ -7117,7 +7514,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0.5, 1.0, 0.1),
         dense_shape=(2, 2))
-    id_tensor, weight_tensor = fc._transform_features({
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': inputs,
         'values': weights,
     }, (column,), None)[column]
@@ -7127,15 +7524,13 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
   def test_transform_features_dense_input(self):
     column = fc.weighted_categorical_column(
@@ -7146,7 +7541,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0.5, 1.0, 0.1),
         dense_shape=(2, 2))
-    id_tensor, weight_tensor = fc._transform_features({
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': ((0, -1), (1, 0)),
         'values': weights,
     }, (column,), None)[column]
@@ -7156,15 +7551,13 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
   def test_transform_features_dense_weights(self):
     column = fc.weighted_categorical_column(
@@ -7175,7 +7568,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(2, 1, 0),
         dense_shape=(2, 2))
-    id_tensor, weight_tensor = fc._transform_features({
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': inputs,
         'values': ((.5, 0.), (1., .1)),
     }, (column,), None)[column]
@@ -7185,15 +7578,13 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)),
-          weight_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(weight_tensor))
 
   def test_linear_model(self):
     column = fc.weighted_categorical_column(
@@ -7216,14 +7607,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
       })
       weight_var, bias = model.variables
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_linear_model_mismatched_shape(self):
     column = fc.weighted_categorical_column(
@@ -7269,7 +7660,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_linear_model_mismatched_dense_shape(self):
     column = fc.weighted_categorical_column(
@@ -7288,14 +7679,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
       })
       weight_var, bias = model.variables
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     column = fc.weighted_categorical_column(
@@ -7318,14 +7709,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_old_linear_model_mismatched_shape(self):
     column = fc.weighted_categorical_column(
@@ -7370,7 +7761,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_old_linear_model_mismatched_dense_shape(self):
     column = fc.weighted_categorical_column(
@@ -7389,18 +7780,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_old_linear_model_old_categorical(self):
     column = fc.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+        categorical_column=fc_old._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -7419,16 +7810,147 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   # TODO(ptucker): Add test with embedding of weighted categorical.
 
+  def test_serialization(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='ids', num_buckets=3)
+    column = fc.weighted_categorical_column(
+        categorical_column=categorical_column, weight_feature_key='weight')
+
+    self.assertEqual([categorical_column, 'weight'], column.parents)
+
+    config = column._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'config': {
+                'key': 'ids',
+                'number_buckets': 3,
+                'default_value': None
+            },
+            'class_name': 'IdentityCategoricalColumn'
+        },
+        'dtype': 'float32',
+        'weight_feature_key': 'weight'
+    }, config)
+
+    self.assertEqual(column, fc.WeightedCategoricalColumn._from_config(config))
+
+    new_column = fc.WeightedCategoricalColumn._from_config(
+        config, columns_by_name={categorical_column.name: categorical_column})
+    self.assertEqual(column, new_column)
+    self.assertIs(categorical_column, new_column.categorical_column)
+
+
+class FeatureColumnForSerializationTest(BaseFeatureColumnForTests):
+
+  @property
+  def _is_v2_column(self):
+    return True
+
+  @property
+  def name(self):
+    return 'BadParentsFeatureColumn'
+
+  def transform_feature(self, transformation_cache, state_manager):
+    return 'Output'
+
+  @property
+  def parse_example_spec(self):
+    pass
+
+
+class SerializationTest(test.TestCase):
+  """Tests for serialization, deserialization helpers."""
+
+  def test_serialize_non_feature_column(self):
+
+    class NotAFeatureColumn(object):
+      pass
+
+    with self.assertRaisesRegexp(ValueError, 'is not a FeatureColumn'):
+      fc.serialize_feature_column(NotAFeatureColumn())
+
+  def test_deserialize_invalid_config(self):
+    with self.assertRaisesRegexp(ValueError, 'Improper config format: {}'):
+      fc.deserialize_feature_column({})
+
+  def test_deserialize_config_missing_key(self):
+    config_missing_key = {
+        'config': {
+            # Dtype is missing and should cause a failure.
+            # 'dtype': 'int32',
+            'default_value': None,
+            'key': 'a',
+            'normalizer_fn': None,
+            'shape': (2,)
+        },
+        'class_name': 'NumericColumn'
+    }
+    with self.assertRaisesRegexp(ValueError, 'Invalid config:'):
+      fc.deserialize_feature_column(config_missing_key)
+
+  def test_deserialize_invalid_class(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Unknown feature_column_v2: NotExistingFeatureColumnClass'):
+      fc.deserialize_feature_column({
+          'class_name': 'NotExistingFeatureColumnClass',
+          'config': {}
+      })
+
+  def test_deserialization_deduping(self):
+    price = fc.numeric_column('price')
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+
+    configs = fc.serialize_feature_columns([price, bucketized_price])
+
+    deserialized_feature_columns = fc.deserialize_feature_columns(configs)
+    self.assertEqual(2, len(deserialized_feature_columns))
+    new_price = deserialized_feature_columns[0]
+    new_bucketized_price = deserialized_feature_columns[1]
+
+    # Ensure these are not the original objects:
+    self.assertIsNot(price, new_price)
+    self.assertIsNot(bucketized_price, new_bucketized_price)
+    # But they are equivalent:
+    self.assertEquals(price, new_price)
+    self.assertEquals(bucketized_price, new_bucketized_price)
+
+    # Check that deduping worked:
+    self.assertIs(new_bucketized_price.source_column, new_price)
+
+  def deserialization_custom_objects(self):
+    # Note that custom_objects is also tested extensively above per class, this
+    # test ensures that the public wrappers also handle it correctly.
+    def _custom_fn(input_tensor):
+      return input_tensor + 42.
+
+    price = fc.numeric_column('price', normalizer_fn=_custom_fn)
+
+    configs = fc.serialize_feature_columns([price])
+
+    deserialized_feature_columns = fc.deserialize_feature_columns(configs)
+
+    self.assertEqual(1, len(deserialized_feature_columns))
+    new_price = deserialized_feature_columns[0]
+
+    # Ensure these are not the original objects:
+    self.assertIsNot(price, new_price)
+    # But they are equivalent:
+    self.assertEquals(price, new_price)
+
+    # Check that normalizer_fn points to the correct function.
+    self.assertIs(new_price.normalizer_fn, _custom_fn)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 9a9ee46aabb13a2dc9bff153c49814da5724ebf6..30dc959e9a9f717bdb5c56bfbdde5ffa9d48c257 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -21,9 +21,11 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
@@ -70,6 +72,17 @@ class AutomaticControlDependencies(object):
       self._returned_tensors.add(indices)
       self._returned_tensors.add(values)
       return ops.IndexedSlices(values, indices, dense_shape=tensor.dense_shape)
+    elif isinstance(tensor, sparse_tensor.SparseTensor):
+      values = array_ops.identity(tensor.values)
+      indices = array_ops.identity(tensor.indices)
+      self._returned_tensors.add(indices)
+      self._returned_tensors.add(values)
+      return sparse_tensor.SparseTensor(
+          indices, values, dense_shape=tensor.dense_shape)
+    elif isinstance(tensor, tensor_array_ops.TensorArray):
+      flow = array_ops.identity(tensor.flow)
+      self._returned_tensors.add(flow)
+      return tensor_array_ops.build_ta_with_new_flow(tensor, flow)
     # We want to make the return values depend on the stateful operations, but
     # we don't want to introduce a cycle, so we make the return value the result
     # of a new identity operation that the stateful operations definitely don't
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 4b2706d4cf8df70818158d18ebf260c52d4f4218..53d84b2dc760c9a4e1c332ef4aa0e6bf3327662e 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -106,12 +106,12 @@ def convert_to_eager_tensor(value, ctx, dtype=None):
     tensor = scalar_cache.get(cache_key, None)
     if tensor is not None:
       return ops.EagerTensor(
-          value, context=handle, device=device, dtype=dtype, other_value=tensor)
-    t = ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+          value, handle, device, dtype, tensor)
+    t = ops.EagerTensor(value, handle, device, dtype)
     scalar_cache[cache_key] = t
     return t
   else:
-    return ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+    return ops.EagerTensor(value, handle, device, dtype)
 
 
 @tf_export("constant")
diff --git a/tensorflow/python/framework/device.py b/tensorflow/python/framework/device.py
index 7f6e0a75a5c508e35ff5bf3c28d4ab31af205715..e7ac6444a4ac1e116675dbb059cd1953df1213ab 100644
--- a/tensorflow/python/framework/device.py
+++ b/tensorflow/python/framework/device.py
@@ -23,7 +23,7 @@ import threading
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("DeviceSpec")
+@tf_export(v1=["DeviceSpec"])
 class DeviceSpec(object):
   """Represents a (possibly partial) specification for a TensorFlow device.
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index e36643b338ed55cde0531de37fef241425657282..f7a12d27df7b90b45cf0e02920b7199aeb310213 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import builtins
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow
@@ -548,8 +549,8 @@ _NP_TO_TF = frozenset([
     (np.int8, int8),
     (np.complex64, complex64),
     (np.complex128, complex128),
-    (np.object, string),
-    (np.bool, bool),
+    (np.object_, string),
+    (np.bool_, bool),
     (_np_qint8, qint8),
     (_np_quint8, quint8),
     (_np_qint16, qint16),
@@ -652,12 +653,15 @@ _QUANTIZED_DTYPES_NO_REF = frozenset([qint8, quint8, qint16, quint16, qint32])
 _QUANTIZED_DTYPES_REF = frozenset(
     [qint8_ref, quint8_ref, qint16_ref, quint16_ref, qint32_ref])
 QUANTIZED_DTYPES = _QUANTIZED_DTYPES_REF.union(_QUANTIZED_DTYPES_NO_REF)
-tf_export("dtypes.QUANTIZED_DTYPES", "QUANTIZED_DTYPES").export_constant(
-    __name__, "QUANTIZED_DTYPES")
+tf_export(
+    "dtypes.QUANTIZED_DTYPES",
+    v1=["dtypes.QUANTIZED_DTYPES", "QUANTIZED_DTYPES"]).export_constant(
+        __name__, "QUANTIZED_DTYPES")
 
 _PYTHON_TO_TF = {
-    float: float32,
-    bool: bool,
+    builtins.float: float32,
+    builtins.bool: bool,
+    builtins.object: string
 }
 
 
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index a873670e0461884d06cde1db4db2cf2db98fde3c..719fdc0953ae4d5bbe016b3dc2730f5601c3494e 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -81,10 +81,10 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertIs(dtypes.int8, dtypes.as_dtype(np.int8))
     self.assertIs(dtypes.complex64, dtypes.as_dtype(np.complex64))
     self.assertIs(dtypes.complex128, dtypes.as_dtype(np.complex128))
-    self.assertIs(dtypes.string, dtypes.as_dtype(np.object))
+    self.assertIs(dtypes.string, dtypes.as_dtype(np.object_))
     self.assertIs(dtypes.string,
                   dtypes.as_dtype(np.array(["foo", "bar"]).dtype))
-    self.assertIs(dtypes.bool, dtypes.as_dtype(np.bool))
+    self.assertIs(dtypes.bool, dtypes.as_dtype(np.bool_))
     with self.assertRaises(TypeError):
       dtypes.as_dtype(np.dtype([("f1", np.uint), ("f2", np.int32)]))
 
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index bc3c81b2a2f19bfa89bb2e2a418ea8239a5075d9..37a634d80679b095d319cabcd29208a35c4fe44f 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -40,6 +40,7 @@ _ParseTag = collections.namedtuple("_ParseTag", ["type", "name"])
 
 _BAD_FILE_SUBSTRINGS = [
     os.path.join("tensorflow", "python"),
+    os.path.join("tensorflow", "contrib"),
     "<embedded",
 ]
 
@@ -267,8 +268,8 @@ def compute_field_dict(op):
 def interpolate(error_message, graph):
   """Interpolates an error message.
 
-  The error message can contain tags of the form ^^type:name^^ which will
-  be replaced.
+  The error message can contain tags of the form `{{type name}}` which will be
+  replaced.
 
   Args:
     error_message: A string to interpolate.
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 9a3751f4e51f79ff85bdc26e58fb82b7b2418785..f1e508c3658207eaa7729ee2f51001368c48ac86 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -26,16 +26,17 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework.auto_control_deps import AutomaticControlDependencies
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.lazy_loader import LazyLoader
 
 # This is to avoid a circular dependency:
@@ -296,7 +297,7 @@ def func_graph_from_py_func(name,
                             kwargs,
                             signature=None,
                             func_graph=None,
-                            experimental_autograph=False,
+                            autograph=False,
                             add_control_dependencies=True,
                             arg_names=None,
                             op_return_value=None):
@@ -316,7 +317,7 @@ def func_graph_from_py_func(name,
       inputs.
     func_graph: Optional. An instance of FuncGraph. If provided, we will use
       this graph else a new one is built and returned.
-    experimental_autograph: whether to use autograph to compile `python_func`.
+    autograph: whether to use autograph to compile `python_func`.
       See https://www.tensorflow.org/guide/autograph for more information.
     add_control_dependencies: If True, automatically adds control dependencies
       to ensure program order matches execution order and stateful ops always
@@ -373,7 +374,7 @@ def func_graph_from_py_func(name,
         # captured Operations).
         with ops.control_dependencies([x]):
           x = array_ops.identity(op_return_value)
-      else:
+      elif not isinstance(x, tensor_array_ops.TensorArray):
         try:
           x = ops.convert_to_tensor_or_indexed_slices(x)
         except (ValueError, TypeError):
@@ -388,30 +389,29 @@ def func_graph_from_py_func(name,
 
     this_tape = tape.push_new_tape()
     try:
-      if experimental_autograph:
+      if autograph:
         from tensorflow.python import autograph  # pylint: disable=g-import-not-at-top
         _, original_func = tf_decorator.unwrap(python_func)
 
-        # AutoGraph does not yet rebind the returned method, and must receive
-        # `self` explicitly.
-        # TODO(mdan): Have the result automatically bind it instead.
-        if (tf_inspect.ismethod(original_func) and
-            hasattr(original_func, "__self__")):
-          effective_func_args = (original_func.__self__,) + func_args
-        else:
-          effective_func_args = func_args
-
-        func_outputs = autograph.converted_call(
-            original_func, None,
-            autograph.ConversionOptions(
-                verbose=True,
-                recursive=True,
-                strip_decorators=(function.defun, def_function.function),
-                optional_features=(),
-            ), *effective_func_args, **func_kwargs)
-      else:
-        func_outputs = python_func(*func_args, **func_kwargs)
-      # invariant: `func_outputs` contains only Tensors and `None`s.
+        def wrapper(*args, **kwargs):
+          return autograph.converted_call(
+              original_func, None,
+              autograph.ConversionOptions(
+                  verbose=True,
+                  recursive=True,
+                  strip_decorators=(function.defun, def_function.function),
+                  optional_features=(),
+              ), *args, **kwargs)
+
+        # Wrapping around a decorator allows checks like tf_inspect.getargspec
+        # to be accurate.
+        converted_func = tf_decorator.make_decorator(original_func, wrapper)
+        tf_decorator.rewrap(python_func, original_func, converted_func)
+
+      func_outputs = python_func(*func_args, **func_kwargs)
+
+      # invariant: `func_outputs` contains only Tensors, IndexedSlices,
+      # SparseTensors, TensorArrays and `None`s.
       func_outputs = nest.map_structure(convert, func_outputs)
 
       check_mutation(func_args_before, func_args)
@@ -498,7 +498,17 @@ def check_mutation(n1, n2):
 
 
 def flatten(sequence):
-  """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
+  """Like `nest.flatten` but also unpacks other Tensor-like objects.
+
+  Flattens non-tensor objects into their constituent tensors.
+
+  Args:
+    sequence: A nested structure of Tensors, IndexedSlices, SparseTensors and
+      TensorArrays.
+
+  Returns:
+    A list of tensors.
+  """
   # TODO(akshayka): Support `SparseTensor` in a similar fashion.
   flat_sequence = nest.flatten(sequence)
   outputs = []
@@ -508,11 +518,58 @@ def flatten(sequence):
         outputs.extend([item.values, item.indices, item.dense_shape])
       else:
         outputs.extend([item.values, item.indices])
+    elif isinstance(item, sparse_tensor.SparseTensor):
+      outputs.extend([item.indices, item.values, item.dense_shape])
+    elif isinstance(item, tensor_array_ops.TensorArray):
+      outputs.append(item.flow)
     else:
       outputs.append(item)
   return outputs
 
 
+def pack_sequence_as(structure, flat_sequence):
+  """Like `nest.pack_sequence_as` but also packs other Tensor-like objects.
+
+  Args:
+    structure: The structure to pack into. May contain Tensors, IndexedSlices,
+      TensorArrays or SparseTensors.
+    flat_sequence: An iterable containing tensors.
+
+  Returns:
+    A nested structure.
+
+  Raises:
+    AssertionError if `structure` and `flat_sequence` are not compatible.
+  """
+  flattened_structure = nest.flatten(structure)
+  flat_sequence_with_slices_and_tas = []
+  index = 0
+  for t in flattened_structure:
+    if isinstance(t, ops.IndexedSlices):
+      if t.dense_shape is not None:
+        flat_sequence_with_slices_and_tas.append(
+            ops.IndexedSlices(*flat_sequence[index:index + 3]))
+        index += 3
+      else:
+        flat_sequence_with_slices_and_tas.append(
+            ops.IndexedSlices(*flat_sequence[index:index + 2]))
+        index += 2
+    elif isinstance(t, sparse_tensor.SparseTensor):
+      flat_sequence_with_slices_and_tas.append(
+          sparse_tensor.SparseTensor(*flat_sequence[index:index + 3]))
+      index += 3
+    elif isinstance(t, tensor_array_ops.TensorArray):
+      flow = flat_sequence[index]
+      ta = tensor_array_ops.build_ta_with_new_flow(t, flow)
+      flat_sequence_with_slices_and_tas.append(ta)
+      index += 1
+    else:
+      flat_sequence_with_slices_and_tas.append(flat_sequence[index])
+      index += 1
+  assert len(flattened_structure) == len(flat_sequence_with_slices_and_tas)
+  return nest.pack_sequence_as(structure, flat_sequence_with_slices_and_tas)
+
+
 def _create_substitute_placeholder(value, name=None, dtype=None):
   """Creates a placeholder for `value` and propagates shape info to it."""
   # Note: setting ops.control_dependencies(None) ensures we always put
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 13ee6c5d2d7bfb9898a491622b6002cfa78f1952..90deb9765f2fd33dac3f5c009e07e61eb3e684e3 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -102,7 +102,7 @@ class FunctionTest(test.TestCase):
       call = MyIdentityFunc([18.0])
       self.assertEqual("MyIdentity", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([18.0], sess.run(call))
+        self.assertAllEqual([18.0], self.evaluate(call))
 
   def testIdentityImplicitDeref(self):
 
@@ -116,8 +116,8 @@ class FunctionTest(test.TestCase):
       self.assertEqual("MyIdentity", call.op.name)
       for cfg in _OptimizerOptions():
         with session.Session(config=cfg) as sess:
-          sess.run(var.initializer)
-          self.assertAllEqual([18.0], sess.run(call))
+          self.evaluate(var.initializer)
+          self.assertAllEqual([18.0], self.evaluate(call))
 
   def testIdentityOutputName(self):
 
@@ -130,7 +130,7 @@ class FunctionTest(test.TestCase):
       call = MyIdentityFunc([18.0])
       self.assertEqual("MyIdentity", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([18.0], sess.run(call))
+        self.assertAllEqual([18.0], self.evaluate(call))
 
   def testTooManyOutputNames(self):
 
@@ -158,7 +158,7 @@ class FunctionTest(test.TestCase):
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([5.0], sess.run(call))
+        self.assertAllEqual([5.0], self.evaluate(call))
 
   def testFunctionWithNoOutput(self):
 
@@ -187,7 +187,7 @@ class FunctionTest(test.TestCase):
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([5.0], sess.run(call))
+        self.assertAllEqual([5.0], self.evaluate(call))
 
   def testDefineFunctionDuplicateOutputs(self):
 
@@ -224,8 +224,8 @@ class FunctionTest(test.TestCase):
       call_g = XSquarePlusOneGrad([2.0], [0.1])
 
       with session.Session() as sess:
-        self.assertAllClose([5.0], sess.run(call_f))
-        self.assertAllClose([0.4], sess.run(call_g))
+        self.assertAllClose([5.0], self.evaluate(call_f))
+        self.assertAllClose([0.4], self.evaluate(call_g))
 
   def testTanhSymGrad(self):
 
@@ -387,7 +387,7 @@ class FunctionTest(test.TestCase):
       call = AConstant()
       self.assertEqual("AConstant", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([42], sess.run(call))
+        self.assertAllEqual([42], self.evaluate(call))
 
   def testDefineFunctionNames(self):
 
@@ -468,7 +468,7 @@ class FunctionTest(test.TestCase):
 
       loop = control_flow_ops.while_loop(lambda x: x < 1e5, Body, [1.0])
 
-      ans = sess.run(loop)
+      ans = self.evaluate(loop)
       self.assertAllClose(ans, 131072.)
 
   def testControlFlowStrictness(self):
@@ -552,8 +552,8 @@ class FunctionTest(test.TestCase):
 
     with self.session(graph=g):
       v.initializer.run()
-      self.assertAllEqual(expected_val.eval(), actual_val.eval())
-      self.assertAllEqual(expected_shape, actual_shape.eval())
+      self.assertAllEqual(expected_val.eval(), self.evaluate(actual_val))
+      self.assertAllEqual(expected_shape, self.evaluate(actual_shape))
 
   def testDefineErrors(self):
     with ops.Graph().as_default():
@@ -650,8 +650,8 @@ class FunctionTest(test.TestCase):
       # pylint: enable=unexpected-keyword-arg
       self.assertEqual("next", call2.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([1], sess.run(call1))
-        self.assertAllEqual([0], sess.run(call2))
+        self.assertAllEqual([1], self.evaluate(call1))
+        self.assertAllEqual([0], self.evaluate(call2))
 
   def testNestedFunction(self):
 
@@ -794,7 +794,7 @@ class FunctionTest(test.TestCase):
       y = Foo()
 
     with self.session(graph=g) as sess:
-      self.assertEqual(sess.run(y), 10)
+      self.assertEqual(self.evaluate(y), 10)
 
   def testCaptureInCond(self):
     g = ops.Graph()
@@ -809,8 +809,8 @@ class FunctionTest(test.TestCase):
       z = Foo(False)
 
     with self.session(graph=g) as sess:
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 2)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 2)
 
   def testStableName(self):
 
@@ -900,7 +900,7 @@ class FunctionTest(test.TestCase):
     self.assertEqual(global_vars[0].name, "linear/w:0")
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       output_val = sess.run(
           output_op, feed_dict={input_op: np.random.rand(32, 100)})
       self.assertEqual(output_val.shape, (32, 100))
@@ -928,7 +928,7 @@ class FunctionTest(test.TestCase):
     self.assertEqual(global_vars[0].name, "vs1/var:0")
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       out1, out2 = sess.run(
           [out1_op, out2_op], feed_dict={input_op: np.linspace(1, 10, 10)})
       self.assertAllEqual(out1, np.linspace(2, 11, 10))
@@ -991,8 +991,8 @@ class FunctionTest(test.TestCase):
     result_2 = Bar(constant_op.constant(100, dtype=dtypes.int64))
 
     with session.Session() as sess:
-      self.assertEqual(4.0, sess.run(result_1))
-      self.assertEqual(100, sess.run(result_2))
+      self.assertEqual(4.0, self.evaluate(result_1))
+      self.assertEqual(100, self.evaluate(result_2))
       self.assertEqual((4.0, 100), sess.run((result_1, result_2)))
 
   def testStatefulFunction(self):
@@ -1052,8 +1052,8 @@ class FunctionTest(test.TestCase):
     for config in _OptimizerOptions():
       config.device_count["CPU"] = 2
       with session.Session(config=config) as sess:
-        self.assertEqual(42.0, sess.run(f_0))
-        self.assertEqual(44.0, sess.run(f_1))
+        self.assertEqual(42.0, self.evaluate(f_0))
+        self.assertEqual(44.0, self.evaluate(f_1))
         self.assertEqual((42.0, 44.0), sess.run((f_0, f_1)))
 
   def testGuaranteedConstsAreCaptured(self):
@@ -1076,7 +1076,7 @@ class FunctionTest(test.TestCase):
       return output
 
     with self.session(use_gpu=False) as sess:
-      sess.run(var.initializer)
+      self.evaluate(var.initializer)
       _ = sess.run(CapturesGuaranteedConst(), {also_not_const: 1.0})
 
   def testSameFunctionDifferentGrads(self):
@@ -1651,8 +1651,8 @@ class ModuleFunctionTest(test.TestCase):
       y = LinearWithCApi(a, b, c)
       z = Linear2WithCApi(a, b, c, d, e)
       with session.Session() as sess:
-        self.assertAllEqual([[1]], sess.run(y))
-        self.assertAllEqual([[5]], sess.run(z))
+        self.assertAllEqual([[1]], self.evaluate(y))
+        self.assertAllEqual([[5]], self.evaluate(z))
 
 
 class VariableHoistingTest(test.TestCase):
@@ -1704,7 +1704,7 @@ class VariableHoistingTest(test.TestCase):
     self.assertEqual("Foo/b", b.op.name)
 
     with self.session(graph=g) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       w, b, x, y0, loss, dw, db = sess.run([w, b, x, y0, loss, dw, db])
 
     self.assertAllEqual(w.shape, (64, 64))
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index 47e1344eaeda7d0cc6a4b0e652071f79f1bc24fa..ee0fd227eec688ec7c48dad241931f6700173ee0 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -27,7 +27,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('io.write_graph', 'train.write_graph')
+@tf_export('io.write_graph', v1=['io.write_graph', 'train.write_graph'])
 def write_graph(graph_or_graph_def, logdir, name, as_text=True):
   """Writes a graph proto to a file.
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 394fac6c856197030f85aab5b11fa881eddf670d..1b61ac925ce3d555525c9086172d43c75a3af10c 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 _VARIABLE_OPS = {
@@ -50,7 +51,10 @@ def _is_variable_op(op):
   return op in _VARIABLE_OPS
 
 
-@tf_export("graph_util.must_run_on_cpu")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.must_run_on_cpu")
+@tf_export(v1=["graph_util.must_run_on_cpu"])
 def must_run_on_cpu(node, pin_variables_on_cpu=False):
   """Returns True if the given node_def must run on CPU, otherwise False.
 
@@ -149,7 +153,10 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
   return nodes_to_keep
 
 
-@tf_export("graph_util.extract_sub_graph")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.extract_sub_graph")
+@tf_export(v1=["graph_util.extract_sub_graph"])
 def extract_sub_graph(graph_def, dest_nodes):
   """Extract the subgraph that can reach any of the nodes in 'dest_nodes'.
 
@@ -187,7 +194,10 @@ def extract_sub_graph(graph_def, dest_nodes):
   return out
 
 
-@tf_export("graph_util.tensor_shape_from_node_def_name")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+@tf_export(v1=["graph_util.tensor_shape_from_node_def_name"])
 def tensor_shape_from_node_def_name(graph, input_name):
   """Convenience function to get a shape from a NodeDef's input string."""
   # To get a tensor, the name must be in the form <input>:<port>, for example
@@ -202,7 +212,10 @@ def tensor_shape_from_node_def_name(graph, input_name):
   return shape
 
 
-@tf_export("graph_util.convert_variables_to_constants")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.convert_variables_to_constants")
+@tf_export(v1=["graph_util.convert_variables_to_constants"])
 def convert_variables_to_constants(sess,
                                    input_graph_def,
                                    output_node_names,
@@ -289,7 +302,10 @@ def convert_variables_to_constants(sess,
   return output_graph_def
 
 
-@tf_export("graph_util.remove_training_nodes")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+@tf_export(v1=["graph_util.remove_training_nodes"])
 def remove_training_nodes(input_graph, protected_nodes=None):
   """Prunes out nodes that aren't needed for inference.
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 563a177dd06b3b165335c91c3a92ff8877609efc..7a9f2e8d860148f9d5397cea67f9dc572cc047e2 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -211,7 +211,7 @@ class DeviceFunctionsTest(test.TestCase):
       with session.Session() as sess:
         init = variables.variables_initializer([variable_node])
         sess.run(init)
-        output = sess.run(output_node)
+        output = self.evaluate(output_node)
         self.assertNear(4.0, output, 0.00001)
         variable_graph_def = sess.graph.as_graph_def()
 
@@ -242,8 +242,8 @@ class DeviceFunctionsTest(test.TestCase):
         output_node = math_ops_lib.multiply(
             variable_node, 2.0, name="output_node")
         with session.Session() as sess:
-          sess.run(variable_node.initializer)
-          output = sess.run(output_node)
+          self.evaluate(variable_node.initializer)
+          output = self.evaluate(output_node)
           self.assertNear(2.0, output, 0.00001)
           variable_graph_def = sess.graph.as_graph_def()
           # First get the constant_graph_def when variable_names_whitelist is
@@ -256,7 +256,7 @@ class DeviceFunctionsTest(test.TestCase):
 
           # Then initialize the unused variable, and get another
           # constant_graph_def when variable_names_whitelist is not set.
-          sess.run(another_variable.initializer)
+          self.evaluate(another_variable.initializer)
           constant_graph_def_without_variable_whitelist = (
               graph_util.convert_variables_to_constants(
                   sess, variable_graph_def, ["output_node"]))
@@ -295,7 +295,7 @@ class DeviceFunctionsTest(test.TestCase):
             ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"])
       with session.Session() as sess:
         output_node = sess.graph.get_tensor_by_name("output_node:0")
-        output = sess.run(output_node)
+        output = self.evaluate(output_node)
         self.assertNear(2.0, output, 0.00001)
 
   def create_node_def(self, op, name, inputs):
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index c9ac27e788709da5fc5533062694f3b680de9853..71ebfd6ceb00507a87a5b9510ff3e465b28f5615 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -431,17 +431,16 @@ def import_graph_def(graph_def,
     #
     # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
     # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
-    # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
-    # _USE_C_SHAPES is removed.
-    if graph_def.library and graph_def.library.function:
-      # pylint: disable=protected-access
-      functions = function._from_library(graph_def.library)
-      for f in functions:
-        f.add_to_graph(graph)
-      # pylint: enable=protected-access
 
     _ProcessNewOps(graph)
 
+  if graph_def.library and graph_def.library.function:
+    # pylint: disable=protected-access
+    functions = function._from_library(graph_def.library)
+    for f in functions:
+      f.add_to_graph(graph)
+    # pylint: enable=protected-access
+
   # Treat input mappings that don't appear in the graph as an error, because
   # they are likely to be due to a typo.
   missing_unused_input_keys = (
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 2b4d8e7299559b689763e18f204556890a412410..a57f0b36540b120cd4f1273c10578d3981541898 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -398,10 +398,10 @@ class ImportGraphDefTest(test.TestCase):
       # TODO(b/76173421): make this work (currently DCHECKS)
       # with self.cached_session() as sess:
       #   sess.run(imported_init)
-      #   self.assertEqual(sess.run(imported_var), 1.0)
-      #   self.assertEqual(sess.run(imported_assign), 2.0)
-      #   self.assertEqual(list(sess.run(imported_shape)), [])
-      #   self.assertEqual(list(sess.run(new_var_shape)), [])
+      #   self.assertEqual(self.evaluate(imported_var), 1.0)
+      #   self.assertEqual(self.evaluate(imported_assign), 2.0)
+      #   self.assertEqual(list(self.evaluate(imported_shape)), [])
+      #   self.assertEqual(list(self.evaluate(new_var_shape)), [])
 
   def testWhileLoop(self):
     # Produce GraphDef containing while loop.
@@ -418,7 +418,7 @@ class ImportGraphDefTest(test.TestCase):
                                               return_elements=[r.name])
       self.assertEqual(imported_r.name, "import/" + r.name)
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(imported_r), 10)
+        self.assertEqual(self.evaluate(imported_r), 10)
 
   def testImportWhileLoopInCond(self):
     # Produce GraphDef containing while loop.
@@ -458,7 +458,7 @@ class ImportGraphDefTest(test.TestCase):
           lambda i: i < 2, ImportFn, [0],
           shape_invariants=[tensor_shape.TensorShape(None)])
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(out), 10)
+        self.assertEqual(self.evaluate(out), 10)
 
   def testTypeMismatchInGraphDef(self):
     # TODO(skyewm): improve error message
@@ -930,7 +930,7 @@ class ImportGraphDefTest(test.TestCase):
           name="",
           return_elements=["id:0"])
       with self.cached_session():
-        self.assertEqual(5.0, t.eval())
+        self.assertEqual(5.0, self.evaluate(t))
 
   def testInvalidInputForReturnOperations(self):
     with ops.Graph().as_default():
@@ -1071,7 +1071,7 @@ class ImportGraphDefTest(test.TestCase):
       tensor_input = np.ones(input_shape, dtype=np.float32)
       t = constant_op.constant(tensor_input, shape=input_shape)
       g = array_ops.identity(t)
-      g.eval()
+      self.evaluate(g)
 
   def testVersion(self):
     v0 = versions.GRAPH_DEF_VERSION_MIN_CONSUMER
@@ -1255,7 +1255,7 @@ class ImportGraphDefTest(test.TestCase):
     z = TestFunc()
 
     with self.cached_session():
-      z_val = z.eval()
+      z_val = self.evaluate(z)
       self.assertEqual(z_val, -2.0)
 
   def testImportGraphWithFunctionTwice(self):
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 908a5f521e15690dee0683ee25dea86e43b5f1f0..727f6aa44c2ed11414e805eb635a9adbc5519da6 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -31,6 +31,7 @@ from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-i
 from tensorflow.python import pywrap_tensorflow as py_tf
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -83,7 +84,8 @@ def load_op_library(library_filename):
   return module
 
 
-@tf_export('load_file_system_library')
+@deprecation.deprecated(date=None, instructions='Use tf.load_library instead.')
+@tf_export(v1=['load_file_system_library'])
 def load_file_system_library(library_filename):
   """Loads a TensorFlow plugin, containing file system implementation.
 
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 33631282bd03a15daddb334e6f40e6b52f84c750..ddf6f66e8ab5e17aa611cce40b01953fb7a5d3b1 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -462,7 +462,7 @@ def _is_default_attr_value(op_def, attr_name, attr_value):
   return False
 
 
-def _strip_graph_default_valued_attrs(meta_graph_def):
+def strip_graph_default_valued_attrs(meta_graph_def):
   """Strips default valued attributes for node defs in given MetaGraphDef.
 
   This method also sets `meta_info_def.stripped_default_attrs` in the given
@@ -587,7 +587,7 @@ def create_meta_graph_def(meta_info_def=None,
 
   # Strip default valued attributes in graph_def.
   if strip_default_attrs:
-    _strip_graph_default_valued_attrs(meta_graph_def)
+    strip_graph_default_valued_attrs(meta_graph_def)
 
   # Adds saver_def.
   if saver_def:
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index fc98b91a016cf40b32607320bb2ebb65cc7d6a63..3605ed7fa2aae33253a6d533b8265cb9bec79048 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -492,8 +492,8 @@ class ScopedMetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
       grad = gradients_impl.gradients([output], [var])
       with session.Session() as sess:
-        sess.run(init_op)
-        expected_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        expected_grad_value = self.evaluate(grad)
 
     # Restore the MetaGraphDef into a new Graph with an import scope.
     with ops.Graph().as_default():
@@ -518,8 +518,8 @@ class ScopedMetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with session.Session() as sess:
-        sess.run(init_op)
-        actual_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        actual_grad_value = self.evaluate(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
   def testImportWhileLoopInWhileLoop(self):
@@ -544,7 +544,7 @@ class ScopedMetaGraphTest(test.TestCase):
       _, x = control_flow_ops.while_loop(lambda i, x: i < 2, body, [0, 0.0],
                                          name="")
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         sess.run(x)
 
   def testScopedImportUnderNameScope(self):
@@ -869,7 +869,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
 
       initializer = variables.local_variables_initializer()
       sess.run(initializer)
-      sess.run(update_op)
+      self.evaluate(update_op)
 
     meta_graph.export_scoped_meta_graph(
         filename=meta_graph_filename, graph=graph)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 14e4c0ca41aa2bf5369b99dca6f8c303e418b6f1..c465d2bc109f6f4d6248996acaef9093d17994a4 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -40,7 +40,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
-from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import error_interpolation
@@ -318,22 +317,13 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-
     # This will be set by self._as_tf_output().
     self._tf_output = None
-
     # This will be set by self.shape().
     self._shape_val = None
-
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
-
-    if not _USE_C_SHAPES:
-      # Attributes used for C++ shape inference. Not inspected, only forwarded.
-      # If set, will be a HandleData object from cpp_shape_inference.proto.
-      self._handle_data = None
-
     self._id = uid()
 
   @property
@@ -408,17 +398,7 @@ class Tensor(_TensorLike):
 
     """
     if self._shape_val is None:
-      if _USE_C_SHAPES:
-        self._shape_val = self._c_api_shape()
-      else:
-        # Call set_shape_and_handle_data_for_outputs in topological order on all
-        # ops that are needed to compute self.op's shape. We do this instead of
-        # having set_shape_and_handle_data_for_outputs recursively call
-        # Operation.shape on self.op.inputs to overflowing the call stack.
-        need_shapes = self._get_input_ops_without_shapes(self.op)
-        need_shapes.sort(key=lambda op: op._id)
-        for op in need_shapes:
-          set_shape_and_handle_data_for_outputs(op)
+      self._shape_val = self._c_api_shape()
     return self._shape_val
 
   def _get_input_ops_without_shapes(self, target_op):
@@ -533,14 +513,10 @@ class Tensor(_TensorLike):
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    if _USE_C_SHAPES:  # pylint: disable=protected-access
-      # Reset cached shape.
-      self._shape_val = None
-    else:
-      self._shape_val = self.shape.merge_with(shape)
+    # Reset cached shape.
+    self._shape_val = None
 
-    # Update C shape even if _USE_C_SHAPES = False, since we still want
-    # set_shape to be reflected in the C API graph for when we run it.
+    # We want set_shape to be reflected in the C API graph for when we run it.
     if not isinstance(shape, tensor_shape.TensorShape):
       shape = tensor_shape.TensorShape(shape)
     dim_list = []
@@ -634,10 +610,7 @@ class Tensor(_TensorLike):
     return id(self) == id(other)
 
   def __copy__(self):
-    # Make sure _shape_val is computed before we copy.
     # TODO(b/77597810): get rid of Tensor copies.
-    if self._shape_val is None:
-      set_shape_and_handle_data_for_outputs(self.op)
     cls = self.__class__
     result = cls.__new__(cls)
     result.__dict__.update(self.__dict__)
@@ -890,6 +863,12 @@ class _EagerTensorBase(Tensor):
     """Returns the number of Tensor dimensions."""
     return self.shape.ndims
 
+  def __len__(self):
+    """Returns the length of the first dimension in the Tensor."""
+    if not self.shape.ndims:
+      raise TypeError("Scalar tensor has no `len()`")
+    return self._shape_tuple()[0]
+
   def _cpu_nograd(self):
     """A copy of this Tensor with contents backed by host memory.
 
@@ -1001,7 +980,7 @@ _tensor_conversion_func_lock = threading.Lock()
 register_dense_tensor_like_type(Tensor)
 
 
-@tf_export("convert_to_tensor")
+@tf_export(v1=["convert_to_tensor"])
 def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
   """Converts the given `value` to a `Tensor`.
 
@@ -1050,12 +1029,65 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
     TypeError: If no conversion function is registered for `value`.
     RuntimeError: If a registered conversion function returns an invalid value.
 
+  """
+  return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
+
+
+@tf_export("convert_to_tensor", v1=[])
+def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
+  """Converts the given `value` to a `Tensor`.
+
+  This function converts Python objects of various types to `Tensor`
+  objects. It accepts `Tensor` objects, numpy arrays, Python lists,
+  and Python scalars. For example:
+
+  ```python
+  import numpy as np
+
+  def my_func(arg):
+    arg = tf.convert_to_tensor(arg, dtype=tf.float32)
+    return tf.matmul(arg, arg) + arg
+
+  # The following calls are equivalent.
+  value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]]))
+  value_2 = my_func([[1.0, 2.0], [3.0, 4.0]])
+  value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
+  ```
+
+  This function can be useful when composing a new operation in Python
+  (such as `my_func` in the example above). All standard Python op
+  constructors apply this function to each of their Tensor-valued
+  inputs, which allows those ops to accept numpy arrays, Python lists,
+  and scalars in addition to `Tensor` objects.
+
+  Note: This function diverges from default Numpy behavior for `float` and
+    `string` types when `None` is present in a Python list or scalar. Rather
+    than silently converting `None` values, an error will be thrown.
+
+  Args:
+    value: An object whose type has a registered `Tensor` conversion function.
+    dtype: Optional element type for the returned tensor. If missing, the
+      type is inferred from the type of `value`.
+    dtype_hint: Optional element type for the returned tensor,
+      used when dtype is None. In some cases, a caller may not have a
+      dtype in mind when converting to a tensor, so dtype_hint
+      can be used as a soft preference.  If the conversion to
+      `dtype_hint` is not possible, this argument has no effect.
+    name: Optional name to use if a new `Tensor` is created.
+
+  Returns:
+    An `Output` based on `value`.
+
+  Raises:
+    TypeError: If no conversion function is registered for `value`.
+    RuntimeError: If a registered conversion function returns an invalid value.
+
   """
   return internal_convert_to_tensor(
       value=value,
       dtype=dtype,
       name=name,
-      preferred_dtype=preferred_dtype,
+      preferred_dtype=dtype_hint,
       as_ref=False)
 
 
@@ -1068,7 +1100,8 @@ def internal_convert_to_tensor(value,
                                name=None,
                                as_ref=False,
                                preferred_dtype=None,
-                               ctx=None):
+                               ctx=None,
+                               accept_symbolic_tensors=True):
   """Converts the given `value` to an `Tensor`.
 
   This function converts Python objects of various types to `Tensor`
@@ -1092,6 +1125,10 @@ def internal_convert_to_tensor(value,
       can be used as a soft preference.  If the conversion to
       `preferred_dtype` is not possible, this argument has no effect.
     ctx: Optional: The value of context.context().
+    accept_symbolic_tensors: Whether Keras graph tensors should be accepted as
+      a valid tensor type during eager execution.
+      If False, this function will raise an exception if it is passed such
+      a tensor during eager eager execution.
 
   Returns:
     A `Tensor` based on `value`.
@@ -1115,6 +1152,19 @@ def internal_convert_to_tensor(value,
         raise RuntimeError("Attempting to capture an EagerTensor without "
                            "building a function.")
       return graph.capture(value, name=name)
+  elif ((not accept_symbolic_tensors) and
+        isinstance(value, Tensor) and
+        ctx.executing_eagerly()):
+    # Found a symbolic tensor in an eager context.
+    # This happens when we use the Keras functional API (i.e. calling layers
+    # on the output of `keras.Input()`, which is symbolic) while eager
+    # execution is enabled.
+    if _is_keras_symbolic_tensor(value):
+      # If the graph of the tensor isn't the Keras graph, we should still
+      # fail, for the time being. TODO(fchollet): consider allowing
+      # all symbolic tensors to raise this exception in this case.
+      raise core._SymbolicException(  # pylint: disable=protected-access
+          "Using the symbolic output of a Keras layer during eager execution.")
 
   if dtype is not None:
     dtype = dtypes.as_dtype(dtype)
@@ -1253,7 +1303,7 @@ def convert_n_to_tensor(values, dtype=None, name=None, preferred_dtype=None):
       as_ref=False)
 
 
-@tf_export("convert_to_tensor_or_indexed_slices")
+@tf_export(v1=["convert_to_tensor_or_indexed_slices"])
 def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
   """Converts the given object to a `Tensor` or an `IndexedSlices`.
 
@@ -2058,12 +2108,6 @@ class Operation(object):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
 
-    # Make sure output shapes are already computed for this op in case we create
-    # a cycle (we cannot compute shapes for cycles). Usually shapes are computed
-    # lazily upon request.
-    if not _USE_C_SHAPES:
-      set_shape_and_handle_data_for_outputs(self)
-
     # Reset cached inputs.
     self._inputs_val = None
     c_api.UpdateEdge(
@@ -2424,8 +2468,9 @@ class RegisterGradient(object):
     return f
 
 
-@tf_export("NoGradient", "NotDifferentiable")
-def NotDifferentiable(op_type):
+@deprecation.deprecated_endpoints("NotDifferentiable", "NoGradient")
+@tf_export("no_gradient", v1=["no_gradient", "NotDifferentiable", "NoGradient"])
+def no_gradient(op_type):
   """Specifies that ops of type `op_type` is not differentiable.
 
   This function should *not* be used for operations that have a
@@ -2458,8 +2503,9 @@ def NotDifferentiable(op_type):
   _gradient_registry.register(None, op_type)
 
 
-# Alias for the old name, will be eventually removed.
-NoGradient = NotDifferentiable
+# Aliases for the old names, will be eventually removed.
+NoGradient = no_gradient
+NotDifferentiable = no_gradient
 
 
 def get_gradient_function(op):
@@ -2535,72 +2581,9 @@ class RegisterShape(object):
     return f
 
 
-# TODO(b/74620627): remove when _USE_C_SHAPES is removed
-def _set_shape_and_handle_data_for_outputs_c_api(op):
-  """Set shapes and resource handle data using info from the C API."""
-  assert not _USE_C_SHAPES
-  for output in op.outputs:
-    output._shape_val = output._c_api_shape()
-    # Set the resource handle data for compatibility with the Python shape
-    # inference code.
-    serialized = c_api.GetHandleShapeAndType(op._graph._c_graph,  # pylint: disable=protected-access
-                                             output._as_tf_output())
-    if serialized:
-      output._handle_data = (
-          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData
-          .FromString(compat.as_bytes(serialized)))
-    else:
-      output._handle_data = None
-
-
-# TODO(b/74620627): remove when _USE_C_SHAPES is removed
-def set_shape_and_handle_data_for_outputs(op):
-  """Set the shapes and resource handle data for op's outputs.
-
-  When _USE_C_SHAPES = False, this is lazily called when a tensor's shape is
-  first requested. Usually this should work automatically, but some edge cases
-  may require manually calling this first to make sure Tensor._shape_val and
-  Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
-  Tensor).
-  """
-  if _USE_C_SHAPES: return
-
-  if op.graph._is_function(op.type):
-    for output in op.outputs:
-      output._shape_val = tensor_shape.unknown_shape()
-    return
-
-  try:
-    shape_func = _shape_registry.lookup(op.type)
-  except LookupError:
-    try:
-      shape_func = _default_shape_function_registry.lookup(op.type)
-    except LookupError:
-      shape_func = _call_cpp_shape_fn_and_require_op
-
-  shapes = shape_func(op)
-  if shapes is None:
-    raise RuntimeError(
-        "Shape function for op %s did not return any shapes" % op)
-  elif isinstance(shapes, dict):
-    # Returned by call_cpp_shape_fn
-    shapes_dict = shapes
-    shapes = shapes_dict["shapes"]
-    handle_datas = shapes_dict["handle_data"]
-    for output, handle_data in zip(op.outputs, handle_datas):
-      # Don't override any existing handle data that may have been manually set.
-      # pylint: disable=protected-access
-      if output._handle_data is None:
-        output._handle_data = handle_data
-      # pylint: enable=protected-access
-
-  if len(op.outputs) != len(shapes):
-    raise RuntimeError(
-        "Shape function for op %s returned %d shapes but expected %d %s %s" %
-        (op, len(shapes), len(op.outputs), shape_func.__name__, str(shapes)))
-  for output, s in zip(op.outputs, shapes):
-    output._shape_val = tensor_shape.unknown_shape()
-    output._shape_val = output._shape_val.merge_with(s)
+def set_shape_and_handle_data_for_outputs(_):
+  """No op. TODO(b/74620627): Remove this."""
+  pass
 
 
 class OpStats(object):
@@ -3453,11 +3436,6 @@ class Graph(object):
 
     # pylint: disable=protected-access
     for op in new_ops:
-      # Operations created by the C API always retrieve shapes from the C API so
-      # we preserve the shapes of ops created in import_graph_def (from the
-      # "_output_shapes" attr of the imported NodeDef).
-      if not _USE_C_SHAPES:
-        _set_shape_and_handle_data_for_outputs_c_api(op)
       new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
       op._add_control_inputs(new_control_inputs)
       op._control_flow_post_processing()
@@ -4913,7 +4891,7 @@ class Graph(object):
 # apply to inner graph mode code. Fix that.
 
 
-@tf_export("device")
+@tf_export(v1=["device"])
 def device(device_name_or_function):
   """Wrapper for `Graph.device()` using the default graph.
 
@@ -4943,7 +4921,41 @@ def device(device_name_or_function):
     return get_default_graph().device(device_name_or_function)
 
 
-@tf_export("container")
+@tf_export("device", v1=[])
+def device_v2(device_name):
+  """Specifies the device for ops created/executed in this context.
+
+  `device_name` can be fully specified, as in "/job:worker/task:1/device:cpu:0",
+  or partially specified, containing only a subset of the "/"-separated
+  fields. Any fields which are specified override device annotations from outer
+  scopes. For example:
+
+  with tf.device('/job:foo'):
+    # ops created here have devices with /job:foo
+    with tf.device('/job:bar/task:0/device:gpu:2'):
+      # ops created here have the fully specified device above
+    with tf.device('/device:gpu:1'):
+      # ops created here have the device '/job:foo/device:gpu:1'
+
+  Args:
+    device_name: The device name to use in the context.
+
+  Returns:
+    A context manager that specifies the default device to use for newly
+    created ops.
+
+  Raises:
+    RuntimeError: If a function is passed in.
+  """
+  if callable(device_name):
+    raise RuntimeError("tf.device does not support functions.")
+  if context.executing_eagerly():
+    return context.device(device_name)
+  else:
+    return get_default_graph().device(device_name)
+
+
+@tf_export(v1=["container"])
 def container(container_name):
   """Wrapper for `Graph.container()` using the default graph.
 
@@ -5564,7 +5576,7 @@ def reset_default_graph():
   _default_graph_stack.reset()
 
 
-@tf_export("get_default_graph")
+@tf_export(v1=["get_default_graph"])
 def get_default_graph():
   """Returns the default graph for the current thread.
 
@@ -5691,7 +5703,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
   return graph or get_default_graph()
 
 
-@tf_export("GraphKeys")
+@tf_export(v1=["GraphKeys"])
 class GraphKeys(object):
   """Standard names to use for graph collections.
 
@@ -5857,7 +5869,7 @@ def dismantle_graph(graph):
   graph.__dict__ = {}
 
 
-@tf_export("add_to_collection")
+@tf_export(v1=["add_to_collection"])
 def add_to_collection(name, value):
   """Wrapper for `Graph.add_to_collection()` using the default graph.
 
@@ -5876,7 +5888,8 @@ def add_to_collection(name, value):
   """
   get_default_graph().add_to_collection(name, value)
 
-@tf_export("add_to_collections")
+
+@tf_export(v1=["add_to_collections"])
 def add_to_collections(names, value):
   """Wrapper for `Graph.add_to_collections()` using the default graph.
 
@@ -5896,7 +5909,7 @@ def add_to_collections(names, value):
   get_default_graph().add_to_collections(names, value)
 
 
-@tf_export("get_collection_ref")
+@tf_export(v1=["get_collection_ref"])
 def get_collection_ref(key):
   """Wrapper for `Graph.get_collection_ref()` using the default graph.
 
@@ -5920,7 +5933,7 @@ def get_collection_ref(key):
   return get_default_graph().get_collection_ref(key)
 
 
-@tf_export("get_collection")
+@tf_export(v1=["get_collection"])
 def get_collection(key, scope=None):
   """Wrapper for `Graph.get_collection()` using the default graph.
 
@@ -6000,6 +6013,13 @@ class name_scope(object):  # pylint: disable=invalid-name
     self._values = values
     self._ctx = context.context()
     self._in_eager_mode = self._ctx.executing_eagerly()
+    self._has_symbolic_input_in_eager = False
+    if self._values and self._in_eager_mode:
+      # The presence of a graph tensor in `self._values` overrides the context.
+      for value in self._values:
+        if hasattr(value, "graph"):
+          self._has_symbolic_input_in_eager = True
+          self._name_scope = value.graph.name_scope(self._name)
 
   def __enter__(self):
     """Start the scope block.
@@ -6011,6 +6031,9 @@ class name_scope(object):  # pylint: disable=invalid-name
       ValueError: if neither `name` nor `default_name` is provided
         but `values` are.
     """
+    if self._has_symbolic_input_in_eager:
+      return self._name_scope.__enter__()
+
     if self._in_eager_mode:
       self._old_name = self._ctx.scope_name
       if not self._name:
@@ -6053,7 +6076,9 @@ class name_scope(object):  # pylint: disable=invalid-name
         raise
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
-    if self._in_eager_mode:
+    if self._has_symbolic_input_in_eager:
+      self._name_scope.__exit__(type_arg, value_arg, traceback_arg)
+    elif self._in_eager_mode:
       self._ctx.scope_name = self._old_name
     else:
       self._name_scope.__exit__(type_arg, value_arg, traceback_arg)
@@ -6118,7 +6143,7 @@ def prepend_name_scope(name, import_scope):
 
 # pylint: disable=g-doc-return-or-yield
 # pylint: disable=not-context-manager
-@tf_export("op_scope")
+@tf_export(v1=["op_scope"])
 @tf_contextlib.contextmanager
 def op_scope(values, name, default_name=None):
   """DEPRECATED. Same as name_scope above, just different argument order."""
@@ -6213,4 +6238,8 @@ def _op_to_colocate_with(v):
   return internal_convert_to_tensor_or_indexed_slices(v, as_ref=True).op
 
 
+def _is_keras_symbolic_tensor(x):
+  return hasattr(x, "graph") and getattr(x.graph, "name", None) == "keras_graph"
+
+
 register_tensor_conversion_function(Operation, _operation_conversion_error)
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 0fb17081e758a7f43a1fb1e6d415da3ed630aea7..b9c690849d38f873e07ca83d0ad962439f14f360 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -317,7 +317,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       values = [[2], [3], [5], [7]]
       tensor = ops.convert_to_tensor(values)
       self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
+      self.assertAllEqual(values, self.evaluate(tensor))
 
   def testShapeTuple(self):
     with self.cached_session():
@@ -346,18 +346,18 @@ class OperationTest(test_util.TensorFlowTestCase):
       tensor = ops.convert_to_tensor(
           [constant_op.constant(row) for row in values])
       self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
+      self.assertAllEqual(values, self.evaluate(tensor))
       tensor = ops.convert_to_tensor(
           [[constant_op.constant(v) for v in row] for row in values])
       self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
+      self.assertAllEqual(values, self.evaluate(tensor))
 
   def testConvertToTensorNestedMix(self):
     with self.cached_session():
       values = ([2], (3,), [constant_op.constant(5)], constant_op.constant([7]))
       tensor = ops.convert_to_tensor(values)
       self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(((2,), (3,), (5,), (7,)), tensor.eval())
+      self.assertAllEqual(((2,), (3,), (5,), (7,)), self.evaluate(tensor))
 
   def testConvertToTensorPreferred(self):
     with self.cached_session():
@@ -517,21 +517,21 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEquals(x.consumers(), [])
     self.assertEquals(y.consumers(), [z.op, z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 4)
+      self.assertEquals(self.evaluate(z), 4)
 
     z.op._update_input(0, x)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
     self.assertEquals(x.consumers(), [z.op])
     self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 3)
+      self.assertEquals(self.evaluate(z), 3)
 
     z.op._update_input(1, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
     self.assertEquals(x.consumers(), [z.op])
     self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 3)
+      self.assertEquals(self.evaluate(z), 3)
 
   def testUpdateInputGraphError(self):
     g_0 = ops.Graph()
@@ -701,7 +701,6 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(g.get_operation_by_name("myop"), op)
     self.assertEqual(g.get_tensor_by_name("myop:0"), op.outputs[0])
 
-  @test_util.enable_c_shapes
   def testShape(self):
     g = ops.Graph()
     with g.as_default():
@@ -2491,12 +2490,14 @@ class KernelLabelTest(test_util.TensorFlowTestCase):
       # pylint: enable=protected-access
       default_3 = test_ops.kernel_label()
 
-      self.assertAllEqual(b"My label is: default", default_1.eval())
-      self.assertAllEqual(b"My label is: default", default_2.eval())
-      self.assertAllEqual(b"My label is: default", default_3.eval())
-      self.assertAllEqual(b"My label is: overload_1", overload_1_1.eval())
-      self.assertAllEqual(b"My label is: overload_1", overload_1_2.eval())
-      self.assertAllEqual(b"My label is: overload_2", overload_2.eval())
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_1))
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_2))
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_3))
+      self.assertAllEqual(b"My label is: overload_1",
+                          self.evaluate(overload_1_1))
+      self.assertAllEqual(b"My label is: overload_1",
+                          self.evaluate(overload_1_2))
+      self.assertAllEqual(b"My label is: overload_2", self.evaluate(overload_2))
 
 
 class AsGraphDefTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 2022fbcbaad8697c147ae63fbea295270046f7f2..465016b808726f28909013e994b9b23b915d982a 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -355,15 +355,12 @@ string GenEagerPythonOp::Code() {
 }
 
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
-  // Handle graph-mode case
-  strings::StrAppend(&result_,
-                     "  _ctx = _context._context\n"
-                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
-                     function_setup,
-                     "    _, _, _op = _op_def_lib._apply_op_helper(\n");
+  strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
+  strings::StrAppend(&result_, function_setup,
+                     "  _, _, _op = _op_def_lib._apply_op_helper(\n");
   AddBodyNoReturn("        ");
   if (num_outs_ > 0) {
-    strings::StrAppend(&result_, "    _result = _op.outputs[:]\n");
+    strings::StrAppend(&result_, "  _result = _op.outputs[:]\n");
     // Special case handling for stateful op with single list output
     // that might be empty.
     if (num_outs_ == 1 && op_def_.is_stateful() &&
@@ -372,10 +369,10 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
       // TODO(josh11b): Can skip this if the number_attr/type_list_attr has
       // a constraint indicating that this can never be empty.
       strings::StrAppend(&result_,
-                         "    if not _result:\n"
-                         "      return _op\n");
+                         "  if not _result:\n"
+                         "    return _op\n");
     }
-    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
+    strings::StrAppend(&result_, "  _inputs_flat = _op.inputs\n");
 
     // Compute graph-mode attrs.
     if (op_def_.attr_size() > 0) {
@@ -387,14 +384,13 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
                            attr_name, "\")");
       }
       strings::StrAppend(&attr_values, ")");
-      strings::StrAppend(&result_,
-                         WordWrap("    _attrs = (", attr_values, kRightMargin),
-                         "\n");
+      strings::StrAppend(
+          &result_, WordWrap("  _attrs = (", attr_values, kRightMargin), "\n");
     } else {
-      strings::StrAppend(&result_, "    _attrs = None\n");
+      strings::StrAppend(&result_, "  _attrs = None\n");
     }
   } else {
-    strings::StrAppend(&result_, "    return _op\n");
+    strings::StrAppend(&result_, "  return _op\n");
   }
 }
 
@@ -643,25 +639,26 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   AddDocStringOutputs();
   strings::StrAppend(&result_, "  \"\"\"\n");
 
+  strings::StrAppend(&result_,
+                     "  _ctx = _context._context\n"
+                     "  if _ctx is not None and _ctx._eager_context.is_eager:",
+                     "\n");
+  if (eager_not_allowed_error.empty()) {
+    AddEagerFastPathExecute();
+  } else {
+    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
+  }
+
   // Handle graph-mode case
   string function_setup;
-  if (!GetEagerFunctionSetup("    ", &function_setup)) {
+  if (!GetEagerFunctionSetup("  ", &function_setup)) {
     result_ = function_setup;
     return false;
   }
   HandleGraphMode(function_setup);
-  AddEagerFunctionTeardown("    ", output_sizes,
+  AddEagerFunctionTeardown("  ", output_sizes,
                            true /* execute_record_gradient */);
 
-  // Handle eager-mode case
-  strings::StrAppend(&result_, "  else:\n");
-
-  if (eager_not_allowed_error.empty()) {
-    AddEagerFastPathExecute();
-  } else {
-    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
-  }
-
   strings::StrAppend(&result_, "\n\n");
   return true;
 }
@@ -669,13 +666,14 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
 bool GenEagerPythonOp::AddEagerFallbackCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& num_outputs_expr, const string& eager_not_allowed_error) {
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
+             strings::StrCat(parameters, ", ctx=None"));
+
   if (!eager_not_allowed_error.empty()) {
     strings::StrAppend(&result_, "  ", eager_not_allowed_error);
     return true;
   }
 
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
-             strings::StrCat(parameters, ", ctx=None"));
   strings::StrAppend(
       &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
   strings::StrAppend(&result_, "  This is for function ", function_name_,
@@ -750,12 +748,16 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
   strings::StrAppend(&fallback_params, "ctx=_ctx");
   strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
+  strings::StrAppend(&result_, "      try:\n");
   strings::StrAppend(
-      &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
+      &result_, "        ", "return ", function_name_, kEagerFallbackSuffix,
       "(\n",
-      WordWrap(strings::StrCat("          "),
+      WordWrap(strings::StrCat("            "),
                strings::StrCat(fallback_params, ")"), kRightMargin),
       "\n");
+  strings::StrAppend(&result_, "      except _core._SymbolicException:\n");
+  strings::StrAppend(&result_,
+                     "        pass  # Add nodes to the TensorFlow graph.\n");
 
   // Any errors thrown from execute need to be unwrapped from
   // _NotOkStatusException.
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 777bb2fe8c544440e2897c73ecabf332b7fd18ee..6b7f56a92cc02fd9f44a541ed3536b35653031d9 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -34,7 +34,7 @@ def _truncate_seed(seed):
   return seed % _MAXINT32  # Truncate to fit into 32-bit integer
 
 
-@tf_export('random.get_seed', v1=['random.get_seed', 'get_seed'])
+@tf_export(v1=['random.get_seed', 'get_seed'])
 @deprecation.deprecated_endpoints('get_seed')
 def get_seed(op_seed):
   """Returns the local seeds an operation should use given an op-specific seed.
@@ -45,7 +45,7 @@ def get_seed(op_seed):
   graph, or for only specific operations.
 
   For details on how the graph-level seed interacts with op seeds, see
-  `tf.set_random_seed`.
+  `tf.random.set_random_seed`.
 
   Args:
     op_seed: integer.
@@ -82,7 +82,7 @@ def get_seed(op_seed):
   return seeds
 
 
-@tf_export('random.set_random_seed', 'set_random_seed')
+@tf_export(v1=['random.set_random_seed', 'set_random_seed'])
 def set_random_seed(seed):
   """Sets the graph-level random seed.
 
@@ -154,7 +154,7 @@ def set_random_seed(seed):
   sessions, set a graph-level seed:
 
   ```python
-  tf.set_random_seed(1234)
+  tf.random.set_random_seed(1234)
   a = tf.random_uniform([1])
   b = tf.random_normal([1])
 
@@ -182,3 +182,103 @@ def set_random_seed(seed):
     context.set_global_seed(seed)
   else:
     ops.get_default_graph().seed = seed
+
+
+@tf_export('random.set_seed', v1=[])
+def set_seed(seed):
+  """Sets the graph-level random seed.
+
+  Operations that rely on a random seed actually derive it from two seeds:
+  the graph-level and operation-level seeds. This sets the graph-level seed.
+
+  Its interactions with operation-level seeds is as follows:
+
+    1. If neither the graph-level nor the operation seed is set:
+      A random seed is used for this op.
+    2. If the graph-level seed is set, but the operation seed is not:
+      The system deterministically picks an operation seed in conjunction
+      with the graph-level seed so that it gets a unique random sequence.
+    3. If the graph-level seed is not set, but the operation seed is set:
+      A default graph-level seed and the specified operation seed are used to
+      determine the random sequence.
+    4. If both the graph-level and the operation seed are set:
+      Both seeds are used in conjunction to determine the random sequence.
+
+  To illustrate the user-visible effects, consider these examples:
+
+  To generate different sequences across sessions, set neither
+  graph-level nor op-level seeds:
+
+  ```python
+  a = tf.random_uniform([1])
+  b = tf.random_normal([1])
+
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A3'
+    print(sess2.run(a))  # generates 'A4'
+    print(sess2.run(b))  # generates 'B3'
+    print(sess2.run(b))  # generates 'B4'
+  ```
+
+  To generate the same repeatable sequence for an op across sessions, set the
+  seed for the op:
+
+  ```python
+  a = tf.random_uniform([1], seed=1)
+  b = tf.random_normal([1])
+
+  # Repeatedly running this block with the same graph will generate the same
+  # sequence of values for 'a', but different sequences of values for 'b'.
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A1'
+    print(sess2.run(a))  # generates 'A2'
+    print(sess2.run(b))  # generates 'B3'
+    print(sess2.run(b))  # generates 'B4'
+  ```
+
+  To make the random sequences generated by all ops be repeatable across
+  sessions, set a graph-level seed:
+
+  ```python
+  tf.random.set_seed(1234)
+  a = tf.random_uniform([1])
+  b = tf.random_normal([1])
+
+  # Repeatedly running this block with the same graph will generate the same
+  # sequences of 'a' and 'b'.
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A1'
+    print(sess2.run(a))  # generates 'A2'
+    print(sess2.run(b))  # generates 'B1'
+    print(sess2.run(b))  # generates 'B2'
+  ```
+
+  Args:
+    seed: integer.
+  """
+  # TODO(go/tf2-random): change doc, update to match design doc
+  set_random_seed(seed)
diff --git a/tensorflow/python/framework/smart_cond_test.py b/tensorflow/python/framework/smart_cond_test.py
index b8a9672b06da9b24d567a9779fb703ac7178d411..174ada9fe1178bb3333e013e017946c3f9ba1b30 100644
--- a/tensorflow/python/framework/smart_cond_test.py
+++ b/tensorflow/python/framework/smart_cond_test.py
@@ -109,8 +109,8 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
                               exclusive=True)
     with session.Session() as sess:
       # No feed_dict necessary
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 1)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 1)
 
   def testFalse(self):
     conditions = [(False, raise_exception)]
@@ -121,8 +121,8 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
                               default=lambda: constant_op.constant(1),
                               exclusive=True)
     with session.Session() as sess:
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 1)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 1)
 
   def testMix(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 68f15f5e62ad1f24182578b444d355e2337fdd05..3643fc5e00475b8d2ebc2e2fc23fa6fd19bea114 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -248,7 +248,7 @@ tf_export("SparseTensorValue")(SparseTensorValue)
 pywrap_tensorflow.RegisterType("SparseTensorValue", SparseTensorValue)
 
 
-@tf_export("convert_to_tensor_or_sparse_tensor")
+@tf_export(v1=["convert_to_tensor_or_sparse_tensor"])
 def convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None):
   """Converts value to a `SparseTensor` or `Tensor`.
 
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index 22423c4f58ca510a2e247b9cd783d5596ca65e46..9ee1bd75a53297063e3d47aa1093d856aaf95b7c 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -46,11 +46,11 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
       self.assertEqual(sp.get_shape(), (4, 5))
 
       with self.cached_session() as sess:
-        value = sp.eval()
+        value = self.evaluate(sp)
         self.assertAllEqual(indices, value.indices)
         self.assertAllEqual(values, value.values)
         self.assertAllEqual(shape, value.dense_shape)
-        sess_run_value = sess.run(sp)
+        sess_run_value = self.evaluate(sp)
         self.assertAllEqual(sess_run_value.indices, value.indices)
         self.assertAllEqual(sess_run_value.values, value.values)
         self.assertAllEqual(sess_run_value.dense_shape, value.dense_shape)
@@ -85,7 +85,7 @@ class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
       value = [42, 43]
       from_value = sparse_tensor.convert_to_tensor_or_sparse_tensor(
           value)
-      self.assertAllEqual(value, from_value.eval())
+      self.assertAllEqual(value, self.evaluate(from_value))
 
   def test_convert_sparse(self):
     with self.cached_session():
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 5a58d271488080eb1ba0036ec60404b5e28adb76..960a3dad7389553955c999e444a9f98c1857f588 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -169,7 +169,7 @@ def dimension_at_index(shape, index):
     return shape.dims[index]
 
 
-@tf_export("Dimension")
+@tf_export(v1=["Dimension"])
 class Dimension(object):
   """Represents the value of one dimension in a TensorShape."""
 
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index fbea930fe0e6a4545b9a5ac55c0a7684b3cd8e28..c44636edc4ec5101c588766714c98a7da15793e4 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -24,14 +24,15 @@ from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("TensorSpec")
 class TensorSpec(object):
   """Describes a tf.Tensor.
 
-  A TensorSpec allows an API to describe the Tensors that it accepts or
-  returns, before that Tensor exists. This allows dynamic and flexible graph
-  construction and configuration.
+  Metadata for describing the `tf.Tensor` objects accepted or returned
+  by some TensorFlow APIs.
   """
 
   __slots__ = ["_shape", "_shape_tuple", "_dtype", "_name"]
@@ -69,11 +70,6 @@ class TensorSpec(object):
     else:
       raise ValueError("`tensor` should be a tf.Tensor")
 
-  @classmethod
-  def is_bounded(cls):
-    del cls
-    return False
-
   @property
   def shape(self):
     """Returns the `TensorShape` that represents the shape of the tensor."""
@@ -86,21 +82,21 @@ class TensorSpec(object):
 
   @property
   def name(self):
-    """Returns the name of the described tensor."""
+    """Returns the (optionally provided) name of the described tensor."""
     return self._name
 
-  @property
-  def is_discrete(self):
-    """Whether spec is discrete."""
-    return self.dtype.is_integer
+  def is_compatible_with(self, spec_or_tensor):
+    """Returns True if spec_or_tensor is compatible with this TensorSpec.
 
-  @property
-  def is_continuous(self):
-    """Whether spec is continuous."""
-    return self.dtype.is_floating
+    Two tensors are considered compatible if they have the same dtype
+    and their shapes are compatible (see `tf.TensorShape.is_compatible_with`).
 
-  def is_compatible_with(self, spec_or_tensor):
-    """True if the shape and dtype of `spec_or_tensor` are compatible."""
+    Args:
+      spec_or_tensor: A tf.TensorSpec or a tf.Tensor
+
+    Returns:
+      True if spec_or_tensor is compatible with self.
+    """
     return (self._dtype.is_compatible_with(spec_or_tensor.dtype) and
             self._shape.is_compatible_with(spec_or_tensor.shape))
 
@@ -188,11 +184,6 @@ class BoundedTensorSpec(TensorSpec):
     self._maximum = np.array(maximum, dtype=self.dtype.as_numpy_dtype())
     self._maximum.setflags(write=False)
 
-  @classmethod
-  def is_bounded(cls):
-    del cls
-    return True
-
   @classmethod
   def from_spec(cls, spec):
     dtype = dtypes.as_dtype(spec.dtype)
@@ -223,4 +214,3 @@ class BoundedTensorSpec(TensorSpec):
   def __reduce__(self):
     return BoundedTensorSpec, (self._shape, self._dtype, self._minimum,
                                self._maximum, self._name)
-
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index 40611e5f840db224f6343f9fdb3852b58c45f5a6..e3aad7cc236470629226ca871c7321669412eb4a 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -134,22 +134,6 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(bounded_spec.dtype, spec.dtype)
     self.assertEqual(bounded_spec.name, spec.name)
 
-  def testIsDiscrete(self):
-    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
-    self.assertTrue(discrete_spec.is_discrete)
-    self.assertFalse(continuous_spec.is_discrete)
-
-  def testIsContinuous(self):
-    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
-    self.assertFalse(discrete_spec.is_continuous)
-    self.assertTrue(continuous_spec.is_continuous)
-
-  def testIsBounded(self):
-    unbounded_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    self.assertFalse(unbounded_spec.is_bounded())
-
   def testSerialization(self):
     desc = tensor_spec.TensorSpec([1, 5], dtypes.float32, "test")
     self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
@@ -165,11 +149,6 @@ class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "not compatible"):
       tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, 0, (1, 1, 1))
 
-  def testIsBounded(self):
-    bounded_spec = tensor_spec.BoundedTensorSpec(
-        (1, 2), dtypes.int32, minimum=0, maximum=1)
-    self.assertTrue(bounded_spec.is_bounded())
-
   def testMinimumMaximumAttributes(self):
     spec = tensor_spec.BoundedTensorSpec(
         (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 0582e986032b10182a6850900110921d4cdceb00..9db94f5288cc515e5a764a19520c057bffa64a9b 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -339,11 +339,29 @@ _TF_TO_IS_OK = {
     dtypes.string: [_FilterStr],
     dtypes.uint16: [_FilterInt],
     dtypes.uint8: [_FilterInt],
+    dtypes.uint32: [_FilterInt],
+    dtypes.uint64: [_FilterInt],
 }
 
 
 def _AssertCompatible(values, dtype):
-  fn_list = _TF_TO_IS_OK.get(dtype, [_FilterNotTensor])
+  if dtype is None:
+    fn_list = [_FilterNotTensor]
+  else:
+    try:
+      fn_list = _TF_TO_IS_OK[dtype]
+    except KeyError:
+      # There isn't a specific fn_list, so we try to do the best possible.
+      if dtype.is_integer:
+        fn_list = [_FilterInt]
+      elif dtype.is_floating:
+        fn_list = [_FilterFloat]
+      elif dtype.is_complex:
+        fn_list = [_FilterComplex]
+      elif dtype.is_quantized:
+        fn_list = [_FilterInt, _FilterTuple]
+      else:
+        fn_list = [_FilterNotTensor]
   mismatch = _FirstNotNone([fn(values) for fn in fn_list])
   if mismatch is not None:
     if dtype is None:
@@ -353,7 +371,7 @@ def _AssertCompatible(values, dtype):
                       (dtype.name, repr(mismatch), type(mismatch).__name__))
 
 
-@tf_export("make_tensor_proto")
+@tf_export(v1=["make_tensor_proto"])
 def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   """Create a TensorProto.
 
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index bdf759f22047fe62a7820bc170654fed07f7adc9..87d65c8c466c0887953ab434e02f1e4d8e615460 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -771,7 +771,7 @@ class TensorUtilTest(test.TestCase):
     with self.cached_session() as sess:
       ma = MockArray(np.array([10, 20, 30]))
       t = ops.convert_to_tensor(ma)
-      a = sess.run(t)
+      a = self.evaluate(t)
       self.assertEquals(np.int64, a.dtype)
       self.assertAllClose(np.array([10, 20, 30], dtype=np.int64), a)
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index f561d16bc3e038c77ccb06501f74e2c5c4a1daf6..bf0ebaea9973f686681a7ccc13cc915cfe68c11c 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -53,7 +53,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape  # pylint: disable=unused-import
+from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -61,10 +61,12 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
@@ -113,8 +115,28 @@ def assert_ops_in_graph(expected_ops, graph):
   return actual_ops
 
 
-@tf_export("test.assert_equal_graph_def")
-def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
+@tf_export("test.assert_equal_graph_def", v1=[])
+def assert_equal_graph_def_v2(actual, expected):
+  """Asserts that two `GraphDef`s are (mostly) the same.
+
+  Compares two `GraphDef` protos for equality, ignoring versions and ordering of
+  nodes, attrs, and control inputs.  Node names are used to match up nodes
+  between the graphs, so the naming of nodes must be consistent. This function
+  ignores randomized attribute values that may appear in V2 checkpoints.
+
+  Args:
+    actual: The `GraphDef` we have.
+    expected: The `GraphDef` we expected.
+
+  Raises:
+    AssertionError: If the `GraphDef`s do not match.
+    TypeError: If either argument is not a `GraphDef`.
+  """
+  assert_equal_graph_def(actual, expected, checkpoint_v2=True)
+
+
+@tf_export(v1=["test.assert_equal_graph_def"])
+def assert_equal_graph_def_v1(actual, expected, checkpoint_v2=False):
   """Asserts that two `GraphDef`s are (mostly) the same.
 
   Compares two `GraphDef` protos for equality, ignoring versions and ordering of
@@ -131,6 +153,10 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     AssertionError: If the `GraphDef`s do not match.
     TypeError: If either argument is not a `GraphDef`.
   """
+  assert_equal_graph_def(actual, expected, checkpoint_v2)
+
+
+def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
   if not isinstance(actual, graph_pb2.GraphDef):
     raise TypeError(
         "Expected tf.GraphDef for actual, got %s" % type(actual).__name__)
@@ -353,53 +379,12 @@ def skip_if(condition):
 
 
 def enable_c_shapes(fn):
-  """Decorator for enabling C shapes on a test.
-
-  Note this enables the C shapes after running the test class's setup/teardown
-  methods.
-
-  Args:
-    fn: the function to be wrapped
-
-  Returns:
-    The wrapped function
-  """
-
-  # pylint: disable=protected-access
-  def wrapper(*args, **kwargs):
-    prev_value = ops._USE_C_SHAPES
-    ops._USE_C_SHAPES = True
-    try:
-      fn(*args, **kwargs)
-    finally:
-      ops._USE_C_SHAPES = prev_value
-
-  # pylint: enable=protected-access
-
-  return wrapper
+  """No-op. TODO(b/74620627): Remove this."""
+  return fn
 
 
 def with_c_shapes(cls):
-  """Adds methods that call original methods but with C API shapes enabled.
-
-  Note this enables C shapes in new methods after running the test class's
-  setup method.
-
-  Args:
-    cls: class to decorate
-
-  Returns:
-    cls with new test methods added
-  """
-  # If C shapes are already enabled, don't do anything. Some tests break if the
-  # same test is run twice, so this allows us to turn on the C shapes by default
-  # without breaking these tests.
-  if ops._USE_C_SHAPES:
-    return cls
-
-  for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith("test"):
-      setattr(cls, name + "WithCShapes", enable_c_shapes(value))
+  """No-op. TODO(b/74620627): Remove this."""
   return cls
 
 
@@ -422,13 +407,40 @@ def enable_control_flow_v2(fn):
   def wrapper(*args, **kwargs):
     enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
     enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
+    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
     control_flow_ops.ENABLE_COND_V2 = True
     control_flow_ops.ENABLE_WHILE_V2 = True
+    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
     try:
       fn(*args, **kwargs)
     finally:
       control_flow_ops.ENABLE_COND_V2 = enable_cond_v2_old
       control_flow_ops.ENABLE_WHILE_V2 = enable_while_v2_old
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
+
+  return wrapper
+
+
+def enable_tensor_array_v2(fn):
+  """Decorator for enabling _GraphTensorArrayV2 on a test.
+
+  Note this enables _GraphTensorArrayV2 after running the test class's
+  setup/teardown methods.
+
+  Args:
+    fn: the function to be wrapped
+
+  Returns:
+    The wrapped function
+  """
+
+  def wrapper(*args, **kwargs):
+    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
+    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
+    try:
+      fn(*args, **kwargs)
+    finally:
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
 
   return wrapper
 
@@ -629,6 +641,109 @@ def assert_no_new_tensors(f):
   return decorator
 
 
+def _find_reference_cycle(objects, idx):
+
+  def get_ignore_reason(obj, blacklist):
+    """Tests whether an object should be omitted from the dependency graph."""
+    if len(blacklist) > 100:
+      return "<depth limit>"
+    if tf_inspect.isframe(obj):
+      if "test_util.py" in tf_inspect.getframeinfo(obj)[0]:
+        return "<test code>"
+    for b in blacklist:
+      if b is obj:
+        return "<test code>"
+    if obj is blacklist:
+      return "<test code>"
+    return None
+
+  # Note: this function is meant to help with diagnostics. Its output is purely
+  # a human readable representation, so you may freely modify it to suit your
+  # needs.
+  def describe(obj, blacklist, leaves_only=False):
+    """Returns a custom human-readable summary of obj.
+
+    Args:
+      obj: the value to describe.
+      blacklist: same as blacklist in get_ignore_reason.
+      leaves_only: boolean flag used when calling describe recursively. Useful
+        for summarizing collections.
+    """
+    if get_ignore_reason(obj, blacklist):
+      return "{}{}".format(get_ignore_reason(obj, blacklist), type(obj))
+    if tf_inspect.isframe(obj):
+      return "frame: {}".format(tf_inspect.getframeinfo(obj))
+    elif tf_inspect.ismodule(obj):
+      return "module: {}".format(obj.__name__)
+    else:
+      if leaves_only:
+        return "{}, {}".format(type(obj), id(obj))
+      elif isinstance(obj, list):
+        return "list({}): {}".format(
+            id(obj), [describe(e, blacklist, leaves_only=True) for e in obj])
+      elif isinstance(obj, tuple):
+        return "tuple({}): {}".format(
+            id(obj), [describe(e, blacklist, leaves_only=True) for e in obj])
+      elif isinstance(obj, dict):
+        return "dict({}): {} keys".format(id(obj), len(obj.keys()))
+      elif tf_inspect.isfunction(obj):
+        return "function({}) {}; globals ID: {}".format(
+            id(obj), obj.__name__, id(obj.__globals__))
+      else:
+        return "{}, {}".format(type(obj), id(obj))
+
+  def build_ref_graph(obj, graph, reprs, blacklist):
+    """Builds a reference graph as <referrer> -> <list of refferents>.
+
+    Args:
+      obj: The object to start from. The graph will be built by recursively
+        adding its referrers.
+      graph: Dict holding the graph to be built. To avoid creating extra
+        references, the graph holds object IDs rather than actual objects.
+      reprs: Auxiliary structure that maps object IDs to their human-readable
+        description.
+      blacklist: List of objects to ignore.
+    """
+    referrers = gc.get_referrers(obj)
+    blacklist = blacklist + (referrers,)
+
+    obj_id = id(obj)
+    for r in referrers:
+      if get_ignore_reason(r, blacklist) is None:
+        r_id = id(r)
+        if r_id not in graph:
+          graph[r_id] = []
+        if obj_id not in graph[r_id]:
+          graph[r_id].append(obj_id)
+          build_ref_graph(r, graph, reprs, blacklist)
+          reprs[r_id] = describe(r, blacklist)
+
+  def find_cycle(el, graph, reprs, path):
+    """Finds and prints a single cycle in the dependency graph."""
+    if el not in graph:
+      return
+    for r in graph[el]:
+      if r in path:
+        logging.error("Reference cycle sample:")
+        for p in path + (r,):
+          logging.error(reprs.get(p, "unknown object " + str(p)))
+        return True
+      else:
+        if find_cycle(r, graph, reprs, path + (r,)):
+          return True
+    return False
+
+  obj = objects[idx]
+  graph = {}  # referrer ID -> object ID
+  reprs = {}  # object ID -> description
+  build_ref_graph(obj, graph, reprs, (objects, graph, reprs, get_ignore_reason,
+                                      describe, build_ref_graph, find_cycle))
+  for k in graph:
+    if find_cycle(k, graph, reprs, ()):
+      return True
+  return False
+
+
 def assert_no_garbage_created(f):
   """Test method decorator to assert that no garbage has been created.
 
@@ -644,6 +759,10 @@ def assert_no_garbage_created(f):
 
   def decorator(self, **kwargs):
     """Sets DEBUG_SAVEALL, runs the test, and checks for new garbage."""
+    # Force-load `distribution_strategy_context` to prevent GC at
+    # test time when using eager. Remove once b/117329403 is resolved.
+    tape.distribution_strategy_context.get_distribution_strategy()
+
     gc.disable()
     previous_debug_flags = gc.get_debug()
     gc.set_debug(gc.DEBUG_SAVEALL)
@@ -651,7 +770,8 @@ def assert_no_garbage_created(f):
     previous_garbage = len(gc.garbage)
     f(self, **kwargs)
     gc.collect()
-    if len(gc.garbage) > previous_garbage:
+    new_garbage = len(gc.garbage)
+    if new_garbage > previous_garbage:
       logging.error(
           "The decorated test created work for Python's garbage collector, "
           "likely due to a reference cycle. New objects in cycle(s):")
@@ -675,11 +795,19 @@ def assert_no_garbage_created(f):
           logging.error(obj)
           logging.error("  Object __repr__:")
           logging.error(repr(obj))
-        except Exception:
+        except Exception:  # pylint: disable=broad-except
           logging.error("(Exception while printing object)")
+
+    # When garbage is created, this call can help identify reference cycles,
+    # which are typically the cause of such garbage.
+    if new_garbage > previous_garbage:
+      for i in range(previous_garbage, new_garbage):
+        if _find_reference_cycle(gc.garbage, i):
+          break
+
     # This will fail if any garbage has been created, typically because of a
     # reference cycle.
-    self.assertEqual(previous_garbage, len(gc.garbage))
+    self.assertEqual(previous_garbage, new_garbage)
     # TODO(allenl): Figure out why this debug flag reset doesn't work. It would
     # be nice to be able to decorate arbitrary tests in a large test suite and
     # not hold on to every object in other tests.
@@ -835,20 +963,20 @@ def run_in_graph_and_eager_modes(func=None,
           "`run_test_in_graph_and_eager_modes` only supports test methods. "
           "Did you mean to use `run_all_in_graph_and_eager_modes`?")
 
-    def decorated(self, **kwargs):
+    def decorated(self, *args, **kwargs):
       try:
         with context.graph_mode():
           with self.test_session(use_gpu=use_gpu, config=config):
-            f(self, **kwargs)
+            f(self, *args, **kwargs)
       except unittest.case.SkipTest:
         pass
 
       def run_eagerly(self, **kwargs):
         if not use_gpu:
           with ops.device("/device:CPU:0"):
-            f(self, **kwargs)
+            f(self, *args, **kwargs)
         else:
-          f(self, **kwargs)
+          f(self, *args, **kwargs)
 
       if assert_no_eager_garbage:
         ops.reset_default_graph()
@@ -934,6 +1062,27 @@ def device(use_gpu):
     yield
 
 
+@contextlib.contextmanager
+def use_gpu():
+  """Uses gpu when requested and available."""
+  with device(use_gpu=True):
+    yield
+
+
+@contextlib.contextmanager
+def force_gpu():
+  """Force the gpu to be used."""
+  with ops.device("/device:GPU:0"):
+    yield
+
+
+@contextlib.contextmanager
+def force_cpu():
+  """Force the cpu to be used."""
+  with ops.device("/device:CPU:0"):
+    yield
+
+
 class CapturedWrites(object):
   """A utility class to load the captured writes made to a stream."""
 
@@ -1135,6 +1284,9 @@ class TensorFlowTestCase(googletest.TestCase):
       return self._eval_helper(tensor())
     else:
       try:
+        if sparse_tensor.is_sparse(tensor):
+          return sparse_tensor.SparseTensorValue(tensor.indices, tensor.values,
+                                                 tensor.dense_shape)
         return tensor.numpy()
       except AttributeError as e:
         six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e)
@@ -1670,9 +1822,16 @@ class TensorFlowTestCase(googletest.TestCase):
     msg = msg if msg else ""
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    self.assertEqual(
-        a.shape, b.shape, "Shape mismatch: expected %s, got %s."
-        " %s" % (a.shape, b.shape, msg))
+    # Arbitrary bounds so that we don't print giant tensors.
+    if (b.ndim <= 3 or b.size < 500):
+      self.assertEqual(
+          a.shape, b.shape, "Shape mismatch: expected %s, got %s."
+          " Contents: %s. \n%s." % (a.shape, b.shape, b, msg))
+    else:
+      self.assertEqual(
+          a.shape, b.shape, "Shape mismatch: expected %s, got %s."
+          " %s" % (a.shape, b.shape, msg))
+
     same = (a == b)
 
     if (a.dtype in [
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 79b1979925f5582b5677728327e9d665ad8df174..cbefe86481421396c0d67f042cf876e3b8e39b53 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -24,6 +24,7 @@ import random
 import threading
 import weakref
 
+from absl.testing import parameterized
 import numpy as np
 
 from google.protobuf import text_format
@@ -46,7 +47,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-class TestUtilTest(test_util.TensorFlowTestCase):
+class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def test_assert_ops_in_graph(self):
     with self.test_session():
@@ -728,6 +729,12 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertEqual(modes[0:2], ["setup_graph", "run_graph"])
     self.assertEqual(modes[2:], ["setup_eager", "run_eager"])
 
+  @parameterized.named_parameters(dict(testcase_name="argument",
+                                       arg=True))
+  @test_util.run_in_graph_and_eager_modes
+  def test_run_in_graph_and_eager_works_with_parameterized_keyword(self, arg):
+    self.assertEqual(arg, True)
+
 
 # Its own test case to reproduce variable sharing issues which only pop up when
 # setUp() is overridden and super() is not called.
diff --git a/tensorflow/python/framework/traceable_stack.py b/tensorflow/python/framework/traceable_stack.py
index 7f4d28237ffba80e5aa604b880fccf00482a9ca5..c4e35a83256c2d546ae45d6b8ed9292de1f7ff0b 100644
--- a/tensorflow/python/framework/traceable_stack.py
+++ b/tensorflow/python/framework/traceable_stack.py
@@ -58,7 +58,7 @@ class TraceableObject(object):
     frame_records = tf_stack.extract_stack()
     if not frame_records:
       return self.FAILURE
-    if len(frame_records) >= local_offset:
+    if len(frame_records) > local_offset:
       # Negative indexing is one-indexed instead of zero-indexed.
       negative_offset = -(local_offset + 1)
       self.filename, self.lineno = frame_records[negative_offset][:2]
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
index ab1d0ed25b9130fabcffbb8da2265c046206da46..30c1e1468146ce58216acbfbb1aef1ab1408027f 100644
--- a/tensorflow/python/grappler/constant_folding_test.py
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -61,7 +61,7 @@ class ConstantFoldingTest(test.TestCase):
           back_prop=False,
           parallel_iterations=1)
       with session.Session() as sess:
-        y_v = sess.run(y)
+        y_v = self.evaluate(y)
         self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
 
 
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index e6229e18566d7b6431f77ac32118bb56cda615ec..7dbaf449cad6f65fbf84054f9e2d5a631b46d13b 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -25,8 +25,8 @@ from google.protobuf import message
 from google.protobuf import text_format
 from tensorflow.contrib.fused_conv.ops import gen_fused_conv2d_bias_activation_op  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
@@ -79,10 +79,11 @@ def get_metagraph():
 
 def main(_):
   metagraph = get_metagraph()
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
+  config = config_pb2.ConfigProto()
   if FLAGS.rewriter_config is not None:
-    text_format.Merge(FLAGS.rewriter_config, rewriter_config)
-  optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+    text_format.Merge(FLAGS.rewriter_config,
+                      config.graph_options.rewrite_options)
+  optimized_graph = tf_optimizer.OptimizeGraph(config, metagraph)
   metagraph.graph_def.CopyFrom(optimized_graph)
 
   report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report,
diff --git a/tensorflow/python/grappler/graph_placer.py b/tensorflow/python/grappler/graph_placer.py
index 654013b23c5811acbd10633d692e2d214d530b26..9c05ad81790d61fe0d19e5738d64e6502ca88915 100644
--- a/tensorflow/python/grappler/graph_placer.py
+++ b/tensorflow/python/grappler/graph_placer.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import time
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.grappler import cluster as gcluster
@@ -54,9 +54,9 @@ def PlaceGraph(metagraph,
     cluster = gcluster.Cluster()
 
   # Optimize the metagraph to speedup the placement
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
+  config = config_pb2.ConfigProto()
   optimized_graph = tf_optimizer.OptimizeGraph(
-      rewriter_config, metagraph, verbose=verbose, cluster=cluster)
+      config, metagraph, verbose=verbose, cluster=cluster)
   optimized_metagraph = meta_graph_pb2.MetaGraphDef()
   optimized_metagraph.CopyFrom(metagraph)
   optimized_metagraph.graph_def.CopyFrom(optimized_graph)
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 8cc971c61d5964d0fad1bfa843c3ef8d3407599f..55ccfbb93c38336a4e6d676e9dee4d1d8d0c9e27 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -241,7 +241,7 @@ class LayoutOptimizerTest(test.TestCase):
       if restore:
         saver.restore(sess, checkpoint_path)
       else:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
       np.random.seed(0)
       for _ in range(2):
@@ -262,7 +262,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _two_layer_model(x)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -365,7 +365,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(pad)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -396,7 +396,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -425,7 +425,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(cast)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -456,7 +456,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -486,7 +486,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -516,7 +516,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -545,7 +545,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -574,7 +574,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -603,7 +603,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -632,7 +632,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -662,7 +662,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -691,7 +691,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -724,7 +724,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(concat)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -835,7 +835,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reverse)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -905,7 +905,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -966,7 +966,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1179,7 +1179,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(s)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1214,7 +1214,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(s)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1347,7 +1347,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1374,7 +1374,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop_with_branch()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1398,7 +1398,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop_with_vec_and_4d()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1422,7 +1422,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _model_with_second_port()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1443,11 +1443,13 @@ class LayoutOptimizerTest(test.TestCase):
 
   def testGradient(self):
     meta_graph = _simple_metagraph()
-    rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
-        min_graph_nodes=-1)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
+            min_graph_nodes=-1))
     optimized_graph = tf_optimizer.OptimizeGraph(
-        rewrite_options, meta_graph, cluster=_get_cluster())
+        config, meta_graph, cluster=_get_cluster())
 
     found = 0
     for node in optimized_graph.node:
@@ -1458,11 +1460,13 @@ class LayoutOptimizerTest(test.TestCase):
 
   def testDepthwise(self):
     meta_graph = _simple_metagraph(depthwise=True)
-    rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
-        min_graph_nodes=-1)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
+            min_graph_nodes=-1))
     optimized_graph = tf_optimizer.OptimizeGraph(
-        rewrite_options, meta_graph, cluster=_get_cluster())
+        config, meta_graph, cluster=_get_cluster())
 
     found = 0
     for node in optimized_graph.node:
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 03b42f6453975c097810b300324f8ab0a2879329..d233629cbbde9cd4929039dbf115c27a9e3cce25 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -49,11 +49,13 @@ class MemoryOptimizerSwapTest(test.TestCase):
     graph_size = len(mg.graph_def.node)
     nodes = [node.name for node in mg.graph_def.node]
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            disable_model_pruning=True,
+            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), graph_size)
     self.assertItemsEqual([node.name for node in graph.node], nodes)
@@ -72,13 +74,15 @@ class MemoryOptimizerSwapTest(test.TestCase):
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
     graph_size = len(mg.graph_def.node)
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
-        min_graph_nodes=-1)
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            disable_model_pruning=True,
+            meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
+            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
+            min_graph_nodes=-1))
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), graph_size + 2)
     self.assertTrue(
@@ -127,7 +131,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
   def testRewritingDefaultGradientNames(self):
     """Tests that rewriting occurs with default gradient names."""
     (original_metagraph, _, _, _) = self._GetMetaGraph()
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
@@ -135,8 +140,9 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS), original_metagraph)
+            memory_optimization=(
+                rewriter_config_pb2.RewriterConfig.RECOMPUTATION_HEURISTICS)))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
         len(original_metagraph.graph_def.node))
@@ -153,7 +159,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     """Tests that rewriting occurs with non-standard gradient names."""
     (original_metagraph, _, _, _) = self._GetMetaGraph(
         optimizer_scope_name='optimizer')
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
@@ -161,11 +168,11 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS,
+            memory_optimization=rewriter_config_pb2.RewriterConfig
+            .RECOMPUTATION_HEURISTICS,
             # Checks that name scope "gradients/" also match sub-scope.
-            memory_optimizer_target_node_name_scope='gradients/'),
-        original_metagraph)
+            memory_optimizer_target_node_name_scope='gradients/'))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
         len(original_metagraph.graph_def.node))
@@ -182,18 +189,19 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     """Tests that rewriting occurs with non-standard gradient names."""
     (original_metagraph, _, _,
      _) = self._GetMetaGraph(optimizer_scope_name='foo/bar')
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
             dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS,
+            memory_optimization=rewriter_config_pb2.RewriterConfig
+            .RECOMPUTATION_HEURISTICS,
             # This should not match anything.
-            memory_optimizer_target_node_name_scope='r/gradients/'),
-        original_metagraph)
+            memory_optimizer_target_node_name_scope='r/gradients/'))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertEqual(
         len(rewritten_graph_def.node), len(original_metagraph.graph_def.node))
     self.assertEqual(0,
@@ -223,10 +231,10 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
       train_op = graph.get_operation_by_name(train_op_name)
       loss_op = graph.get_tensor_by_name(loss_op_name)
       with session.Session(config=config, graph=graph) as sess:
-        sess.run(init_op)
-        sess.run(train_op)
-        sess.run(train_op)
-        return sess.run(loss_op)
+        self.evaluate(init_op)
+        self.evaluate(train_op)
+        self.evaluate(train_op)
+        return self.evaluate(loss_op)
 
   def testRecomputationRewritingNoErrors(self):
     """Tests that graph output is not significantly different with rewriting."""
@@ -287,8 +295,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
           rewrite_options=manual_memory_config)
       session_config = config_pb2.ConfigProto(graph_options=graph_options)
       with session.Session(config=session_config) as sess:
-        sess.run(init_op)
-        sess.run(train_op)
+        self.evaluate(init_op)
+        self.evaluate(train_op)
 
   def testHintDoesRewrite(self):
     graph = self._annotated_graph()[0]
@@ -298,11 +306,12 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         0,
         len([node for node in metagraph.graph_def.node
              if 'Recomputed/' in node.name]))
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL),
-        metagraph)
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, metagraph)
     self.assertEqual(
         9,
         len([node for node in rewritten_graph_def.node
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index 39ca71e99af06c19fb7fe5bf185c29106729f5e9..daa5bc9444046dfcf7694ae7a5998352d460afa1 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -34,8 +34,8 @@ limitations under the License.
   $1 = &temp;
 }
 
-%typemap(in) const tensorflow::RewriterConfig& (
-    tensorflow::RewriterConfig temp) {
+%typemap(in) const tensorflow::ConfigProto& (
+    tensorflow::ConfigProto temp) {
   char* c_string;
   Py_ssize_t py_size;
   if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
@@ -46,7 +46,7 @@ limitations under the License.
   if (!temp.ParseFromString(string(c_string, py_size))) {
     PyErr_SetString(
         PyExc_TypeError,
-        "The RewriterConfig could not be parsed as a valid protocol buffer");
+        "The ConfigProto could not be parsed as a valid protocol buffer");
     SWIG_fail;
   }
   $1 = &temp;
@@ -67,8 +67,8 @@ limitations under the License.
   #include "tensorflow/core/grappler/clusters/utils.h"
   #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
   #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+  #include "tensorflow/core/protobuf/config.pb.h"
   #include "tensorflow/core/protobuf/meta_graph.pb.h"
-  #include "tensorflow/core/protobuf/rewriter_config.pb.h"
   #include "tensorflow/core/public/session_options.h"
 
 
@@ -94,7 +94,7 @@ void DetectDevices(std::unordered_map<string, tensorflow::DeviceProperties>* dev
 
 PyObject* TF_OptimizeGraph(
       GCluster cluster,
-      const tensorflow::RewriterConfig& rewriter_config,
+      const tensorflow::ConfigProto& config_proto,
       const tensorflow::MetaGraphDef& metagraph,
       bool verbose, const string& graph_id, TF_Status* out_status) {
     tensorflow::grappler::ItemConfig item_config;
@@ -110,7 +110,7 @@ PyObject* TF_OptimizeGraph(
 
     tensorflow::DeviceBase* cpu_device = nullptr;
     tensorflow::GraphDef out_graph;
-    tensorflow::grappler::MetaOptimizer optimizer(cpu_device, rewriter_config);
+    tensorflow::grappler::MetaOptimizer optimizer(cpu_device, config_proto);
     tensorflow::Status status = optimizer.Optimize(cluster.get(), *grappler_item, &out_graph);
     if (verbose) {
       optimizer.PrintResult();
@@ -127,7 +127,7 @@ PyObject* TF_OptimizeGraph(
 // Wrap this function
 PyObject* TF_OptimizeGraph(
     GCluster cluster,
-    const tensorflow::RewriterConfig& rewriter_config,
+    const tensorflow::ConfigProto& config_proto,
     const tensorflow::MetaGraphDef& metagraph, bool verbose,
     const string& graph_id, TF_Status* out_status);
 
diff --git a/tensorflow/python/grappler/tf_optimizer.py b/tensorflow/python/grappler/tf_optimizer.py
index a73a4a98fc5a883cf8681a20ca332f16f3b7f0ce..e72667b6f3184c7f2900fb410102a08220c44e2e 100644
--- a/tensorflow/python/grappler/tf_optimizer.py
+++ b/tensorflow/python/grappler/tf_optimizer.py
@@ -19,22 +19,26 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_opt
 from tensorflow.python.framework import errors
 from tensorflow.python.grappler import cluster as gcluster
 
 
-def OptimizeGraph(rewriter_config,
+def OptimizeGraph(config_proto,
                   metagraph,
                   verbose=True,
                   graph_id=b'graph_to_optimize',
                   cluster=None):
   """Optimize the provided metagraph."""
+  if not isinstance(config_proto, config_pb2.ConfigProto):
+    raise TypeError('Expected config_proto to be a ConfigProto, saw type %s' %
+                    type(config_proto))
   with errors.raise_exception_on_not_ok_status() as status:
     if cluster is None:
       cluster = gcluster.Cluster()
     ret_from_swig = tf_opt.TF_OptimizeGraph(cluster.tf_cluster,
-                                            rewriter_config.SerializeToString(),
+                                            config_proto.SerializeToString(),
                                             metagraph.SerializeToString(),
                                             verbose, graph_id, status)
   if ret_from_swig is None:
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index eca0f679829507212608e75f2c792b4bddf9b1da..0a4d4cbe2db26d903e805382c96e8811e53e4f10 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -17,7 +17,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
@@ -45,11 +45,12 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     train_op.append(d)
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.optimizers.append('constfold')
     rewriter_config.min_graph_nodes = -1
 
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), 1)
     self.assertItemsEqual([node.name for node in graph.node], ['d'])
@@ -68,17 +69,19 @@ class PyWrapOptimizeGraphTest(test.TestCase):
 
     # Optimize the graph.
     mg = meta_graph.create_meta_graph_def(graph=g)
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.min_graph_nodes = -1
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    optimized_graph = tf_optimizer.OptimizeGraph(config, mg)
 
     # Check that the nodes referenced in various collections have been preserved
-    self.assertEqual(len(optimized_graph.node), 5)
-    self.assertEqual(d.op.name, optimized_graph.node[0].name)
-    self.assertEqual(a1.op.name, optimized_graph.node[1].name)
-    self.assertEqual('Variable/initial_value', optimized_graph.node[2].name)
-    self.assertEqual(a2.op.name, optimized_graph.node[3].name)
-    self.assertEqual('Variable/Assign', optimized_graph.node[4].name)
+    optimized_graph_nodes = [node.name for node in optimized_graph.node]
+    expected_nodes = [
+        d.op.name, a1.op.name, a2.op.name, 'Variable/initial_value',
+        'Variable/Assign'
+    ]
+    self.assertEqual(len(optimized_graph_nodes), len(expected_nodes))
+    self.assertAllInSet(optimized_graph_nodes, expected_nodes)
 
   def testLoops(self):
     g = ops.Graph()
@@ -110,9 +113,10 @@ class PyWrapOptimizeGraphTest(test.TestCase):
 
     # Optimize the graph.
     mg = meta_graph.create_meta_graph_def(graph=g)
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.min_graph_nodes = -1
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    optimized_graph = tf_optimizer.OptimizeGraph(config, mg)
     mg.graph_def.CopyFrom(optimized_graph)
 
     # Check that the nodes referenced in various collections have been preserved
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index dd4e11f0eebc900cd1e3a578eb7d3e73c19c2513..540dd03768f6ce838088b3409e3dc20e984ec6f7 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -3,10 +3,10 @@
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
 package(default_visibility = ["//visibility:public"])
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -124,6 +124,7 @@ py_library(
         "engine/base_layer.py",
         "engine/distributed_training_utils.py",
         "engine/input_layer.py",
+        "engine/input_spec.py",
         "engine/network.py",
         "engine/saving.py",
         "engine/sequential.py",
@@ -146,6 +147,8 @@ py_library(
     deps = [
         ":backend",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/training/checkpointable:data_structures",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
@@ -264,6 +267,7 @@ py_test(
     name = "optimizers_test",
     size = "medium",
     srcs = ["optimizers_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -271,6 +275,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -318,7 +323,7 @@ py_test(
 
 py_test(
     name = "advanced_activations_test",
-    size = "small",
+    size = "medium",
     srcs = ["layers/advanced_activations_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -344,6 +349,7 @@ py_test(
     name = "convolutional_test",
     size = "large",
     srcs = ["layers/convolutional_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -368,7 +374,7 @@ cuda_py_test(
 
 py_test(
     name = "pooling_test",
-    size = "small",
+    size = "large",
     srcs = ["layers/pooling_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -381,6 +387,7 @@ py_test(
     name = "core_test",
     size = "medium",
     srcs = ["layers/core_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -401,9 +408,11 @@ cuda_py_test(
 
 py_test(
     name = "local_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/local_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -492,7 +501,7 @@ py_test(
 
 py_test(
     name = "recurrent_test",
-    size = "medium",
+    size = "large",
     srcs = ["layers/recurrent_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -502,6 +511,17 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "unified_rnn_test",
+    size = "medium",
+    srcs = ["layers/unified_rnn_test.py"],
+    additional_deps = [
+        ":keras",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "serialization_test",
     size = "small",
@@ -690,8 +710,9 @@ py_test(
 
 py_test(
     name = "training_test",
-    size = "enormous",
+    size = "medium",
     srcs = ["engine/training_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -701,6 +722,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "training_dataset_test",
+    size = "medium",
+    srcs = ["engine/training_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "training_generator_test",
     size = "enormous",
@@ -725,6 +759,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -758,6 +793,7 @@ py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -824,6 +860,7 @@ py_test(
     name = "models_test",
     size = "medium",
     srcs = ["models_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67509773
     deps = [
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index b1999d9566b53c3151056f2cb1cfe70fdbd20805..54421d9022069e62af5a58be33de7a6766c0edd8 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -25,6 +25,7 @@ import collections
 import itertools
 import json
 import os
+import threading
 import weakref
 
 import numpy as np
@@ -32,6 +33,7 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_module
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import func_graph
@@ -72,9 +74,9 @@ py_sum = sum
 # while executing eagerly (such as the functional API for model-building).
 _GRAPH = None
 
-# This is the default internal TF session used by Keras.
-# It can be set manually via `set_session(sess)`.
-_SESSION = None
+# This is a thread local object that will hold the default internal TF session
+# used by Keras. It can be set manually via `set_session(sess)`.
+_SESSION = threading.local()
 
 # This dictionary holds a mapping {graph: learning_phase}.
 # A learning phase is a bool tensor used to run Keras models in
@@ -336,7 +338,7 @@ def clear_session():
   global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
   ops.reset_default_graph()
   reset_uids()
-  _SESSION = None
+  _SESSION.session = None
   graph = get_graph()
   with graph.as_default():
     phase = array_ops.placeholder_with_default(
@@ -375,27 +377,22 @@ def learning_phase():
   Returns:
       Learning phase (scalar integer tensor or Python integer).
   """
-  with ops.init_scope():
-    # We always check & set the learning phase inside the init_scope,
-    # otherwise the wrong default_graph will be used to look up the learning
-    # phase inside of functions & defuns.
-    #
-    # This is because functions & defuns (both in graph & in eager mode)
-    # will always execute non-eagerly using a function-specific default
-    # subgraph.
-    if context.executing_eagerly():
-      if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
-        # Fallback to inference mode as default.
-        return 0
-      return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+  if context.executing_eagerly():
+    if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
+      # Fallback to inference mode as default.
+      return 0
+    return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+  return symbolic_learning_phase()
 
-    graph = get_graph()
-    with graph.as_default():
-      if graph not in _GRAPH_LEARNING_PHASES:
-        phase = array_ops.placeholder_with_default(
-            False, shape=(), name='keras_learning_phase')
-        _GRAPH_LEARNING_PHASES[graph] = phase
-      return _GRAPH_LEARNING_PHASES[graph]
+
+def symbolic_learning_phase():
+  graph = get_graph()
+  with graph.as_default():
+    if graph not in _GRAPH_LEARNING_PHASES:
+      phase = array_ops.placeholder_with_default(
+          False, shape=(), name='keras_learning_phase')
+      _GRAPH_LEARNING_PHASES[graph] = phase
+    return _GRAPH_LEARNING_PHASES[graph]
 
 
 @tf_export('keras.backend.set_learning_phase')
@@ -448,6 +445,20 @@ def learning_phase_scope(value):
         _GRAPH_LEARNING_PHASES[get_graph()] = previous_value
 
 
+def _get_session():
+  """Returns the session object for the current thread."""
+  global _SESSION
+  default_session = ops.get_default_session()
+  if default_session is not None:
+    session = default_session
+  else:
+    if getattr(_SESSION, 'session', None) is None:
+      _SESSION.session = session_module.Session(
+          config=get_default_session_config())
+    session = _SESSION.session
+  return session
+
+
 @tf_export(v1=['keras.backend.get_session'])
 def get_session():
   """Returns the TF session to be used by the backend.
@@ -465,14 +476,7 @@ def get_session():
   Returns:
       A TensorFlow session.
   """
-  global _SESSION
-  default_session = ops.get_default_session()
-  if default_session is not None:
-    session = default_session
-  else:
-    if _SESSION is None:
-      _SESSION = session_module.Session(config=get_default_session_config())
-    session = _SESSION
+  session = _get_session()
   if not _MANUAL_VAR_INIT:
     with session.graph.as_default():
       _initialize_variables(session)
@@ -497,7 +501,7 @@ def set_session(session):
       session: A TF Session.
   """
   global _SESSION
-  _SESSION = session
+  _SESSION.session = session
 
 
 def get_default_session_config():
@@ -694,7 +698,6 @@ def variable(value, dtype=None, name=None, constraint=None):
     v = sparse_tensor.SparseTensor(
         indices=indices, values=sparse_coo.data, dense_shape=sparse_coo.shape)
     v._keras_shape = sparse_coo.shape
-    v._uses_learning_phase = False
     return v
   v = resource_variable_ops.ResourceVariable(
       value,
@@ -705,7 +708,6 @@ def variable(value, dtype=None, name=None, constraint=None):
     v._keras_shape = value.shape
   elif hasattr(value, 'shape'):
     v._keras_shape = int_shape(value)
-  v._uses_learning_phase = False
   track_variable(v)
   return v
 
@@ -868,7 +870,6 @@ def placeholder(shape=None, ndim=None, dtype=None, sparse=False, name=None):
       x = array_ops.sparse_placeholder(dtype, shape=shape, name=name)
     else:
       x = array_ops.placeholder(dtype, shape=shape, name=name)
-  x._uses_learning_phase = False
   return x
 
 
@@ -1719,10 +1720,7 @@ def var(x, axis=None, keepdims=False):
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  m = math_ops.reduce_mean(x, axis, True)
-  devs_squared = math_ops.square(x - m)
-  return math_ops.reduce_mean(
-      devs_squared, axis, keepdims)
+  return math_ops.reduce_variance(x, axis=axis, keepdims=keepdims)
 
 
 @tf_export('keras.backend.std')
@@ -1740,7 +1738,9 @@ def std(x, axis=None, keepdims=False):
   Returns:
       A tensor with the standard deviation of elements of `x`.
   """
-  return math_ops.sqrt(var(x, axis=axis, keepdims=keepdims))
+  if x.dtype.base_dtype == dtypes_module.bool:
+    x = math_ops.cast(x, floatx())
+  return math_ops.reduce_std(x, axis=axis, keepdims=keepdims)
 
 
 @tf_export('keras.backend.mean')
@@ -2555,7 +2555,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
     result = cast(result, dtype)
   return result
 
-
+@tf_export('keras.backend.tile')
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -2903,7 +2903,7 @@ def print_tensor(x, message=''):
 # GRAPH MANIPULATION
 
 
-class Function(object):
+class GraphExecutionFunction(object):
   """Runs a computation graph.
 
   It's possible to pass arguments to `tf.Session.run()` via `session_kwargs`.
@@ -2927,13 +2927,13 @@ class Function(object):
                **session_kwargs):
     updates = updates or []
     if not isinstance(inputs, (list, tuple)):
-      raise TypeError('`inputs` to a TensorFlow backend function '
+      raise TypeError('`inputs` to a Keras backend function '
                       'should be a list or tuple.')
     if not isinstance(outputs, (list, tuple)):
-      raise TypeError('`outputs` of a TensorFlow backend function '
+      raise TypeError('`outputs` of a Keras backend function '
                       'should be a list or tuple.')
     if not isinstance(updates, (list, tuple)):
-      raise TypeError('`updates` in a TensorFlow backend function '
+      raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
     self.inputs = list(inputs)
     self.outputs = list(outputs)
@@ -3080,14 +3080,106 @@ class Function(object):
     return fetched[:len(self.outputs)]
 
 
+class EagerExecutionFunction(object):
+  """Helper class for constructing a TF graph function from the Keras graph.
+
+  Arguments:
+    inputs: Feed placeholders to the computation graph.
+    outputs: Output tensors to fetch.
+    updates: Additional update ops to be run at function call.
+    name: A name to help users identify what this function does.
+    session_kwargs: Unsupported.
+  """
+
+  def __init__(self, inputs, outputs, updates=None, name=None):
+    updates = updates or []
+    if not isinstance(inputs, (list, tuple)):
+      raise TypeError('`inputs` to a Keras backend function '
+                      'should be a list or tuple.')
+    if not isinstance(outputs, (list, tuple)):
+      raise TypeError('`outputs` of a Keras backend function '
+                      'should be a list or tuple.')
+    if not isinstance(updates, (list, tuple)):
+      raise TypeError('`updates` in a Keras backend function '
+                      'should be a list or tuple.')
+    self.inputs = list(inputs)
+    self.outputs = list(outputs)
+    self.name = name
+
+    graph = get_graph()
+    # Consolidate updates
+    with graph.as_default():
+      with ops.control_dependencies(self.outputs):
+        # In general, updates should be run after the outputs have been
+        # computed. However, we can only ensure this when we create
+        # the updates here (i.e. when updates are passed as tuples).
+        # We cannot modify the control dependencies of preexisting update ops.
+        updates_ops = []
+        for update in updates:
+          # For legacy reasons it is allowed to pass an update as a tuple
+          # `(variable, new_value)` (this maps to an assign op).
+          if isinstance(update, tuple):
+            p, new_p = update
+            updates_ops.append(state_ops.assign(p, new_p))
+          else:
+            # Assumed already an op -- we cannot control its execution order.
+            updates_ops.append(update)
+
+      # We set the update ops to run at the end by conditioning it on output[0]
+      if updates and not self.outputs:
+        # Edge case; never happens in practice
+        raise ValueError('Cannot create a Keras backend function with updates'
+                         ' but no outputs during eager execution.')
+      with ops.control_dependencies(updates_ops):
+        self.outputs[0] = array_ops.identity(self.outputs[0])
+
+    # Prepare graph function
+    # TODO(fchollet): can we restrict `captures` to variables actually used in
+    # the relevant subgraph?
+    graph.inputs = self.inputs + list(graph.captures.values())
+    graph.outputs = self.outputs
+    graph_fn = eager_function.Function(graph)
+    graph_fn._num_positional_args = len(self.inputs)
+    graph_fn._arg_keywords = []
+    self._graph_fn = graph_fn
+
+    # Handle placeholders with default
+    # (treated as required placeholder by graph functions)
+    self._placeholder_default_values = {}
+    with graph.as_default():
+      for x in self.inputs:
+        if x.op.type == 'PlaceholderWithDefault':
+          self._placeholder_default_values[x] = tensor_util.constant_value(
+              x.op.inputs[0])
+
+  def __call__(self, inputs):
+    converted_inputs = []
+    for tensor, value in zip(self.inputs, inputs):
+      if value is None:
+        # Assume `value` is a placeholder with default
+        value = self._placeholder_default_values.get(tensor, None)
+        if value is None:
+          raise ValueError(
+              'You must feed a value for placeholder %s' % (tensor,))
+      value = ops.convert_to_tensor(value, dtype=tensor.dtype)
+      if value.dtype != tensor.dtype:
+        # Temporary workaround due to `convert_to_tensor` not casting floats.
+        # See b/119637405
+        value = math_ops.cast(value, tensor.dtype)
+      converted_inputs.append(value)
+    outputs = self._graph_fn(*converted_inputs)
+    return [x.numpy() for x in outputs]
+
+
 @tf_export('keras.backend.function')
-def function(inputs, outputs, updates=None, **kwargs):
+def function(inputs, outputs, updates=None, name=None, **kwargs):
   """Instantiates a Keras function.
 
   Arguments:
       inputs: List of placeholder tensors.
       outputs: List of output tensors.
       updates: List of update ops.
+      name: String, name of function.
       **kwargs: Passed to `tf.Session.run`.
 
   Returns:
@@ -3097,16 +3189,19 @@ def function(inputs, outputs, updates=None, **kwargs):
       ValueError: if invalid kwargs are passed in or if in eager execution.
   """
   if context.executing_eagerly():
-    raise ValueError(
-        '`keras.backend.function` is not supported with eager execution.')
+    if kwargs:
+      raise ValueError('Session keyword arguments are not support during '
+                       'eager execution. You passed: %s' % (kwargs,))
+    return EagerExecutionFunction(inputs, outputs, updates=updates, name=name)
+
   if kwargs:
     for key in kwargs:
       if (key not in tf_inspect.getfullargspec(session_module.Session.run)[0]
-          and key not in tf_inspect.getfullargspec(Function.__init__)[0]):
+          and key not in ['inputs', 'outputs', 'updates', 'name']):
         msg = ('Invalid argument "%s" passed to K.function with TensorFlow '
                'backend') % key
         raise ValueError(msg)
-  return Function(inputs, outputs, updates=updates, **kwargs)
+  return GraphExecutionFunction(inputs, outputs, updates=updates, **kwargs)
 
 
 @tf_export('keras.backend.gradients')
@@ -3154,7 +3249,8 @@ def rnn(step_function,
         constants=None,
         unroll=False,
         input_length=None,
-        time_major=False):
+        time_major=False,
+        zero_output_for_mask=False):
   """Iterates over the time dimension of a tensor.
 
   Arguments:
@@ -3192,7 +3288,9 @@ def rnn(step_function,
           RNN calculation. However, most TensorFlow data is batch-major, so by
           default this function accepts input and emits output in batch-major
           form.
-
+      zero_output_for_mask: Boolean. If True, the output for masked timestep
+          will be zeros, whereas in the False case, output from previous
+          timestep is returned.
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
           last_output: the latest output of the rnn, of shape `(samples, ...)`
@@ -3238,23 +3336,20 @@ def rnn(step_function,
   if constants is None:
     constants = []
 
-  global uses_learning_phase  # pylint: disable=global-variable-undefined
-  uses_learning_phase = False
-
   # tf.where needs its condition tensor to be the same shape as its two
   # result tensors, but in our case the condition (mask) tensor is
   # (nsamples, 1), and inputs are (nsamples, ndimensions) or even more.
   # So we need to broadcast the mask to match the shape of inputs.
   # That's what the tile call does, it just repeats the mask along its
   # second dimension n times.
-  def _expand_mask(mask_t, input_t):
+  def _expand_mask(mask_t, input_t, fixed_dim=1):
     assert not nest.is_sequence(mask_t)
     assert not nest.is_sequence(input_t)
     rank_diff = len(input_t.shape) - len(mask_t.shape)
     for _ in range(rank_diff):
-      mask_t = array_ops.expand_dims(mask_t)
-    expand_dims = [1] + input_t.shape.as_list()[1:]
-    return array_ops.tile(mask_t, expand_dims)
+      mask_t = array_ops.expand_dims(mask_t, -1)
+    multiples = [1] * fixed_dim + input_t.shape.as_list()[fixed_dim:]
+    return array_ops.tile(mask_t, multiples)
 
   if unroll:
     if not time_steps:
@@ -3292,9 +3387,6 @@ def rnn(step_function,
         inp = _get_input_tensor(i)
         mask_t = mask_list[i]
         output, new_states = step_function(inp, states + constants)
-        if getattr(output, '_uses_learning_phase', False):
-          uses_learning_phase = True
-
         tiled_mask_t = _expand_mask(mask_t, output)
 
         if not successive_outputs:
@@ -3315,12 +3407,21 @@ def rnn(step_function,
       last_output = successive_outputs[-1]
       new_states = successive_states[-1]
       outputs = array_ops.stack(successive_outputs)
+
+      if zero_output_for_mask:
+        last_output = array_ops.where(
+            _expand_mask(mask_list[-1], last_output),
+            last_output,
+            zeros_like(last_output))
+        outputs = array_ops.where(
+            _expand_mask(mask, outputs, fixed_dim=2),
+            outputs,
+            zeros_like(outputs))
+
     else:
       for i in range(time_steps):
         inp = _get_input_tensor(i)
         output, states = step_function(inp, states + constants)
-        if getattr(output, '_uses_learning_phase', False):
-          uses_learning_phase = True
         successive_outputs.append(output)
         successive_states.append(states)
       last_output = successive_outputs[-1]
@@ -3362,6 +3463,13 @@ def rnn(step_function,
 
     time = constant_op.constant(0, dtype='int32', name='time')
 
+    while_loop_kwargs = {
+        'cond': lambda time, *_: time < time_steps_t,
+        'maximum_iterations': input_length,
+        'parallel_iterations': 32,
+        'swap_memory': True,
+    }
+
     if mask is not None:
       if not states:
         raise ValueError('No initial states provided! '
@@ -3379,16 +3487,21 @@ def rnn(step_function,
           tensor_array_name='mask_ta')
       mask_ta = mask_ta.unstack(mask)
 
-      def _step(time, output_ta_t, *states):
+      # Mask for the T output will be base on the output of T - 1. In the case
+      # T = 0, a zero filled tensor will be used.
+      flat_zero_output = tuple(array_ops.zeros_like(o)
+                               for o in nest.flatten(output_time_zero))
+      def _step(time, output_ta_t, prev_output, *states):
         """RNN step function.
 
         Arguments:
             time: Current timestep value.
             output_ta_t: TensorArray.
+            prev_output: tuple of outputs from time - 1.
             *states: List of states.
 
         Returns:
-            Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
+            Tuple: `(time + 1, output_ta_t, output) + tuple(new_states)`
         """
         current_input = tuple(ta.read(time) for ta in input_ta)
         # maybe set shape.
@@ -3396,17 +3509,14 @@ def rnn(step_function,
         mask_t = mask_ta.read(time)
         output, new_states = step_function(current_input,
                                            tuple(states) + tuple(constants))
-        if getattr(output, '_uses_learning_phase', False):
-          global uses_learning_phase  # pylint: disable=global-variable-undefined
-          uses_learning_phase = True
-
+        # mask output
         flat_output = nest.flatten(output)
-        # This assume the state[0] is same shape as the output
-        flat_previous_output = nest.flatten(states[0])
+        flat_mask_output = (flat_zero_output if zero_output_for_mask
+                            else nest.flatten(prev_output))
         tiled_mask_t = tuple(_expand_mask(mask_t, o) for o in flat_output)
         flat_new_output = tuple(
-            array_ops.where(m, o, po) for m, o, po in zip(
-                tiled_mask_t, flat_output, flat_previous_output))
+            array_ops.where(m, o, zo) for m, o, zo in zip(
+                tiled_mask_t, flat_output, flat_mask_output))
 
         # mask states
         flat_state = nest.flatten(states)
@@ -3415,16 +3525,23 @@ def rnn(step_function,
           new_state.set_shape(state.shape)
         tiled_mask_t = tuple(_expand_mask(mask_t, s) for s in flat_state)
         flat_final_state = tuple(
-            array_ops.where(m, o, po)
-            for m, o, po in zip(tiled_mask_t, flat_new_state, flat_state))
+            array_ops.where(m, s, ps)
+            for m, s, ps in zip(tiled_mask_t, flat_new_state, flat_state))
         new_states = nest.pack_sequence_as(new_states, flat_final_state)
 
         output_ta_t = tuple(
             ta.write(time, out)
             for ta, out in zip(output_ta_t, flat_new_output))
-        return (time + 1, output_ta_t) + tuple(new_states)
+        return (time + 1, output_ta_t,
+                tuple(flat_new_output)) + tuple(new_states)
+
+      final_outputs = control_flow_ops.while_loop(
+          body=_step,
+          loop_vars=(time, output_ta, flat_zero_output) + states,
+          **while_loop_kwargs)
+      # Skip final_outputs[2] which is the output for final timestep.
+      new_states = final_outputs[3:]
     else:
-
       def _step(time, output_ta_t, *states):
         """RNN step function.
 
@@ -3440,10 +3557,6 @@ def rnn(step_function,
         current_input = nest.pack_sequence_as(inputs, current_input)
         output, new_states = step_function(current_input,
                                            tuple(states) + tuple(constants))
-        if getattr(output, '_uses_learning_phase', False):
-          global uses_learning_phase  # pylint: disable=global-variable-undefined
-          uses_learning_phase = True
-
         flat_state = nest.flatten(states)
         flat_new_state = nest.flatten(new_states)
         for state, new_state in zip(flat_state, flat_new_state):
@@ -3452,25 +3565,21 @@ def rnn(step_function,
         flat_output = nest.flatten(output)
         output_ta_t = tuple(
             ta.write(time, out) for ta, out in zip(output_ta_t, flat_output))
+        new_states = nest.pack_sequence_as(initial_states, flat_new_state)
         return (time + 1, output_ta_t) + tuple(new_states)
 
-    final_outputs = control_flow_ops.while_loop(
-        cond=lambda time, *_: time < time_steps_t,
-        body=_step,
-        loop_vars=(time, output_ta) + states,
-        maximum_iterations=input_length,
-        parallel_iterations=32,
-        swap_memory=True)
-    last_time = final_outputs[0]
+      final_outputs = control_flow_ops.while_loop(
+          body=_step,
+          loop_vars=(time, output_ta) + states,
+          **while_loop_kwargs)
+      new_states = final_outputs[2:]
+
     output_ta = final_outputs[1]
-    new_states = final_outputs[2:]
 
     outputs = tuple(o.stack() for o in output_ta)
+    last_output = tuple(o[-1] for o in outputs)
+
     outputs = nest.pack_sequence_as(output_time_zero, outputs)
-    last_output = tuple(o.read(last_time - 1) for o in output_ta)
-    if not context.executing_eagerly():
-      for o in last_output:
-        o._uses_learning_phase = uses_learning_phase
     last_output = nest.pack_sequence_as(output_time_zero, last_output)
 
   # static shape inference
@@ -3574,17 +3683,14 @@ def in_train_phase(x, alt, training=None):
   """
   if training is None:
     training = learning_phase()
-    uses_learning_phase = True
-  else:
-    uses_learning_phase = False
 
-  if training is 1 or training is True:
+  if training == 1 or training is True:
     if callable(x):
       return x()
     else:
       return x
 
-  elif training is 0 or training is False:
+  elif training == 0 or training is False:
     if callable(alt):
       return alt()
     else:
@@ -3592,8 +3698,6 @@ def in_train_phase(x, alt, training=None):
 
   # else: assume learning phase is a placeholder tensor.
   x = switch(training, x, alt)
-  if uses_learning_phase:
-    x._uses_learning_phase = True
   return x
 
 
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 4368b69ebe575c65dc8b87d182972b8d0a2b7304..48fdd56e9f68790de0939328f0d9aaf84a587877 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -135,7 +136,7 @@ class BackendUtilsTest(test.TestCase):
       x = keras.Input((3,))
       y = keras.layers.BatchNormalization()(x)
       if not context.executing_eagerly():
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         sess.run(y, feed_dict={x: np.random.random((2, 3))})
 
   def test_learning_phase_scope(self):
@@ -1222,6 +1223,121 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
       for s, u_s in zip(additional_state_list[2], additional_state_list[3]):
         self.assertAllClose(s, u_s, atol=1e-04)
 
+  def test_rnn_output_and_state_masking_independent(self):
+    num_samples = 2
+    num_timesteps = 4
+    state_and_io_size = 2
+    mask_last_num_timesteps = 2  # for second sample only
+
+    # a step function that just outputs inputs,
+    # but increments states +1 per timestep
+    def step_function(inputs, states):
+      return inputs, [s + 1 for s in states]
+
+    inputs_vals = np.random.random((num_samples, num_timesteps,
+                                    state_and_io_size))
+    initial_state_vals = np.random.random((num_samples, state_and_io_size))
+    # masking of two last timesteps for second sample only
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[1, -mask_last_num_timesteps:] = 0
+
+    # outputs expected to be same as inputs for the first sample
+    expected_outputs = inputs_vals.copy()
+    # but for the second sample all outputs in masked region should be the same
+    # as last output before masked region
+    expected_outputs[1, -mask_last_num_timesteps:] = \
+        expected_outputs[1, -(mask_last_num_timesteps + 1)]
+
+    expected_last_state = initial_state_vals.copy()
+    # first state should be incremented for every timestep (no masking)
+    expected_last_state[0] += num_timesteps
+    # second state should not be incremented for last two timesteps
+    expected_last_state[1] += (num_timesteps - mask_last_num_timesteps)
+
+    # verify same expected output for `unroll=true/false`
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, outputs, last_states = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
+      self.assertAllClose(
+          keras.backend.eval(last_states[0]), expected_last_state)
+
+  def test_rnn_output_num_dim_larger_than_2_masking(self):
+    num_samples = 3
+    num_timesteps = 4
+    num_features = 5
+
+    def step_function(inputs, states):
+      outputs = keras.backend.tile(keras.backend.expand_dims(inputs), [1, 1, 2])
+      return outputs, [keras.backend.identity(s) for s in states]
+      # Note: cannot just return states (which can be a problem) ->
+      # tensorflow/python/ops/resource_variable_ops.py", line 824, in set_shape
+      # NotImplementedError: ResourceVariable does not implement set_shape()
+
+    inputs_vals = np.random.random((num_samples, num_timesteps, num_features))
+    initial_state_vals = np.random.random((num_samples, 6))
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[-1, -1] = 0  # final timestep masked for last sample
+
+    expected_outputs = np.repeat(inputs_vals[..., None], repeats=2, axis=-1)
+    # for the last sample, the final timestep (in masked region) should be the
+    # same as the second to final output (before masked region)
+    expected_outputs[-1, -1] = expected_outputs[-1, -2]
+
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, outputs, _ = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
+
+  def test_rnn_state_num_dim_larger_than_2_masking(self):
+    num_samples = 3
+    num_timesteps = 4
+
+    def step_function(inputs, states):
+      return inputs, [s + 1 for s in states]
+
+    inputs_vals = np.random.random((num_samples, num_timesteps, 5))
+    initial_state_vals = np.random.random((num_samples, 6, 7))
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[0, -2:] = 0  # final two timesteps masked for first sample
+
+    expected_last_state = initial_state_vals.copy()
+    expected_last_state[0] += (num_timesteps - 2)
+    expected_last_state[1:] += num_timesteps
+
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, _, last_states = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(
+          keras.backend.eval(last_states[0]), expected_last_state)
+
   def test_normalize_batch_in_training(self):
     val = np.random.random((10, 3, 10, 10))
     x = keras.backend.variable(val)
@@ -1400,7 +1516,38 @@ class BackendGraphTests(test.TestCase):
     x = keras.backend.variable(1)
     self.assertEqual(keras.backend.is_placeholder(x), False)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_basics(self):
+    x1 = keras.backend.placeholder(shape=(), dtype='float32')
+    x2 = keras.backend.placeholder(shape=(), dtype='int32')
+    v = keras.backend.variable(10.)
+    with keras.backend.get_graph().as_default():
+      y1 = x1 + keras.backend.cast(x2, 'float32') + v
+      y2 = x1 * keras.backend.cast(x2, 'float32')
+      with ops.control_dependencies([y1]):
+        u = keras.backend.update(v, 5.)
+    f = keras.backend.function([x1, x2], [y1, y2], updates=[u])
+    output_values = f([2, 3])
+    self.assertEqual(output_values, [15., 6.])
+    self.assertEqual(keras.backend.eval(v), 5.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_placeholder_with_default(self):
+    with keras.backend.get_graph().as_default():
+      x1 = array_ops.placeholder_with_default(
+          np.array(2., dtype='float32'), shape=())
+      x2 = array_ops.placeholder_with_default(
+          np.array(3, dtype='int32'), shape=())
+    y1 = x1 + keras.backend.cast(x2, 'float32')
+    y2 = x1 * keras.backend.cast(x2, 'float32')
+    f = keras.backend.function([x1, x2], [y1, y2])
+    output_values = f([4, 5])
+    self.assertEqual(output_values, [9., 20.])
+    output_values = f([None, None])
+    self.assertEqual(output_values, [5., 6.])
+
   def test_function_tf_feed_symbols(self):
+    # Test Keras backend functions with TF tensor inputs.
     with self.cached_session():
       # Test feeding a resource variable to `function`.
       x1 = keras.backend.placeholder(shape=())
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 4c12c83a4c2cc00d20835858e534e4e5973f3991..8223e795bc2d1667d3c42729c67de9c173fef710 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -19,9 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import deque
-from collections import Iterable
-from collections import OrderedDict
+import collections
 import copy
 import csv
 import io
@@ -56,6 +54,7 @@ except ImportError:
   requests = None
 
 
+# pylint: disable=protected-access
 def configure_callbacks(callbacks,
                         model,
                         do_validation=False,
@@ -68,7 +67,8 @@ def configure_callbacks(callbacks,
                         samples=None,
                         validation_steps=None,
                         verbose=1,
-                        count_mode='steps'):
+                        count_mode='steps',
+                        mode='train'):
   """Configures callbacks for use in various training loops.
 
   Arguments:
@@ -88,37 +88,41 @@ def configure_callbacks(callbacks,
       validation_steps: Number of batches to run per validation epoch.
       verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
       count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
+      mode: String. One of 'train', 'test', or 'predict'. Which loop mode to
+        configure callbacks for.
 
   Returns:
       Instance of CallbackList used to control all Callbacks.
   """
-
-  # Add additional callbacks
-  model.history = History()
-  stateful_metric_names = None
-  if hasattr(model, 'stateful_metric_names'):
-    stateful_metric_names = model.stateful_metric_names
-  callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
-              ] + (callbacks or []) + [model.history]
-  if verbose:
-    callbacks.append(
-        ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
+  # Check if callbacks have already been configured.
+  if isinstance(callbacks, CallbackList):
+    return callbacks
+
+  if not callbacks:
+    callbacks = []
+
+  # Add additional callbacks during training.
+  if mode == 'train':
+    model.history = History()
+    stateful_metric_names = None
+    if hasattr(model, 'metrics_names'):
+      stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+    callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
+                ] + (callbacks or []) + [model.history]
+    if verbose:
+      callbacks.append(
+          ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
   callback_list = CallbackList(callbacks)
 
   # Set callback model
-  callback_model = model._get_callback_model()  # pylint: disable=protected-access
-  if do_validation and val_inputs and not context.executing_eagerly():
-    # Need to create the test_function before start of the first epoch
-    # because TensorBoard callback on_epoch_begin adds summary to the
-    # list of fetches of the test_function
-    callback_model._make_test_function()  # pylint: disable=protected-access
+  callback_model = model._get_callback_model()
   callback_list.set_model(callback_model)
 
   # Set callback parameters
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
   # when we standardize first batch of data.
-  if model._is_compiled:  # pylint: disable=protected-access
+  if mode != 'predict' and hasattr(model, 'metrics_names'):
     callback_metrics = copy.copy(model.metrics_names)
     if do_validation:
       callback_metrics += ['val_' + n for n in model.metrics_names]
@@ -137,21 +141,29 @@ def configure_callbacks(callbacks,
   callback_list.set_params(callback_params)
 
   # Pass validation data to callbacks
-  if not val_inputs:
+  # TODO(omalleyt): remove this once val hooks are ready.
+  if model._distribution_strategy or not val_inputs:
     val_data = []
-  elif _is_generator_like(val_inputs):
-    val_data = val_inputs
   else:
-    val_data = val_inputs + val_targets
-    if val_sample_weights:
-      val_data += val_sample_weights
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      val_data += [0.]
+    if not model.run_eagerly:
+      # Need to create the eval_function before start of the first epoch
+      # because TensorBoard callback on_epoch_begin adds summary to the
+      # list of fetches of the eval_function
+      callback_model._make_eval_function()
+    if _is_generator_like(val_inputs):
+      val_data = val_inputs
+    else:
+      val_data = val_inputs + val_targets
+      if val_sample_weights:
+        val_data += val_sample_weights
+      if not isinstance(K.symbolic_learning_phase(), int):
+        val_data += [False]
   for cbk in callbacks:
     cbk.validation_data = val_data
 
   callback_list.model.stop_training = False
   return callback_list
+# pylint: enable=protected-access
 
 
 def _is_generator_like(data):
@@ -175,6 +187,12 @@ class CallbackList(object):
     self.queue_length = queue_length
     self.params = {}
     self.model = None
+    self._reset_batch_timing()
+
+  def _reset_batch_timing(self):
+    self._delta_t_batch = 0.
+    self._delta_ts = collections.defaultdict(
+        lambda: collections.deque([], maxlen=self.queue_length))
 
   def append(self, callback):
     self.callbacks.append(callback)
@@ -189,72 +207,96 @@ class CallbackList(object):
     for callback in self.callbacks:
       callback.set_model(model)
 
-  def on_epoch_begin(self, epoch, logs=None):
+  def _call_batch_hook(self, mode, hook, batch, logs=None):
+    """Helper function for all batch_{begin | end} methods."""
+    # TODO(omalleyt): add batch hooks for test/predict.
+    if mode != 'train':
+      return
+
+    hook_name = 'on_{mode}_batch_{hook}'.format(mode=mode, hook=hook)
+    if hook == 'begin':
+      self._t_enter_batch = time.time()
+    if hook == 'end':
+      # Batch is ending, calculate batch time.
+      self._delta_t_batch = time.time() - self._t_enter_batch
+
+    logs = logs or {}
+    t_before_callbacks = time.time()
+    for callback in self.callbacks:
+      batch_hook = getattr(callback, hook_name)
+      batch_hook(batch, logs)
+    self._delta_ts[hook_name].append(time.time() - t_before_callbacks)
+
+    delta_t_median = np.median(self._delta_ts[hook_name])
+    if (self._delta_t_batch > 0. and
+        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
+      logging.warning(
+          'Method (%s) is slow compared '
+          'to the batch update (%f). Check your callbacks.', hook_name,
+          delta_t_median)
+
+  def _call_begin_hook(self, mode):
+    """Helper function for on_{train|test|predict}_begin methods."""
+    # TODO(omalleyt): add test/predict methods.
+    if mode == 'train':
+      self.on_train_begin()
+
+  def _call_end_hook(self, mode):
+    """Helper function for on_{train|test|predict}_end methods."""
+    # TODO(omalleyt): add test/predict methods.
+    if mode == 'train':
+      self.on_train_end()
+
+  def on_batch_begin(self, batch, logs=None):
+    self._call_batch_hook('train', 'begin', batch, logs=logs)
+
+  def on_batch_end(self, batch, logs=None):
+    self._call_batch_hook('train', 'end', batch, logs=logs)
+
+  def on_epoch_begin(self, epoch, logs=None, mode='train'):
     """Called at the start of an epoch.
 
     Arguments:
         epoch: integer, index of epoch.
         logs: dictionary of logs.
+        mode: One of 'train'/'test'/'predict'
     """
-    logs = logs or {}
-    for callback in self.callbacks:
-      callback.on_epoch_begin(epoch, logs)
-    self._delta_t_batch = 0.
-    self._delta_ts_batch_begin = deque([], maxlen=self.queue_length)
-    self._delta_ts_batch_end = deque([], maxlen=self.queue_length)
+    if mode == 'train':
+      logs = logs or {}
+      for callback in self.callbacks:
+        callback.on_epoch_begin(epoch, logs)
+    self._reset_batch_timing()
 
-  def on_epoch_end(self, epoch, logs=None):
+  def on_epoch_end(self, epoch, logs=None, mode='train'):
     """Called at the end of an epoch.
 
     Arguments:
         epoch: integer, index of epoch.
         logs: dictionary of logs.
+        mode: One of 'train'/'test'/'predict'
     """
-    logs = logs or {}
-    for callback in self.callbacks:
-      callback.on_epoch_end(epoch, logs)
+    if mode == 'train':
+      logs = logs or {}
+      for callback in self.callbacks:
+        callback.on_epoch_end(epoch, logs)
 
-  def on_batch_begin(self, batch, logs=None):
-    """Called right before processing a batch.
+  def on_train_batch_begin(self, batch, logs=None):
+    """Called at the beginning of a training batch in `fit` methods.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
         logs: dictionary of logs.
     """
-    logs = logs or {}
-    t_before_callbacks = time.time()
-    for callback in self.callbacks:
-      callback.on_batch_begin(batch, logs)
-    self._delta_ts_batch_begin.append(time.time() - t_before_callbacks)
-    delta_t_median = np.median(self._delta_ts_batch_begin)
-    if (self._delta_t_batch > 0. and
-        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      logging.warning('Method on_batch_begin() is slow compared '
-                      'to the batch update (%f). Check your callbacks.',
-                      delta_t_median)
-    self._t_enter_batch = time.time()
+    self._call_batch_hook('train', 'begin', batch, logs=logs)
 
-  def on_batch_end(self, batch, logs=None):
-    """Called at the end of a batch.
+  def on_train_batch_end(self, batch, logs=None):
+    """Called at the end of a training batch in `fit` methods.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
         logs: dictionary of logs.
     """
-    logs = logs or {}
-    if not hasattr(self, '_t_enter_batch'):
-      self._t_enter_batch = time.time()
-    self._delta_t_batch = time.time() - self._t_enter_batch
-    t_before_callbacks = time.time()
-    for callback in self.callbacks:
-      callback.on_batch_end(batch, logs)
-    self._delta_ts_batch_end.append(time.time() - t_before_callbacks)
-    delta_t_median = np.median(self._delta_ts_batch_end)
-    if (self._delta_t_batch > 0. and
-        (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
-      logging.warning('Method on_batch_end() is slow compared '
-                      'to the batch update (%f). Check your callbacks.',
-                      delta_t_median)
+    self._call_batch_hook('train', 'end', batch, logs=logs)
 
   def on_train_begin(self, logs=None):
     """Called at the beginning of training.
@@ -330,6 +372,14 @@ class Callback(object):
   def on_batch_end(self, batch, logs=None):
     pass
 
+  def on_train_batch_begin(self, batch, logs=None):
+    # For backwards compatibility
+    self.on_batch_begin(batch, logs=logs)
+
+  def on_train_batch_end(self, batch, logs=None):
+    # For backwards compatibility
+    self.on_batch_end(batch, logs=logs)
+
   def on_train_begin(self, logs=None):
     pass
 
@@ -432,18 +482,19 @@ class ProgbarLogger(Callback):
     self.epochs = self.params['epochs']
 
   def on_epoch_begin(self, epoch, logs=None):
+    self.seen = 0
+    if self.use_steps:
+      self.target = self.params['steps']
+    else:
+      self.target = self.params['samples']
+
     if self.verbose:
-      print('Epoch %d/%d' % (epoch + 1, self.epochs))
-      if self.use_steps:
-        target = self.params['steps']
-      else:
-        target = self.params['samples']
-      self.target = target
+      if self.epochs > 1:
+        print('Epoch %d/%d' % (epoch + 1, self.epochs))
       self.progbar = Progbar(
           target=self.target,
           verbose=self.verbose,
           stateful_metrics=self.stateful_metrics)
-    self.seen = 0
 
   def on_batch_begin(self, batch, logs=None):
     if self.seen < self.target:
@@ -1124,17 +1175,19 @@ class TensorBoard(Callback):
     self._total_batches_seen += 1
 
   def on_epoch_begin(self, epoch, logs=None):
-    """Add histogram op to Model test_function callbacks, reset batch count."""
+    """Add histogram op to Model eval_function callbacks, reset batch count."""
 
     # check if histogram summary should be run for this epoch
     if self.histogram_freq and epoch % self.histogram_freq == 0:
       self._epoch = epoch
       self._current_val_batch = 0
+      # pylint: disable=protected-access
       # add the histogram summary op if it should run this epoch
-      if self.merged not in self.model.test_function.fetches:
-        self.model.test_function.fetches.append(self.merged)
-        self.model.test_function.fetch_callbacks[
+      if self.merged not in self.model._eval_function.fetches:
+        self.model._eval_function.fetches.append(self.merged)
+        self.model._eval_function.fetch_callbacks[
             self.merged] = self._fetch_callback
+      # pylint: enable=protected-access
 
   def on_epoch_end(self, epoch, logs=None):
     """Checks if summary ops should run next epoch, logs scalar summaries."""
@@ -1152,10 +1205,12 @@ class TensorBoard(Callback):
 
     # pop the histogram summary op after each epoch
     if self.histogram_freq:
-      if self.merged in self.model.test_function.fetches:
-        self.model.test_function.fetches.remove(self.merged)
-      if self.merged in self.model.test_function.fetch_callbacks:
-        self.model.test_function.fetch_callbacks.pop(self.merged)
+      # pylint: disable=protected-access
+      if self.merged in self.model._eval_function.fetches:
+        self.model._eval_function.fetches.remove(self.merged)
+      if self.merged in self.model._eval_function.fetch_callbacks:
+        self.model._eval_function.fetch_callbacks.pop(self.merged)
+      # pylint: enable=protected-access
 
     if self.embeddings_data is None and self.embeddings_freq:
       raise ValueError('To visualize embeddings, embeddings_data must '
@@ -1187,7 +1242,7 @@ class TensorBoard(Callback):
 
           feed_dict.update({self.batch_id: i, self.step: step})
 
-          if self.model.uses_learning_phase:
+          if not isinstance(K.learning_phase(), int):
             feed_dict[K.learning_phase()] = False
 
           self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
@@ -1381,7 +1436,7 @@ class CSVLogger(Callback):
       is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
       if isinstance(k, six.string_types):
         return k
-      elif isinstance(k, Iterable) and not is_zero_dim_ndarray:
+      elif isinstance(k, collections.Iterable) and not is_zero_dim_ndarray:
         return '"[%s]"' % (', '.join(map(str, k)))
       else:
         return k
@@ -1409,7 +1464,7 @@ class CSVLogger(Callback):
       if self.append_header:
         self.writer.writeheader()
 
-    row_dict = OrderedDict({'epoch': epoch})
+    row_dict = collections.OrderedDict({'epoch': epoch})
     row_dict.update((key, handle_value(logs[key])) for key in self.keys)
     self.writer.writerow(row_dict)
     self.csv_file.flush()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index bb85347033cdee893195b17ae549d077b5f07613..9d9ede22c018a85d716534848ba65a98f463e4f5 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -422,8 +422,7 @@ class KerasCallbacksTest(test.TestCase):
             num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
         model.compile(
             loss='categorical_crossentropy',
-            optimizer=keras.optimizers.SGD(lr=0.1),
-            metrics=['accuracy'])
+            optimizer=keras.optimizers.SGD(lr=0.1))
         return model
 
       model = make_model()
@@ -673,8 +672,8 @@ class KerasCallbacksTest(test.TestCase):
           callbacks=cbks,
           epochs=20)
       loss = history.history['loss']
-      assert len(loss) == 1
-      assert loss[0] == np.inf
+      self.assertEqual(len(loss), 1)
+      self.assertEqual(loss[0], np.inf)
 
   def test_TensorBoard(self):
     np.random.seed(1337)
diff --git a/tensorflow/python/keras/engine/__init__.py b/tensorflow/python/keras/engine/__init__.py
index 26aed34766f9e1e2094db7a4c8b66ff057dacc4b..005f6462ffa4e6120c66373f7be9e31d5eac5449 100644
--- a/tensorflow/python/keras/engine/__init__.py
+++ b/tensorflow/python/keras/engine/__init__.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 # TODO(fchollet): Remove hourglass imports once external code is done importing
 # non-public APIs.
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
 
 del absolute_import
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 7870192638c02031893d6670d72e93cc23f89486..c8e964d1f72b2c3ab1a422567fb9298b756e3073 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -36,6 +36,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
@@ -68,6 +69,14 @@ class CallConvention(enum.Enum):
   POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
 
 
+def _create_mean_metric(value, name=None):
+  # TODO(psv): Remove this import when b/110718070 is fixed.
+  from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+  metric_obj = metrics_module.Mean(name=name)
+  result = metric_obj(value)
+  return metric_obj, result
+
+
 @tf_export('keras.layers.Layer')
 class Layer(checkpointable.CheckpointableBase):
   """Base layer class.
@@ -170,6 +179,13 @@ class Layer(checkpointable.CheckpointableBase):
     # in eager mode or graph mode alternatively, we need to keep track of
     # eager losses and symbolic losses via separate attributes.
     self._eager_losses = []
+    # A list of metric instances corresponding to the symbolic metric tensors
+    # added using the `add_metric` API.
+    self._metrics = []
+    # TODO(psv): Remove this property.
+    # A dictionary that maps metric names to metric result tensors. The results
+    # are the running averages of metric values over an epoch.
+    self._metrics_tensors = {}
     self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
     self._call_fn_args = function_utils.fn_args(self.call)
     self._compute_previous_mask = ('mask' in self._call_fn_args or
@@ -183,16 +199,6 @@ class Layer(checkpointable.CheckpointableBase):
 
     self.supports_masking = False
 
-    # Mark if a layer supports using graph functions in the eager
-    # fit/predict/evaluate loop
-    # TODO(kaftan): merge this with the _static_graph_friendly flag once
-    # enough eager function bugs involving control flow / tensorarrays have
-    # been fixed,  and static-graph-friendly layers will almost always work in
-    # eager graph functions.
-    # We conservatively make this flag opt-in for now to avoid causing existing
-    # custom layers to crash.
-    self._can_use_graph_functions = False
-
     call_argspec = tf_inspect.getfullargspec(self.call)
     if 'training' in call_argspec.args:
       self._expects_training_arg = True
@@ -200,7 +206,7 @@ class Layer(checkpointable.CheckpointableBase):
       self._expects_training_arg = False
 
     # Whether the `call` method can be used to build a TF graph without issues.
-    self._static_graph_friendly = True
+    self._call_is_graph_friendly = True
 
     # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
@@ -222,17 +228,6 @@ class Layer(checkpointable.CheckpointableBase):
     else:
       self._initial_weights = None
 
-  @property
-  def _is_static_graph_friendly(self):
-    return self._static_graph_friendly
-
-  @_is_static_graph_friendly.setter
-  def _is_static_graph_friendly(self, value):
-    if value not in {True, False}:
-      raise ValueError('`static_graph_friendly` requires a boolean value. '
-                       'Received: {}'.format(value))
-    self._static_graph_friendly = value
-
   def _init_set_name(self, name, zero_based=True):
     if not name:
       self._name = unique_layer_name(
@@ -298,8 +293,6 @@ class Layer(checkpointable.CheckpointableBase):
 
   @property
   def updates(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.updates not supported in Eager mode.')
     if not self.trainable and not self.stateful:
       return []
     return self._updates
@@ -366,9 +359,6 @@ class Layer(checkpointable.CheckpointableBase):
     Raises:
       RuntimeError: If called in Eager mode.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
-
     # Updates disabled if layer is not trainable and not explicitly stateful.
     if not self.trainable and not self.stateful:
       return []
@@ -459,6 +449,84 @@ class Layer(checkpointable.CheckpointableBase):
         else:
           self._losses.append(_tag_unconditional(loss))
 
+  @doc_controls.for_subclass_implementers
+  def add_metric(self, value, aggregation=None, name=None):
+    """Adds metric tensor to the layer.
+
+    Args:
+      value: Metric tensor.
+      aggregation: Sample-wise metric reduction function. If `aggregation=None`,
+        it indicates that the metric tensor provided has been aggregated
+        already. eg, `model.add_metric(BinaryAccuracy(name='acc')(y_true,
+        y_pred))`. If aggregation='mean', the given metric tensor will be
+        sample-wise reduced using `mean` function. eg, `model.add_metric(
+        tf.reduce_mean(outputs), name='output_mean', aggregation='mean')`.
+      name: String metric name.
+
+    Raises:
+      ValueError: If `aggregation` is anything other than None or `mean`.
+    """
+    if aggregation is not None and aggregation != 'mean':
+      raise ValueError(
+          'We currently support only `mean` sample-wise metric aggregation. '
+          'You provided aggregation=`%s`' % aggregation)
+
+    if tf_utils.is_symbolic_tensor(value):
+      self._symbolic_add_metric(value, aggregation, name)
+    else:
+      self._eager_add_metric(value, aggregation, name)
+
+  def _get_existing_metric(self, name=None):
+    match = [m for m in self._metrics if m.name == name]
+    if not match:
+      return
+    if len(match) > 1:
+      raise ValueError(
+          'Please provide different names for the metrics you have added. '
+          'We found {} metrics with the name: "{}"'.format(len(match), name))
+    return match[0]
+
+  def _eager_add_metric(self, value, aggregation=None, name=None):
+    # If the given metric is available in `metrics` list we just update state
+    # on it, otherwise we create a new metric instance and
+    # add it to the `metrics` list.
+    match = self._get_existing_metric(name)
+    if match:
+      match(value)  # Update the metric state.
+      return
+    else:
+      if aggregation is None:
+        raise ValueError('We do not support adding an aggregated metric tensor '
+                         'in `call` in eager execution.')
+      metric_obj, _ = _create_mean_metric(value, name)
+      self._metrics.append(metric_obj)
+
+  def _symbolic_add_metric(self, value, aggregation=None, name=None):
+    if aggregation is None:
+      # Iterate over the metrics and check if the given metric exists already.
+      # This can happen when a metric instance is created in subclassed model
+      # layer `__init__` and we have tracked that instance already in
+      # model.__setattr__.
+      match = self._get_existing_metric(name)
+      if match:
+        result_tensor = value
+        if match.name not in self._metrics_tensors:
+          self._metrics_tensors[match.name] = result_tensor
+          return
+        else:
+          raise ValueError(
+              'We currently do not support reusing a metric instance.')
+      else:
+        # We track the instance using the metadata on the result tensor.
+        result_tensor = value
+        metric_obj = result_tensor._metric_obj
+    else:
+      # If a non-aggregated tensor is given as input (ie. `aggregation` is
+      # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
+      metric_obj, result_tensor = _create_mean_metric(value, name)
+    self._metrics.append(metric_obj)
+    self._metrics_tensors[metric_obj.name] = result_tensor
+
   def get_losses_for(self, inputs):
     """Retrieves losses relevant to a specific set of inputs.
 
@@ -471,9 +539,6 @@ class Layer(checkpointable.CheckpointableBase):
     Raises:
       RuntimeError: If called in Eager mode.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-
     if inputs is None:
       # Requesting unconditional losses.
       return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
@@ -733,7 +798,8 @@ class Layer(checkpointable.CheckpointableBase):
     with ops.name_scope(self._name_scope()):
       if not self.built:
         # Check input assumptions set before layer building, e.g. input rank.
-        self._assert_input_compatibility(inputs)
+        input_spec.assert_input_compatibility(
+            self.input_spec, inputs, self.name)
         if input_list and self._dtype is None:
           try:
             self._dtype = input_list[0].dtype.base_dtype.name
@@ -758,7 +824,8 @@ class Layer(checkpointable.CheckpointableBase):
       if build_graph:
         # Symbolic execution on symbolic tensors. We will attempt to build
         # the corresponding TF subgraph inside `backend.get_graph()`
-        self._assert_input_compatibility(inputs)
+        input_spec.assert_input_compatibility(
+            self.input_spec, inputs, self.name)
         graph = backend.get_graph()
         with graph.as_default():
           if not executing_eagerly:
@@ -772,10 +839,10 @@ class Layer(checkpointable.CheckpointableBase):
               # Any issue during graph-building means we will later run the
               # model in eager mode, whether the issue was related to
               # graph mode or not. This provides a nice debugging experience.
-              self._is_static_graph_friendly = False
+              self._call_is_graph_friendly = False
               # We will use static shape inference to return symbolic tensors
               # matching the specifications of the layer outputs.
-              # Since we have set `self._is_static_graph_friendly = False`,
+              # Since we have set `self._call_is_graph_friendly = False`,
               # we will never attempt to run the underlying TF graph (which is
               # disconnected).
               # TODO(fchollet): consider py_func as an alternative, which
@@ -792,7 +859,6 @@ class Layer(checkpointable.CheckpointableBase):
                              '(layer: ' + self.name + ').')
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, previous_mask)
-          self._set_learning_phase_metadata(inputs, outputs)
           if have_all_keras_metadata(inputs):
             inputs, outputs = self._set_connectivity_metadata_(
                 inputs, outputs, args, kwargs)
@@ -831,23 +897,6 @@ class Layer(checkpointable.CheckpointableBase):
     """
     return self.__call__(inputs, *args, **kwargs)
 
-  def _set_learning_phase_metadata(self, inputs, outputs):
-    # Update learning phase info. To work with subclassed models,
-    # this should be done even if Keras metadata is absent.
-    output_tensors = generic_utils.to_list(outputs)
-    uses_lp = any(
-        [getattr(x, '_uses_learning_phase', False)
-         for x in generic_utils.to_list(inputs)])
-    uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
-    for i in range(len(output_tensors)):
-      try:
-        output_tensors[i]._uses_learning_phase = getattr(
-            output_tensors[i], '_uses_learning_phase', False) or uses_lp
-      except AttributeError:
-        # An output element happens to be a C type (such as tuple or dict).
-        # We don't track learning phase info in such edge cases.
-        pass
-
   def _set_mask_metadata(self, inputs, outputs, previous_mask):
     # In some cases the mask of the outputs has already been computed by
     # inner layers and does not need to be recomputed by this layer.
@@ -1007,7 +1056,6 @@ class Layer(checkpointable.CheckpointableBase):
       # use `compute_output_shape` manually (these users will have to
       # implement `compute_output_shape` themselves).
       self.build(input_shape)
-
       with context.graph_mode():
         graph = func_graph.FuncGraph('graph')
         with graph.as_default():
@@ -1447,101 +1495,6 @@ class Layer(checkpointable.CheckpointableBase):
     """Deprecated, do NOT use! Only for compatibility with external Keras."""
     return self._outbound_nodes
 
-  def _assert_input_compatibility(self, inputs):
-    """Checks compatibility between the layer and provided inputs.
-
-    This checks that the tensor(s) `inputs` verify the input assumptions
-    of the layer (if any). If not, a clear and actional exception gets raised.
-
-    Arguments:
-        inputs: input tensor or list of input tensors.
-
-    Raises:
-        ValueError: in case of mismatch between
-            the provided inputs and the expectations of the layer.
-    """
-    if not self.input_spec:
-      return
-    if not isinstance(self.input_spec, (list, tuple)):
-      input_spec = nest.flatten(self.input_spec)
-    else:
-      input_spec = self.input_spec
-    inputs = nest.flatten(inputs)
-    if len(inputs) != len(input_spec):
-      raise ValueError('Layer ' + self.name + ' expects ' +
-                       str(len(input_spec)) + ' inputs, '
-                       'but it received ' + str(len(inputs)) +
-                       ' input tensors. Inputs received: ' + str(inputs))
-    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-      if spec is None:
-        continue
-
-      if (spec.ndim is not None or
-          spec.min_ndim is not None or
-          spec.max_ndim is not None):
-        if x.shape.ndims is None:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'its rank is undefined, but the layer requires a '
-                           'defined rank.')
-
-      # Check ndim.
-      if spec.ndim is not None:
-        ndim = x.shape.ndims
-        if ndim != spec.ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
-                           str(ndim) + '. Full shape received: ' +
-                           str(x.shape.as_list()))
-      if spec.max_ndim is not None:
-        ndim = x.shape.ndims
-        if ndim is not None and ndim > spec.max_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected max_ndim=' + str(spec.max_ndim) +
-                           ', found ndim=' + str(ndim))
-      if spec.min_ndim is not None:
-        ndim = x.shape.ndims
-        if ndim is not None and ndim < spec.min_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           ': expected min_ndim=' + str(spec.min_ndim) +
-                           ', found ndim=' + str(ndim) +
-                           '. Full shape received: ' +
-                           str(x.shape.as_list()))
-      # Check dtype.
-      if spec.dtype is not None:
-        if x.dtype != spec.dtype:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected dtype=' + str(spec.dtype) +
-                           ', found dtype=' + str(x.dtype))
-      # Check specific shape axes.
-      if spec.axes:
-        shape = x.shape.as_list()
-        if shape is not None:
-          for axis, value in spec.axes.items():
-            if hasattr(value, 'value'):
-              value = value.value
-            if value is not None and shape[int(axis)] not in {value, None}:
-              raise ValueError(
-                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
-                  ' incompatible with the layer: expected axis ' + str(axis) +
-                  ' of input shape to have value ' + str(value) +
-                  ' but received input with shape ' + str(shape))
-      # Check shape.
-      if spec.shape is not None:
-        shape = x.shape.as_list()
-        if shape is not None:
-          for spec_dim, dim in zip(spec.shape, shape):
-            if spec_dim is not None and dim is not None:
-              if spec_dim != dim:
-                raise ValueError('Input ' + str(input_index) +
-                                 ' is incompatible with layer ' + self.name +
-                                 ': expected shape=' + str(spec.shape) +
-                                 ', found shape=' + str(shape))
-
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
 
@@ -1624,54 +1577,22 @@ class Layer(checkpointable.CheckpointableBase):
     """
     return cls(**config)
 
+  @property
+  def _static_graph_friendly(self):
+    """Whether the layer can be called to create a static graph.
 
-@tf_export(
-    'keras.layers.InputSpec', v1=['keras.layers.InputSpec', 'layers.InputSpec'])
-class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
-
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
-
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
-
-  Arguments:
-      dtype: Expected DataType of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
-  """
-
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None):
-    self.dtype = dtype
-    self.shape = shape
-    if shape is not None:
-      self.ndim = len(shape)
-    else:
-      self.ndim = ndim
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.axes = axes or {}
+    Because of nesting, there are two components to being "graph-friendly":
+      1) all inner layers are graph-friendly
+      2) the way they are composed is graph-friendly.
+    We denote the latter as "_call_is_graph_friendly", and define
+    "_static_graph_friendly" as being the combination of
+    "_call_is_graph_friendly" and "all inner layers are _static_graph_friendly".
+    For atomic layers (no inner layers), this is just "_call_is_graph_friendly".
 
-  def __repr__(self):
-    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
-            ('shape=' + str(self.shape)) if self.shape else '',
-            ('ndim=' + str(self.ndim)) if self.ndim else '',
-            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
-            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
-            ('axes=' + str(self.axes)) if self.axes else '']
-    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+    Returns:
+      Boolean.
+    """
+    return self._call_is_graph_friendly
 
 
 class Node(object):
@@ -1975,3 +1896,8 @@ def default(method):
 
 def generate_placeholders_from_shape(shape):
   return array_ops.placeholder(shape=shape, dtype=backend.floatx())
+
+
+# Avoid breaking users who directly import this symbol from this file.
+# TODO(fchollet): remove this.
+InputSpec = input_spec.InputSpec  # pylint:disable=invalid-name
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index bda26dabcc65a5d5220ed57271f95d5ba47bf03d..798775b6a5b29aa72a2c766584811aa469db2471 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -81,14 +81,14 @@ class BaseLayerTest(test.TestCase):
     inputs = keras.Input((3,))
     outputs = DynamicLayer1()(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
     inputs = keras.Input((3,))
     outputs = DynamicLayer2()(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
@@ -102,7 +102,7 @@ class BaseLayerTest(test.TestCase):
     outputs = inner_model(x)
 
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
@@ -116,11 +116,71 @@ class BaseLayerTest(test.TestCase):
     inputs = keras.Input((3,))
     outputs = InvalidLayer()(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
       model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
+  def test_using_symbolic_tensors_with_tf_ops(self):
+    # Single-input.
+    x = keras.Input((3,))
+    y = math_ops.square(x)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+    # Multi-inputs.
+    x1, x2 = keras.Input((3,)), keras.Input((3,))
+    y = array_ops.concat([x1, x2], axis=1)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+    # Mixing Keras symbolic tensors and graph tensors from the same graph works.
+    with keras.backend.get_graph().as_default():
+      x1 = keras.Input((3,))
+    x2 = keras.Input((3,))
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+    # Creating same op type (matmul) multiple times in the Keras graph works.
+    x1 = keras.Input((3,))
+    x2 = keras.Input((3,))
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+  def test_mixing_eager_and_graph_tensors(self):
+    with ops.Graph().as_default():
+      x1 = array_ops.ones((3, 3))
+    x2 = array_ops.ones((3, 3))
+    self.assertTrue(isinstance(x2, ops.EagerTensor))
+    with self.assertRaisesRegexp(TypeError,
+                                 'provided list of inputs contains '
+                                 'objects other than \'EagerTensor\''):
+      math_ops.matmul(x1, x2)
+
+  def test_mixing_numpy_arrays_and_graph_tensors(self):
+    with ops.Graph().as_default():
+      x1 = array_ops.ones((3, 3))
+    x2 = np.ones((3, 3), dtype='float32')
+    with self.assertRaisesRegexp(TypeError,
+                                 'provided list of inputs contains '
+                                 'objects other than \'EagerTensor\''):
+      math_ops.matmul(x1, x2)
+
+  def test_mixing_keras_symbolic_tensors_and_eager_tensors(self):
+    x1 = keras.Input((3,))
+    x2 = array_ops.ones((3, 3))
+    with self.assertRaisesRegexp(
+        TypeError,
+        'mix computation of symbolic Tensors'):
+      math_ops.matmul(x1, x2)
+
+  def test_mixing_keras_symbolic_tensors_and_numpy_arrays(self):
+    # For the time being we treat Numpy arrays as EagerTensors when mixing both.
+    x1 = keras.Input((3,))
+    x2 = np.ones((3, 3), dtype='float32')
+    with self.assertRaisesRegexp(
+        TypeError,
+        'mix computation of symbolic Tensors'):
+      math_ops.matmul(x1, x2)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index ec553db2f8e175885b4cdfe24ddbffe319dad5ce..25685fb5cfe9124d4354793f2789bd5950be7345 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.client import session as session_module
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -293,19 +294,63 @@ def validate_all_tensor_shapes(x, x_values):
                        ' inputs {}'.format(x))
 
 
+def _wait_for_variable_initialization(session):
+  """Utility to wait for variables to be initialized."""
+  all_variables = K._get_variables(K.get_graph())  # pylint: disable=protected-access
+  candidate_vars = []
+  for v in all_variables:
+    if not getattr(v, '_keras_initialized', False):
+      candidate_vars.append(v)
+
+  if not candidate_vars:
+    return
+
+  while True:
+    is_initialized = session.run(
+        [variables.is_variable_initialized(v) for v in candidate_vars])
+    uninitialized_vars = []
+    for flag, v in zip(is_initialized, candidate_vars):
+      if not flag:
+        uninitialized_vars.append(v)
+      v._keras_initialized = True  # pylint: disable=protected-access
+    if not uninitialized_vars:
+      break
+
+
+def init_restore_or_wait_for_variables():
+  """Initialize or restore variables or wait for variables to be initialized."""
+  session = K._get_session()  # pylint: disable=protected-access
+  worker_context = dc_context.get_current_worker_context()
+  if not worker_context or worker_context.should_init:
+    # TODO(yuefengz): if checkpoints exit, restore from checkpoint.
+    K._initialize_variables(session)  # pylint: disable=protected-access
+  else:
+    _wait_for_variable_initialization(session)
+
+
 def configure_and_create_session(distribution_strategy):
   """Configure session config and create a session with it."""
   # TODO(priyag): Throw error if a session already exists.
   session_config = K.get_default_session_config()
-  distribution_strategy.configure(session_config)
 
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
-    # TODO(priyag): Remove this workaround when Distributed Coordinator is
-    # integrated with keras and we can create a session from there.
-    master = distribution_strategy._tpu_cluster_resolver.master()  # pylint: disable=protected-access
+  if is_tpu_strategy(distribution_strategy):
+    # TODO(priyag, yuefengz): Remove this workaround when Distribute
+    # Coordinator is integrated with keras and we can create a session from
+    # there.
+    distribution_strategy.configure(session_config)
+    master = distribution_strategy.extended._tpu_cluster_resolver.master()  # pylint: disable=protected-access
     session = session_module.Session(config=session_config, target=master)
   else:
-    session = session_module.Session(config=session_config)
+    worker_context = dc_context.get_current_worker_context()
+    if worker_context:
+      dc_session_config = worker_context.session_config
+      # Merge the default session config to the one from distribute coordinator,
+      # which is fine for now since they don't have conflicting configurations.
+      dc_session_config.MergeFrom(session_config)
+      session = session_module.Session(
+          config=dc_session_config, target=worker_context.master_target)
+    else:
+      session = session_module.Session(config=session_config)
 
   K.set_session(session)
 
@@ -334,7 +379,7 @@ def validate_inputs(x, y, distribution_strategy):
                      'Iterator. You must pass a `tf.data.Dataset` object or a '
                      'numpy array as input.')
 
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
+  if is_tpu_strategy(distribution_strategy):
     for i in [x, y]:
       if isinstance(i, dataset_ops.Dataset):
         shapes = nest.flatten(i.output_shapes)
@@ -346,40 +391,97 @@ def validate_inputs(x, y, distribution_strategy):
               'Found unknown shape {} in input {}.'.format(s, i))
 
 
-def get_input_batch_params(first_x_value, batch_size, distribution_strategy):
+# TODO(b/118776054): Currently we support global batch size for TPUStrategy and
+# core MirroredStrategy only. Remove this check when contrib MirroredStrategy is
+# no longer needed.
+def global_batch_size_supported(distribution_strategy):
+  return distribution_strategy.extended._global_batch_size  # pylint: disable=protected-access
+
+
+# TODO(sourabhbajaj): Remove this once we use the same API for all strategies.
+def is_tpu_strategy(strategy):
+  """We're executing TPU Strategy."""
+  return strategy is not None and strategy.__class__.__name__ == 'TPUStrategy'
+
+
+def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
+                     is_training=False):
   """Calculate the number of batches and steps/steps_per_epoch.
 
   Args:
+    distribution_strategy: The DistributionStrategy used to compile the model.
     first_x_value: This is the first input numpy array that is passed in as the
       model input.
-    batch_size: The specified batch_size or the default batch_size of 32.
-    distribution_strategy: The current DistributionStrategy used to compile the
-      model.
+    steps:  The specified number of steps.
+    batch_size: The specified batch_size.
+    is_training: Boolean to relax the constraints on consuming all the training
+      samples to keep compatibility till we support partial batches.
 
   Returns:
-    The steps or steps_per_epoch argument depending on if a user is
-    calling `fit`, `evaluate` or `predict`.
+    steps: The steps or steps_per_epoch argument depending on if a user is
+        calling `fit`, `evaluate` or `predict`. If the is_training flag is set
+        we don't require the number of samples to be used completely.
+    batch_size: The batch size to be used in model iterations.
 
   Raises:
     ValueError: If the number of batches or steps evaluates to 0.
 
   """
-  num_batches = first_x_value.shape[0] // batch_size
-  if not num_batches:
-    raise ValueError('Please specify a batch_size that is smaller than'
-                     'the number of input samples %d.' % first_x_value.shape[0])
-  # TODO(anjalisridhar): TPU currently supports using the num_replicas property.
-  # We might want to look into implementing worker_devices. In multi worker
-  # strategy, perhaps num_replicas works better?
-  steps = num_batches // distribution_strategy.num_replicas
-  if not steps:
-    # TODO(anjalisridhar): Number of replicas in the error message may not
-    # convey what we want to the user. Is there another terminology that we can
-    # use that is consistent across different strategies?
-    raise ValueError('The number of batches %d is smaller than the number '
-                     'of replicas %d used for DistributionStrategy. ' %
-                     (num_batches, distribution_strategy.num_replicas))
-  return steps
+  num_samples = first_x_value.shape[0]
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
+  use_per_replica_batch = not global_batch_size_supported(
+      distribution_strategy)
+
+  if steps is None:
+    if batch_size is None:
+      # If neither the batch size or number of steps are set. We choose the
+      # global batch size as the minimum of number of samples and 32. 32 is
+      # chosen to provide backward compatibility.
+      global_batch_size = min(num_samples, 32)
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+    if not is_training and num_samples % global_batch_size:
+      raise ValueError('The number of samples %s is not divisible by '
+                       'batch size %s.' % (num_samples, global_batch_size))
+    steps = num_samples // global_batch_size
+  else:
+    if batch_size is None:
+      # We calculate the batch size based on the number of steps specified
+      if num_samples % steps:
+        raise ValueError('The number of samples %s is not divisible by '
+                         'steps %s. Please change the number of steps to a '
+                         'value that can consume all the samples' % (
+                             num_samples, steps))
+      global_batch_size = num_samples // steps
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+
+      if num_samples < (global_batch_size * steps):
+        raise ValueError('Number of samples %s is less than samples required '
+                         'for specified batch_size %s and steps %s' % (
+                             num_samples, global_batch_size, steps))
+
+  # We need to return the per replica or global batch size based on the strategy
+  if use_per_replica_batch:
+    if global_batch_size % distribution_strategy.num_replicas_in_sync:
+      raise ValueError(
+          'The batch size (%s) could not be sharded evenly across the sync '
+          'replicas (%s) in the distribution strategy.' % (
+              global_batch_size, distribution_strategy.num_replicas_in_sync))
+    batch_size = global_batch_size // distribution_strategy.num_replicas_in_sync
+  else:
+    batch_size = global_batch_size
+
+  return steps, batch_size
 
 
 def get_batch_dimension(iterator):
@@ -390,33 +492,6 @@ def get_batch_dimension(iterator):
   return dims[0] if dims else None
 
 
-def get_batch_size(num_replicas, num_samples, steps):
-  """Calculate and return batch size for numpy inputs.
-
-  Args:
-    num_replicas: Number of devices over which the model input is distributed.
-    num_samples: Total number of input samples in the input numpy arrays.
-    steps: Number of steps that we run the model for.
-
-  Returns:
-    batch size used to create the Dataset object from the input numpy arrays.
-
-  """
-  if num_samples % steps != 0:
-    logging.warning('The number of input samples %d is not evenly '
-                    'divisible by the number of steps %d. '
-                    'Some samples will not be processed as expected.' %
-                    (num_samples, steps))
-  global_batch_size = num_samples // steps
-  if global_batch_size % num_replicas != 0:
-    logging.warning('The total number of batches per step %d is not evenly '
-                    'divisible by the number of replicas %d used in '
-                    'DistributionStrategy. Some samples will not be processed '
-                    'as expected.' %
-                    (global_batch_size, num_replicas))
-  return global_batch_size // num_replicas
-
-
 def get_cpu_device(distribution_strategy):
   """Returns the CPU device of the TPU host or the default CPU device string.
 
@@ -432,12 +507,12 @@ def get_cpu_device(distribution_strategy):
     NotImplementedError: We currently don't support copying numpy data to
     multiple hosts in the case of Cloud TPU pods.
   """
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
-    if distribution_strategy.num_hosts > 1:
+  if is_tpu_strategy(distribution_strategy):
+    if distribution_strategy.extended.num_hosts > 1:
       raise NotImplementedError('TPUDistributionStrategy does not '
                                 'support numpy inputs when running on Cloud'
                                 'TPU pods.')
-    return distribution_strategy.get_host_cpu_device(0)
+    return distribution_strategy.extended.get_host_cpu_device(0)
   else:
     # For all strategies except TPUDistributionStrategy
     # TODO(anjalisridhar): We may need to modify this when we add support for
@@ -496,7 +571,7 @@ def _get_var_for_numpy(distribution_strategy, input_array):
                                 input_var.dtype.size
 
   # Calculate number of elements we want to copy per slice.
-  batch_size_per_slice = np.ceil((64 << 20) / byte_size_per_batch_element)
+  batch_size_per_slice = int(np.ceil((64 << 20) / byte_size_per_batch_element))
 
   # Copy slices of the above size starting at 0, except the last slice will be
   # smaller.
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index e0478ee357b7a5e93d73be2c939930172b5943f7..b7549e013c909a72198018985e2c96d2c20199ea 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -18,11 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.eager import context
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.platform import test
@@ -33,7 +35,7 @@ class TestDNNModel(keras.models.Model):
 
   def __init__(self, feature_columns, units, name=None, **kwargs):
     super(TestDNNModel, self).__init__(name=name, **kwargs)
-    self._input_layer = fc.FeatureLayer(feature_columns, name='input_layer')
+    self._input_layer = fc.DenseFeatures(feature_columns, name='input_layer')
     self._dense_layer = keras.layers.Dense(units, name='dense_layer')
 
   def call(self, features):
@@ -42,7 +44,7 @@ class TestDNNModel(keras.models.Model):
     return net
 
 
-class FeatureColumnsIntegrationTest(test.TestCase):
+class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
 
   """
@@ -51,7 +53,7 @@ class FeatureColumnsIntegrationTest(test.TestCase):
   def test_sequential_model(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.FeatureLayer(columns),
+        fc.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
@@ -72,7 +74,7 @@ class FeatureColumnsIntegrationTest(test.TestCase):
   def test_sequential_model_with_ds_input(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.FeatureLayer(columns),
+        fc.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
@@ -112,8 +114,10 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.evaluate(x=x, y=y, batch_size=5)
     dnn_model.predict(x=x, batch_size=5)
 
+  @parameterized.parameters(True, False)
   @tf_test_util.run_in_graph_and_eager_modes
-  def test_subclassed_model_with_feature_columns_with_ds_input(self):
+  def test_subclassed_model_with_feature_columns_with_ds_input(self,
+                                                               run_eagerly):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
@@ -122,7 +126,8 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.compile(
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=run_eagerly and context.executing_eagerly())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
@@ -140,10 +145,10 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
-    feature_layer = fc.FeatureLayer([col_a, col_b], name='fc')
+    feature_layer = fc.DenseFeatures([col_a, col_b], name='fc')
     dense = keras.layers.Dense(4)
 
-    # This seems problematic.... We probably need something for FeatureLayer
+    # This seems problematic.... We probably need something for DenseFeatures
     # the way Input is for InputLayer.
     output = dense(feature_layer)
 
@@ -167,11 +172,11 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     col_b = fc.numeric_column('b')
     col_c = fc.numeric_column('c')
 
-    fc1 = fc.FeatureLayer([col_a, col_b], name='fc1')
-    fc2 = fc.FeatureLayer([col_b, col_c], name='fc2')
+    fc1 = fc.DenseFeatures([col_a, col_b], name='fc1')
+    fc2 = fc.DenseFeatures([col_b, col_c], name='fc2')
     dense = keras.layers.Dense(4)
 
-    # This seems problematic.... We probably need something for FeatureLayer
+    # This seems problematic.... We probably need something for DenseFeatures
     # the way Input is for InputLayer.
     output = dense(fc1) + dense(fc2)
 
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 6f5d1fa7cfb9ee12e0e866cd3b22ca01bd1a46f6..590b935d40810f74b35fbf5814f3cdbf74ed2d5d 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -84,7 +84,6 @@ class InputLayer(base_layer.Layer):
     self.sparse = sparse
     self.batch_size = batch_size
     self.supports_masking = True
-    self._can_use_graph_functions = True
 
     if isinstance(input_shape, tensor_shape.TensorShape):
       input_shape = tuple(input_shape.as_list())
@@ -194,6 +193,16 @@ def Input(  # pylint: disable=invalid-name
       model = Model(x, y)
       ```
 
+      Note that even if eager execution is enabled,
+      `Input` produces a symbolic tensor (i.e. a placeholder).
+      This symbolic tensor can be used with other
+      TensorFlow ops, as such:
+
+      ```python
+      x = Input(shape=(32,))
+      y = tf.square(x)
+      ```
+
   Raises:
     ValueError: in case of invalid arguments.
   """
diff --git a/tensorflow/python/keras/engine/input_spec.py b/tensorflow/python/keras/engine/input_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..7277c16fe51197af3bf0e045814ccc29f7feaf7c
--- /dev/null
+++ b/tensorflow/python/keras/engine/input_spec.py
@@ -0,0 +1,170 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Contains the InputSpec class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('keras.layers.InputSpec',
+           v1=['keras.layers.InputSpec', 'layers.InputSpec'])
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
+
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
+
+def assert_input_compatibility(input_spec, inputs, layer_name):
+  """Checks compatibility between the layer and provided inputs.
+
+  This checks that the tensor(s) `inputs` verify the input assumptions
+  of a layer (if any). If not, a clear and actional exception gets raised.
+
+  Arguments:
+      input_spec: An InputSpec instance, or None.
+      inputs: Input tensor or list of input tensors.
+      layer_name: String, name of the layer (for error message formatting).
+
+  Raises:
+      ValueError: in case of mismatch between
+          the provided inputs and the expectations of the layer.
+  """
+  if not input_spec:
+    return
+  if not isinstance(input_spec, (list, tuple)):
+    input_spec = nest.flatten(input_spec)
+
+  inputs = nest.flatten(inputs)
+  if len(inputs) != len(input_spec):
+    raise ValueError('Layer ' + layer_name + ' expects ' +
+                     str(len(input_spec)) + ' inputs, '
+                     'but it received ' + str(len(inputs)) +
+                     ' input tensors. Inputs received: ' + str(inputs))
+  for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+    if spec is None:
+      continue
+
+    if (spec.ndim is not None or
+        spec.min_ndim is not None or
+        spec.max_ndim is not None):
+      if x.shape.ndims is None:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'its rank is undefined, but the layer requires a '
+                         'defined rank.')
+
+    # Check ndim.
+    if spec.ndim is not None:
+      ndim = x.shape.ndims
+      if ndim != spec.ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected ndim=' + str(spec.ndim) + ', found ndim=' +
+                         str(ndim) + '. Full shape received: ' +
+                         str(x.shape.as_list()))
+    if spec.max_ndim is not None:
+      ndim = x.shape.ndims
+      if ndim is not None and ndim > spec.max_ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected max_ndim=' + str(spec.max_ndim) +
+                         ', found ndim=' + str(ndim))
+    if spec.min_ndim is not None:
+      ndim = x.shape.ndims
+      if ndim is not None and ndim < spec.min_ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         ': expected min_ndim=' + str(spec.min_ndim) +
+                         ', found ndim=' + str(ndim) +
+                         '. Full shape received: ' +
+                         str(x.shape.as_list()))
+    # Check dtype.
+    if spec.dtype is not None:
+      if x.dtype != spec.dtype:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected dtype=' + str(spec.dtype) +
+                         ', found dtype=' + str(x.dtype))
+    # Check specific shape axes.
+    if spec.axes:
+      shape = x.shape.as_list()
+      if shape is not None:
+        for axis, value in spec.axes.items():
+          if hasattr(value, 'value'):
+            value = value.value
+          if value is not None and shape[int(axis)] not in {value, None}:
+            raise ValueError(
+                'Input ' + str(input_index) + ' of layer ' + layer_name + ' is'
+                ' incompatible with the layer: expected axis ' + str(axis) +
+                ' of input shape to have value ' + str(value) +
+                ' but received input with shape ' + str(shape))
+    # Check shape.
+    if spec.shape is not None:
+      shape = x.shape.as_list()
+      if shape is not None:
+        for spec_dim, dim in zip(spec.shape, shape):
+          if spec_dim is not None and dim is not None:
+            if spec_dim != dim:
+              raise ValueError('Input ' + str(input_index) +
+                               ' is incompatible with layer ' + layer_name +
+                               ': expected shape=' + str(spec.shape) +
+                               ', found shape=' + str(shape))
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 76a164da994cba04dbf982c575e7264afc05e9e9..41631764830ff965706ffb051273786a99b41b2c 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -112,11 +112,6 @@ class Network(base_layer.Layer):
     self.trainable = True
     self._is_compiled = False
     self._expects_training_arg = False
-    # A list of "extra" variables assigned to attributes of this class, included
-    # in self.weights and self.variables. Always empty for graph networks (but
-    # included in base_init to avoid excessive special casing when retrieving
-    # the value).
-    self._extra_variables = []
     # In many internal cases one needs to compute both the model's output
     # and its output mask without relying on `__call__` (which would do both and
     # set mask metadata), but for models, computing the mask requires to
@@ -134,12 +129,19 @@ class Network(base_layer.Layer):
       self.optimizer = None
 
     # Private attributes to implement compatibility with Layer.
+    self._trainable_weights = []
+    self._non_trainable_weights = []
     self._updates = []  # Used in symbolic mode only.
     self._losses = []
     self._eager_losses = []
+    # A list of metric instances corresponding to the symbolic metric tensors
+    # added using the `add_metric` API.
+    self._metrics = []
+    # A dictionary that maps metric names to metric result tensors.
+    self._metrics_tensors = {}
     self._scope = None  # Never used.
     self._reuse = None  # Never used.
-    self._can_use_graph_functions = False
+    self._call_is_graph_friendly = True
     if context.executing_eagerly():
       self._graph = None
     else:
@@ -258,10 +260,6 @@ class Network(base_layer.Layer):
 
     self._track_layers(layers)
 
-    # A Graph network supports defun-ed eager loops if all of its layers do.
-    self._can_use_graph_functions = all(
-        layer._can_use_graph_functions for layer in layers)
-
     # Create the node linking internal inputs to internal outputs.
     base_layer.Node(
         outbound_layer=self,
@@ -282,9 +280,7 @@ class Network(base_layer.Layer):
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
         self._feed_input_shapes.append(backend.int_shape(self.inputs[i]))
-        # layer.input gives an error in eager mode
-        if not context.executing_eagerly():
-          self._feed_inputs.append(layer.input)
+        self._feed_inputs.append(layer.input)
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
@@ -301,13 +297,12 @@ class Network(base_layer.Layer):
     self.outputs = []
     self.inputs = []
     self.built = False
-    self._static_graph_friendly = True
 
   @property
-  def _is_static_graph_friendly(self):
+  def _static_graph_friendly(self):
     if self._is_graph_network:
-      return all(layer._is_static_graph_friendly for layer in self.layers)
-    return self._static_graph_friendly
+      return all(layer._static_graph_friendly for layer in self.layers)
+    return self._call_is_graph_friendly
 
   def _determine_call_convention(self, call_argspec):
     """Decides how `self.call()` is invoked. See base_layer.CallConvention."""
@@ -415,45 +410,22 @@ class Network(base_layer.Layer):
             # simply by assigning them to attributes.
           not self._is_graph_network
           and isinstance(value, variables.Variable)):
-        self._extra_variables.append(value)
+        if value.trainable:
+          # Could already be added via `add_weight`.
+          if value not in self._trainable_weights:
+            self._trainable_weights.append(value)
+        else:
+          if value not in self._non_trainable_weights:
+            self._non_trainable_weights.append(value)
+
+    # Keeping track of metric instance created in subclassed model/layer.
+    # We do this so that we can maintain the correct order of metrics by adding
+    # the instance to the `metrics` list as soon as it is created.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    if isinstance(value, metrics_module.Metric):
+      self._metrics.append(value)
     super(Network, self).__setattr__(name, value)
 
-  def add_variable(self, name, shape, dtype=None, initializer=None,
-                   regularizer=None, trainable=True, constraint=None):
-    if self._is_graph_network:
-      raise NotImplementedError('`add_variable` is not supported on Networks.')
-    else:
-      raise NotImplementedError(
-          '`add_variable` is not supported on Networks. However, you may '
-          'assign variables to attributes and they will show up in the weights '
-          'and variables properties.')
-
-  def add_weight(self,
-                 name,
-                 shape,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=None,
-                 constraint=None,
-                 partitioner=None,
-                 use_resource=None,
-                 synchronization=variables.VariableSynchronization.AUTO,
-                 aggregation=variables.VariableAggregation.NONE,
-                 **kwargs):
-    if self._is_graph_network:
-      raise NotImplementedError('`add_weight` is not supported on Networks.')
-    else:
-      raise NotImplementedError(
-          '`add_weight` is not supported on Networks. However, you may '
-          'assign variables to attributes and they will show up in the weights '
-          'and variables properties.')
-
-  @property
-  def uses_learning_phase(self):
-    return any(
-        [getattr(x, '_uses_learning_phase', False) for x in self.outputs])
-
   @property
   def stateful(self):
     return any([(hasattr(layer, 'stateful') and layer.stateful)
@@ -562,14 +534,13 @@ class Network(base_layer.Layer):
 
   @property
   def _unfiltered_updates(self):
-    if context.executing_eagerly():
-      return []
     updates = []
     for layer in self.layers:
       if isinstance(layer, Network):
         updates += layer._unfiltered_updates
       else:
         updates += layer.updates
+    updates += self._updates
     return updates
 
   @property
@@ -646,9 +617,6 @@ class Network(base_layer.Layer):
     Returns:
         A list of update ops.
     """
-    if context.executing_eagerly():
-      return []
-
     if not self.trainable and not self.stateful:
       return []
 
@@ -664,7 +632,7 @@ class Network(base_layer.Layer):
       else:
         relevant_inputs.append(inputs)
     if not relevant_inputs:
-      return updates
+      return list(set(updates))
 
     reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, updates)
     relevant_conditional_updates = [x for x in updates if x in reachable]
@@ -672,8 +640,7 @@ class Network(base_layer.Layer):
         x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
     # A layer could be used multiple times in a nested structure,
     # so the updates list must be de-duped.
-    return list(set(
-        relevant_conditional_updates + unconditional_updates + self._updates))
+    return list(set(relevant_conditional_updates + unconditional_updates))
 
   @property
   def losses(self):
@@ -733,14 +700,38 @@ class Network(base_layer.Layer):
     return checkpointable_layer_utils.gather_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
-        extra_variables=self._extra_variables)
+        extra_variables=self._trainable_weights)
 
   @property
   def non_trainable_weights(self):
     return checkpointable_layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
-        extra_variables=self._extra_variables)
+        extra_variables=self._non_trainable_weights + self._trainable_weights)
+
+  @property
+  def metrics(self):
+    """Returns the network's symbolic metrics.
+
+    Model overrides this function to include the metrics from `compile` API.
+    """
+    metrics = []
+    for layer in self.layers:
+      metrics += layer._metrics  # pylint: disable=protected-access
+    return metrics + self._metrics
+
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    # TODO(psv): Remove this property.
+    metrics_tensors = {}
+    for layer in self.layers:
+      if isinstance(layer, Network):
+        metrics_tensors.update(layer._all_metrics_tensors)
+      else:
+        metrics_tensors.update(layer._metrics_tensors)
+    metrics_tensors.update(self._metrics_tensors)
+    return metrics_tensors
 
   @property
   def input_spec(self):
@@ -900,9 +891,7 @@ class Network(base_layer.Layer):
 
   def compute_output_shape(self, input_shape):
     if not self._is_graph_network:
-      if context.executing_eagerly():
-        return super(Network, self).compute_output_shape(input_shape)
-      raise NotImplementedError
+      return super(Network, self).compute_output_shape(input_shape)
 
     if isinstance(input_shape, list):
       input_shapes = []
diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index 61bff7fff23d188117ab6d86dc4ff2940568a055..22c48e3f13aba5e861be16b8c8b1da79d4d8e250 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -79,6 +79,10 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
 
   from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
+  # TODO(psv) Add warning when we save models that contain non-serializable
+  # entities like metrics added using `add_metric` and losses added using
+  # `add_loss.`
+
   if not isinstance(filepath, h5py.File):
     # If file exists and should not be overwritten.
     if not overwrite and os.path.isfile(filepath):
@@ -126,8 +130,8 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
                     'config': model.optimizer.get_config()
                 },
                 'loss': model.loss,
-                'metrics': model.metrics,
-                'weighted_metrics': model.weighted_metrics,
+                'metrics': model._compile_metrics,
+                'weighted_metrics': model._compile_weighted_metrics,
                 'sample_weight_mode': model.sample_weight_mode,
                 'loss_weights': model.loss_weights,
             },
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index c25702d964ed523666b8b78e7dc2f0ce9bca477f..26866d4714dbfe724d23e21a57885ecc466b033d 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -120,8 +120,8 @@ class Sequential(Model):
     return layers[:]
 
   @property
-  def _is_static_graph_friendly(self):
-    return all(layer._is_static_graph_friendly for layer in self.layers)
+  def _static_graph_friendly(self):
+    return all(layer._static_graph_friendly for layer in self.layers)
 
   @checkpointable.no_automatic_dependency_tracking
   def add(self, layer):
@@ -190,8 +190,6 @@ class Sequential(Model):
       self._layers.append(layer)
     if self._layers:
       self._track_layers(self._layers)
-    self._can_use_graph_functions = all(
-        layer._can_use_graph_functions for layer in self.layers)
 
   @checkpointable.no_automatic_dependency_tracking
   def pop(self):
@@ -213,8 +211,6 @@ class Sequential(Model):
       self.outputs = [self.layers[-1].output]
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
       self.built = True
-    self._can_use_graph_functions = all(
-        layer._can_use_graph_functions for layer in self.layers)
 
   def build(self, input_shape=None):
     if self._is_graph_network:
@@ -357,6 +353,12 @@ class Sequential(Model):
       model.built = False
     return model
 
+  @property
+  def input_spec(self):
+    if self.layers and hasattr(self.layers[0], 'input_spec'):
+      return self.layers[0].input_spec
+    return None
+
 
 def get_input_shape_and_dtype(layer):
   """Retrieve input shape and input dtype of layer if applicable.
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 401dff308ad12158788676cc68eddfa85e919554..54db0f3566077d74f623edd9ef0a0699cb103ed4 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
@@ -317,6 +318,15 @@ class TestSequential(test.TestCase, parameterized.TestCase):
          'sequential/dense_1/kernel:0', 'sequential/dense_1/bias:0'],
         [v.name for v in model.variables])
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_input_assumptions_propagation(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1))
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(ValueError,
+                                   'expected min_ndim=2, found ndim=0'):
+        model(1.0)
+
 
 class TestSequentialEagerIntegration(test.TestCase):
 
@@ -353,27 +363,23 @@ class TestSequentialEagerIntegration(test.TestCase):
     model.fit(x, y, epochs=1)
 
   @tf_test_util.run_in_graph_and_eager_modes
-  def test_sequential_can_use_graph_functions(self):
-    model = testing_utils.get_small_sequential_mlp(4, 3)
-    self.assertTrue(model._can_use_graph_functions)
-    inner_model = testing_utils.get_small_sequential_mlp(4, 5)
-    model.add(inner_model)
-
-    self.assertTrue(model._can_use_graph_functions)
-
-    inner_model_two = testing_utils.get_small_sequential_mlp(5, 7)
-    self.assertTrue(inner_model_two._can_use_graph_functions)
-
-    layer = keras.layers.Lambda(lambda x: x)
-    layer._can_use_graph_functions = False
-    inner_model_two.add(layer)
-    self.assertFalse(inner_model_two._can_use_graph_functions)
-
-    model.add(inner_model_two)
-    self.assertFalse(model._can_use_graph_functions)
-
-    model.pop()
-    self.assertTrue(model._can_use_graph_functions)
+  def test_sequential_model_fails_with_dict_inputs(self):
+    num_classes = 5
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes)
+    model.compile(
+        rmsprop.RMSPropOptimizer(learning_rate=0.001),
+        metrics=['acc'],
+        weighted_metrics=['mae'],
+        loss='categorical_crossentropy')
+
+    x = {'dense_input': np.random.random((10, 1))}
+    y = np.random.randint(num_classes, size=(10, 1))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'Passing a dictionary input to a Sequential Model which '
+        'doesn\'t have FeatureLayer as the first layer is an error'):
+      model.fit(x, y, batch_size=5, epochs=1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index 298044c5df28ad7cde9c5a2a34e466708e4886ee..b4a4babf25924a615472cb11e15b7ddc49253bc3 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -341,62 +341,24 @@ class TopologyConstructionTest(test.TestCase):
     self.assertListEqual(model.trainable_weights, [])
     self.assertListEqual(model.non_trainable_weights, weights)
 
-  def test_learning_phase(self):
-    with self.cached_session():
-      a = keras.layers.Input(shape=(32,), name='input_a')
-      b = keras.layers.Input(shape=(32,), name='input_b')
-
-      a_2 = keras.layers.Dense(16, name='dense_1')(a)
-      dp = keras.layers.Dropout(0.5, name='dropout')
-      b_2 = dp(b)
-
-      self.assertFalse(a_2._uses_learning_phase)
-      self.assertTrue(b_2._uses_learning_phase)
-
-      # test merge
-      m = keras.layers.concatenate([a_2, b_2])
-      self.assertTrue(m._uses_learning_phase)
-
-      # Test recursion
-      model = keras.models.Model([a, b], [a_2, b_2])
-      self.assertTrue(model.uses_learning_phase)
-
-      c = keras.layers.Input(shape=(32,), name='input_c')
-      d = keras.layers.Input(shape=(32,), name='input_d')
-
-      c_2, b_2 = model([c, d])
-      self.assertTrue(c_2._uses_learning_phase)
-      self.assertTrue(b_2._uses_learning_phase)
-
-      # try actually running graph
-      fn = keras.backend.function(
-          model.inputs + [keras.backend.learning_phase()], model.outputs)
-      input_a_np = np.random.random((10, 32))
-      input_b_np = np.random.random((10, 32))
-      fn_outputs_no_dp = fn([input_a_np, input_b_np, 0])
-      fn_outputs_dp = fn([input_a_np, input_b_np, 1])
-      # output a: nothing changes
-      self.assertEqual(fn_outputs_no_dp[0].sum(), fn_outputs_dp[0].sum())
-      # output b: dropout applied
-      self.assertNotEqual(fn_outputs_no_dp[1].sum(), fn_outputs_dp[1].sum())
-
   def test_layer_call_arguments(self):
     # Test the ability to pass and serialize arguments to `call`.
     inp = keras.layers.Input(shape=(2,))
     x = keras.layers.Dense(3)(inp)
     x = keras.layers.Dropout(0.5)(x, training=True)
     model = keras.models.Model(inp, x)
-    self.assertFalse(model.uses_learning_phase)
+    # Would be `dropout/cond/Merge` by default
+    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
 
     # Test that argument is kept when applying the model
     inp2 = keras.layers.Input(shape=(2,))
     out2 = model(inp2)
-    self.assertFalse(out2._uses_learning_phase)
+    self.assertTrue(out2.op.name.endswith('dropout/mul'))
 
     # Test that argument is kept after loading a model
     config = model.get_config()
     model = keras.models.Model.from_config(config)
-    self.assertFalse(model.uses_learning_phase)
+    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
 
   def test_node_construction(self):
     # test basics
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index f7f61d2a04987f6fe04d2820f1e8691e461e18dd..56f069c057088b2b09319864225acc8bb3884c8e 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import weakref
 import numpy as np
 
@@ -31,7 +32,6 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_distributed
@@ -41,8 +41,6 @@ from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -124,11 +122,8 @@ class Model(Network):
     # initializing _distribution_strategy here since it is possible to call
     # predict on a model without compiling it.
     self._distribution_strategy = None
-    # This flag must be disabled upon model mutation, such as changing the model
-    # layers or recompiling the model to use a different optimizer. New function
-    # definitions are generated whenever this flag is disabled, ensuring that
-    # internal graph functions are always using the current model structure.
-    self._built_graph_functions = False
+
+    self.run_eagerly = None
 
   def _set_sample_weight_attributes(self, sample_weight_mode,
                                     skip_target_weighing_indices):
@@ -180,19 +175,66 @@ class Model(Network):
       metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
     j = 1
     base_metric_name = metric_name
-    while metric_name in self.metrics_names:
+    while metric_name in self._compile_metrics_names:
       metric_name = '%s_%d' % (base_metric_name, j)
       j += 1
 
     return metric_name
 
+  @property
+  def metrics(self):
+    """Returns the model's metrics added using `compile`, `add_metric` APIs."""
+    metrics = []
+    if self._is_compiled:
+      metrics += self._compile_stateful_metric_functions
+    return metrics + super(Model, self).metrics
+
+  @property
+  def metrics_names(self):
+    """Returns the model's display labels for all outputs."""
+    metrics_names = []
+    if self._is_compiled:
+      metrics_names += self._compile_metrics_names  # Includes names of losses.
+
+    # Add metric names from layers.
+    for layer in self.layers:
+      metrics_names += [m.name for m in layer._metrics]  # pylint: disable=protected-access
+    metrics_names += [m.name for m in self._metrics]
+    return metrics_names
+
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
+  @property
+  def _all_stateful_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_stateful_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
   def _init_metric_attributes(self):
     """Initialized model metric attributes."""
-    self.metrics_names = ['loss']
-    self.metrics_tensors = []
-    self.metrics_updates = []
-    self.stateful_metric_names = []
-    self.stateful_metric_functions = []
+    # List of all metric names in the model.
+    self._compile_metrics_names = ['loss']
+    # List of stateful metric functions. Used for resetting metric state during
+    # training/eval.
+    # This includes loss functions when there are multiple outputs.
+    self._compile_stateful_metric_functions = []
+    # Dict of all aggregated metric result tensors. This includes aggregated
+    # loss result tensors when there are multiple outputs.
+    self._compile_stateful_metrics_tensors = {}
+    # Dict of all metric result tensors (aggregated or not - based on the
+    # values given in compile.). This includes aggregated loss result tensors
+    # when there are multiple outputs.
+    self._compile_metrics_tensors = {}
 
   def _set_per_output_metric_attributes(self, metrics_dict, output_index):
     """Sets the metric attributes on the model for the given output.
@@ -201,33 +243,47 @@ class Model(Network):
       metrics_dict: A dict with metric names as keys and metric fns as values.
       output_index: The index of the model output for which the metric
         attributes are added.
+
+    Returns:
+      Metrics dict updated with unique metric names as keys.
     """
-    for metric_name, metric_fn in metrics_dict.items():
+    updated_metrics_dict = collections.OrderedDict()
+    for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
       metric_name = self._add_unique_metric_name(metric_name, output_index)
-      # Keep track of metric name.
-      self.metrics_names.append(metric_name)
-
-      # Keep track of stateful metric attributes (name and metric function).
-      if isinstance(metric_fn, base_layer.Layer) and metric_fn.stateful:
-        self.stateful_metric_names.append(metric_name)
-        self.stateful_metric_functions.append(metric_fn)
+      updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
+      # Keep track of metric name, function and stateful function.
+      self._compile_metrics_names.append(metric_name)
+      self._compile_stateful_metric_functions.append(stateful_metric_fn)
+    return updated_metrics_dict
 
   def _set_metric_attributes(self, outputs, skip_target_indices=None):
     """Sets the metric attributes on the model for all the model outputs."""
     skip_target_indices = skip_target_indices or []
+    updated_per_output_metrics = []
+    updated_per_output_weighted_metrics = []
     for i in range(len(outputs)):
       if i in skip_target_indices:
+        updated_per_output_metrics.append(self._per_output_metrics[i])
+        updated_per_output_weighted_metrics.append(
+            self._per_output_weighted_metrics[i])
         continue
-      self._set_per_output_metric_attributes(self._per_output_metrics[i], i)
-      self._set_per_output_metric_attributes(
-          self._per_output_weighted_metrics[i], i)
+      updated_per_output_metrics.append(
+          self._set_per_output_metric_attributes(self._per_output_metrics[i],
+                                                 i))
+      updated_per_output_weighted_metrics.append(
+          self._set_per_output_metric_attributes(
+              self._per_output_weighted_metrics[i], i))
+
+    self._per_output_metrics = updated_per_output_metrics
+    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
 
   def _handle_per_output_metrics(self,
                                  metrics_dict,
                                  y_true,
                                  y_pred,
                                  mask,
-                                 weights=None):
+                                 weights=None,
+                                 return_stateful_result=True):
     """Calls metric functions for a single output.
 
     Arguments:
@@ -236,52 +292,50 @@ class Model(Network):
       y_pred: Predicted output.
       mask: Computed mask value for the current output.
       weights: Weights to be applied on the current output.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
 
     Returns:
       A list of metric result tensors.
     """
     metric_results = []
-    for metric_name, metric_fn in metrics_dict.items():
+    for metric_name, (metric_fn, stateful_fn) in metrics_dict.items():
       with K.name_scope(metric_name):
+
+        def _call_stateful_fn(fn):
+          return training_utils.call_metric_function(
+              fn, y_true, y_pred, weights=weights, mask=mask)
+
+        def _call_stateless_fn(fn):
+          weighted_metric_fn = training_utils.weighted_masked_objective(fn)
+          return weighted_metric_fn(y_true, y_pred, weights=weights, mask=mask)
+
+        def _track_metric_tensors(name, stateless_result, stateful_result):
+          self._compile_metrics_tensors[name] = stateless_result
+          self._compile_stateful_metrics_tensors[name] = stateful_result
+
         if isinstance(metric_fn, metrics_module.Metric):
-          # Call the stateful metric function.
-          if mask is not None:
-            mask = math_ops.cast(mask, y_pred.dtype)
-            # Update weights with mask.
-            if weights is None:
-              weights = mask
-            else:
-              # Update shape of weights if possible before adding mask.
-              # Update dimensions of weights to match with mask if possible.
-              mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
-                  mask, None, weights)
-              try:
-                # Broadcast weights if possible.
-                weights = weights_broadcast_ops.broadcast_weights(weights, mask)
-              except ValueError:
-                pass
-                # TODO(psv): Handle case when mask and weight shapes are not
-                # compatible.
-              weights *= mask
-
-          metric_result = metric_fn(y_true, y_pred, weights)
+          # If the given metric fn is stateful, call the fn and return result.
+          metric_result = _call_stateful_fn(metric_fn)
+          metric_results.append(metric_result)
+          if not self.run_eagerly:
+            _track_metric_tensors(metric_name, metric_result, metric_result)
+        elif self.run_eagerly:
+          # In eager mode, if the given metric fn is not stateful, we invoke the
+          # given fn or its stateful version based on the given flag.
+          if return_stateful_result:
+            metric_result = _call_stateful_fn(stateful_fn)
+          else:
+            metric_result = _call_stateless_fn(metric_fn)
+          metric_results.append(metric_result)
         else:
-          # Call the stateless metric function.
-          weighted_metric_fn = training_utils.weighted_masked_objective(
-              metric_fn)
-          metric_result = weighted_metric_fn(
-              y_true, y_pred, weights=weights, mask=mask)
-
-        if not context.executing_eagerly():
-          # Keep track of metric result tensor.
-          self.metrics_tensors.append(metric_result)
-
-      metric_results.append(metric_result)
-      is_stateful = isinstance(metric_fn,
-                               base_layer.Layer) and metric_fn.stateful
-      if is_stateful and not context.executing_eagerly():
-        # Keep track of updates created by stateful metrics.
-        self.metrics_updates += metric_fn.updates
+          # In graph mode, we build the sub-graph for both the stateful and the
+          # stateless fns.
+          stateful_metric_result = _call_stateful_fn(stateful_fn)
+          metric_result = _call_stateless_fn(metric_fn)
+          _track_metric_tensors(metric_name, metric_result,
+                                stateful_metric_result)
+
     return metric_results
 
   def _handle_metrics(self,
@@ -289,7 +343,8 @@ class Model(Network):
                       skip_target_indices=None,
                       targets=None,
                       sample_weights=None,
-                      masks=None):
+                      masks=None,
+                      return_stateful_result=True):
     """Handles calling metric functions.
 
     Arguments:
@@ -298,6 +353,8 @@ class Model(Network):
       targets: List of targets.
       sample_weights: Optional list of sample weight arrays.
       masks: List of computed output mask values.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
 
     Returns:
       A list of metric result tensors.
@@ -305,6 +362,7 @@ class Model(Network):
     skip_target_indices = skip_target_indices or []
     metric_results = []
     with K.name_scope('metrics'):
+      # Invoke all metrics added using `compile`.
       for i in range(len(outputs)):
         if i in skip_target_indices:
           continue
@@ -312,17 +370,62 @@ class Model(Network):
         target = targets[i] if targets else None
         output_mask = masks[i] if masks else None
         metric_results.extend(
-            self._handle_per_output_metrics(self._per_output_metrics[i], target,
-                                            output, output_mask))
+            self._handle_per_output_metrics(
+                self._per_output_metrics[i],
+                target,
+                output,
+                output_mask,
+                return_stateful_result=return_stateful_result))
         metric_results.extend(
             self._handle_per_output_metrics(
                 self._per_output_weighted_metrics[i],
                 target,
                 output,
                 output_mask,
-                weights=sample_weights[i]))
+                weights=sample_weights[i],
+                return_stateful_result=return_stateful_result))
+
+    # Add metric results from the `add_metric` metrics in eager mode.
+    if context.executing_eagerly():
+      for m in self.metrics:
+        if m not in self._compile_stateful_metric_functions:
+          metric_results.append(m.result())
     return metric_results
 
+  @property
+  def run_eagerly(self):
+    """Settable attribute indicating whether the model should run eagerly.
+
+    Running eagerly means that your model will be run step by step,
+    like Python code. Your model might run slower, but it should become easier
+    for you to debug it by stepping into individual layer calls.
+
+    By default, we will attempt to compile your model to a static graph to
+    deliver the best execution performance.
+
+    Returns:
+      Boolean, whether the model should run eagerly.
+    """
+    if self._run_eagerly is True and not context.executing_eagerly():
+      raise ValueError('You can only set `run_eagerly=True` if eager execution '
+                       'is enabled.')
+    if self._static_graph_friendly:
+      if self._run_eagerly is None:
+        return False
+      else:
+        return self._run_eagerly
+    else:
+      if self._run_eagerly is False:
+        # TODO(fchollet): consider using py_func to enable this.
+        raise ValueError('Your model contains layers that can only be '
+                         'successfully run in eager execution. '
+                         'You cannot set `run_eagerly=False`.')
+      return context.executing_eagerly()
+
+  @run_eagerly.setter
+  def run_eagerly(self, value):
+    self._run_eagerly = value
+
   @checkpointable.no_automatic_dependency_tracking
   def compile(self,
               optimizer,
@@ -384,9 +487,8 @@ class Model(Network):
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
-    # The correct graph function may have changed,
-    # already-built ones must be updated
-    self._built_graph_functions = False
+    run_eagerly = kwargs.pop('run_eagerly', None)
+    self._run_eagerly = run_eagerly
 
     # Validate that arguments passed by the user to `compile` are supported by
     # DistributionStrategy.
@@ -396,9 +498,9 @@ class Model(Network):
         raise NotImplementedError(
             'optimizer must be an instance of '
             'tf.train.Optimizer, not a %s' % type(optimizer))
-      if context.executing_eagerly():
+      if self.run_eagerly:
         raise NotImplementedError('DistributionStrategy is not supported '
-                                  'when eager execution is enabled.')
+                                  'when running a model eagerly.')
       if sample_weight_mode:
         raise NotImplementedError('sample_weight_mode is not supported with '
                                   'DistributionStrategy.')
@@ -410,11 +512,12 @@ class Model(Network):
                          'DistributionStrategy.')
 
     loss = loss or {}
-    if context.executing_eagerly() and not isinstance(
+    if self.run_eagerly and not isinstance(
         optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
       raise ValueError(
-          'optimizer must be an instance of tf.train.Optimizer, not '
-          'a %s' % type(optimizer))
+          'When running a model in eager execution, the optimizer must be an '
+          'instance of tf.train.Optimizer. Received: '
+          '%s' % optimizer)
 
     self.optimizer = optimizers.get(optimizer)
     # We've disabled automatic dependency tracking for this method, but do want
@@ -423,12 +526,14 @@ class Model(Network):
       self._track_checkpointable(
           self.optimizer, name='optimizer', overwrite=True)
     self.loss = loss
-    self.metrics = metrics or []
+    self._compile_metrics = metrics or []
     self.loss_weights = loss_weights
     self.sample_weight_mode = sample_weight_mode
-    self.weighted_metrics = weighted_metrics
-    if context.executing_eagerly() and target_tensors is not None:
-      raise ValueError('target_tensors is not supported in Eager mode.')
+    self._compile_weighted_metrics = weighted_metrics
+    if self.run_eagerly and target_tensors is not None:
+      raise ValueError(
+          'target_tensors argument is not supported when '
+          'running a model eagerly.')
     self.target_tensors = target_tensors
 
     # Set DistributionStrategy specific parameters.
@@ -438,6 +543,8 @@ class Model(Network):
     if self._distribution_strategy is not None:
       distributed_training_utils.configure_and_create_session(
           self._distribution_strategy)
+    # Initialize model metric attributes.
+    self._init_metric_attributes()
     if not self.built:
       # Model is not compilable because it does not know its number of inputs
       # and outputs, nor their shapes and names. We will compile after the first
@@ -474,21 +581,19 @@ class Model(Network):
       loss_functions = [loss_function for _ in range(len(self.outputs))]
     self.loss_functions = loss_functions
 
-    weighted_losses = [training_utils.weighted_masked_objective(fn)
-                       for fn in loss_functions]
     skip_target_indices = []
     skip_target_weighing_indices = []
     self._feed_outputs = []
     self._feed_output_names = []
     self._feed_output_shapes = []
     self._feed_loss_fns = []
-    for i in range(len(weighted_losses)):
-      if weighted_losses[i] is None:
+    for i in range(len(loss_functions)):
+      if loss_functions[i] is None:
         skip_target_indices.append(i)
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
-    if not context.executing_eagerly():
+    if not self.run_eagerly:
       masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
       if not isinstance(masks, list):
         masks = [masks]
@@ -519,11 +624,8 @@ class Model(Network):
                       str(loss_weights) + ' - expected a list of dicts.')
     self.loss_weights_list = loss_weights_list
 
-    # Initialize model metric attributes.
-    self._init_metric_attributes()
-
     # Initialization for Eager mode execution.
-    if context.executing_eagerly():
+    if self.run_eagerly:
       # Prepare sample weights.
       self._set_sample_weight_attributes(sample_weight_mode,
                                          skip_target_weighing_indices)
@@ -536,7 +638,7 @@ class Model(Network):
       self.total_loss = None
       for i in range(len(self.outputs)):
         if len(self.outputs) > 1:
-          self.metrics_names.append(self.output_names[i] + '_loss')
+          self._compile_metrics_names.append(self.output_names[i] + '_loss')
 
       # Set metric attributes on model.
       self._set_metric_attributes(
@@ -550,127 +652,152 @@ class Model(Network):
       self._collected_trainable_weights = self.trainable_weights
       return
 
-    # Prepare targets of model.
-    self.targets = []
-    self._feed_targets = []
-    if target_tensors not in (None, []):
-      if isinstance(target_tensors, list):
-        if len(target_tensors) != len(self.outputs):
-          raise ValueError(
-              'When passing a list as `target_tensors`, '
-              'it should have one entry per model output. '
-              'The model has ' + str(len(self.outputs)) +
-              ' outputs, but you passed target_tensors=' + str(target_tensors))
-      elif isinstance(target_tensors, dict):
-        for name in target_tensors:
-          if name not in self.output_names:
+    with K.get_graph().as_default():
+      # Prepare targets of model.
+      self.targets = []
+      self._feed_targets = []
+      if target_tensors not in (None, []):
+        if isinstance(target_tensors, list):
+          if len(target_tensors) != len(self.outputs):
             raise ValueError(
-                'Unknown entry in `target_tensors` '
-                'dictionary: "' + name + '". '
-                'Only expected the following keys: ' + str(self.output_names))
-        tmp_target_tensors = []
-        for name in self.output_names:
-          tmp_target_tensors.append(target_tensors.get(name, None))
-        target_tensors = tmp_target_tensors
-      elif tensor_util.is_tensor(target_tensors):
-        target_tensors = [target_tensors]
-      else:
-        raise TypeError('Expected `target_tensors` to be a list or tuple or '
-                        'dict or a single tensor, but got:', target_tensors)
-
-    for i in range(len(self.outputs)):
-      if i in skip_target_indices:
-        self.targets.append(None)
-      else:
-        shape = K.int_shape(self.outputs[i])
-        name = self.output_names[i]
-        if target_tensors not in (None, []):
-          target = target_tensors[i]
-        else:
-          target = None
-        if target is None or K.is_placeholder(target):
-          if target is None:
-            target = K.placeholder(
-                ndim=len(shape),
-                name=name + '_target',
-                sparse=K.is_sparse(self.outputs[i]),
-                dtype=K.dtype(self.outputs[i]))
-          self._feed_targets.append(target)
-          self._feed_outputs.append(self.outputs[i])
-          self._feed_output_names.append(name)
-          self._feed_output_shapes.append(shape)
-          self._feed_loss_fns.append(self.loss_functions[i])
+                'When passing a list as `target_tensors`, '
+                'it should have one entry per model output. '
+                'The model has %s outputs, but you passed target_tensors=%s' %
+                (len(self.outputs), target_tensors))
+        elif isinstance(target_tensors, dict):
+          for name in target_tensors:
+            if name not in self.output_names:
+              raise ValueError(
+                  'Unknown entry in `target_tensors` '
+                  'dictionary: "' + name + '". '
+                  'Only expected the following keys: ' + str(self.output_names))
+          tmp_target_tensors = []
+          for name in self.output_names:
+            tmp_target_tensors.append(target_tensors.get(name, None))
+          target_tensors = tmp_target_tensors
+        elif tensor_util.is_tensor(target_tensors):
+          target_tensors = [target_tensors]
         else:
-          skip_target_weighing_indices.append(i)
-        self.targets.append(target)
-
-    # Prepare sample weights.
-    self._set_sample_weight_attributes(sample_weight_mode,
-                                       skip_target_weighing_indices)
-    # Save all metric attributes per output of the model.
-    self._cache_output_metric_attributes(metrics, weighted_metrics)
-
-    # Compute total loss.
-    total_loss = None
-    with K.name_scope('loss'):
+          raise TypeError('Expected `target_tensors` to be a list or tuple or '
+                          'dict or a single tensor, but got:', target_tensors)
+
       for i in range(len(self.outputs)):
         if i in skip_target_indices:
-          continue
-        y_true = self.targets[i]
-        y_pred = self.outputs[i]
-        weighted_loss = weighted_losses[i]
-        sample_weight = self.sample_weights[i]
-        mask = masks[i]
-        loss_weight = loss_weights_list[i]
-        with K.name_scope(self.output_names[i] + '_loss'):
-          output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
-        if len(self.outputs) > 1:
-          self.metrics_tensors.append(output_loss)
-          self.metrics_names.append(self.output_names[i] + '_loss')
-        if total_loss is None:
-          total_loss = loss_weight * output_loss
-        else:
-          total_loss += loss_weight * output_loss
-      if total_loss is None:
-        if not self.losses:
-          raise ValueError('The model cannot be compiled '
-                           'because it has no loss to optimize.')
+          self.targets.append(None)
         else:
-          total_loss = 0.
-
-      # Add regularization penalties
-      # and other layer-specific losses.
-      for loss_tensor in self.losses:
-        total_loss += loss_tensor
-
-    # Set metric attributes on model.
-    self._set_metric_attributes(
-        self.outputs,
-        skip_target_indices=skip_target_indices,
-    )
-    # Invoke metric functions for all the outputs.
-    self._handle_metrics(
-        self.outputs,
-        masks=masks,
-        targets=self.targets,
-        skip_target_indices=skip_target_indices,
-        sample_weights=self.sample_weights)
-
-    # Prepare gradient updates and state updates.
-    self.total_loss = total_loss
-
-    # Functions for train, test and predict will
-    # be compiled lazily when required.
-    # This saves time when the user is not using all functions.
-    self._function_kwargs = kwargs
-
-    self.train_function = None
-    self.test_function = None
-    self.predict_function = None
-
-    # Collected trainable weights, sorted in topological order.
-    trainable_weights = self.trainable_weights
-    self._collected_trainable_weights = trainable_weights
+          shape = K.int_shape(self.outputs[i])
+          name = self.output_names[i]
+          if target_tensors not in (None, []):
+            target = target_tensors[i]
+          else:
+            target = None
+          if target is None or K.is_placeholder(target):
+            if target is None:
+              target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
+                  self.loss_functions[i],
+                  K.dtype(self.outputs[i]))
+
+              target = K.placeholder(
+                  ndim=len(shape),
+                  name=name + '_target',
+                  sparse=K.is_sparse(self.outputs[i]),
+                  dtype=target_dtype)
+            self._feed_targets.append(target)
+            self._feed_outputs.append(self.outputs[i])
+            self._feed_output_names.append(name)
+            self._feed_output_shapes.append(shape)
+            self._feed_loss_fns.append(self.loss_functions[i])
+          else:
+            skip_target_weighing_indices.append(i)
+          self.targets.append(target)
+
+      # Prepare sample weights.
+      self._set_sample_weight_attributes(sample_weight_mode,
+                                         skip_target_weighing_indices)
+      # Save all metric attributes per output of the model.
+      self._cache_output_metric_attributes(metrics, weighted_metrics)
+
+      # Compute total loss.
+      total_loss = None
+      with K.name_scope('loss'):
+        for i in range(len(self.outputs)):
+          if i in skip_target_indices:
+            continue
+          y_true = self.targets[i]
+          y_pred = self.outputs[i]
+          loss_fn = loss_functions[i]
+          sample_weight = self.sample_weights[i]
+          mask = masks[i]
+          loss_weight = loss_weights_list[i]
+          with K.name_scope(self.output_names[i] + '_loss'):
+            weighted_loss = training_utils.weighted_masked_objective(loss_fn)
+            output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+
+          if len(self.outputs) > 1:
+            # Keep track of the un-aggregated loss result tensor.
+            self._compile_metrics_tensors[self.output_names[i] +
+                                          '_loss'] = output_loss
+
+            # Keep track of stateful result tensor and function for the loss.
+            mean_wrapped_loss = metrics_module.MeanMetricWrapper(
+                loss_fn, name=loss_fn.__name__)
+            result_tensor = training_utils.call_metric_function(
+                mean_wrapped_loss,
+                y_true,
+                y_pred,
+                weights=sample_weight,
+                mask=mask)
+            self._compile_stateful_metrics_tensors[self.output_names[i] +
+                                                   '_loss'] = result_tensor
+            self._compile_stateful_metric_functions.append(mean_wrapped_loss)
+
+            self._compile_metrics_names.append(self.output_names[i] + '_loss')
+          if total_loss is None:
+            total_loss = loss_weight * output_loss
+          else:
+            total_loss += loss_weight * output_loss
+        if total_loss is None:
+          if not self.losses:
+            raise ValueError('The model cannot be compiled '
+                             'because it has no loss to optimize.')
+          else:
+            total_loss = 0.
+
+        # Add regularization penalties
+        # and other layer-specific losses.
+        for loss_tensor in self.losses:
+          total_loss += loss_tensor
+
+      # Set metric attributes on model.
+      self._set_metric_attributes(
+          self.outputs,
+          skip_target_indices=skip_target_indices,
+      )
+      # Invoke metric functions for all the outputs.
+      self._handle_metrics(
+          self.outputs,
+          masks=masks,
+          targets=self.targets,
+          skip_target_indices=skip_target_indices,
+          sample_weights=self.sample_weights)
+
+      # Prepare gradient updates and state updates.
+      self.total_loss = total_loss
+
+      # Functions for train, test and predict will
+      # be compiled lazily when required.
+      # This saves time when the user is not using all functions.
+      self._function_kwargs = kwargs
+
+      self._fit_function = None
+      self._eval_function = None
+      self.train_function = None
+      self.test_function = None
+      self.predict_function = None
+
+      # Collected trainable weights, sorted in topological order.
+      trainable_weights = self.trainable_weights
+      self._collected_trainable_weights = trainable_weights
 
   def _check_trainable_weights_consistency(self):
     """Check trainable weights count consistency.
@@ -690,69 +817,124 @@ class Model(Network):
           ' trainable weights, did you set `model.trainable`'
           ' without calling `model.compile` after ?', 1)
 
-  def _make_train_function(self):
-    if not hasattr(self, 'train_function'):
+  def _make_train_function_helper(self, fn_name, outputs, metric_updates=None):
+    if not hasattr(self, fn_name):
       raise RuntimeError('You must compile your model before using it.')
     self._check_trainable_weights_consistency()
-    if self.train_function is None:
+    if getattr(self, fn_name) is None:
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        inputs += [K.learning_phase()]
+      if not isinstance(K.symbolic_learning_phase(), int):
+        inputs += [K.symbolic_learning_phase()]
+
+      with K.get_graph().as_default():
+        with K.name_scope('training'):
+          with K.name_scope(self.optimizer.__class__.__name__):
+            # Training updates
+            updates = self.optimizer.get_updates(
+                params=self._collected_trainable_weights, loss=self.total_loss)
+      # Unconditional updates
+      updates += self.get_updates_for(None)
+      # Conditional updates relevant to this model
+      updates += self.get_updates_for(self.inputs)
+      # Add stateful metrics updates.
+      if metric_updates is not None:
+        updates += metric_updates
 
       with K.name_scope('training'):
-        with K.name_scope(self.optimizer.__class__.__name__):
-          # Training updates
-          updates = self.optimizer.get_updates(
-              params=self._collected_trainable_weights, loss=self.total_loss)
-        # Unconditional updates
-        updates += self.get_updates_for(None)
-        # Conditional updates relevant to this model
-        updates += self.get_updates_for(self.inputs)
-        # Stateful metrics updates
-        updates += self.metrics_updates
         # Gets loss and metrics. Updates weights at each call.
-        self.train_function = K.function(
-            inputs, [self.total_loss] + self.metrics_tensors,
+        fn = K.function(
+            inputs,
+            outputs,
             updates=updates,
             name='train_function',
             **self._function_kwargs)
+        setattr(self, fn_name, fn)
 
-  def _make_test_function(self):
-    if not hasattr(self, 'test_function'):
+  def _make_train_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_train_function_helper('train_function',
+                                     [self.total_loss] + metrics_tensors)
+
+  def _make_fit_function(self):
+    # TODO(psv/anjalisridhar): Remove updates after we fix b/118841692
+    # Stateful metrics updates
+    metric_updates = []
+    for m in self.metrics:
+      metric_updates += m.updates
+
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_train_function_helper(
+        '_fit_function', [self.total_loss] + metrics_tensors, metric_updates)
+
+  def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
+    if not hasattr(self, fn_name):
       raise RuntimeError('You must compile your model before using it.')
-    if self.test_function is None:
+    if getattr(self, fn_name) is None:
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        inputs += [K.learning_phase()]
-      # Return loss and metrics, no gradient updates.
-      # Does update the network states.
-      self.test_function = K.function(
-          inputs, [self.total_loss] + self.metrics_tensors,
-          updates=self.state_updates + self.metrics_updates,
-          name='test_function',
-          **self._function_kwargs)
+
+      with K.name_scope('evaluation'):
+        updates = self.state_updates
+        # Add stateful metrics updates.
+        if metric_updates is not None:
+          updates += metric_updates
+        # Return loss and metrics, no gradient updates.
+        # Does update the network states.
+        fn = K.function(
+            inputs,
+            outputs,
+            updates=updates,
+            name='test_function',
+            **self._function_kwargs)
+        setattr(self, fn_name, fn)
+
+  def _make_test_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_test_function_helper('test_function',
+                                    [self.total_loss] + metrics_tensors)
+
+  def _make_eval_function(self):
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_test_function_helper('_eval_function',
+                                    [self.total_loss] + metrics_tensors)
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
       self.predict_function = None
     if self.predict_function is None:
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        inputs = self._feed_inputs + [K.learning_phase()]
-      else:
-        inputs = self._feed_inputs
+      inputs = self._feed_inputs
       # Gets network outputs. Does not update weights.
       # Does update the network states.
       kwargs = getattr(self, '_function_kwargs', {})
-      self.predict_function = K.function(
-          inputs,
-          self.outputs,
-          updates=self.state_updates,
-          name='predict_function',
-          **kwargs)
+      with K.name_scope('predict'):
+        self.predict_function = K.function(
+            inputs,
+            self.outputs,
+            updates=self.state_updates,
+            name='predict_function',
+            **kwargs)
+
+  def _get_execution_function(self, mode):
+    if mode == 'train':
+      self._make_fit_function()
+      return self._fit_function
+    if mode == 'test':
+      self._make_eval_function()
+      return self._eval_function
+    if mode == 'predict':
+      self._make_predict_function()
+      return self.predict_function
 
   def _get_iterator_get_next_tensors(self, iterator):
     get_next_op = self._iterator_get_next.get(iterator, None)
@@ -808,7 +990,8 @@ class Model(Network):
                                 'when using DistributionStrategy.')
 
     if (sample_weight is not None and sample_weight.all() and
-        self._distribution_strategy.__class__.__name__ == 'TPUStrategy'):
+        distributed_training_utils.is_tpu_strategy(
+            self._distribution_strategy)):
       raise NotImplementedError('`sample_weight` is currently not supported '
                                 'when using TPUStrategy.')
 
@@ -824,11 +1007,6 @@ class Model(Network):
 
     first_x_value = nest.flatten(x)[0]
     if isinstance(first_x_value, np.ndarray):
-      assert steps is not None
-      x_shape = first_x_value.shape
-      if batch_size is None:
-        batch_size = distributed_training_utils.get_batch_size(
-            self._distribution_strategy.num_replicas, x_shape[0], steps)
       # We need to use the drop_remainder argument to allow for a static
       # input shape which is required for TPUs.
       drop_remainder = self._distribution_strategy.require_static_shapes
@@ -863,19 +1041,13 @@ class Model(Network):
         var_x = distributed_training_utils.get_var_for_numpy(
             self._distribution_strategy, x)
         x = dataset_ops.Dataset.from_tensor_slices(var_x)
-        x = x.repeat()
         x = x.batch(batch_size, drop_remainder=drop_remainder)
 
     assert isinstance(x, dataset_ops.Dataset)
 
-    # TODO(anjalisridhar): We want distribute_dataset() to accept a Dataset or a
-    # function which returns a Dataset. Currently distribute_dataset() only
-    # accepts a function that returns a Dataset. Once we add support for being
-    # able to clone a Dataset on multiple workers we can remove this lambda.
-    result = self._distribution_strategy.distribute_dataset(lambda: x)
-    iterator = result.make_initializable_iterator()
     with self._distribution_strategy.scope():
-      K.get_session().run(iterator.initializer)
+      iterator = self._distribution_strategy.make_dataset_iterator(x)
+      K.get_session().run(iterator.initialize())
 
     training_utils.validate_iterator_input(x, y, sample_weight,
                                            validation_split)
@@ -987,7 +1159,7 @@ class Model(Network):
     # For eager iterators, when we have to process multiple batches of samples,
     # we will standardize the data when we actually loop over iterator and get
     # the batches. For now, we just return the iterator as is.
-    if is_x_eager_iterator and steps is not None:
+    if is_x_eager_iterator:
       return x, y, sample_weight
 
     # If input data is a dataset iterator in graph mode or if it is an eager
@@ -1031,6 +1203,8 @@ class Model(Network):
     all_inputs = []
     is_build_called = False
     is_compile_called = False
+    # Whether this is a subclassed model that expects dictionary inputs
+    # rather than list inputs (e.g. FeatureColumn-based models).
     dict_inputs = False
     if not self.inputs:
       # We need to use `x` to set the model inputs.
@@ -1057,9 +1231,16 @@ class Model(Network):
       # to match the value shapes.
       if not self.inputs:
         is_build_called = True
-        self._set_inputs(x)
+        cast_inputs = x
+        if training_utils.has_tensors(x):
+          cast_inputs = training_utils.cast_if_floating_dtype(x)
+        self._set_inputs(cast_inputs)
     else:
       dict_inputs = isinstance(self.inputs, dict)
+    if dict_inputs and context.executing_eagerly():
+      # No support for graph functions when the model expects dictionary inputs
+      # (i.e. FeatureColumn-based models).
+      self.run_eagerly = True
 
     if y is not None:
       if not self.optimizer:
@@ -1069,6 +1250,8 @@ class Model(Network):
       if not self._is_compiled:
         # On-the-fly compilation of the model.
         # We need to use `y` to set the model targets.
+        if training_utils.has_tensors(y):
+          y = training_utils.cast_if_floating_dtype(y)
         if isinstance(y, (list, tuple)):
           if not all(isinstance(v, np.ndarray) or
                      tensor_util.is_tensor(v) for v in y):
@@ -1093,19 +1276,22 @@ class Model(Network):
                              'TensorFlow tensors. '
                              'You passed: x=' + str(x) + '; y=' + str(y))
 
-        if context.executing_eagerly():
+        if self.run_eagerly:
           target_tensors = None
         else:
           # Handle target tensors if any passed.
           if not isinstance(y, (list, tuple)):
             y = [y]
-          target_tensors = [v for v in y if tensor_util.is_tensor(v)]
+          target_tensors = [v for v in y if _is_symbolic_tensor(v)]
         is_compile_called = True
-        self.compile(optimizer=self.optimizer,
-                     loss=self.loss,
-                     metrics=self.metrics,
-                     loss_weights=self.loss_weights,
-                     target_tensors=target_tensors)
+        self.compile(
+            optimizer=self.optimizer,
+            loss=self.loss,
+            metrics=self._compile_metrics,
+            weighted_metrics=self._compile_weighted_metrics,
+            loss_weights=self.loss_weights,
+            target_tensors=target_tensors,
+            run_eagerly=self.run_eagerly)
 
     # In graph mode, if we had just set inputs and targets as symbolic tensors
     # by invoking build and compile on the model respectively, we do not have to
@@ -1113,15 +1299,15 @@ class Model(Network):
     # part of the graph.
     # Note: in this case, `any` and `all` are equivalent since we disallow
     # mixed symbolic/value inputs.
-    if (not context.executing_eagerly() and is_build_called and
+    if (not self.run_eagerly and is_build_called and
         is_compile_called and
-        any(tensor_util.is_tensor(v) for v in all_inputs)):
+        any(_is_symbolic_tensor(v) for v in all_inputs)):
       return [], [], []
 
     # What follows is input validation and standardization to list format,
     # in the case where all inputs are value arrays.
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       # In eager mode, do not do shape validation
       # since the network has no input nodes (placeholders) to be fed.
       feed_input_names = self.input_names
@@ -1177,7 +1363,9 @@ class Model(Network):
       y = training_utils.standardize_input_data(
           y,
           feed_output_names,
-          feed_output_shapes,
+          # Don't enforce target shapes to match output shapes.
+          # Precise checks will be run in `check_loss_and_target_compatibility`.
+          shapes=None,
           check_batch_axis=False,  # Don't enforce the batch size.
           exception_prefix='target')
 
@@ -1195,7 +1383,7 @@ class Model(Network):
       # Check that all arrays have the same length.
       if not self._distribution_strategy:
         training_utils.check_array_lengths(x, y, sample_weights)
-        if self._is_graph_network and not context.executing_eagerly():
+        if self._is_graph_network and not self.run_eagerly:
           # Additional checks to avoid users mistakenly using improper loss fns.
           training_utils.check_loss_and_target_compatibility(
               y, self._feed_loss_fns, feed_output_shapes)
@@ -1256,8 +1444,8 @@ class Model(Network):
         # We assert that the first layer is a FeatureLayer.
         if not training_utils.is_feature_layer(self.layers[0]):
           raise ValueError('Passing a dictionary input to a Sequential Model '
-                           'which doesnt have FeatureLayer as the first layer '
-                           'is an error')
+                           'which doesn\'t have FeatureLayer as the first layer'
+                           ' is an error.')
         input_shape = (None,)
         self.build(input_shape=input_shape)
       else:
@@ -1283,8 +1471,7 @@ class Model(Network):
 
     if outputs is None:
       # Obtain symbolic outputs by calling the model.
-      graph = K.get_graph()
-      with graph.as_default():
+      with K.get_graph().as_default():
         if self._expects_training_arg:
           outputs = self.call(inputs, training=training)
         else:
@@ -1444,7 +1631,6 @@ class Model(Network):
     """
     # TODO(fchollet): this method may be creating reference cycles, which would
     # lead to accumulating garbage in memory when called in a loop. Investigate.
-
     if data_utils.is_generator_or_sequence(x):
       training_utils.check_generator_arguments(y, sample_weight)
       return self.fit_generator(
@@ -1462,9 +1648,6 @@ class Model(Network):
           shuffle=shuffle,
           initial_epoch=initial_epoch)
 
-    # Backwards compatibility
-    if batch_size is None and steps_per_epoch is None:
-      batch_size = 32
     # Legacy support
     if 'nb_epoch' in kwargs:
       logging.warning(
@@ -1482,9 +1665,15 @@ class Model(Network):
           x, y, self._distribution_strategy)
 
       first_x_value = nest.flatten(x)[0]
-      if not steps_per_epoch and isinstance(first_x_value, np.ndarray):
-        steps_per_epoch = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps_per_epoch, batch_size = (
+            distributed_training_utils.get_input_params(
+                self._distribution_strategy, first_x_value, steps_per_epoch,
+                batch_size, is_training=True))
+
+    # Backwards compatibility
+    if batch_size is None and steps_per_epoch is None:
+      batch_size = 32
 
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1525,9 +1714,10 @@ class Model(Network):
         distributed_training_utils.validate_inputs(
             val_x, val_y, self._distribution_strategy)
         first_valx_value = nest.flatten(val_x)[0]
-        if not validation_steps and isinstance(first_valx_value, np.ndarray):
-          validation_steps = distributed_training_utils.get_input_batch_params(
-              first_valx_value, batch_size, self._distribution_strategy)
+        if isinstance(first_valx_value, np.ndarray):
+          validation_steps, _ = distributed_training_utils.get_input_params(
+              self._distribution_strategy, first_valx_value, validation_steps,
+              batch_size)
 
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
@@ -1557,7 +1747,7 @@ class Model(Network):
       val_y = None
       val_sample_weights = None
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       return training_eager.fit_loop(
           self,
           inputs=x,
@@ -1575,9 +1765,11 @@ class Model(Network):
           initial_epoch=initial_epoch,
           steps_per_epoch=steps_per_epoch,
           validation_steps=validation_steps)
-    elif self._distribution_strategy:
-      return training_distributed.fit_loop(
-          self, x,
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_fit_loop(
+          self,
+          x,
           epochs=epochs,
           verbose=verbose,
           callbacks=callbacks,
@@ -1585,9 +1777,23 @@ class Model(Network):
           initial_epoch=initial_epoch,
           steps_per_epoch=steps_per_epoch,
           validation_steps=validation_steps)
+    elif isinstance(x, iterator_ops.EagerIterator):
+      return training_generator.fit_generator(
+          self,
+          x,
+          steps_per_epoch=steps_per_epoch,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          workers=0,
+          initial_epoch=initial_epoch)
     else:
       return training_arrays.fit_loop(
-          self, x, y,
+          self,
+          x,
+          y,
           sample_weights=sample_weights,
           batch_size=batch_size,
           epochs=epochs,
@@ -1690,19 +1896,18 @@ class Model(Network):
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
-
     # Validate and standardize user data.
     if self._distribution_strategy:
       distributed_training_utils.validate_inputs(
           x, y, self._distribution_strategy)
       first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray) and not steps:
-        steps = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
+
+    # Backwards compatibility.
+    if batch_size is None and steps is None:
+      batch_size = 32
 
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1713,7 +1918,7 @@ class Model(Network):
         steps_name='steps',
         steps=steps)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       return training_eager.test_loop(
           self,
           inputs=x,
@@ -1722,12 +1927,17 @@ class Model(Network):
           batch_size=batch_size,
           verbose=verbose,
           steps=steps)
-    elif self._distribution_strategy:
-      return training_distributed.test_loop(
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_test_loop(
+          self, iterator=x, verbose=verbose, steps=steps)
+    elif isinstance(x, iterator_ops.EagerIterator):
+      return training_generator.evaluate_generator(
           self,
-          iterator=x,
+          x,
+          steps=steps,
           verbose=verbose,
-          steps=steps)
+          workers=0)
     else:
       return training_arrays.test_loop(
           self,
@@ -1801,41 +2011,44 @@ class Model(Network):
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
-
     if self._distribution_strategy:
-      # Turn off prefetching since this is currently not deterministic. Once
-      # b/112498930 is fixed we can turn it back on.
-      # `_prefetch_on_device` is currently a property of only
-      # `MirroredStrategy`.
-      if hasattr(self._distribution_strategy, '_prefetch_on_device'):
-        self._distribution_strategy._prefetch_on_device = False  # pylint: disable=protected-access
       distributed_training_utils.validate_inputs(
           x, None, self._distribution_strategy)
       first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray) and not steps:
-        steps = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
+
+    # Backwards compatibility.
+    if batch_size is None and steps is None:
+      batch_size = 32
 
     # Validate and standardize user data.
-    # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
-    # means that we end up calculating it twice which we should avoid.
-    x, _, _ = self._standardize_user_data(
-        x, check_steps=True, steps_name='steps', steps=steps)
+    if self._distribution_strategy:
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps,
+          batch_size=batch_size)
+    else:
+      # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
+      # means we need to special case distribution strategy which needs the
+      # batch size.
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       return training_eager.predict_loop(
           self, x, batch_size=batch_size, verbose=verbose, steps=steps)
-    elif self._distribution_strategy:
-      results = training_distributed.predict_loop(
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_predict_loop(
           self, x, verbose=verbose, steps=steps)
-      # Turn prefetching back on since we turned it off previously.
-      if hasattr(self._distribution_strategy, '_prefetch_on_device'):
-        self._distribution_strategy._prefetch_on_device = True  # pylint: disable=protected-access
-      return results
+    elif isinstance(x, iterator_ops.EagerIterator):
+      return training_generator.predict_generator(
+          self,
+          x,
+          steps=steps,
+          verbose=verbose,
+          workers=0)
     else:
       return training_arrays.predict_loop(
           self, x, batch_size=batch_size, verbose=verbose, steps=steps)
@@ -1846,32 +2059,28 @@ class Model(Network):
     Arguments:
         x: Input data. It could be:
           - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
+              (in case the model has multiple inputs).
           - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
+              (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
+              if the model has named inputs.
           - A `tf.data` dataset or a dataset iterator.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset or a
-          dataset iterator, `y` should not be specified
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s). It should be consistent with `x`
+          (you cannot have Numpy inputs and tensor targets, or inversely). If
+          `x` is a dataset or a dataset iterator, `y` should not be specified
           (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
-            weights to apply to the model's loss for each sample.
-            In the case of temporal data, you can pass a 2D array
-            with shape (samples, sequence_length),
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile(). This argument is not
-            supported when `x` is a dataset or a dataset iterator.
-        class_weight: Optional dictionary mapping
-            class indices (integers) to
-            a weight (float) to apply to the model's loss for the samples
-            from this class during training.
-            This can be useful to tell the model to "pay more attention" to
-            samples from an under-represented class.
+          weights to apply to the model's loss for each sample. In the case of
+          temporal data, you can pass a 2D array with shape (samples,
+          sequence_length), to apply a different weight to every timestep of
+          every sample. In this case you should make sure to specify
+          sample_weight_mode="temporal" in compile(). This argument is not
+          supported when `x` is a dataset or a dataset iterator.
+        class_weight: Optional dictionary mapping class indices (integers) to a
+          weight (float) to apply to the model's loss for the samples from this
+          class during training. This can be useful to tell the model to "pay
+          more attention" to samples from an under-represented class.
 
     Returns:
         Scalar training loss
@@ -1890,17 +2099,17 @@ class Model(Network):
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight, class_weight=class_weight)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       outputs = training_eager.train_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        ins = x + y + sample_weights + [1]
+      if not isinstance(K.symbolic_learning_phase(), int):
+        ins = x + y + sample_weights + [True]
       else:
         ins = x + y + sample_weights
 
       self._make_train_function()
-      outputs = self.train_function(ins)
+      outputs = self.train_function(ins)  # pylint: disable=not-callable
 
     if len(outputs) == 1:
       return outputs[0]
@@ -1949,16 +2158,13 @@ class Model(Network):
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       outputs = training_eager.test_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        ins = x + y + sample_weights + [0]
-      else:
-        ins = x + y + sample_weights
+      inputs = x + y + sample_weights
       self._make_test_function()
-      outputs = self.test_function(ins)
+      outputs = self.test_function(inputs)  # pylint: disable=not-callable
 
     if len(outputs) == 1:
       return outputs[0]
@@ -1987,28 +2193,21 @@ class Model(Network):
                                 'models compiled with DistributionStrategy.')
     # Validate and standardize user data.
     inputs, _, _ = self._standardize_user_data(x)
-    if context.executing_eagerly():
-      if (isinstance(x, iterator_ops.EagerIterator) or
-          (isinstance(x, dataset_ops.Dataset) and context.executing_eagerly())):
+    if self.run_eagerly:
+      if (isinstance(inputs, iterator_ops.EagerIterator) or
+          (isinstance(inputs, dataset_ops.Dataset))):
         inputs = training_utils.cast_if_floating_dtype(inputs)
-      else:
+      elif isinstance(inputs, collections.Sequence):
         inputs = [
-            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs
-        ]
+            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs]
       return self(inputs)  # pylint: disable=not-callable
 
-    if not context.executing_eagerly():
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        ins = inputs + [0]
-      else:
-        ins = inputs
-
-      self._make_predict_function()
-      outputs = self.predict_function(ins)
+    self._make_predict_function()
+    outputs = self.predict_function(inputs)
 
-      if len(outputs) == 1:
-        return outputs[0]
-      return outputs
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
 
   def fit_generator(self,
                     generator,
@@ -2118,11 +2317,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`fit_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`fit_generator` is not yet enabled for unbuilt Model subclasses')
-
     return training_generator.fit_generator(
         self,
         generator,
@@ -2189,12 +2383,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`evaluate_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`evaluate_generator` is not yet enabled for '
-          'unbuilt Model subclasses')
-
     return training_generator.evaluate_generator(
         self,
         generator,
@@ -2246,11 +2434,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`predict_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`predict_generator` is not yet enabled for unbuilt Model subclasses')
-
     return training_generator.predict_generator(
         self,
         generator,
@@ -2322,3 +2505,7 @@ class DistributedCallbackModel(Model):
       logging.warning('You are accessing attribute ' + item + ' of the '
                       'DistributedCallbackModel that may not have been set '
                       'correctly.')
+
+
+def _is_symbolic_tensor(x):
+  return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index 95b864bef028ecb8de3c85cd3df13bc41fd8dff3..390357303e2d519e20fe492313806944b643624a 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -19,14 +19,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 import numpy as np
 
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras.engine import training_distributed
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.platform import tf_logging as logging
 
@@ -36,22 +38,197 @@ except ImportError:
   issparse = None
 
 
-def fit_loop(model,
-             inputs,
-             targets,
-             sample_weights=None,
-             batch_size=None,
-             epochs=100,
-             verbose=1,
-             callbacks=None,
-             val_inputs=None,
-             val_targets=None,
-             val_sample_weights=None,
-             shuffle=True,
-             initial_epoch=0,
-             steps_per_epoch=None,
-             validation_steps=None):
-  """Abstract fit function for arrays of data.
+class Aggregator(object):
+  """Abstract base class used to aggregate batch-level outputs of a loop.
+
+  Arguments:
+    use_steps: Whether the loop is using `step` or `batch_size`.
+    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
+  """
+
+  def __init__(self, use_steps, num_samples_or_steps):
+    self.use_steps = use_steps
+    self.num_samples_or_steps = num_samples_or_steps
+    self.results = []
+
+  def create(self, batch_outs):
+    """Create the initial results from the first batch outputs.
+
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+    """
+    raise NotImplementedError
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    """Aggregate batch-level results into total results.
+
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+      batch_start: The start index of this batch. Always `None` if `use_steps`
+        is `True`.
+      batch_end: The end index of this batch. Always `None` if `use_steps` is
+        `True`.
+    """
+    raise NotImplementedError
+
+  def finalize(self):
+    """Prepare the total results to be returned."""
+    raise NotImplementedError
+
+
+class MetricsAggregator(Aggregator):
+  """Aggregator that calculates loss and metrics info."""
+
+  def create(self, batch_outs):
+    self.results = [0.] * len(batch_outs)
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    # Loss.
+    if self.use_steps:
+      self.results[0] += batch_outs[0]
+    else:
+      self.results[0] += batch_outs[0] * (batch_end - batch_start)
+    # Metrics (always stateful, just grab current values.)
+    self.results[1:] = batch_outs[1:]
+
+  def finalize(self):
+    self.results[0] /= self.num_samples_or_steps
+
+
+class OutputsAggregator(Aggregator):
+  """Aggregator that concatenates outputs."""
+
+  def create(self, batch_outs):
+    if self.use_steps:
+      # Cannot pre-allocate the returned NumPy arrays bc
+      # batch sizes are unknown. Concatenate batches at the end.
+      for _ in batch_outs:
+        self.results.append([])
+    else:
+      # Pre-allocate NumPy arrays.
+      for batch_out in batch_outs:
+        shape = (self.num_samples_or_steps,) + batch_out.shape[1:]
+        self.results.append(np.zeros(shape, dtype=batch_out.dtype))
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    if self.use_steps:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i].append(batch_out)
+    else:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i][batch_start:batch_end] = batch_out
+
+  def finalize(self):
+    if self.use_steps:
+      self.results = [np.concatenate(result, axis=0) for result in self.results]
+
+
+def _get_model_feed(model, mode):
+  if mode == 'predict':
+    feed = model._feed_inputs
+  else:
+    feed = (
+        model._feed_inputs + model._feed_targets + model._feed_sample_weights)
+  return feed
+
+
+def _validate_arguments(steps_per_epoch, validation_steps, kwargs):
+  for k in kwargs:
+    if k != 'steps':
+      raise ValueError('Invalid argument passed: {}'.format(k))
+
+  # Validate inputs when in training mode.
+  if validation_steps and steps_per_epoch is None:
+    raise ValueError('Can only use `validation_steps` '
+                     'when doing step-wise '
+                     'training, i.e. `steps_per_epoch` '
+                     'must be set.')
+
+
+def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
+  if (val_inputs and steps_per_epoch is None and verbose and inputs and
+      hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
+    print('Train on %d samples, validate on %d samples' %
+          (inputs[0].shape[0], val_inputs[0].shape[0]))
+
+
+def _get_progbar(model, count_mode):
+  stateful_metric_names = None
+  if hasattr(model, 'metrics_names'):
+    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
+
+
+def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
+  """Returns total number of samples (when training in batch mode) or steps."""
+  if steps_per_epoch:
+    return steps_per_epoch
+  return training_utils.check_num_samples(ins, batch_size, steps_per_epoch,
+                                          'steps_per_epoch')
+
+
+def _make_logs(model, outputs, mode, prefix=''):
+  """Used to make logs to send to `on_batch_end` methods."""
+  logs = {}
+  # TODO(omalleyt): handle outputs in prediction when Callback
+  # hooks are ready.
+  if mode in ['train', 'test']:
+    if hasattr(model, 'metrics_names'):
+      for label, output in zip(model.metrics_names, outputs):
+        logs[prefix + label] = output
+  return logs
+
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of 'train'/'test'/'predict'.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  if model._distribution_strategy:
+    return training_distributed._prepare_feed_values(model, inputs, targets,
+                                                     sample_weights, mode)
+  inputs = training_utils.ModelInputs(inputs).as_list()
+  targets = targets or []
+  sample_weights = sample_weights or []
+  ins = inputs + targets + sample_weights
+  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
+    ins += [True]
+  return ins
+
+
+def _get_execution_function(model, mode):
+  """Get function to run one step of model execution."""
+  if model._distribution_strategy:
+    return training_distributed._get_execution_function(model, mode)
+  return model._get_execution_function(mode)
+
+
+def model_iteration(model,
+                    inputs,
+                    targets=None,
+                    sample_weights=None,
+                    batch_size=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    val_inputs=None,
+                    val_targets=None,
+                    val_sample_weights=None,
+                    shuffle=True,
+                    initial_epoch=0,
+                    steps_per_epoch=None,
+                    validation_steps=None,
+                    mode='train',
+                    **kwargs):
+  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
 
   Arguments:
       model: Keras Model instance.
@@ -66,52 +243,51 @@ def fit_loop(model,
       val_targets: List of target arrays.
       val_sample_weights: Optional list of sample weight arrays.
       shuffle: Whether to shuffle the data at the beginning of each epoch
-          concatenation of list the display names of the outputs of
-           `f` and the list of display names of the outputs of `f_val`.
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for
-          (only if doing validation from data tensors).
-          Ignored with the default value of `None`.
+        concatenation of list the display names of the outputs of `f` and the
+        list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training (useful for resuming a
+        previous training run)
+      steps_per_epoch: Total number of steps (batches of samples) before
+        declaring one epoch finished and starting the next epoch. Ignored with
+        the default value of `None`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with the default value of `None`.
+      mode: One of 'train'/'test'/'predict'.
+      **kwargs: Additional arguments for backwards compatibility.
 
   Returns:
-      `History` object.
+      - In 'train' mode: `History` object.
+      - In 'test' mode: Evaluation metrics.
+      - In 'predict' mode: Outputs of the Model called on inputs.
 
   Raises:
       ValueError: in case of invalid arguments.
   """
-  model._make_train_function()
-  f = model.train_function
-
-  sample_weights = sample_weights or []
-  val_sample_weights = val_sample_weights or []
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = inputs + targets + sample_weights + [1]
-  else:
-    ins = inputs + targets + sample_weights
-
-  do_validation = False
-  if val_inputs:
-    do_validation = True
-    if (steps_per_epoch is None and verbose and inputs and
-        hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
-      print('Train on %d samples, validate on %d samples' %
-            (inputs[0].shape[0], val_inputs[0].shape[0]))
-  if validation_steps:
-    do_validation = True
-    if steps_per_epoch is None:
-      raise ValueError('Can only use `validation_steps` '
-                       'when doing step-wise '
-                       'training, i.e. `steps_per_epoch` '
-                       'must be set.')
-
-  num_train_samples = training_utils.check_num_samples(
-      ins, batch_size, steps_per_epoch, 'steps_per_epoch')
-  count_mode = 'steps' if steps_per_epoch else 'samples'
+  # Backwards compatibility.
+  if 'steps' in kwargs:
+    steps_per_epoch = kwargs['steps']
+
+  _validate_arguments(steps_per_epoch, validation_steps, kwargs)
+  if mode == 'train':
+    _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
+
+  # Enter DistributionStrategy scope.
+  if model._distribution_strategy:
+    scope = model._distribution_strategy.scope()
+    scope.__enter__()
+
+  # Get step function and loop type.
+  f = _get_execution_function(model, mode)
+  use_steps = steps_per_epoch is not None
+  do_validation = val_inputs is not None
+
+  # Prepare input data.
+  ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
+  num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
+                                                   steps_per_epoch)
+
+  # Configure callbacks.
+  count_mode = 'steps' if use_steps else 'samples'
   callbacks = cbks.configure_callbacks(
       callbacks,
       model,
@@ -122,81 +298,101 @@ def fit_loop(model,
       batch_size=batch_size,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
-      samples=num_train_samples,
+      samples=num_samples_or_steps,
       validation_steps=validation_steps,
-      verbose=verbose,
-      count_mode=count_mode)
-
-  if num_train_samples is not None:
-    index_array = np.arange(num_train_samples)
+      verbose=0,  # Handle ProgBarLogger separately in this loop.
+      count_mode=count_mode,
+      mode=mode)
+  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
+  progbar = _get_progbar(model, count_mode)
+  progbar.params = callbacks.params
+  progbar.params['verbose'] = verbose
+
+  # Find beforehand arrays that need sparse-to-dense conversion.
+  if issparse is not None:
+    indices_for_conversion_to_dense = []
+    feed = _get_model_feed(model, mode)
+    for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
+      if issparse(input_data) and not K.is_sparse(feed_tensor):
+        indices_for_conversion_to_dense.append(i)
+
+  # Select aggregation method.
+  if mode == 'predict':
+    aggregator = OutputsAggregator(use_steps, num_samples_or_steps)
+  else:
+    aggregator = MetricsAggregator(use_steps, num_samples_or_steps)
 
-  # To prevent a slowdown, we find beforehand the arrays that need conversion.
-  feed = model._feed_inputs + model._feed_targets + model._feed_sample_weights
-  indices_for_conversion_to_dense = []
-  for i in range(len(feed)):
-    if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
-      indices_for_conversion_to_dense.append(i)
+  if model._distribution_strategy:
+    training_distributed._copy_weights_to_distributed_model(model)
 
-  callbacks.on_train_begin()
+  callbacks.model.stop_training = False
+  callbacks._call_begin_hook(mode)
+  progbar.on_train_begin()
   for epoch in range(initial_epoch, epochs):
-    # Reset stateful metrics
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-    # Update callbacks
-    callbacks.on_epoch_begin(epoch)
+    if callbacks.model.stop_training:
+      break
+
+    # Setup work for each epoch
+    results = []
     epoch_logs = {}
-    if steps_per_epoch is not None:
-      # Step-wise fit loop.
-      for step_index in range(steps_per_epoch):
-        batch_logs = {'batch': step_index, 'size': 1}
-        callbacks.on_batch_begin(step_index, batch_logs)
+    if hasattr(model, 'metrics'):
+      for m in model.metrics:
+        m.reset_states()
+    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_begin(epoch, epoch_logs)
+
+    if use_steps:
+      # Step-wise loop.
+      for step in range(steps_per_epoch):
+        batch_logs = {'batch': step, 'size': 1}
+        callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
+        progbar.on_batch_begin(step, batch_logs)
+
+        # Get outputs.
         try:
-          outs = f(ins)
+          batch_outs = f(ins)
         except errors.OutOfRangeError:
           logging.warning('Your dataset iterator ran out of data; '
                           'interrupting training. Make sure that your dataset '
                           'can generate at least `steps_per_epoch * epochs` '
                           'batches (in this case, %d batches). You may need to'
                           'use the repeat() function when building your '
-                          'dataset.' %
-                          steps_per_epoch * epochs)
+                          'dataset.' % steps_per_epoch * epochs)
           break
+        if not isinstance(batch_outs, list):
+          batch_outs = [batch_outs]
 
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
+        if model._distribution_strategy:
+          batch_outs = training_distributed._per_device_aggregate_batch(
+              batch_outs, model, mode)
+
+        # Aggregate results.
+        if step == 0:
+          aggregator.create(batch_outs)
+        aggregator.aggregate(batch_outs)
+
+        # Callbacks batch end.
+        batch_logs.update(_make_logs(model, batch_outs, mode))
+        callbacks._call_batch_hook(mode, 'end', step, batch_logs)
+        progbar.on_batch_end(step, batch_logs)
 
-        callbacks.on_batch_end(step_index, batch_logs)
         if callbacks.model.stop_training:
           break
-
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_inputs,
-            val_targets,
-            sample_weights=val_sample_weights,
-            steps=validation_steps,
-            verbose=0)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(model.metrics_names, val_outs):
-          epoch_logs['val_' + l] = o
     else:
-      # Sample-wise fit loop.
+      # Sample-wise loop.
+      index_array = np.arange(num_samples_or_steps)
       if shuffle == 'batch':
         index_array = training_utils.batch_shuffle(index_array, batch_size)
       elif shuffle:
         np.random.shuffle(index_array)
-
-      batches = make_batches(num_train_samples, batch_size)
+      batches = make_batches(num_samples_or_steps, batch_size)
 
       for batch_index, (batch_start, batch_end) in enumerate(batches):
         batch_ids = index_array[batch_start:batch_end]
+
+        # Slice into a batch.
         try:
-          if isinstance(ins[-1], int):
+          if ins and isinstance(ins[-1], int):
             # Do not slice the training phase flag.
             ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
           else:
@@ -205,256 +401,71 @@ def fit_loop(model,
           raise TypeError('TypeError while preparing batch. '
                           'If using HDF5 input data, '
                           'pass shuffle="batch".')
-        batch_logs = {}
-        batch_logs['batch'] = batch_index
-        batch_logs['size'] = len(batch_ids)
-        callbacks.on_batch_begin(batch_index, batch_logs)
-        for i in indices_for_conversion_to_dense:
-          ins_batch[i] = ins_batch[i].toarray()
-
-        outs = f(ins_batch)
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
-
-        callbacks.on_batch_end(batch_index, batch_logs)
-        if callbacks.model.stop_training:
-          break
-
-        if batch_index == len(batches) - 1:  # Last batch.
-          if do_validation:
-            val_outs = test_loop(
-                model,
-                val_inputs,
-                val_targets,
-                sample_weights=val_sample_weights,
-                batch_size=batch_size,
-                verbose=0)
-            if not isinstance(val_outs, list):
-              val_outs = [val_outs]
-            # Same labels assumed.
-            for l, o in zip(model.metrics_names, val_outs):
-              epoch_logs['val_' + l] = o
-    callbacks.on_epoch_end(epoch, epoch_logs)
-    if callbacks.model.stop_training:
-      break
-  callbacks.on_train_end()
-  return model.history
-
-
-def predict_loop(model, inputs, batch_size=32, verbose=0, steps=None):
-  """Abstract method to loop over some data in batches.
 
-  Arguments:
-      model: Keras Model instance.
-      inputs: list of tensors to be fed to `f`.
-      batch_size: integer batch size.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
+        # Sparse to dense conversion.
+        if issparse is not None:
+          for i in indices_for_conversion_to_dense:
+            ins_batch[i] = ins_batch[i].toarray()
 
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  model._make_predict_function()
-  f = model.predict_function
+        # Callbacks batch_begin.
+        batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
+        callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs)
+        progbar.on_batch_begin(batch_index, batch_logs)
 
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = inputs + [0]
-  else:
-    ins = inputs
+        # Get outputs.
+        batch_outs = f(ins_batch)
+        if not isinstance(batch_outs, list):
+          batch_outs = [batch_outs]
 
-  num_samples = training_utils.check_num_samples(
-      inputs, batch_size, steps, 'steps')
-  if verbose == 1:
-    if steps is not None:
-      progbar = Progbar(target=steps)
-    else:
-      progbar = Progbar(target=num_samples)
-
-  indices_for_conversion_to_dense = []
-  for i in range(len(model._feed_inputs)):
-    if (issparse is not None and issparse(inputs[i]) and
-        not K.is_sparse(model._feed_inputs[i])):
-      indices_for_conversion_to_dense.append(i)
-
-  if steps is not None:
-    # Step-based predictions.
-    # Since we do not know how many samples
-    # we will see, we cannot pre-allocate
-    # the returned Numpy arrays.
-    # Instead, we store one array per batch seen
-    # and concatenate them upon returning.
-    unconcatenated_outs = []
-    for step in range(steps):
-      batch_outs = f(ins)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if step == 0:
-        for batch_out in batch_outs:
-          unconcatenated_outs.append([])
-      for i, batch_out in enumerate(batch_outs):
-        unconcatenated_outs[i].append(batch_out)
-      if verbose == 1:
-        progbar.update(step + 1)
-    if len(unconcatenated_outs) == 1:
-      return np.concatenate(unconcatenated_outs[0], axis=0)
-    return [
-        np.concatenate(unconcatenated_outs[i], axis=0)
-        for i in range(len(unconcatenated_outs))
-    ]
-  else:
-    # Sample-based predictions.
-    outs = []
-    batches = make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      if ins and isinstance(ins[-1], int):
-        # Do not slice the training phase flag.
-        ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-      else:
-        ins_batch = slice_arrays(ins, batch_ids)
-      for i in indices_for_conversion_to_dense:
-        ins_batch[i] = ins_batch[i].toarray()
-
-      batch_outs = f(ins_batch)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if batch_index == 0:
-        # Pre-allocate the results arrays.
-        for batch_out in batch_outs:
-          shape = (num_samples,) + batch_out.shape[1:]
-          outs.append(np.zeros(shape, dtype=batch_out.dtype))
-      for i, batch_out in enumerate(batch_outs):
-        outs[i][batch_start:batch_end] = batch_out
-      if verbose == 1:
-        progbar.update(batch_end)
-    if len(outs) == 1:
-      return outs[0]
-    return outs
-
-
-def test_loop(model,
-              inputs,
-              targets,
-              sample_weights=None,
-              batch_size=None,
-              verbose=0,
-              steps=None):
-  """Abstract method to loop over some data in batches.
+        # Aggregate results.
+        if batch_index == 0:
+          aggregator.create(batch_outs)
+        aggregator.aggregate(batch_outs, batch_start, batch_end)
 
-  Arguments:
-      model: Keras Model instance.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: integer batch size or `None`.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
+        # Callbacks batch end.
+        batch_logs.update(_make_logs(model, batch_outs, mode))
+        callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
+        progbar.on_batch_end(batch_index, batch_logs)
 
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  model._make_test_function()
-  f = model.test_function
+        if callbacks.model.stop_training:
+          break
 
-  sample_weights = sample_weights or []
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = inputs + targets + sample_weights + [0]
-  else:
-    ins = inputs + targets + sample_weights
-
-  if hasattr(model, 'metrics'):
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-    stateful_metric_indices = [
-        i for i, name in enumerate(model.metrics_names)
-        if str(name) in model.stateful_metric_names
-    ]
-  else:
-    stateful_metric_indices = []
-
-  num_samples = training_utils.check_num_samples(
-      ins, batch_size, steps, 'steps')
-  outs = []
-  if verbose == 1:
-    if steps is not None:
-      progbar = Progbar(target=steps)
-    else:
-      progbar = Progbar(target=num_samples)
-
-  # To prevent a slowdown, we find beforehand the arrays that need conversion.
-  feed = model._feed_inputs + model._feed_targets + model._feed_sample_weights
-  indices_for_conversion_to_dense = []
-  for i in range(len(feed)):
-    if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
-      indices_for_conversion_to_dense.append(i)
-
-  if steps is not None:
-    for step in range(steps):
-      batch_outs = f(ins)
-      if isinstance(batch_outs, list):
-        if step == 0:
-          for _ in enumerate(batch_outs):
-            outs.append(0.)
-        for i, batch_out in enumerate(batch_outs):
-          if i in stateful_metric_indices:
-            outs[i] = batch_out
-          else:
-            outs[i] += batch_out
-      else:
-        if step == 0:
-          outs.append(0.)
-        outs[0] += batch_outs
-      if verbose == 1:
-        progbar.update(step + 1)
-    for i in range(len(outs)):
-      if i not in stateful_metric_indices:
-        outs[i] /= steps
-  else:
-    batches = make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      if isinstance(ins[-1], int):
-        # Do not slice the training phase flag.
-        ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-      else:
-        ins_batch = slice_arrays(ins, batch_ids)
-      for i in indices_for_conversion_to_dense:
-        ins_batch[i] = ins_batch[i].toarray()
-
-      batch_outs = f(ins_batch)
-
-      if isinstance(batch_outs, list):
-        if batch_index == 0:
-          outs.extend([0.] * len(batch_outs))
-        for i, batch_out in enumerate(batch_outs):
-          if i in stateful_metric_indices:
-            outs[i] = batch_out
-          else:
-            outs[i] += batch_out * len(batch_ids)
-      else:
-        if batch_index == 0:
-          outs.append(0.)
-        outs[0] += batch_outs * len(batch_ids)
-      if verbose == 1:
-        progbar.update(batch_end)
-    for i in range(len(outs)):
-      if i not in stateful_metric_indices:
-        outs[i] /= num_samples
-  if len(outs) == 1:
-    return outs[0]
-  return outs
+    aggregator.finalize()
+    results = aggregator.results
+    epoch_logs.update(_make_logs(model, results, mode))
+    if len(results) == 1:
+      results = results[0]
+
+    # Run the test loop every epoch during training.
+    if do_validation and not callbacks.model.stop_training:
+      val_results = model_iteration(
+          model,
+          val_inputs,
+          targets=val_targets,
+          sample_weights=val_sample_weights,
+          batch_size=batch_size,
+          steps_per_epoch=validation_steps,
+          callbacks=callbacks,
+          verbose=0,
+          mode='test')
+      if not isinstance(val_results, list):
+        val_results = [val_results]
+      epoch_logs.update(_make_logs(model, val_results, mode, prefix='val_'))
+
+    callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_end(epoch, epoch_logs)
+  callbacks._call_end_hook(mode)
+
+  if model._distribution_strategy:
+    training_distributed._copy_weights_to_original_model(model, mode)
+    scope.__exit__(None, None, None)
+
+  if mode == 'train':
+    return model.history
+  return results
+
+
+# For backwards compatibility for internal users of these loops.
+fit_loop = functools.partial(model_iteration, mode='train')
+test_loop = functools.partial(model_iteration, mode='test', shuffle=False)
+predict_loop = functools.partial(model_iteration, mode='predict', shuffle=False)
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e79e5842a1c7c63f84a57e90e4ba96429589f9aa
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -0,0 +1,343 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class TestTrainingWithDatasetIterators(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      {'model': 'functional'},
+      {'model': 'subclass'},
+  )
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_training_and_eval_methods_on_iterators_single_io(self, model):
+    if model == 'functional':
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    elif model == 'subclass':
+      model = testing_utils.get_small_sequential_mlp(1, 4)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(iterator, steps=2, verbose=1)
+    model.predict(iterator, steps=2)
+
+    # Test with validation data
+    model.fit(iterator,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=iterator, validation_steps=2)
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(iterator,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_split=0.5, validation_steps=2)
+
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          iterator,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
+
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(iterator, iterator,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(iterator, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(iterator, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(iterator, verbose=0)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_get_next_op_created_once(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    # Finalize graph to make sure we are not appending another iterator
+    # get_next op in the graph.
+    ops.get_default_graph().finalize()
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_iterators_running_out_of_data(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(2)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'dataset iterator ran out of data')
+
+
+class TestTrainingWithDataset(test.TestCase, parameterized.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_calling_model_on_same_dataset(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    # Call fit with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+    # Finalize the graph to make sure new ops aren't added when calling on the
+    # same dataset
+    ops.get_default_graph().finalize()
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_training_and_eval_methods_on_dataset(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+    # Test with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(dataset,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_split=0.5, validation_steps=2)
+
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          dataset,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
+
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(dataset, dataset,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(dataset, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(dataset, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(dataset, verbose=0)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_dataset_with_sample_weights(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    sample_weights = np.ones((10), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
+                                                      sample_weights))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+  @parameterized.parameters(
+      {'model': 'functional'},
+      {'model': 'subclass'},
+  )
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_dataset_with_sparse_labels(self, model):
+    if model == 'functional':
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    elif model == 'subclass':
+      model = testing_utils.get_small_sequential_mlp(1, 4)
+
+    for loss in ['sparse_categorical_crossentropy',
+                 losses_impl.sparse_softmax_cross_entropy]:
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      model.compile(optimizer, loss)
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.random.randint(0, 4, size=10, dtype=np.int32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  def test_dataset_input_shape_validation(self):
+    with self.cached_session():
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
+      ):
+        model.train_on_batch(dataset)
+
+      # Wrong input shape
+      inputs = np.zeros((10, 5))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   r'expected (.*?) to have shape \(3,\)'):
+        model.train_on_batch(dataset)
+
+
+class TestMetricsWithDatasetIterators(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_metrics_correctness_with_iterator(self):
+    model = keras.Sequential()
+    model.add(
+        keras.layers.Dense(
+            8, activation='relu', input_dim=4, kernel_initializer='ones'))
+    model.add(
+        keras.layers.Dense(
+            1, activation='sigmoid', kernel_initializer='ones'))
+    model.compile(
+        loss='binary_crossentropy',
+        metrics=['accuracy', metrics_module.BinaryAccuracy()],
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(123)
+    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
+    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
+    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
+
+    y = np.zeros((100, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(outs[1], 0.)
+    self.assertEqual(outs[2], 0.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 418bebccb091b8c57ab1ecc38e5e488bc7ad6e2f..53261fdd262202532a76347cb82eafe4aedb8268 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import enum
 import numpy as np
 
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -34,7 +35,6 @@ from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
@@ -48,186 +48,15 @@ class _Mode(enum.Enum):
 # TODO(priyag, sourabhbajaj): Refactor this file to address code duplication.
 
 
-def fit_loop(
-    model,
-    iterator,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    val_iterator=None,
-    initial_epoch=0,
-    steps_per_epoch=None,
-    validation_steps=None):
-  """Fit loop for training with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      epochs: Number of times to iterate over the data
-      verbose: Integer, Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      val_iterator: Iterator for validation data.
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for
-          (only if doing validation from data tensors).
-          Ignored with the default value of `None`.
-
-  Returns:
-      `History` object.
-
-  Raises:
-      ValueError: in case of invalid arguments.
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_fit_loop(
-        model, iterator, epochs, verbose, callbacks, initial_epoch,
-        steps_per_epoch, val_iterator, validation_steps)
-
-  if not model._grouped_model:
-    clone_model_on_replicas(model, current_strategy, make_callback_model=True)
-
-  def _per_device_train_function(model):
-    model._make_train_function()
-    return (model.train_function.inputs,
-            model.train_function.outputs,
-            model.train_function.updates_op,
-            model.train_function.session_kwargs)
-
-  inputs, targets, sample_weights = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_train_function`.
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_train_function, model._grouped_model)
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of update ops on
-    # all the devices over which the model is distributed.
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs,
-         grouped_updates, grouped_session_args, with_loss_tensor=True)
-
-    # Dataset inputs and targets are also per devices values that need to be
-    # unwrapped.
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-    dataset_targets = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, targets)
-
-    # Create a train function that is composed of all the parameters above.
-    distributed_train_function = K.Function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_train_function',
-        **all_session_args)
-
-    # We need to set sample_weights to None since there are sample weight
-    # placeholders that are created with default values.
-    sample_weights = [None for _ in range(len(model.outputs) *
-                                          current_strategy.num_replicas)]
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + dataset_targets + sample_weights + [1]
-    else:
-      ins = dataset_inputs + dataset_targets
-
-    do_validation = False
-    if validation_steps:
-      do_validation = True
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        val_inputs=None,
-        val_targets=None,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        verbose=verbose)
-    out_labels = model.metrics_names or []
-    callbacks.on_train_begin()
-
-    assert steps_per_epoch is not None
-
-    for epoch in range(initial_epoch, epochs):
-      # Reset stateful metrics
-      for m in model.stateful_metric_functions:
-        m.reset_states()
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      for step_index in range(steps_per_epoch):
-        batch_logs = {'batch': step_index, 'size': 1}
-        callbacks.on_batch_begin(step_index, batch_logs)
-        try:
-          outs = distributed_train_function(ins)
-        except errors.OutOfRangeError:
-          logging.warning('Your dataset iterator ran out of data; '
-                          'interrupting training. Make sure that your dataset '
-                          'can generate at least `steps_per_epoch * epochs` '
-                          'batches (in this case, %d batches).' %
-                          steps_per_epoch * epochs)
-          break
-
-        if not isinstance(outs, list):
-          outs = [outs]
-
-        outs = _aggregate_metrics_across_replicas(current_strategy.num_replicas,
-                                                  out_labels,
-                                                  model.stateful_metric_names,
-                                                  outs)
-        for l, o in zip(out_labels, outs):
-          batch_logs[l] = o
-        callbacks.on_batch_end(step_index, batch_logs)
-        if callbacks.model.stop_training:
-          break
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_iterator,
-            steps=validation_steps,
-            verbose=0)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(out_labels, val_outs):
-          epoch_logs['val_' + l] = o
-
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callbacks.model.stop_training:
-        break
-    callbacks.on_train_end()
-
-    # Copy the weights back from the replicated model to the original model.
-    updated_weights = current_strategy.unwrap(
-        model._grouped_model)[0].get_weights()
-    model.set_weights(updated_weights)
-    return model.history
-
-
-def _experimental_fit_loop(
-    model,
-    iterator,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    initial_epoch=0,
-    steps_per_epoch=None,
-    val_iterator=None,
-    validation_steps=None):
+def experimental_fit_loop(model,
+                          iterator,
+                          epochs=100,
+                          verbose=1,
+                          callbacks=None,
+                          initial_epoch=0,
+                          steps_per_epoch=None,
+                          val_iterator=None,
+                          validation_steps=None):
   """Fit loop for training with TPU DistributionStrategy.
 
   Arguments:
@@ -256,22 +85,21 @@ def _experimental_fit_loop(
 
   K.get_session().run(current_strategy.initialize())
 
-  def _per_device_train_function(model):
-    model._make_train_function()
-    return (model.train_function.inputs,
-            model.train_function.outputs,
-            model.train_function.updates_op,
-            model.train_function.session_kwargs)
+  def _per_device_fit_function(model):
+    model._make_fit_function()
+    return (model._fit_function.inputs, model._fit_function.outputs,
+            model._fit_function.updates_op, model._fit_function.session_kwargs)
 
   # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
   K.set_learning_phase(1)
   out_labels = model.metrics_names or []
 
-  def step_fn(ctx, inputs, targets):
-    """Clones the model and calls make_train_function."""
+  def step_fn(ctx, inputs):
+    """Clones the model and calls make_fit_function."""
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
+    inputs, targets = inputs
     clone_model_on_replicas(
         model,
         current_strategy,
@@ -281,26 +109,27 @@ def _experimental_fit_loop(
         mode=_Mode.TRAIN)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_train_function, model._grouped_model_train)
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
+         _per_device_fit_function, args=(model._grouped_model_train,))
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs,
          grouped_updates, grouped_session_args)
-    combined_fn = K.Function(
-        all_inputs, all_outputs,
+    combined_fn = K.function(
+        all_inputs,
+        all_outputs,
         updates=all_updates,
-        name='distributed_train_function',
+        name='distributed_fit_function',
         **all_session_args)
 
     for label, output in zip(out_labels, combined_fn.outputs):
       if label == 'loss':
-        aggregation = distribute_lib.get_loss_reduction()
+        reduce_op = distribute_lib.get_loss_reduction()
       else:
-        # We aggregate all other metrics using mean for now. This is temporary
+        # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
-        aggregation = variable_scope.VariableAggregation.MEAN
-      ctx.set_last_step_output(label, output, aggregation)
+        reduce_op = ds_reduce_util.ReduceOp.MEAN
+      ctx.set_last_step_output(label, output, reduce_op)
 
     # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
     # feed_dict, session kwargs, run options, run_metadata for now. These should
@@ -310,19 +139,20 @@ def _experimental_fit_loop(
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
-  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
+  for name in model.metrics_names[1:]:
+    tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   if steps_per_epoch is None:
     raise ValueError('`steps_per_epoch` should be specified when calling '
                      '`fit` on the model.')
   steps_per_run = K.variable(
-      value=min(steps_per_epoch, current_strategy.steps_per_run),
+      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
       dtype='int32',
       name='steps_per_run')
 
   with current_strategy.scope():
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=steps_per_run,
         initial_loop_values=initial_loop_values)
 
@@ -348,10 +178,11 @@ def _experimental_fit_loop(
       verbose=verbose)
 
   # Calculate the steps each time on the device.
-  steps_to_run = [current_strategy.steps_per_run] * (
-      steps_per_epoch // current_strategy.steps_per_run)
-  if steps_per_epoch % current_strategy.steps_per_run:
-    steps_to_run.append(steps_per_epoch % current_strategy.steps_per_run)
+  steps_to_run = [current_strategy.extended.steps_per_run] * (
+      steps_per_epoch // current_strategy.extended.steps_per_run)
+  if steps_per_epoch % current_strategy.extended.steps_per_run:
+    steps_to_run.append(
+        steps_per_epoch % current_strategy.extended.steps_per_run)
 
   callbacks.on_train_begin()
   for epoch in range(initial_epoch, epochs):
@@ -391,7 +222,7 @@ def _experimental_fit_loop(
             model._grouped_model_train)[0].get_weights()
         model.set_weights(updated_weights)
 
-      val_outs = _experimental_test_loop(
+      val_outs = experimental_test_loop(  # pylint: disable=undefined-variable
           model,
           val_iterator,
           steps=validation_steps,
@@ -418,118 +249,11 @@ def _experimental_fit_loop(
   return model.history
 
 
-def test_loop(model, iterator, verbose=0, steps=None):
-  """Test loop for evaluating with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the outputs.
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_test_loop(model, iterator, verbose, steps)
-
-  if not model._grouped_model:
-    clone_model_on_replicas(model, current_strategy)
-
-  def _per_device_test_function(model):
-    model._make_test_function()
-    return (model.test_function.inputs,
-            model.test_function.outputs,
-            model.test_function.updates_op,
-            model.test_function.session_kwargs)
-
-  inputs, targets, sample_weights = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_test_function, model._grouped_model)
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args, with_loss_tensor=True)
-
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-    dataset_targets = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, targets)
-
-    distributed_test_function = K.Function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_test_function',
-        **all_session_args)
-
-    # We need to set sample_weights to None since there are sample weight
-    # placeholders that are created with default values.
-    sample_weights = [None for _ in range(len(model.outputs) *
-                                          current_strategy.num_replicas)]
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + dataset_targets + sample_weights + [0]
-    else:
-      ins = dataset_inputs + dataset_targets
-
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-    stateful_metric_indices = [
-        i for i, name in enumerate(model.metrics_names)
-        if str(name) in model.stateful_metric_names
-    ]
-
-    outs = []
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    assert steps is not None
-    for step in range(steps):
-      batch_outs = distributed_test_function(ins)
-      batch_outs = _aggregate_metrics_across_replicas(
-          current_strategy.num_replicas, model.metrics_names,
-          model.stateful_metric_names, batch_outs)
-      if isinstance(batch_outs, list):
-        if step == 0:
-          outs = [0.] * len(batch_outs)
-        for i, batch_out in enumerate(batch_outs):
-          if i in stateful_metric_indices:
-            outs[i] = batch_out
-          else:
-            outs[i] += batch_out
-      else:
-        if step == 0:
-          outs.append(0.)
-        outs[0] += batch_outs
-      if verbose >= 1:
-        progbar.update(step + 1)
-    for i in range(len(outs)):
-      if i not in stateful_metric_indices:
-        outs[i] /= steps
-
-    if len(outs) == 1:
-      return outs[0]
-    return outs
-
-
-def _experimental_test_loop(model, iterator, verbose=0, steps=None,
-                            initialize_finalize_strategy=True):
+def experimental_test_loop(model,
+                           iterator,
+                           verbose=0,
+                           steps=None,
+                           initialize_finalize_strategy=True):
   """Test loop for evaluating with TPU DistributionStrategy.
 
   Arguments:
@@ -552,21 +276,21 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
   if initialize_finalize_strategy:
     K.get_session().run(current_strategy.initialize())
 
-  def _per_device_test_function(model):
-    model._make_test_function()
-    return (model.test_function.inputs,
-            model.test_function.outputs,
-            model.test_function.updates_op,
-            model.test_function.session_kwargs)
+  def _per_device_eval_function(model):
+    model._make_eval_function()
+    return (model._eval_function.inputs, model._eval_function.outputs,
+            model._eval_function.updates_op,
+            model._eval_function.session_kwargs)
 
   # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
   K.set_learning_phase(0)
 
-  def step_fn(ctx, inputs, targets):
-    """Clones the model and calls make_test_function."""
+  def step_fn(ctx, inputs):
+    """Clones the model and calls make_eval_function."""
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
+    inputs, targets = inputs
     clone_model_on_replicas(
         model,
         current_strategy,
@@ -576,15 +300,15 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
         mode=_Mode.TEST)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_test_function, model._grouped_model_test)
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
+         _per_device_eval_function, args=(model._grouped_model_test,))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
          grouped_session_args)
 
-    combined_fn = K.Function(
+    combined_fn = K.function(
         all_inputs, all_outputs,
         updates=all_updates,
         name='distributed_test_function',
@@ -592,25 +316,26 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
 
     for label, output in zip(model.metrics_names, combined_fn.outputs):
       if label == 'loss':
-        aggregation = distribute_lib.get_loss_reduction()
+        reduce_op = distribute_lib.get_loss_reduction()
       else:
-        # We aggregate all other metrics using mean for now. This is temporary
+        # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
-        aggregation = variable_scope.VariableAggregation.MEAN
-      ctx.set_last_step_output(label, output, aggregation)
+        reduce_op = ds_reduce_util.ReduceOp.MEAN
+      ctx.set_last_step_output(label, output, reduce_op)
 
     return combined_fn.updates_op
 
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
-  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
+  for name in model.metrics_names[1:]:
+    tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   with current_strategy.scope():
     # TODO(priyag): Use steps_per_run when we use new metrics as they will
     # allow handling metric computation at each step using variables.
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=1,
         initial_loop_values=initial_loop_values)
 
@@ -646,102 +371,7 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
   return outs
 
 
-def predict_loop(model, iterator, verbose=0, steps=None):
-  """Predict loop for predicting with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_predict_loop(model, iterator, verbose, steps)
-
-  if not model._grouped_model:
-    clone_model_on_replicas(model, current_strategy)
-
-  def _per_device_predict_function(model):
-    model._make_predict_function()
-    return (model.predict_function.inputs,
-            model.predict_function.outputs,
-            model.predict_function.updates_op,
-            model.predict_function.session_kwargs)
-
-  inputs, _, _ = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_predict_function, model._grouped_model)
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args)
-
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-
-    distributed_predict_function = K.Function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_predict_function',
-        **all_session_args)
-
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + [0]
-    else:
-      ins = dataset_inputs
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    num_towers = current_strategy.num_towers
-    # Since we do not know how many samples we will see, we cannot
-    # pre-allocate the returned Numpy arrays. Instead, we store one array per
-    # batch seen and concatenate them upon returning.
-    unconcatenated_outs = []
-    assert steps is not None
-    for step in range(steps):
-      batch_outs = distributed_predict_function(ins)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if step == 0:
-        # batch_outs gives you the number of model outputs. In the distributed
-        # case this will be number of model_outputs * num_towers.
-        for _ in range(len(model.outputs)):
-          unconcatenated_outs.append([])
-      for i in range(len(model.outputs)):
-        nested_outs = batch_outs[i * num_towers:i * num_towers + num_towers]
-        outs = nest.flatten(nested_outs)
-        unconcatenated_outs[i].extend(outs)
-      if verbose >= 1:
-        progbar.update(step + 1)
-    if len(unconcatenated_outs) == 1:
-      return np.concatenate(unconcatenated_outs[0], axis=0)
-    return [
-        np.concatenate(unconcatenated_outs[i], axis=0)
-        for i in range(len(unconcatenated_outs))
-    ]
-
-
-def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
+def experimental_predict_loop(model, iterator, verbose=0, steps=None):
   """Predict loop for predicting with TPU DistributionStrategy.
 
   Arguments:
@@ -770,7 +400,7 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
             model.predict_function.updates_op,
             model.predict_function.session_kwargs)
 
-  def step_fn(ctx, *inputs):
+  def step_fn(ctx, inputs):
     """Clones the model and calls make_predict_function."""
 
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
@@ -784,15 +414,15 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
         mode=_Mode.PREDICT)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_predict_function, model._grouped_model_predict)
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
+         _per_device_predict_function, args=(model._grouped_model_predict,))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
          grouped_session_args)
 
-    combined_fn = K.Function(
+    combined_fn = K.function(
         all_inputs, all_outputs,
         updates=all_updates,
         name='distributed_predict_function',
@@ -815,7 +445,7 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
 
   with current_strategy.scope():
     # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=1,
         initial_loop_values=initial_loop_values)
 
@@ -885,10 +515,11 @@ def _clone_and_build_model(model, inputs=None, targets=None):
   cloned_model.compile(
       optimizer,
       model.loss,
-      metrics=metrics_module.clone_metrics(model.metrics),
+      metrics=metrics_module.clone_metrics(model._compile_metrics),
       loss_weights=model.loss_weights,
       sample_weight_mode=model.sample_weight_mode,
-      weighted_metrics=metrics_module.clone_metrics(model.weighted_metrics),
+      weighted_metrics=metrics_module.clone_metrics(
+          model._compile_weighted_metrics),
       target_tensors=targets)
   return cloned_model
 
@@ -897,8 +528,8 @@ def clone_model_on_replicas(model, strategy, make_callback_model=False,
                             inputs=None, targets=None, mode=None):
   """Create a cloned model on each replica."""
   with strategy.scope():
-    grouped_model = strategy.call_for_each_replica(
-        _clone_and_build_model, model, inputs, targets)
+    grouped_model = strategy.extended.call_for_each_replica(
+        _clone_and_build_model, args=(model, inputs, targets))
     if mode is _Mode.TRAIN:
       model._grouped_model_train = grouped_model
     elif mode is _Mode.TEST:
@@ -911,45 +542,6 @@ def clone_model_on_replicas(model, strategy, make_callback_model=False,
     model._make_callback_model(grouped_model)
 
 
-def _aggregate_metrics_across_replicas(num_devices, out_labels,
-                                       stateful_metric_names, outs):
-  """Aggregates stateless metrics values across replicas.
-
-  When using `MirroredStrategy`, the number of replicas is equal to the
-  number of devices over which training is distributed. This may not always be
-  the case.
-
-  Args:
-    num_devices: Number of devices over which the model is being distributed.
-    out_labels: The list of metric names passed to `compile`.
-    stateful_metric_names: List of stateful metric names on the model.
-    outs: The output from all the replicas.
-
-  Returns:
-    The average value of each metric across the replicas.
-  """
-  # TODO(anjalisridhar): Temporary workaround for aggregating metrics
-  # across replicas. Replace with the new metrics module eventually.
-  merged_output = []
-  # The first output is the total loss.
-  merged_output.append(outs[0])
-  current_index = 1
-  # Each label in `out_labels` corresponds to one set of metrics. The
-  # number of metric values corresponds to the number of devices. We
-  # currently take the mean of the values.
-  for metric_name in out_labels[1:]:
-    if metric_name in stateful_metric_names:
-      # For stateful metrics, we get one aggregated result value.
-      merged_output.append(outs[current_index])
-      current_index += 1
-    else:
-      m = np.mean(outs[current_index:current_index + num_devices])
-      merged_output.append(m)
-      current_index += num_devices
-
-  return merged_output
-
-
 def _get_input_from_iterator(iterator, model):
   """Get elements from the iterator and verify the input shape and type."""
   next_element = iterator.get_next()
@@ -974,3 +566,111 @@ def _get_input_from_iterator(iterator, model):
   model._standardize_weights(x_values, y_values,
                              sample_weight=sample_weights_values)
   return x, y, sample_weights
+
+
+def _get_execution_function(model, mode):
+  """Get function to run one step of distributed model execution."""
+  strategy = model._distribution_strategy
+  if not model._grouped_model:
+    clone_model_on_replicas(
+        model, strategy, make_callback_model=(mode == 'train'))
+
+  def _per_device_function(model):
+    f = model._get_execution_function(mode)
+    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
+
+  with strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs, grouped_updates,
+     grouped_session_args) = strategy.extended.call_for_each_replica(
+         _per_device_function, args=(model._grouped_model,))
+
+    if mode == 'train':
+      # Initialize the variables in the replicated model. This is necessary for
+      # multi-worker training because on some workers, initialization is not
+      # needed. This method does initialization or waiting for initialization
+      # according to the context object of distribute coordinator.
+      distributed_training_utils.init_restore_or_wait_for_variables()
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of update ops on
+    # all the devices over which the model is distributed.
+    (all_inputs, all_outputs, all_updates,
+     all_session_args) = distributed_training_utils.unwrap_values(
+         strategy,
+         grouped_inputs,
+         grouped_outputs,
+         grouped_updates,
+         grouped_session_args,
+         with_loss_tensor=(mode != 'predict'))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        updates=all_updates,
+        name='distributed_{}_function'.format(mode),
+        **all_session_args)
+
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of 'train'/'test'/'predict'.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  strategy = model._distribution_strategy
+  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
+  inputs = distributed_training_utils.flatten_perdevice_values(strategy, inputs)
+  targets = distributed_training_utils.flatten_perdevice_values(
+      strategy, targets)
+  if mode == 'predict':
+    sample_weights = []
+    targets = []
+  else:
+    sample_weights = [
+        None for _ in range(len(model.outputs) * strategy.num_replicas_in_sync)
+    ]
+  ins = inputs + targets + sample_weights
+  if mode == 'train' and not isinstance(K.learning_phase(), int):
+    ins += [True]
+  return ins
+
+
+def _copy_weights_to_distributed_model(model):
+  """Copies weights from original model to distributed models."""
+  if model._distribution_strategy:
+    # Copy the weights from the original model to each of the replicated models.
+    orig_model_weights = model.get_weights()
+    distributed_model = model._distribution_strategy.unwrap(
+        model._grouped_model)[0]
+    distributed_training_utils.set_weights(
+        model._distribution_strategy, distributed_model, orig_model_weights)
+
+
+def _copy_weights_to_original_model(model, mode):
+  """Copies weights from first distributed model back to original model."""
+  if model._distribution_strategy and mode == 'train':
+    updated_weights = model._distribution_strategy.unwrap(
+        model._grouped_model)[0].get_weights()
+    model.set_weights(updated_weights)
+
+
+def _per_device_aggregate_batch(batch_outs, model, mode):
+  """Aggregates the per-device batch-level outputs from a distributed step."""
+  if model._distribution_strategy is not None and mode == 'predict':
+    total_batch_outs = []
+    for i in range(len(model.outputs)):
+      num_replicas = model._distribution_strategy.num_replicas_in_sync
+      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
+      total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
+    return total_batch_outs
+  return batch_outs
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 955493a8123f01ec4498176a29aa862b1910812a..b2dace84aa343d076c343ff670d1397c71795a0a 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -19,36 +19,37 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
-import threading
 
 import numpy as np
 
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
-# A lock for assigning polymorphic functions to models in a thread-safe way
-_graph_function_building_lock = threading.Lock()
-
-
 def _eager_loss_fn(outputs, targets, loss_fn, output_name):
   with backend.name_scope(output_name + '_loss'):
     loss = loss_fn(targets, outputs)
   return loss
 
 
-def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
+def _eager_metrics_fn(model,
+                      outputs,
+                      targets,
+                      sample_weights=None,
+                      masks=None,
+                      return_stateful_result=True):
   """Calculates the metrics for each output of the given model.
 
   Arguments:
@@ -57,6 +58,8 @@ def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
       targets: The predictions or targets of the given model.
       sample_weights: Optional list of sample weights for each output.
       masks: Optional list of masks for each output.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
 
   Returns:
       Returns the metric results for each output of the model.
@@ -65,11 +68,20 @@ def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
   targets = generic_utils.to_list(targets)
   # TODO(psv): Consider supporting skip target indices in eager mode?
   metric_results = model._handle_metrics(
-      outputs, targets=targets, sample_weights=sample_weights, masks=masks)
+      outputs,
+      targets=targets,
+      sample_weights=sample_weights,
+      masks=masks,
+      return_stateful_result=return_stateful_result)
   return [backend.mean(t) for t in metric_results]
 
 
-def _model_loss(model, inputs, targets, sample_weights=None, training=False):
+def _model_loss(model,
+                inputs,
+                targets,
+                output_loss_metrics=None,
+                sample_weights=None,
+                training=False):
   """Calculates the loss for a given model.
 
   Arguments:
@@ -77,6 +89,8 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       inputs: Either a dictionary of inputs to the model or a list of input
         arrays.
       targets: List of target arrays.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
       sample_weights: Optional list of sample weight arrays.
       training: Whether the model should be run in inference or training mode.
 
@@ -106,6 +120,7 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
   targets = generic_utils.to_list(targets)
 
   loss_metrics = []
+  aggregated_loss_metrics = []
   with backend.name_scope('loss'):
     for i, loss_fn in enumerate(model.loss_functions):
       if sample_weights:
@@ -125,6 +140,16 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       if len(model.outputs) > 1:
         loss_metrics.append(backend.mean(output_loss))
 
+        if output_loss_metrics is not None:
+          # Keep track of the stateful loss result.
+          aggregated_loss_metrics.append(
+              training_utils.call_metric_function(
+                  output_loss_metrics[i],
+                  targets[i],
+                  outs[i],
+                  weights=weights,
+                  mask=mask))
+
       loss_weight = model.loss_weights_list[i]
       if total_loss is None:
         total_loss = loss_weight * output_loss
@@ -138,49 +163,7 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       total_loss += math_ops.add_n(custom_losses)
     model._clear_losses()
 
-  return outs, total_loss, loss_metrics, masks
-
-
-def _maybe_build_graph_functions(model):
-  """Constructs polymorphic functions to use for fit, evaluate and predict."""
-  # We lock this function to ensure thread-safety in case users are
-  # hypothetically trying to call '.predict' on a model in multiple threads
-  # at once when the graph functions were never previously built.
-  with _graph_function_building_lock:
-    if not model._built_graph_functions:
-      model._eager_process_single_batch_graph_function = eager_function.defun(
-          _process_single_batch
-      )
-      model._eager_model_loss_graph_function = eager_function.defun(_model_loss)
-      model._eager_call_graph_function = eager_function.defun(model.call)
-      model._built_graph_functions = True
-
-
-def _maybe_graph_function_model_loss(
-    model,
-    inputs,
-    targets,
-    sample_weights=None,
-    training=False):
-  """Compute model loss, using defun if the model supports it."""
-  if model._can_use_graph_functions:
-    _maybe_build_graph_functions(model)
-    return model._eager_model_loss_graph_function(
-        model, inputs, targets,
-        sample_weights=sample_weights, training=training)
-  else:
-    return _model_loss(model, inputs, targets,
-                       sample_weights=sample_weights,
-                       training=training)
-
-
-def _maybe_graph_function_model_call(model, *args, **kwargs):
-  """Compute model loss, using defun if the model supports it."""
-  if model._can_use_graph_functions:
-    _maybe_build_graph_functions(model)
-    return model._eager_call_graph_function(*args, **kwargs)
-  else:
-    return model.call(*args, **kwargs)
+  return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
 
 
 def iterator_fit_loop(model,
@@ -196,7 +179,8 @@ def iterator_fit_loop(model,
                       callbacks=None,
                       validation_steps=None,
                       do_validation=False,
-                      batch_size=None):
+                      batch_size=None,
+                      output_loss_metrics=None):
   """Fit function for eager execution when input is given as dataset iterator.
 
   Updates the given epoch logs.
@@ -222,6 +206,8 @@ def iterator_fit_loop(model,
       do_validation: Boolean value indicating whether we should do validation.
       batch_size: int, val_inputs and val_targets will be evaled batch by
         batch with size batch_size if they are array.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
 
   Raises:
       ValueError: In case of mismatch between given number of inputs and
@@ -230,7 +216,7 @@ def iterator_fit_loop(model,
   assert isinstance(inputs, iterator_ops.EagerIterator)
 
   # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, (list, tuple)) or
+  if (not isinstance(inputs.output_shapes, collections.Sequence) or
       len(inputs.output_shapes) not in (2, 3)):
     raise ValueError('Please provide either inputs and targets '
                      'or inputs, targets, and sample_weights')
@@ -269,16 +255,25 @@ def iterator_fit_loop(model,
           if val is not None else None for val in sample_weights
       ]
 
-    # Set stateful_metrics in callbacks. We do not do this before the
-    # `steps_per_epoch` loop because model will be compiled only in the first
-    # iteration of this loop in the deferred build scenario.
+    # Train model.
+    outs, loss, _, aggregated_loss_metrics, masks = _process_single_batch(
+        model,
+        x,
+        y,
+        output_loss_metrics=output_loss_metrics,
+        sample_weights=sample_weights,
+        training=True)
+    outs = generic_utils.to_list(outs)
+
     if step_index == 0:
+      # Set stateful_metrics in callbacks. We do not do this before the
+      # `steps_per_epoch` loop because model will be compiled only in the first
+      # iteration of this loop in the deferred build scenario.
       for cbk in callbacks:
         if (isinstance(cbk, cbks.BaseLogger) or
             isinstance(cbk, cbks.ProgbarLogger)):
-          cbk.stateful_metrics = model.stateful_metric_names
+          cbk.stateful_metrics = model.metrics_names[1:]  # Exclude `loss`
 
-    if step_index == 0 and not callbacks.params['metrics']:
       callback_metrics = copy.copy(model.metrics_names)
       if do_validation:
         callback_metrics += ['val_' + n for n in model.metrics_names]
@@ -292,22 +287,16 @@ def iterator_fit_loop(model,
           'validation_steps': validation_steps
       })
 
-    # Train model.
-    outs, loss, loss_metrics, masks = (
-        _maybe_graph_function_process_single_batch(
-            model, x, y, sample_weights=sample_weights, training=True))
-    outs = generic_utils.to_list(outs)
-
     # Calculate metrics.
     for l, o in zip(model.metrics_names, outs):
       batch_logs[l] = o
-    # Required for eager execution
     metrics_results = _eager_metrics_fn(
         model, outs, y, sample_weights=sample_weights, masks=masks)
     batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
 
-    for k, v in zip(model.metrics_names,
-                    [backend.mean(loss)] + loss_metrics + metrics_results):
+    for k, v in zip(
+        model.metrics_names,
+        [backend.mean(loss)] + aggregated_loss_metrics + metrics_results):
       batch_logs[k] = tensor_util.constant_value(v)
     callbacks.on_batch_end(step_index, batch_logs)
     if callbacks.model.stop_training:
@@ -352,11 +341,20 @@ def iterator_test_loop(model, inputs, steps, verbose=0):
   """
   assert isinstance(inputs, iterator_ops.EagerIterator)
   # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, (list, tuple)) or
+  if (not isinstance(inputs.output_shapes, collections.Sequence) or
       len(inputs.output_shapes) < 2 or len(inputs.output_shapes) > 3):
     raise ValueError('Please provide either inputs and targets'
                      'or inputs, targets, and sample_weights')
   outs = []
+
+  # Create metric wrapper for the losses.
+  output_loss_metrics = []
+  for i in range(len(model.outputs)):
+    loss_fn = model.loss_functions[i]
+    mean_wrapped_loss = metrics_module.MeanMetricWrapper(
+        loss_fn, name=loss_fn.__name__)
+    output_loss_metrics.append(mean_wrapped_loss)
+
   num_samples = 0
   if verbose == 1:
     progbar = generic_utils.Progbar(target=steps)
@@ -394,24 +392,26 @@ def iterator_test_loop(model, inputs, steps, verbose=0):
       # Get stateful metrics indices. We do not do this before the `steps` loop
       # because model will be compiled only in the first iteration of this loop
       # in the deferred build scenario.
-      if hasattr(model, 'metrics'):
-        for m in model.stateful_metric_functions:
+      if hasattr(model, '_compile_metrics'):
+        for m in model.metrics:
           m.reset_states()
-        stateful_metric_indices = [
-            i for i, name in enumerate(model.metrics_names)
-            if str(name) in model.stateful_metric_names
-        ]
-      else:
-        stateful_metric_indices = []
+      for m in output_loss_metrics:
+        m.reset_states()
 
     # Calculate model output, loss values.
-    loss_outs, loss, loss_metrics, masks = _maybe_graph_function_model_loss(
-        model, x, y, sample_weights=sample_weights, training=False)
+    loss_outs, loss, _, aggregated_loss_metrics, masks = _model_loss(
+        model,
+        x,
+        y,
+        output_loss_metrics=output_loss_metrics,
+        sample_weights=sample_weights,
+        training=False)
     metrics_results = _eager_metrics_fn(
         model, loss_outs, y, sample_weights=sample_weights, masks=masks)
     batch_outs = []
-    for _, v in zip(model.metrics_names,
-                    [backend.mean(loss)] + loss_metrics + metrics_results):
+    for _, v in zip(
+        model.metrics_names,
+        [backend.mean(loss)] + aggregated_loss_metrics + metrics_results):
       batch_outs.append(tensor_util.constant_value(v))
 
     # Get current step size.
@@ -428,20 +428,15 @@ def iterator_test_loop(model, inputs, steps, verbose=0):
     if step_index == 0:
       for _ in enumerate(batch_outs):
         outs.append(0.)
-    for i, batch_out in enumerate(batch_outs):
-      if i in stateful_metric_indices:
-        outs[i] = batch_out
-      else:
-        outs[i] += batch_out * step_size
+    outs[0] += batch_outs[0] * step_size  # index 0 = 'loss'
+    outs[1:] = batch_outs[1:]
 
     # Calculate sample size.
     num_samples += step_size
     if verbose == 1:
       progbar.update(step_index + 1)
 
-  for i in range(len(outs)):
-    if i not in stateful_metric_indices:
-      outs[i] /= num_samples
+  outs[0] /= num_samples  # index 0 = 'loss'
   if len(outs) == 1:
     return outs[0]
   return outs
@@ -467,7 +462,7 @@ def iterator_predict_loop(model, inputs, steps, verbose=0):
   """
   assert isinstance(inputs, iterator_ops.EagerIterator)
   if not isinstance(inputs.output_shapes,
-                    (list, tuple)) or len(inputs.output_shapes) > 3:
+                    collections.Sequence) or len(inputs.output_shapes) > 3:
     raise ValueError(
         'Please provide data as a list or tuple of 1, 2, or 3 elements '
         ' - `(input)`, or `(input, target)`, or `(input, target,'
@@ -500,9 +495,9 @@ def iterator_predict_loop(model, inputs, steps, verbose=0):
       x = x[0]
 
     if model._expects_training_arg:
-      batch_outs = _maybe_graph_function_model_call(model, x, training=False)
+      batch_outs = model.call(x, training=False)
     else:
-      batch_outs = _maybe_graph_function_model_call(model, x)
+      batch_outs = model.call(x)
     if not isinstance(batch_outs, list):
       batch_outs = [batch_outs]
 
@@ -527,6 +522,7 @@ def iterator_predict_loop(model, inputs, steps, verbose=0):
 def _process_single_batch(model,
                           inputs,
                           targets,
+                          output_loss_metrics=None,
                           sample_weights=None,
                           training=False):
   """Calculate the loss and gradient for one input batch.
@@ -537,6 +533,8 @@ def _process_single_batch(model,
       model: Model whose loss has to be calculated.
       inputs: List of input arrays.
       targets: List of target arrays.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
       sample_weights: Optional list of sample weight arrays.
       training: The boolean represents if the weights of the model are updated.
               'fit' methods will set this to True while 'evaluate' methods will
@@ -551,12 +549,14 @@ def _process_single_batch(model,
   """
   with backend.learning_phase_scope(1 if training else 0):
     with GradientTape() as tape:
-      outs, loss, loss_metrics, masks = _model_loss(
-          model,
-          inputs,
-          targets,
-          sample_weights=sample_weights,
-          training=training)
+      outs, loss, loss_metrics, aggregated_loss_metrics, masks\
+        = _model_loss(
+            model,
+            inputs,
+            targets,
+            output_loss_metrics=output_loss_metrics,
+            sample_weights=sample_weights,
+            training=training)
       if loss is None:
         raise ValueError('The model cannot be run '
                          'because it has no loss to optimize.')
@@ -569,25 +569,7 @@ def _process_single_batch(model,
         grads = tape.gradient(loss, model._collected_trainable_weights)
         model.optimizer.apply_gradients(zip(grads,
                                             model._collected_trainable_weights))
-    return outs, loss, loss_metrics, masks
-
-
-def _maybe_graph_function_process_single_batch(
-    model,
-    inputs,
-    targets,
-    sample_weights=None,
-    training=False):
-  """Process a single batch, using defun if the model supports it."""
-  if model._can_use_graph_functions:
-    _maybe_build_graph_functions(model)
-    return model._eager_process_single_batch_graph_function(
-        model, inputs, targets, sample_weights=sample_weights,
-        training=training)
-  else:
-    return _process_single_batch(model, inputs, targets,
-                                 sample_weights=sample_weights,
-                                 training=training)
+    return outs, loss, loss_metrics, aggregated_loss_metrics, masks
 
 
 def train_on_batch(model, inputs, targets, sample_weights=None):
@@ -602,28 +584,34 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss and the loss associated with each output.
   """
-  if len(inputs) and tensor_util.is_tensor(inputs[0]):
-    inputs = training_utils.cast_if_floating_dtype(inputs)
-    targets = training_utils.cast_if_floating_dtype(targets)
-  else:
-    inputs = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-    ]
-    targets = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-    ]
+  if isinstance(inputs, collections.Sequence):
+    if len(inputs) and tensor_util.is_tensor(inputs[0]):
+      inputs = training_utils.cast_if_floating_dtype(inputs)
+      targets = training_utils.cast_if_floating_dtype(targets)
+    else:
+      inputs = [
+          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
+      ]
+      targets = [
+          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
+      ]
   if sample_weights:
     sample_weights = [
         ops.convert_to_tensor(val, dtype=backend.floatx())
         if val is not None else None for val in sample_weights
     ]
 
-  outs, loss, loss_metrics, masks = _maybe_graph_function_process_single_batch(
+  outs, loss, loss_metrics, _, masks = _process_single_batch(
       model, inputs, targets, sample_weights=sample_weights, training=True)
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
-      model, outs, targets, sample_weights=sample_weights, masks=masks)
+      model,
+      outs,
+      targets,
+      sample_weights=sample_weights,
+      masks=masks,
+      return_stateful_result=False)
   loss = generic_utils.to_list(loss)
 
   return [
@@ -644,27 +632,33 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss, loss and metrics associated with each output.
   """
-  if len(inputs) and tensor_util.is_tensor(inputs[0]):
-    inputs = training_utils.cast_if_floating_dtype(inputs)
-    targets = training_utils.cast_if_floating_dtype(targets)
-  else:
-    inputs = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-    ]
-    targets = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-    ]
+  if isinstance(inputs, collections.Sequence):
+    if len(inputs) and tensor_util.is_tensor(inputs[0]):
+      inputs = training_utils.cast_if_floating_dtype(inputs)
+      targets = training_utils.cast_if_floating_dtype(targets)
+    else:
+      inputs = [
+          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
+      ]
+      targets = [
+          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
+      ]
   if sample_weights:
     sample_weights = [
         ops.convert_to_tensor(val, dtype=backend.floatx())
         if val is not None else None for val in sample_weights
     ]
-  outs, loss, loss_metrics, masks = _maybe_graph_function_model_loss(
+  outs, loss, loss_metrics, _, masks = _model_loss(
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
-      model, outs, targets, sample_weights=sample_weights, masks=masks)
+      model,
+      outs,
+      targets,
+      sample_weights=sample_weights,
+      masks=masks,
+      return_stateful_result=False)
   loss = generic_utils.to_list(loss)
 
   return [
@@ -746,12 +740,24 @@ def fit_loop(model,
         validation_steps=validation_steps,
         verbose=verbose)
 
+    # Create metric wrapper for the losses.
+    output_loss_metrics = []
+    for i in range(len(model.outputs)):
+      loss_fn = model.loss_functions[i]
+      mean_wrapped_loss = metrics_module.MeanMetricWrapper(
+          loss_fn, name=loss_fn.__name__)
+      output_loss_metrics.append(mean_wrapped_loss)
+
     callbacks.on_train_begin()
     for epoch in range(initial_epoch, epochs):
       if model._is_compiled:  # Model may not be compiled the first time.
         # Reset stateful metrics
-        for m in model.stateful_metric_functions:
+        for m in model.metrics:
           m.reset_states()
+
+      for m in output_loss_metrics:
+        m.reset_states()
+
       callbacks.on_epoch_begin(epoch)
       epoch_logs = {}
       iterator_fit_loop(
@@ -768,7 +774,8 @@ def fit_loop(model,
           callbacks=callbacks,
           validation_steps=validation_steps,
           do_validation=do_validation,
-          batch_size=batch_size)
+          batch_size=batch_size,
+          output_loss_metrics=output_loss_metrics)
       callbacks.on_epoch_end(epoch, epoch_logs)
       if callbacks.model.stop_training:
         break
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 234cbab9ebff4ce8722b0fa35c673d29718f519b..d769143106a56c6079ab70dd4b1bfcbdf6d75483 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
@@ -51,6 +51,7 @@ class TrainingTest(test.TestCase):
         loss,
         metrics=metrics,
         loss_weights=loss_weights,
+        run_eagerly=True,
         sample_weight_mode=None)
 
     input_a = keras.backend.zeros(shape=(10, 3))
@@ -111,7 +112,7 @@ class TrainingTest(test.TestCase):
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics, run_eagerly=True)
 
     inputs = keras.backend.zeros(shape=(10, 3))
     targets = keras.backend.zeros(shape=(10, 4))
@@ -129,7 +130,9 @@ class TrainingTest(test.TestCase):
     x = keras.layers.Input(shape=(3,), name='input')
     y = keras.layers.Dense(4, name='dense')(x)
     model = keras.Model(x, y)
-    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse',
+                  run_eagerly=True)
 
     x = keras.backend.zeros(shape=(10, 3))
     y = keras.backend.zeros(shape=(10, 4))
@@ -142,16 +145,19 @@ class TrainingTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, r'specify .* `steps_per_epoch`'):
       model.fit(iterator, epochs=1, verbose=0)
+    if not context.executing_eagerly():
+      # In eager execution, `keras.backend.zeros` returns value tensors
+      # which can be used for validation without a `validation_steps` argument.
+      with self.assertRaisesRegexp(
+          ValueError, r'provide either `batch_size` or `validation_steps`'):
+        model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
+                  validation_data=(x, y))
     with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
-      model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
-                validation_data=(x, y))
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
+        ValueError, r'specify the number of steps'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_dataset)
     with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
+        ValueError, r'specify the number of steps'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_iterator)
 
@@ -160,25 +166,31 @@ class TrainingTest(test.TestCase):
     model.add(keras.layers.Dense(4, input_shape=(3,)))
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     model.compile(
-        optimizer, 'mse', metrics=['mae',
-                                   metrics_module.CategoricalAccuracy()])
+        optimizer,
+        loss='mse',
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=True)
 
     x = np.random.random((10, 3))
     y = np.random.random((10, 4))
 
-    def iterator():
+    def numpy_iterator():
       while True:
         yield x, y
 
-    model.fit_generator(iterator(), steps_per_epoch=3, epochs=1)
-    model.evaluate_generator(iterator(), steps=3)
-    out = model.predict_generator(iterator(), steps=3)
+    model.fit_generator(numpy_iterator(), steps_per_epoch=3, epochs=1)
+    model.evaluate_generator(numpy_iterator(), steps=3)
+
+    def inference_numpy_iterator():
+      while True:
+        yield x
+
+    out = model.predict_generator(inference_numpy_iterator(), steps=3)
     self.assertEqual(out.shape, (30, 4))
 
 
 class CorrectnessTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_loss_correctness(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
@@ -191,15 +203,14 @@ class CorrectnessTest(test.TestCase):
                                  activation='softmax',
                                  kernel_initializer='ones'))
     model.compile(loss='sparse_categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  run_eagerly=False)
     x = np.ones((100, 4))
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     history = model.fit(x, y, epochs=1, batch_size=10)
-    self.assertEqual(
-        np.around(history.history['loss'][-1], decimals=4), 0.6173)
+    self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_loss_correctness_with_iterator(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
@@ -211,7 +222,8 @@ class CorrectnessTest(test.TestCase):
         keras.layers.Dense(2, activation='softmax', kernel_initializer='ones'))
     model.compile(
         loss='sparse_categorical_crossentropy',
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=True)
     x = np.ones((100, 4), dtype=np.float32)
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
@@ -220,7 +232,7 @@ class CorrectnessTest(test.TestCase):
     dataset = dataset.batch(10)
     iterator = dataset.make_one_shot_iterator()
     history = model.fit(iterator, epochs=1, steps_per_epoch=10)
-    self.assertEqual(np.around(history.history['loss'][-1], decimals=4), 0.6173)
+    self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
 
   def test_loss_in_call(self):
 
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index 21f44423ec03c70a4929710fc81e07b28d5827d4..45247a27514f0a6da2e9eb3bac34e2a1794964d0 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -21,12 +21,12 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras.utils.data_utils import iter_sequence_infinite
-from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
-from tensorflow.python.keras.utils.data_utils import Sequence
+from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
 
@@ -54,7 +54,7 @@ def fit_generator(model,
     if do_validation:
       model._make_test_function()
 
-  is_sequence = isinstance(generator, Sequence)
+  is_sequence = isinstance(generator, data_utils.Sequence)
   if not is_sequence and use_multiprocessing and workers > 1:
     logging.warning(
         UserWarning('Using a generator with `use_multiprocessing=True`'
@@ -65,23 +65,16 @@ def fit_generator(model,
     if is_sequence:
       steps_per_epoch = len(generator)
     else:
-      raise ValueError('`steps_per_epoch=None` is only valid for a'
-                       ' generator based on the `keras.utils.Sequence`'
-                       ' class. Please specify `steps_per_epoch` or use'
-                       ' the `keras.utils.Sequence` class.')
-
-  # python 2 has 'next', 3 has '__next__'
-  # avoid any explicit version checks
-  val_gen = (
-      hasattr(validation_data, 'next') or
-      hasattr(validation_data, '__next__') or
-      isinstance(validation_data, Sequence))
-  if (val_gen and not isinstance(validation_data, Sequence) and
+      raise ValueError('Please specify the `steps_per_epoch` argument.')
+
+  if (isinstance(validation_data, dataset_ops.Dataset) and
+      context.executing_eagerly()):
+    validation_data = validation_data.make_one_shot_iterator()
+  val_gen = (data_utils.is_generator_or_sequence(validation_data) or
+             isinstance(validation_data, iterator_ops.EagerIterator))
+  if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
       not validation_steps):
-    raise ValueError('`validation_steps=None` is only valid for a'
-                     ' generator based on the `keras.utils.Sequence`'
-                     ' class. Please specify `validation_steps` or use'
-                     ' the `keras.utils.Sequence` class.')
+    raise ValueError('Please specify the `validation_steps` argument.')
 
   enqueuer = None
   val_enqueuer = None
@@ -117,19 +110,19 @@ def fit_generator(model,
 
     if workers > 0:
       if is_sequence:
-        enqueuer = OrderedEnqueuer(
+        enqueuer = data_utils.OrderedEnqueuer(
             generator,
             use_multiprocessing=use_multiprocessing,
             shuffle=shuffle)
       else:
-        enqueuer = GeneratorEnqueuer(
+        enqueuer = data_utils.GeneratorEnqueuer(
             generator,
             use_multiprocessing=use_multiprocessing)
       enqueuer.start(workers=workers, max_queue_size=max_queue_size)
       output_generator = enqueuer.get()
     else:
       if is_sequence:
-        output_generator = iter_sequence_infinite(generator)
+        output_generator = data_utils.iter_sequence_infinite(generator)
       else:
         output_generator = generator
 
@@ -137,14 +130,13 @@ def fit_generator(model,
     # Construct epoch logs.
     epoch_logs = {}
     while epoch < epochs:
-      for m in model.stateful_metric_functions:
+      for m in model.metrics:
         m.reset_states()
       callbacks.on_epoch_begin(epoch)
       steps_done = 0
       batch_index = 0
       while steps_done < steps_per_epoch:
         generator_output = next(output_generator)
-
         if not hasattr(generator_output, '__len__'):
           raise ValueError('Output of generator should be '
                            'a tuple `(x, y, sample_weight)` '
@@ -167,8 +159,8 @@ def fit_generator(model,
           batch_size = list(x.values())[0].shape[0]
         else:
           batch_size = x.shape[0]
-        batch_logs['batch'] = batch_index
-        batch_logs['size'] = batch_size
+        batch_logs['batch'] = int(batch_index)
+        batch_logs['size'] = int(batch_size)
         callbacks.on_batch_begin(batch_index, batch_logs)
 
         outs = model.train_on_batch(
@@ -217,6 +209,13 @@ def fit_generator(model,
       if callbacks.model.stop_training:
         break
 
+  except (errors.OutOfRangeError, StopIteration):
+    logging.warning(
+        'Your dataset iterator ran out of data interrupting testing. '
+        'Make sure that your dataset can generate at least `steps_per_epoch` '
+        'batches (in this case, %d batches). You may need to use the '
+        'repeat() function when building your dataset.', steps_per_epoch)
+
   finally:
     try:
       if enqueuer is not None:
@@ -240,19 +239,14 @@ def evaluate_generator(model,
   if not context.executing_eagerly():
     model._make_test_function()
 
-  if hasattr(model, 'metrics'):
-    for m in model.stateful_metric_functions:
+  if hasattr(model, '_compile_metrics'):
+    for m in model.metrics:
       m.reset_states()
-    stateful_metric_indices = [
-        i for i, name in enumerate(model.metrics_names)
-        if str(name) in model.stateful_metric_names]
-  else:
-    stateful_metric_indices = []
 
   steps_done = 0
   all_outs = []
   batch_sizes = []
-  is_sequence = isinstance(generator, Sequence)
+  is_sequence = isinstance(generator, data_utils.Sequence)
   if not is_sequence and use_multiprocessing and workers > 1:
     logging.warning(
         UserWarning('Using a generator with `use_multiprocessing=True`'
@@ -263,26 +257,23 @@ def evaluate_generator(model,
     if is_sequence:
       steps = len(generator)
     else:
-      raise ValueError('`steps=None` is only valid for a generator'
-                       ' based on the `keras.utils.Sequence` class.'
-                       ' Please specify `steps` or use the'
-                       ' `keras.utils.Sequence` class.')
+      raise ValueError('Please specify the `steps` argument.')
   enqueuer = None
 
   try:
     if workers > 0:
       if is_sequence:
-        enqueuer = OrderedEnqueuer(
+        enqueuer = data_utils.OrderedEnqueuer(
             generator, use_multiprocessing=use_multiprocessing)
       else:
-        enqueuer = GeneratorEnqueuer(
+        enqueuer = data_utils.GeneratorEnqueuer(
             generator,
             use_multiprocessing=use_multiprocessing)
       enqueuer.start(workers=workers, max_queue_size=max_queue_size)
       output_generator = enqueuer.get()
     else:
       if is_sequence:
-        output_generator = iter_sequence_infinite(generator)
+        output_generator = data_utils.iter_sequence_infinite(generator)
       else:
         output_generator = generator
 
@@ -307,11 +298,11 @@ def evaluate_generator(model,
       outs = model.test_on_batch(x, y, sample_weight=sample_weight)
 
       if isinstance(x, list):
-        batch_size = x[0].shape[0]
+        batch_size = int(x[0].shape[0])
       elif isinstance(x, dict):
-        batch_size = list(x.values())[0].shape[0]
+        batch_size = int(list(x.values())[0].shape[0])
       else:
-        batch_size = x.shape[0]
+        batch_size = int(x.shape[0])
       if batch_size == 0:
         raise ValueError('Received an empty batch. '
                          'Batches should at least contain one item.')
@@ -322,6 +313,13 @@ def evaluate_generator(model,
       if verbose == 1:
         progbar.update(steps_done)
 
+  except (errors.OutOfRangeError, StopIteration):
+    logging.warning(
+        'Your dataset iterator ran out of data interrupting testing. '
+        'Make sure that your dataset can generate at least `steps` '
+        'batches (in this case, %d batches). You may need to use the '
+        'repeat() function when building your dataset.', steps)
+
   finally:
     if enqueuer is not None:
       enqueuer.stop()
@@ -329,13 +327,12 @@ def evaluate_generator(model,
   if not isinstance(outs, list):
     return np.average(np.asarray(all_outs), weights=batch_sizes)
   else:
-    averages = []
-    for i in range(len(outs)):
-      if i not in stateful_metric_indices:
-        averages.append(
-            np.average([out[i] for out in all_outs], weights=batch_sizes))
-      else:
-        averages.append(np.float64(all_outs[-1][i]))
+    averages = [float(all_outs[-1][0])]  # index 0 = 'loss'
+    averages.extend([
+        np.average([out[i]
+                    for out in all_outs], weights=batch_sizes)
+        for i in range(1, len(outs))
+    ])
     return averages
 
 
@@ -348,11 +345,11 @@ def predict_generator(model,
                       verbose=0):
   """See docstring for `Model.predict_generator`."""
   if not context.executing_eagerly():
-    model._make_test_function()
+    model._make_predict_function()
 
   steps_done = 0
   all_outs = []
-  is_sequence = isinstance(generator, Sequence)
+  is_sequence = isinstance(generator, data_utils.Sequence)
   if not is_sequence and use_multiprocessing and workers > 1:
     logging.warning(
         UserWarning('Using a generator with `use_multiprocessing=True`'
@@ -363,26 +360,23 @@ def predict_generator(model,
     if is_sequence:
       steps = len(generator)
     else:
-      raise ValueError('`steps=None` is only valid for a generator'
-                       ' based on the `keras.utils.Sequence` class.'
-                       ' Please specify `steps` or use the'
-                       ' `keras.utils.Sequence` class.')
+      raise ValueError('Please specify the `steps` argument.')
   enqueuer = None
 
   try:
     if workers > 0:
       if is_sequence:
-        enqueuer = OrderedEnqueuer(
+        enqueuer = data_utils.OrderedEnqueuer(
             generator, use_multiprocessing=use_multiprocessing)
       else:
-        enqueuer = GeneratorEnqueuer(
+        enqueuer = data_utils.GeneratorEnqueuer(
             generator,
             use_multiprocessing=use_multiprocessing)
       enqueuer.start(workers=workers, max_queue_size=max_queue_size)
       output_generator = enqueuer.get()
     else:
       if is_sequence:
-        output_generator = iter_sequence_infinite(generator)
+        output_generator = data_utils.iter_sequence_infinite(generator)
       else:
         output_generator = generator
 
@@ -421,6 +415,13 @@ def predict_generator(model,
       if verbose == 1:
         progbar.update(steps_done)
 
+  except (errors.OutOfRangeError, StopIteration):
+    logging.warning(
+        'Your dataset iterator ran out of data interrupting testing. '
+        'Make sure that your dataset can generate at least `steps` '
+        'batches (in this case, %d batches). You may need to use the '
+        'repeat() function when building your dataset.', steps)
+
   finally:
     if enqueuer is not None:
       enqueuer.stop()
diff --git a/tensorflow/python/keras/engine/training_gpu_test.py b/tensorflow/python/keras/engine/training_gpu_test.py
index 596d085f3fa4c49c7506c35fa1f4ce776bc8f691..45dcfe43995b280072395b11a573e20d57bcadc7 100644
--- a/tensorflow/python/keras/engine/training_gpu_test.py
+++ b/tensorflow/python/keras/engine/training_gpu_test.py
@@ -69,7 +69,7 @@ class TrainingGPUTest(test.TestCase):
       return simple_model
 
     if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
+      with test_util.use_gpu():
         losses_to_test = ['sparse_categorical_crossentropy',
                           'categorical_crossentropy', 'binary_crossentropy']
 
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index faafc60d428f36b27dbef521e0e60b179174d254..1009ef7138793d217b8633c42a3032047e0d3755 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -18,9 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import io
 import logging
+import sys
 
 import numpy as np
+import six
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
@@ -31,9 +34,10 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.callbacks import Callback
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
-from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
@@ -332,44 +336,16 @@ class TrainingTest(test.TestCase):
     })
     self.assertEqual(len(out), 2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_invalid_loss(self):
-    num_classes = 5
-    train_samples = 1000
-    test_samples = 1000
-    input_dim = 5
-
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, loss='categorical_crossentropy')
-    np.random.seed(1337)
-    (x_train, y_train), (_, _) = testing_utils.get_test_data(
-        train_samples=train_samples,
-        test_samples=test_samples,
-        input_shape=(input_dim,),
-        num_classes=num_classes)
-
-    with self.assertRaises(ValueError):
-      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
-
-    if not context.executing_eagerly():
-      # TODO(psv): Investigate these use cases in eager mode.
-      with self.assertRaises(ValueError):
-        model.fit(x_train, y_train)
-
-      with self.assertRaises(ValueError):
-        model.compile(optimizer, loss=None)
-
   @tf_test_util.run_in_graph_and_eager_modes
   def test_activity_regularizer_fit(self):
     loss = {}
     for reg in [None, 'l2']:
       inputs = keras.layers.Input(shape=(10,))
       x = keras.layers.Dense(
-          10, activation='relu', activity_regularizer=reg)(
-              inputs)
-      outputs = keras.layers.Dense(1, activation='sigmoid')(x)
+          10, activation='relu', activity_regularizer=reg,
+          kernel_initializer='ones', use_bias=False)(inputs)
+      outputs = keras.layers.Dense(1, activation='sigmoid',
+                                   kernel_initializer='ones', use_bias=False)(x)
       model = keras.Model(inputs, outputs)
 
       x = np.ones((10, 10), 'float32')
@@ -511,6 +487,151 @@ class TrainingTest(test.TestCase):
       x2 = model.predict(val_a)
       self.assertAllClose(x1, x2, atol=1e-7)
 
+  def test_logs_passed_to_callbacks(self):
+    with self.cached_session():
+      input_dim = 5
+      num_classes = 1
+
+      class TestCallback(Callback):
+
+        def __init__(self):
+          super(TestCallback, self).__init__()
+          self.epoch_end_logs = None
+          self.batch_end_logs = None
+          self.epoch_end_call_count = 0
+          self.batch_end_call_count = 0
+
+        def on_epoch_end(self, epoch, logs=None):
+          self.epoch_end_logs = logs
+          self.epoch_end_call_count += 1
+
+        def on_batch_end(self, batch, logs=None):
+          self.batch_end_logs = logs
+          self.batch_end_call_count += 1
+
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+      model.compile(
+          loss='binary_crossentropy',
+          metrics=['acc'],
+          weighted_metrics=['mae'],
+          optimizer=RMSPropOptimizer(learning_rate=0.01))
+
+      np.random.seed(1337)
+      (x_train, y_train), (_, _) = testing_utils.get_test_data(
+          train_samples=10,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+
+      test_callback = TestCallback()
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=2,
+          epochs=2,
+          verbose=0,
+          callbacks=[test_callback],
+          validation_data=(x_train, y_train))
+      self.assertEqual(test_callback.batch_end_call_count, 10)
+      self.assertEqual(test_callback.epoch_end_call_count, 2)
+      self.assertSetEqual(
+          set(test_callback.batch_end_logs.keys()),
+          set(['batch', 'size', 'acc', 'loss', 'weighted_mean_absolute_error']))
+      self.assertSetEqual(
+          set(test_callback.epoch_end_logs.keys()),
+          set([
+              'acc', 'loss', 'weighted_mean_absolute_error', 'val_acc',
+              'val_loss', 'val_weighted_mean_absolute_error'
+          ]))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_mismatched_output_shape_and_target_shape(self):
+    model = keras.Sequential([
+        keras.layers.Dense(2, input_shape=(3, 4)),
+        keras.layers.Dense(5),
+    ])
+    model.compile(RMSPropOptimizer(learning_rate=0.001),
+                  loss='sparse_categorical_crossentropy')
+    # Test with Numpy data
+    x_train = np.random.random((10, 3, 4))
+    y_train = np.random.randint(0, 5, size=(10, 3))
+    model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+    # Test with iterator
+    dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+    dataset = dataset.repeat(10)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+    model.fit(iterator, epochs=1, steps_per_epoch=2)
+
+    if context.executing_eagerly():
+      # Test with eager execution
+      model.compile(RMSPropOptimizer(learning_rate=0.001),
+                    loss='sparse_categorical_crossentropy',
+                    run_eagerly=True)
+      model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+      # Test with eager execution and iterator
+      model.fit(iterator, epochs=1, steps_per_epoch=2)
+
+  def test_losses_in_defun(self):
+    with context.eager_mode():
+      layer = keras.layers.Dense(1, kernel_regularizer='l1')
+      layer(array_ops.ones([1, 10]))
+
+      @function.defun
+      def get_losses():
+        return layer.losses
+
+      self.assertAllEqual(
+          self.evaluate(layer.losses), self.evaluate(get_losses()))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_logging(self):
+    mock_stdout = io.BytesIO() if six.PY2 else io.StringIO()
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, activation='relu'))
+    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model.compile(
+        RMSPropOptimizer(learning_rate=0.001), loss='binary_crossentropy')
+    with test.mock.patch.object(sys, 'stdout', mock_stdout):
+      model.fit(
+          np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
+    self.assertTrue('Epoch 5/10' in mock_stdout.getvalue())
+
+
+class TestExceptionsAndWarnings(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_invalid_loss(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(optimizer, loss='categorical_crossentropy')
+    np.random.seed(1337)
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+
+    with self.assertRaises(ValueError):
+      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
+
+    if not context.executing_eagerly():
+      # TODO(psv): Investigate these use cases in eager mode.
+      with self.assertRaises(ValueError):
+        model.fit(x_train, y_train)
+
+      with self.assertRaises(ValueError):
+        model.compile(optimizer, loss=None)
+
   @tf_test_util.run_in_graph_and_eager_modes
   def test_compile_warning_for_loss_missing_output(self):
     with self.cached_session():
@@ -607,25 +728,6 @@ class LossWeightingTest(test.TestCase):
         x_test[test_ids, :], y_test[test_ids, :], verbose=0)
     self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_sequential_model_fails_with_dict_inputs(self):
-    num_classes = 5
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes)
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        metrics=['acc'],
-        weighted_metrics=['mae'],
-        loss='categorical_crossentropy')
-
-    x = {'dense_input': np.random.random((10, 1))}
-    y = np.random.randint(num_classes, size=(10, 1))
-
-    with self.assertRaisesRegexp(
-        ValueError, 'Passing a dictionary input to a Sequential Model which '
-        'doesnt have FeatureLayer as the first layer is an error'):
-      model.fit(x, y, batch_size=5, epochs=1)
-
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sample_weights(self):
     num_classes = 5
@@ -1014,22 +1116,6 @@ class LossMaskingTest(test.TestCase):
               keras.backend.variable(weights), keras.backend.variable(mask)))
 
 
-class LearningPhaseTest(test.TestCase):
-
-  def test_empty_model_no_learning_phase(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      self.assertFalse(model.uses_learning_phase)
-
-  def test_dropout_has_learning_phase(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_dim=3))
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(2))
-      self.assertTrue(model.uses_learning_phase)
-
-
 class TestDynamicTrainability(test.TestCase):
 
   def test_trainable_warning(self):
@@ -1173,40 +1259,6 @@ class TestDynamicTrainability(test.TestCase):
       self.assertListEqual(outer_model.trainable_weights, [])
 
 
-class TestTrainingUtils(test.TestCase):
-
-  def test_check_array_lengths(self):
-    keras.engine.training_utils.check_array_lengths(None, None, None)
-    a_np = np.random.random((4, 3, 3))
-    keras.engine.training_utils.check_array_lengths(a_np, a_np, a_np)
-    keras.engine.training_utils.check_array_lengths(
-        [a_np, a_np], [a_np, a_np], [a_np, a_np])
-    keras.engine.training_utils.check_array_lengths([None], [None], [None])
-
-    b_np = np.random.random((3, 4))
-    with self.assertRaises(ValueError):
-      keras.engine.training_utils.check_array_lengths([a_np], [b_np], None)
-
-  def test_slice_arrays(self):
-    input_a = np.random.random((10, 3))
-    slice_arrays(input_a, 0)
-    slice_arrays(None)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = [None, [1, 1], None, [1, 1]]
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = [None]
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = None
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-
-
 class TestTrainingWithDataTensors(test.TestCase):
 
   def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
@@ -1778,264 +1830,6 @@ class TestTrainingWithDataTensors(test.TestCase):
                            [output_a_np, output_b_np])
 
 
-class TestTrainingWithDatasetIterators(test.TestCase):
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_training_and_eval_methods_on_iterators_single_io(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(iterator, steps=2, verbose=1)
-    model.predict(iterator, steps=2)
-    model.train_on_batch(iterator)
-    model.test_on_batch(iterator)
-    model.predict_on_batch(iterator)
-
-    # Test with validation data
-    model.fit(iterator,
-              epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=iterator, validation_steps=2)
-    # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(iterator,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_split=0.5, validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
-        ValueError, '`sample_weight` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(
-          iterator,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    # Test invalid usage
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should not specify a target'):
-      model.fit(iterator, iterator,
-                epochs=1, steps_per_epoch=2, verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
-      model.fit(iterator, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.evaluate(iterator, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.predict(iterator, verbose=0)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_get_next_op_created_once(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    # Finalize graph to make sure we are not appending another iterator
-    # get_next op in the graph.
-    ops.get_default_graph().finalize()
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_iterators_running_out_of_data(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(2)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
-      self.assertRegexpMatches(
-          str(mock_log.call_args),
-          'dataset iterator ran out of data')
-
-
-class TestTrainingWithDataset(test.TestCase):
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_calling_model_on_same_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    # Call fit with validation data
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-    # Finalize the graph to make sure new ops aren't added when calling on the
-    # same dataset
-    ops.get_default_graph().finalize()
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_training_and_eval_methods_on_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-    model.train_on_batch(dataset)
-    model.predict_on_batch(dataset)
-
-    # Test with validation data
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-
-    # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(dataset,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_split=0.5, validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
-        ValueError, '`sample_weight` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(
-          dataset,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    # Test invalid usage
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should not specify a target'):
-      model.fit(dataset, dataset,
-                epochs=1, steps_per_epoch=2, verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
-      model.fit(dataset, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.evaluate(dataset, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.predict(dataset, verbose=0)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dataset_with_sample_weights(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    sample_weights = np.ones((10), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
-                                                      sample_weights))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-    model.train_on_batch(dataset)
-    model.predict_on_batch(dataset)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dataset_with_sparse_labels(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'sparse_categorical_crossentropy'
-    model.compile(optimizer, loss)
-
-    inputs = np.zeros((10, 3))
-    targets = np.random.randint(0, 4, size=10, dtype=np.int32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-
-  def test_dataset_input_shape_validation(self):
-    with self.cached_session():
-      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
-
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-
-      with self.assertRaisesRegexp(
-          ValueError,
-          r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
-      ):
-        model.train_on_batch(dataset)
-
-      # Wrong input shape
-      inputs = np.zeros((10, 5))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      with self.assertRaisesRegexp(ValueError,
-                                   r'expected (.*?) to have shape \(3,\)'):
-        model.train_on_batch(dataset)
-
-
 class TestTrainingWithMetrics(test.TestCase):
   """Training tests related to metrics."""
 
@@ -2059,12 +1853,7 @@ class TestTrainingWithMetrics(test.TestCase):
         'dense_binary_accuracy', 'dropout_mean_squared_error',
         'dropout_binary_accuracy'
     ]
-    reference_stateful_metric_names = [
-        'dense_binary_accuracy', 'dropout_binary_accuracy'
-    ]
     self.assertEqual(reference_metric_names, model.metrics_names)
-    self.assertEqual(reference_stateful_metric_names,
-                     model.stateful_metric_names)
 
     # Verify that model metric names are not altered during training.
     input_a_np = np.random.random((10, 3))
@@ -2077,8 +1866,6 @@ class TestTrainingWithMetrics(test.TestCase):
               epochs=1,
               batch_size=5)
     self.assertEqual(reference_metric_names, model.metrics_names)
-    self.assertEqual(reference_stateful_metric_names,
-                     model.stateful_metric_names)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_correctness(self):
@@ -2106,39 +1893,6 @@ class TestTrainingWithMetrics(test.TestCase):
     self.assertEqual(outs[1], 0.)
     self.assertEqual(outs[2], 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_metrics_correctness_with_iterator(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            8, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(
-            1, activation='sigmoid', kernel_initializer='ones'))
-    model.compile(
-        loss='binary_crossentropy',
-        metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-    np.random.seed(123)
-    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
-    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
-    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
-
-    y = np.zeros((100, 1), dtype=np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(outs[1], 0.)
-    self.assertEqual(outs[2], 0.)
-
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_correctness_with_weighted_metrics(self):
     np.random.seed(1337)
@@ -2152,8 +1906,7 @@ class TestTrainingWithMetrics(test.TestCase):
         RMSPropOptimizer(learning_rate=0.001),
         loss='mse',
         sample_weight_mode='temporal',
-        weighted_metrics=['accuracy',
-                          metrics_module.BinaryAccuracy()])
+        weighted_metrics=['accuracy', 'mse'])
     y = np.array([[[1.], [1.]], [[1.], [1.]]])
 
     outs = model.evaluate(x, y)
@@ -2165,7 +1918,15 @@ class TestTrainingWithMetrics(test.TestCase):
 
     w = np.array([[3., 4.], [1., 2.]])
     outs = model.evaluate(x, y, sample_weight=w)
-    self.assertArrayNear(outs, [0.3, 0.7, 0.7], .001)
+    self.assertArrayNear(outs, [0.3, 0.7, 0.3], .001)
+
+    # Verify that metric value is same with arbitrary weights and batch size.
+    x = np.random.random((50, 2, 1))
+    y = np.random.random((50, 2, 1))
+    w = np.random.random((50, 2))
+    mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[2]
+    mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[2]
+    self.assertNear(mse1, mse2, err=1e-7)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
@@ -2216,31 +1977,340 @@ class TestTrainingWithMetrics(test.TestCase):
       model.compile(
           RMSPropOptimizer(learning_rate=0.001),
           loss='mse',
-          weighted_metrics=['accuracy',
-                            metrics_module.BinaryAccuracy()])
+          weighted_metrics=['accuracy'])
 
-      # verify that masking is applied for stateless and stateful metrics.
+      # verify that masking is applied.
       x = np.array([[[1], [1]], [[1], [1]], [[0], [0]]])
       y = np.array([[[1], [1]], [[0], [1]], [[1], [1]]])
       scores = model.train_on_batch(x, y)
-      self.assertArrayNear(scores, [0.25, 0.75, 0.75], 0.1)
+      self.assertArrayNear(scores, [0.25, 0.75], 0.1)
 
       # verify that masking is combined with sample weights.
       w = np.array([3, 2, 4])
       scores = model.train_on_batch(x, y, sample_weight=w)
-      self.assertArrayNear(scores, [0.2, 0.8, 0.8], 0.1)
+      self.assertArrayNear(scores, [0.2, 0.8], 0.1)
 
-  def test_losses_in_defun(self):
+  def test_add_metric_with_tensor_on_model_in_graph_mode(self):
+    with self.cached_session():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      model = keras.models.Model(x, y)
+      model.add_metric(
+          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+
+      # test with a metric which does not have the standard signature:
+      # (y_true, y_pred, sample_Weight)
+      model.add_metric(metrics_module.Mean(name='metric_2')(y))
+      model.compile('sgd', loss='mse')
+
+      inputs = np.ones(shape=(10, 1))
+      targets = np.ones(shape=(10, 1))
+      history = model.fit(
+          inputs,
+          targets,
+          epochs=2,
+          batch_size=5,
+          validation_data=(inputs, targets))
+      self.assertEqual(history.history['metric_1'][-1], 5)
+      self.assertEqual(history.history['metric_2'][-1], 1)
+      self.assertEqual(history.history['val_metric_1'][-1], 5)
+      self.assertEqual(history.history['val_metric_2'][-1], 1)
+
+      eval_results = model.evaluate(inputs, targets, batch_size=5)
+      self.assertEqual(eval_results[-1], 1)
+      self.assertEqual(eval_results[-2], 5)
+
+      model.predict(inputs, batch_size=5)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_add_metric_in_model_call(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean = metrics_module.Mean(name='metric_1')
+
+      def call(self, x):
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_2', aggregation='mean')
+        # Provide same name as in the instance created in __init__
+        # for eager mode
+        self.add_metric(self.mean(x), name='metric_1')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['val_metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_2'][-1], 5, 0)
+    self.assertAlmostEqual(history.history['val_metric_2'][-1], 5, 0)
+
+    eval_results = model.evaluate(x, y, batch_size=5)
+    self.assertAlmostEqual(eval_results[1], 1, 0)
+    self.assertAlmostEqual(eval_results[2], 5, 0)
+
+    model.predict(x, batch_size=5)
+    model.train_on_batch(x, y)
+    model.test_on_batch(x, y)
+
+  def test_add_metric_in_model_call_run_eagerly(self):
     with context.eager_mode():
-      layer = keras.layers.Dense(1, kernel_regularizer='l1')
-      layer(array_ops.ones([1, 10]))
 
-      @function.defun
-      def get_losses():
-        return layer.losses
+      class TestModel(keras.Model):
+
+        def __init__(self):
+          super(TestModel, self).__init__(name='test_model')
+          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+          self.mean = metrics_module.Mean(name='metric_1')
+
+        def call(self, x):
+          self.add_metric(
+              math_ops.reduce_sum(x), name='metric_2', aggregation='mean')
+          # Provide same name as in the instance created in __init__
+          # for eager mode
+          self.add_metric(self.mean(x), name='metric_1')
+          return self.dense1(x)
+
+      model = TestModel()
+      model.compile(
+          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+      self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+      self.assertAlmostEqual(history.history['val_metric_1'][-1], 1, 0)
+      self.assertAlmostEqual(history.history['metric_2'][-1], 5, 0)
+      self.assertAlmostEqual(history.history['val_metric_2'][-1], 5, 0)
+
+      eval_results = model.evaluate(x, y, batch_size=5)
+      self.assertAlmostEqual(eval_results[1], 1, 0)
+      self.assertAlmostEqual(eval_results[2], 5, 0)
+
+      model.predict(x, batch_size=5)
+      model.train_on_batch(x, y)
+      model.test_on_batch(x, y)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_add_metric_in_layer_call(self):
+
+    class TestLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable(
+            'a', (1, 1), initializer='ones', trainable=False)
+        self.built = True
+
+      def call(self, inputs):
+        self.add_metric(
+            math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
+        return inputs + 1
+
+    model = keras.Sequential()
+    model.add(TestLayer(input_shape=(1,)))
+    model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertEqual(history.history['metric_1'][-1], 5)
+    self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
+
+  def test_add_metric_in_layer_call_run_eagerly(self):
+    with context.eager_mode():
+
+      class TestLayer(keras.layers.Layer):
+
+        def build(self, input_shape):
+          self.a = self.add_variable(
+              'a', (1, 1), initializer='ones', trainable=False)
+          self.built = True
+
+        def call(self, inputs):
+          self.add_metric(
+              math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
+          return inputs + 1
+
+      model = keras.Sequential()
+      model.add(TestLayer(input_shape=(1,)))
+      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+      model.compile(
+          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+      self.assertEqual(history.history['metric_1'][-1], 5)
+      self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
+
+  def test_model_metrics_list(self):
+    with self.cached_session():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      model = keras.models.Model(x, y)
+      model.add_metric(
+          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+      model.add_metric(metrics_module.Mean(name='metric_2')(y))
+      model.compile('sgd', loss='mse', metrics=['acc'])
+
+      # Verify that the metrics added using `compile` and `add_metric` API are
+      # included
+      self.assertEqual(model._compile_metrics, ['acc'])
+      names = []
+      for m in model.metrics:
+        if isinstance(m, metrics_module.Metric):
+          names.append(m.name)
+        else:
+          names.append(m.__name__)
+      self.assertEqual(names, ['binary_accuracy', 'metric_1', 'metric_2'])
+
+  def test_model_eager_metrics_list(self):
+    with context.eager_mode():
+
+      class TestModel(keras.Model):
+
+        def __init__(self):
+          super(TestModel, self).__init__(name='test_model')
+          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+        def call(self, x):
+          self.add_metric(
+              math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
+          return self.dense1(x)
+
+      model = TestModel()
+      model.compile(
+          loss='mse',
+          optimizer=RMSPropOptimizer(0.01),
+          metrics=['acc'],
+          run_eagerly=True)
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+      self.assertEqual(model._compile_metrics, ['acc'])
+      names = []
+      for m in model.metrics:
+        if isinstance(m, metrics_module.Metric):
+          names.append(m.name)
+        else:
+          names.append(m.__name__)
+      self.assertEqual(names, ['categorical_accuracy', 'metric_1'])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_multiple_add_metric_calls(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean1 = metrics_module.Mean(name='metric_1')
+        self.mean2 = metrics_module.Mean(name='metric_2')
+
+      def call(self, x):
+        self.add_metric(self.mean2(x), name='metric_2')
+        self.add_metric(self.mean1(x), name='metric_1')
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_3', aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_2'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_3'][-1], 5, 0)
+
+    eval_results = model.evaluate(x, y, batch_size=5)
+    self.assertArrayNear(eval_results[1:4], [1, 1, 5], 0.1)
+
+    model.predict(x, batch_size=5)
+    model.train_on_batch(x, y)
+    model.test_on_batch(x, y)
+
+  def test_invalid_metric_tensor_in_call(self):
+    with context.eager_mode():
+
+      class TestLayer(keras.layers.Layer):
+
+        def call(self, inputs):
+          self.add_metric(metrics_module.Mean(name='metric_1')(inputs))
+          return inputs + 1
+
+      model = keras.Sequential()
+      model.add(TestLayer(input_shape=(1,)))
+      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+      model.compile(
+          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      with self.assertRaisesRegexp(
+          ValueError,
+          'We do not support adding an aggregated metric tensor in `call` in '
+          'eager execution.'):
+        model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_duplicate_metric_name_in_add_metric(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean = metrics_module.Mean(name='metric_1')
+        self.mean2 = metrics_module.Mean(name='metric_1')
+
+      def call(self, x):
+        self.add_metric(self.mean(x), name='metric_1')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Please provide different names for the metrics you have added. '
+        'We found 2 metrics with the name: "metric_1"'):
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_multiple_no_name_input_to_add_metric(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+      def call(self, x):
+        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
+        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertEqual([m.name for m in model.metrics], ['mean', 'mean_1'])
 
-      self.assertAllEqual(self.evaluate(layer.losses),
-                          self.evaluate(get_losses()))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 7034874ee8de746a2cbf37ba284cefcc62e19aed..8669daf99ef89c1a46c6ac7925e00301c1569df5 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import weights_broadcast_ops
@@ -137,6 +138,9 @@ def convert_to_iterator(x=None,
 
   """
   if isinstance(x, iterator_ops.EagerIterator):
+    if steps_per_epoch is None:
+      raise ValueError('You must specify the number of steps (number of batches'
+                       ' to draw from the iterator).')
     return x, steps_per_epoch
 
   if not _nested_any(sample_weights, lambda x: x is None):
@@ -151,7 +155,7 @@ def convert_to_iterator(x=None,
   data = _convert_lists_to_tuples(data)
   if steps_per_epoch is None and batch_size is not None:
     num_samples = _get_batch_axis_size(data)
-    steps_per_epoch = int(math.ceil(num_samples / batch_size))
+    steps_per_epoch = int(math.ceil(num_samples / int(batch_size)))
 
   if steps_per_epoch is None:
     alternative_arg_name = (
@@ -223,9 +227,9 @@ def standardize_single_array(x):
     return None
   if x.shape is not None and len(x.shape) == 1:
     if tensor_util.is_tensor(x):
-      return array_ops.expand_dims(x, axis=1)
+      x = array_ops.expand_dims(x, axis=1)
     else:
-      return np.expand_dims(x, 1)
+      x = np.expand_dims(x, 1)
   return x
 
 
@@ -510,8 +514,15 @@ def collect_per_output_metric_info(metrics,
       For instance, if the model has 2 outputs, and for the first output
       we want to compute "binary_accuracy" and "binary_crossentropy",
       and just "binary_accuracy" for the second output,
-      the list would look like: `[[('acc', binary_accuracy()),
-      ('ce', binary_crossentropy())], [('acc', binary_accuracy())]]`
+      the list would look like: `[
+        {
+          'acc': (binary_accuracy(), mean_obj_1),
+          'ce': (binary_crossentropy(), mean_obj_2)
+        },
+        {
+          'acc': (binary_accuracy(), mean_obj_3)
+        }
+      ]`
 
   Raises:
       TypeError: if an incorrect type is passed for the `metrics` argument.
@@ -541,7 +552,19 @@ def collect_per_output_metric_info(metrics,
       metric_name = get_metric_name(metric, weighted)
       metric_fn = get_metric_function(
           metric, output_shape=output_shapes[i], loss_fn=loss_fns[i])
-      metrics_dict[metric_name] = metric_fn
+
+      # If the metric function is not stateful, we create a stateful version and
+      # return both the stateless and the stateful version together. For batch
+      # APIs like `train_on_batch` we will use the stateless version and for
+      # other APIs like `fit` we will use the stateful version.
+      is_stateful = isinstance(metric_fn,
+                               base_layer.Layer) and metric_fn.stateful
+      stateful_fn = metric_fn
+      if not is_stateful:
+        stateful_fn = metrics_module.MeanMetricWrapper(
+            metric_fn, name=metric_fn.__name__)
+
+      metrics_dict[metric_name] = (metric_fn, stateful_fn)
     per_output_metrics.append(metrics_dict)
 
   return per_output_metrics
@@ -608,19 +631,10 @@ def weighted_masked_objective(fn):
       if weights is None:
         weights = mask
       else:
-        # Update shape of weights if possible before adding mask.
         # Update dimensions of weights to match with mask if possible.
         mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
             mask, None, weights)
-        try:
-          # Broadcast weights if possible.
-          weights = weights_broadcast_ops.broadcast_weights(weights, mask)
-          weights *= mask
-        except ValueError:
-          score_array *= mask
-          score_array /= K.mean(mask)
-          # TODO(psv): Handle case when mask and weight shapes are not
-          # compatible.
+        weights *= mask
 
     # Apply sample weighting.
     if weights is not None:
@@ -640,7 +654,7 @@ def weighted_masked_objective(fn):
       score_array = math_ops.multiply(score_array, weights)
       score_array = math_ops.reduce_sum(score_array)
       weights = math_ops.reduce_sum(weights)
-      score_array = metrics_module.safe_div(score_array, weights)
+      score_array = math_ops.div_no_nan(score_array, weights)
     return K.mean(score_array)
 
   return weighted
@@ -813,6 +827,23 @@ def get_metric_function(metric, output_shape=None, loss_fn=None):
   return metrics_module.get(metric)
 
 
+def call_metric_function(metric_fn, y_true, y_pred, weights=None, mask=None):
+  """Invokes metric function and returns the metric result tensor."""
+  if mask is None:
+    return metric_fn(y_true, y_pred, sample_weight=weights)
+
+  mask = math_ops.cast(mask, y_pred.dtype)
+  if weights is None:
+    # Use mask as sample weight.
+    return metric_fn(y_true, y_pred, sample_weight=mask)
+
+  # Update dimensions of weights to match with mask.
+  mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
+      mask, None, weights)
+  weights *= mask
+  return metric_fn(y_true, y_pred, sample_weight=weights)
+
+
 def validate_iterator_input(x, y, sample_weight, validation_split=None):
   """Validates user input arguments when a dataset iterator is passed.
 
@@ -1025,9 +1056,11 @@ class ModelInputs(object):
     self._inputs = inputs
     self._is_dict = isinstance(self._inputs, dict)
     self._is_single_input = not isinstance(self._inputs, (list, tuple, dict))
+
     self._flattened_inputs = []
     self._input_names = []
-    if isinstance(self._inputs, dict):
+
+    if self._is_dict:
       for k in sorted(self._inputs.keys()):
         self._flattened_inputs.append(self._inputs[k])
         self._input_names.append(k)
@@ -1036,7 +1069,6 @@ class ModelInputs(object):
       self._input_names = [
           'input_%d' % (i + 1) for i in range(len(self._flattened_inputs))
       ]
-    assert len(self._input_names) == len(self._flattened_inputs)
 
   def get_input_names(self):
     """Returns keys to name inputs by.
@@ -1046,56 +1078,29 @@ class ModelInputs(object):
     """
     return self._input_names
 
-  def _get(self, return_single_as_list=False):
-    """Returns provided inputs, potentially transformed.
-
-    Inputs are returned in the same format they were provided i.e. lists
-    are returned as lists, single entries as single entries (unless
-    `return_single_as_list` is true), dictionaries as dictionaries.
-
-    Args:
-      return_single_as_list: Returns a list of size 1 for single entry case.
-    """
-    if self._is_dict:
-      return dict(zip(self._input_names, self._flattened_inputs))
-    if self._is_single_input and not return_single_as_list:
-      return self._flattened_inputs[0]
-    return self._flattened_inputs
-
-  def get_input_values(self):
-    """Returns input values passed in."""
-    if context.executing_eagerly():
-      for i in range(len(self._flattened_inputs)):
-        v = self._flattened_inputs[i]
-        if tensor_util.is_tensor(v):
-          v = cast_single_tensor(v)
-        else:
-          v = ops.convert_to_tensor(v, dtype=K.floatx())
-        self._flattened_inputs[i] = v
-    return self._get(return_single_as_list=False)
-
   def get_symbolic_inputs(self, return_single_as_list=False):
     """Returns inputs to be set as self.inputs for a model."""
     for i in range(len(self._flattened_inputs)):
       k = self._input_names[i]
       v = self._flattened_inputs[i]
-      if context.executing_eagerly():
-        v = K.placeholder((None,) + tuple(v.shape[1:]), name=k)
-      else:
-        if isinstance(v, list):
-          v = np.asarray(v)
-          if v.ndim == 1:
-            v = np.expand_dims(v, 1)
-        if isinstance(v, (np.ndarray)):
-          # We fix the placeholder shape except the batch size.
-          # This is suboptimal, but it is the best we can do with the info
-          # we have. The user should call `model._set_inputs(placeholders)`
-          # to specify custom placeholders if the need arises.
-          shape = (None,) + v.shape[1:]
-          v = K.placeholder(shape=shape, name=k)
+      if isinstance(v, (list, float, int)):
+        v = np.asarray(v)
+        if v.ndim == 1:
+          v = np.expand_dims(v, 1)
+      if isinstance(v, (np.ndarray, ops.EagerTensor)):
+        # We fix the placeholder shape except the batch size.
+        # This is suboptimal, but it is the best we can do with the info
+        # we have. The user should call `model._set_inputs(placeholders)`
+        # to specify custom placeholders if the need arises.
+        shape = (None,) + tuple(v.shape[1:])
+        v = K.placeholder(shape=shape, name=k)
       self._flattened_inputs[i] = v
 
-    return self._get(return_single_as_list)
+    if self._is_dict:
+      return dict(zip(self._input_names, self._flattened_inputs))
+    if self._is_single_input and not return_single_as_list:
+      return self._flattened_inputs[0]
+    return self._flattened_inputs
 
   def as_dict(self):
     """An iterable over a dictionary version of inputs."""
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index ff15c49a926324267ca330c2022f48da55b38722..d6a92dec7c445c49e07426d2d022055272ebeeb3 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -148,6 +148,18 @@ class TrainingUtilTest(test.TestCase):
     any_true = training_utils._nested_any(nested_data, lambda x: x)
     self.assertEquals(any_true, False)
 
+  def test_check_array_lengths(self):
+    training_utils.check_array_lengths(None, None, None)
+    a_np = np.random.random((4, 3, 3))
+    training_utils.check_array_lengths(a_np, a_np, a_np)
+    training_utils.check_array_lengths(
+        [a_np, a_np], [a_np, a_np], [a_np, a_np])
+    training_utils.check_array_lengths([None], [None], [None])
+
+    b_np = np.random.random((3, 4))
+    with self.assertRaises(ValueError):
+      training_utils.check_array_lengths([a_np], [b_np], None)
+
 
 class ModelInputsTest(test.TestCase):
 
@@ -155,9 +167,6 @@ class ModelInputsTest(test.TestCase):
     a = np.ones(10)
     model_inputs = training_utils.ModelInputs(a)
     self.assertEquals(['input_1'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertAllEqual(np.ones(10), vals)
-    self.assertFalse(tensor_util.is_tensor(vals))
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals))
     vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
@@ -169,9 +178,6 @@ class ModelInputsTest(test.TestCase):
       a = np.ones(10)
       model_inputs = training_utils.ModelInputs(a)
       self.assertEquals(['input_1'], model_inputs.get_input_names())
-      val = model_inputs.get_input_values()
-      self.assertAllEqual(np.ones(10), val)
-      self.assertTrue(tensor_util.is_tensor(val))
       val = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(val))
       vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
@@ -182,12 +188,6 @@ class ModelInputsTest(test.TestCase):
     a = [np.ones(10), np.ones(20)]
     model_inputs = training_utils.ModelInputs(a)
     self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertEqual(2, len(vals))
-    self.assertAllEqual(np.ones(10), vals[0])
-    self.assertAllEqual(np.ones(20), vals[1])
-    self.assertFalse(tensor_util.is_tensor(vals[0]))
-    self.assertFalse(tensor_util.is_tensor(vals[1]))
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals[0]))
     self.assertTrue(tensor_util.is_tensor(vals[1]))
@@ -197,12 +197,6 @@ class ModelInputsTest(test.TestCase):
       a = [np.ones(10), np.ones(20)]
       model_inputs = training_utils.ModelInputs(a)
       self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
-      vals = model_inputs.get_input_values()
-      self.assertEqual(2, len(vals))
-      self.assertAllEqual(np.ones(10), vals[0])
-      self.assertAllEqual(np.ones(20), vals[1])
-      self.assertTrue(tensor_util.is_tensor(vals[0]))
-      self.assertTrue(tensor_util.is_tensor(vals[1]))
       vals = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
@@ -211,11 +205,6 @@ class ModelInputsTest(test.TestCase):
     a = {'b': np.ones(10), 'a': np.ones(20)}
     model_inputs = training_utils.ModelInputs(a)
     self.assertEquals(['a', 'b'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertAllEqual(np.ones(20), vals['a'])
-    self.assertAllEqual(np.ones(10), vals['b'])
-    self.assertFalse(tensor_util.is_tensor(vals['a']))
-    self.assertFalse(tensor_util.is_tensor(vals['b']))
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals['a']))
     self.assertTrue(tensor_util.is_tensor(vals['b']))
@@ -225,11 +214,6 @@ class ModelInputsTest(test.TestCase):
       a = {'b': np.ones(10), 'a': np.ones(20)}
       model_inputs = training_utils.ModelInputs(a)
       self.assertEquals(['a', 'b'], model_inputs.get_input_names())
-      vals = model_inputs.get_input_values()
-      self.assertAllEqual(np.ones(20), vals['a'])
-      self.assertAllEqual(np.ones(10), vals['b'])
-      self.assertTrue(tensor_util.is_tensor(vals['a']))
-      self.assertTrue(tensor_util.is_tensor(vals['b']))
       vals = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index b244beb5b58cf339a4687216b87418c88b953c17..3c1a63d6dfdb3b4324e7f29b77d1bceb8d2bf9d1 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -24,23 +24,54 @@ from tensorflow.python.util.tf_export import tf_export
 # As long as you depend //third_party/py/tensorflow:tensorflow target
 # everything will work as normal.
 
-try:
-  from tensorflow.python.estimator import keras as keras_lib  # pylint: disable=g-import-not-at-top
-  model_to_estimator = tf_export('keras.estimator.model_to_estimator')(
-      keras_lib.model_to_estimator)
-except Exception:  # pylint: disable=broad-except
-
-  # pylint: disable=unused-argument
-  def stub_model_to_estimator(keras_model=None,
-                              keras_model_path=None,
-                              custom_objects=None,
-                              model_dir=None,
-                              config=None):
+
+# LINT.IfChange
+@tf_export('keras.estimator.model_to_estimator')
+def model_to_estimator(
+    keras_model=None,
+    keras_model_path=None,
+    custom_objects=None,
+    model_dir=None,
+    config=None):
+  """Constructs an `Estimator` instance from given keras model.
+
+  For usage example, please see:
+  [Creating estimators from Keras
+  Models](https://tensorflow.org/guide/estimators#model_to_estimator).
+
+  Args:
+    keras_model: A compiled Keras model object. This argument is mutually
+      exclusive with `keras_model_path`.
+    keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
+      format, which can be generated with the `save()` method of a Keras model.
+      This argument is mutually exclusive with `keras_model`.
+    custom_objects: Dictionary for custom objects.
+    model_dir: Directory to save `Estimator` model parameters, graph, summary
+      files for TensorBoard, etc.
+    config: `RunConfig` to config `Estimator`.
+
+  Returns:
+    An Estimator from given keras model.
+
+  Raises:
+    ValueError: if neither keras_model nor keras_model_path was given.
+    ValueError: if both keras_model and keras_model_path was given.
+    ValueError: if the keras_model_path is a GCS URI.
+    ValueError: if keras_model has not been compiled.
+  """
+  try:
+    from tensorflow_estimator.python.estimator import keras as keras_lib  # pylint: disable=g-import-not-at-top
+  except ImportError:
     raise NotImplementedError(
         'tf.keras.estimator.model_to_estimator function not available in your '
         'installation.')
-  # pylint: enable=unused-argument
+  keras_lib.model_to_estimator(
+      keras_model=keras_model,
+      keras_model_path=keras_model_path,
+      custom_objects=custom_objects,
+      model_dir=model_dir,
+      config=config)
+
+# LINT.ThenChange(//third_party/tensorflow_estimator/python/estimator/keras.py)
 
-  model_to_estimator = tf_export('keras.estimator.model_to_estimator')(
-      stub_model_to_estimator)
 
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index 3c0f73b1c3aab037164f612e0e9b3a2fc7b32385..25ca9e69e2e7ad663ce87ccb1f3dc342f2474b9b 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.layers import core as tf_core_layers
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
@@ -312,6 +313,15 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=0)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  def test_regularizers_with_get_variable(self):
+    # Test case for GitHub issue 22470.
+    with self.cached_session():
+      v = variable_scope.get_variable(
+          'v',
+          shape=[4, 4],
+          initializer=keras.initializers.glorot_uniform(),
+          regularizer=keras.regularizers.l2(0.))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 7268040b0287fcfd7c0bd291b0ff7a75e154534e..49990b6bf4f617dff1f6dc827ba03aa66f41f568 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 # pylint: disable=g-bad-import-order
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 
 # Advanced activations.
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index b0dffced3eada824fc3c5656363c94deb00eaa96..35ac7830b2e2f37ffc270227d44450d730a9149c 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -22,8 +22,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -54,7 +54,6 @@ class LeakyReLU(Layer):
     super(LeakyReLU, self).__init__(**kwargs)
     self.supports_masking = True
     self.alpha = K.cast_to_floatx(alpha)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return K.relu(inputs, alpha=self.alpha)
@@ -118,7 +117,6 @@ class PReLU(Layer):
       self.shared_axes = [shared_axes]
     else:
       self.shared_axes = list(shared_axes)
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -193,7 +191,6 @@ class ELU(Layer):
     super(ELU, self).__init__(**kwargs)
     self.supports_masking = True
     self.alpha = K.cast_to_floatx(alpha)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return K.elu(inputs, self.alpha)
@@ -233,7 +230,6 @@ class ThresholdedReLU(Layer):
     super(ThresholdedReLU, self).__init__(**kwargs)
     self.supports_masking = True
     self.theta = K.cast_to_floatx(theta)
-    self._can_use_graph_functions = True
 
   def call(self, inputs, mask=None):
     return inputs * math_ops.cast(
@@ -269,7 +265,6 @@ class Softmax(Layer):
     super(Softmax, self).__init__(**kwargs)
     self.supports_masking = True
     self.axis = axis
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return K.softmax(inputs, axis=self.axis)
@@ -324,7 +319,6 @@ class ReLU(Layer):
     self.max_value = max_value
     self.negative_slope = K.cast_to_floatx(negative_slope)
     self.threshold = K.cast_to_floatx(threshold)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     # alpha is used for leaky relu slope in activations instead of
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 0671a5a36d6fc2a4c5a763505548c54eb7568039..6564d6e8fdba6d6f8b384b06125032d16f34e28a 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -26,8 +26,8 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
 from tensorflow.python.keras.layers.pooling import AveragePooling1D
@@ -120,7 +120,6 @@ class Conv(Layer):
         name=name,
         activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
-    self._can_use_graph_functions = True
     self.rank = rank
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(
@@ -1916,7 +1915,6 @@ class UpSampling1D(Layer):
     super(UpSampling1D, self).__init__(**kwargs)
     self.size = int(size)
     self.input_spec = InputSpec(ndim=3)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -1983,7 +1981,6 @@ class UpSampling2D(Layer):
                        'or `"bilinear"`.')
     self.interpolation = interpolation
     self.input_spec = InputSpec(ndim=4)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2054,7 +2051,6 @@ class UpSampling3D(Layer):
     self.size = conv_utils.normalize_tuple(size, 3, 'size')
     self.input_spec = InputSpec(ndim=5)
     super(UpSampling3D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2109,7 +2105,6 @@ class ZeroPadding1D(Layer):
 
   def __init__(self, padding=1, **kwargs):
     super(ZeroPadding1D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.padding = conv_utils.normalize_tuple(padding, 2, 'padding')
     self.input_spec = InputSpec(ndim=3)
 
@@ -2175,7 +2170,6 @@ class ZeroPadding2D(Layer):
 
   def __init__(self, padding=(1, 1), data_format=None, **kwargs):
     super(ZeroPadding2D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     if isinstance(padding, int):
       self.padding = ((padding, padding), (padding, padding))
@@ -2280,7 +2274,6 @@ class ZeroPadding3D(Layer):
 
   def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
     super(ZeroPadding3D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     if isinstance(padding, int):
       self.padding = ((padding, padding), (padding, padding), (padding,
@@ -2375,7 +2368,6 @@ class Cropping1D(Layer):
     super(Cropping1D, self).__init__(**kwargs)
     self.cropping = conv_utils.normalize_tuple(cropping, 2, 'cropping')
     self.input_spec = InputSpec(ndim=3)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2475,7 +2467,6 @@ class Cropping2D(Layer):
                        '((top_crop, bottom_crop), (left_crop, right_crop)). '
                        'Found: ' + str(cropping))
     self.input_spec = InputSpec(ndim=4)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2609,7 +2600,6 @@ class Cropping3D(Layer):
           ' (left_dim3_crop, right_dim2_crop)). '
           'Found: ' + str(cropping))
     self.input_spec = InputSpec(ndim=5)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index e61dd3043d96e69f76cb5bb041de304f5c1c2642..cf3861da21858d0ef0ab4e7567795edbf41635b8 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -26,8 +26,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _generate_dropout_mask
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.layers.recurrent import RNN
@@ -391,10 +391,6 @@ class ConvRNN2D(RNN):
     else:
       output = last_output
 
-    # Properly set learning phase
-    if getattr(last_output, '_uses_learning_phase', False):
-      output._uses_learning_phase = True
-
     if self.return_state:
       if not isinstance(states, (list, tuple)):
         states = [states]
@@ -723,11 +719,6 @@ class ConvLSTM2DCell(Layer):
     c = f * c_tm1 + i * self.activation(x_c + h_c)
     o = self.recurrent_activation(x_o + h_o)
     h = o * self.activation(c)
-
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
-        h._uses_learning_phase = True
-
     return h, [h, c]
 
   def input_conv(self, x, w, b=None, padding='valid'):
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index c257b25b3ac19d655834756f20a2a07f25324bf0..56dd70558cc6c1bf41211924ad5f8f9750ce8993 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -34,8 +34,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -81,7 +81,6 @@ class Masking(Layer):
     super(Masking, self).__init__(**kwargs)
     self.supports_masking = True
     self.mask_value = mask_value
-    self._can_use_graph_functions = True
 
   def compute_mask(self, inputs, mask=None):
     return K.any(math_ops.not_equal(inputs, self.mask_value), axis=-1)
@@ -125,7 +124,6 @@ class Dropout(Layer):
     self.noise_shape = noise_shape
     self.seed = seed
     self.supports_masking = True
-    self._can_use_graph_functions = True
 
   def _get_noise_shape(self, inputs):
     # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
@@ -136,7 +134,6 @@ class Dropout(Layer):
     return nn_ops._get_noise_shape(inputs, self.noise_shape)  # pylint: disable=protected-access
 
   def call(self, inputs, training=None):
-    original_training_value = training
     if training is None:
       training = K.learning_phase()
 
@@ -147,9 +144,6 @@ class Dropout(Layer):
     output = tf_utils.smart_cond(training,
                                  dropped_inputs,
                                  lambda: array_ops.identity(inputs))
-    # EagerTensor object has no attribute _uses_learning_phase
-    if not context.executing_eagerly() and original_training_value is None:
-      output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
   def compute_output_shape(self, input_shape):
@@ -330,7 +324,6 @@ class Activation(Layer):
     super(Activation, self).__init__(**kwargs)
     self.supports_masking = True
     self.activation = activations.get(activation)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return self.activation(inputs)
@@ -383,7 +376,6 @@ class Reshape(Layer):
   def __init__(self, target_shape, **kwargs):
     super(Reshape, self).__init__(**kwargs)
     self.target_shape = tuple(target_shape)
-    self._can_use_graph_functions = True
 
   def _fix_unknown_dimension(self, input_shape, output_shape):
     """Find and replace a missing dimension in an output shape.
@@ -492,7 +484,6 @@ class Permute(Layer):
           'The set of indices in `dims` must be consecutive and start from 1.' %
           (dims,))
     self.input_spec = InputSpec(ndim=len(self.dims) + 1)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -544,7 +535,6 @@ class Flatten(Layer):
     super(Flatten, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(min_ndim=2)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     if self.data_format == 'channels_first':
@@ -604,7 +594,6 @@ class RepeatVector(Layer):
     super(RepeatVector, self).__init__(**kwargs)
     self.n = n
     self.input_spec = InputSpec(ndim=2)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -933,7 +922,6 @@ class Dense(Layer):
 
     self.supports_masking = True
     self.input_spec = InputSpec(min_ndim=2)
-    self._can_use_graph_functions = True
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -1033,7 +1021,6 @@ class ActivityRegularization(Layer):
     self.supports_masking = True
     self.l1 = l1
     self.l2 = l2
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     return input_shape
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index beacdf2515633cae2cb16b49fbf8b66b11522e73..81f292817fd989ee0aa256ada64e09b32a79ac2b 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -25,7 +25,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 682b5614394e4288010dad875f0f10f01dde2a4c..28d8ef252aa8968e1b5fbbe89a398373cf736afc 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -82,10 +82,10 @@ class Embedding(Layer):
         (without it, the shape of the dense outputs cannot be computed).
 
   Input shape:
-      2D tensor with shape: `(batch_size, sequence_length)`.
+      2D tensor with shape: `(batch_size, input_length)`.
 
   Output shape:
-      3D tensor with shape: `(batch_size, sequence_length, output_dim)`.
+      3D tensor with shape: `(batch_size, input_length, output_dim)`.
 
   """
 
@@ -116,7 +116,6 @@ class Embedding(Layer):
     self.mask_zero = mask_zero
     self.supports_masking = mask_zero
     self.input_length = input_length
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index 30b83eaf50c2d239503856298dd9a02ae1f1733c..d2c4aaa125e7f1415c4e33224056c18418670769 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -23,8 +23,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
@@ -154,7 +154,6 @@ class LocallyConnected1D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.implementation = implementation
     self.input_spec = InputSpec(ndim=3)
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -430,7 +429,6 @@ class LocallyConnected2D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.implementation = implementation
     self.input_spec = InputSpec(ndim=4)
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 8589b32b3c5bd942f0a78978e0ce3173c85950ac..2397a607da96b89676328c1a392b4f82705e99af 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -27,40 +27,43 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class LocallyConnectedLayersTest(test.TestCase):
+class LocallyConnected1DLayersTest(test.TestCase):
+  # TODO(fchollet): investigate why LocallyConnected1D
+  # fails inside a graph function in an eager context (fails with error
+  # "Incompatible shapes between op input and calculated input gradient").
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_locallyconnected_1d(self):
-    num_samples = 2
-    num_steps = 8
-    input_dim = 5
-    filter_length = 3
-    filters = 4
-
-    for padding in ['valid', 'same']:
-      for strides in [1]:
-        if padding == 'same' and strides != 1:
-          continue
-        for data_format in ['channels_first', 'channels_last']:
-          for implementation in [1, 2]:
-            kwargs = {
-                'filters': filters,
-                'kernel_size': filter_length,
-                'padding': padding,
-                'strides': strides,
-                'data_format': data_format,
-                'implementation': implementation
-            }
+    with self.cached_session():
+      num_samples = 2
+      num_steps = 8
+      input_dim = 5
+      filter_length = 3
+      filters = 4
 
-            if padding == 'same' and implementation == 1:
-              self.assertRaises(ValueError,
-                                keras.layers.LocallyConnected1D,
-                                **kwargs)
-            else:
-              testing_utils.layer_test(
-                  keras.layers.LocallyConnected1D,
-                  kwargs=kwargs,
-                  input_shape=(num_samples, num_steps, input_dim))
+      for padding in ['valid', 'same']:
+        for strides in [1]:
+          if padding == 'same' and strides != 1:
+            continue
+          for data_format in ['channels_first', 'channels_last']:
+            for implementation in [1, 2]:
+              kwargs = {
+                  'filters': filters,
+                  'kernel_size': filter_length,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': data_format,
+                  'implementation': implementation
+              }
+
+              if padding == 'same' and implementation == 1:
+                self.assertRaises(ValueError,
+                                  keras.layers.LocallyConnected1D,
+                                  **kwargs)
+              else:
+                testing_utils.layer_test(
+                    keras.layers.LocallyConnected1D,
+                    kwargs=kwargs,
+                    input_shape=(num_samples, num_steps, input_dim))
 
   def test_locallyconnected_1d_regularization(self):
     num_samples = 2
@@ -111,29 +114,63 @@ class LocallyConnectedLayersTest(test.TestCase):
               self.assertEqual(layer.kernel.constraint, k_constraint)
               self.assertEqual(layer.bias.constraint, b_constraint)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+
+class LocallyConnected2DLayersTest(test.TestCase):
+  # TODO(fchollet): investigate why LocallyConnected2D
+  # fails inside a graph function in an eager context (fails with error
+  # "Incompatible shapes between op input and calculated input gradient").
+
   def test_locallyconnected_2d(self):
-    num_samples = 8
-    filters = 3
-    stack_size = 4
-    num_row = 6
-    num_col = 10
+    with self.cached_session():
+      num_samples = 8
+      filters = 3
+      stack_size = 4
+      num_row = 6
+      num_col = 10
 
-    for padding in ['valid', 'same']:
-      for strides in [(1, 1), (2, 2)]:
-        for implementation in [1, 2]:
-          if padding == 'same' and strides != (1, 1):
-            continue
+      for padding in ['valid', 'same']:
+        for strides in [(1, 1), (2, 2)]:
+          for implementation in [1, 2]:
+            if padding == 'same' and strides != (1, 1):
+              continue
+
+            kwargs = {
+                'filters': filters,
+                'kernel_size': 3,
+                'padding': padding,
+                'kernel_regularizer': 'l2',
+                'bias_regularizer': 'l2',
+                'strides': strides,
+                'data_format': 'channels_last',
+                'implementation': implementation
+            }
+
+            if padding == 'same' and implementation == 1:
+              self.assertRaises(ValueError,
+                                keras.layers.LocallyConnected2D,
+                                **kwargs)
+            else:
+              testing_utils.layer_test(
+                  keras.layers.LocallyConnected2D,
+                  kwargs=kwargs,
+                  input_shape=(num_samples, num_row, num_col, stack_size))
 
+  def test_locallyconnected_2d_channels_first(self):
+    with self.cached_session():
+      num_samples = 8
+      filters = 3
+      stack_size = 4
+      num_row = 6
+      num_col = 10
+
+      for implementation in [1, 2]:
+        for padding in ['valid', 'same']:
           kwargs = {
               'filters': filters,
               'kernel_size': 3,
-              'padding': padding,
-              'kernel_regularizer': 'l2',
-              'bias_regularizer': 'l2',
-              'strides': strides,
-              'data_format': 'channels_last',
-              'implementation': implementation
+              'data_format': 'channels_first',
+              'implementation': implementation,
+              'padding': padding
           }
 
           if padding == 'same' and implementation == 1:
@@ -146,40 +183,12 @@ class LocallyConnectedLayersTest(test.TestCase):
                 kwargs=kwargs,
                 input_shape=(num_samples, num_row, num_col, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_locallyconnected_2d_channels_first(self):
-    num_samples = 8
-    filters = 3
-    stack_size = 4
-    num_row = 6
-    num_col = 10
-
-    for implementation in [1, 2]:
-      for padding in ['valid', 'same']:
-        kwargs = {
-            'filters': filters,
-            'kernel_size': 3,
-            'data_format': 'channels_first',
-            'implementation': implementation,
-            'padding': padding
-        }
-
-        if padding == 'same' and implementation == 1:
-          self.assertRaises(ValueError,
-                            keras.layers.LocallyConnected2D,
-                            **kwargs)
-        else:
-          testing_utils.layer_test(
-              keras.layers.LocallyConnected2D,
-              kwargs=kwargs,
-              input_shape=(num_samples, num_row, num_col, stack_size))
-
   def test_locallyconnected_2d_regularization(self):
-    num_samples = 8
+    num_samples = 2
     filters = 3
     stack_size = 4
     num_row = 6
-    num_col = 10
+    num_col = 7
     for implementation in [1, 2]:
       for padding in ['valid', 'same']:
         kwargs = {
@@ -220,63 +229,67 @@ class LocallyConnectedLayersTest(test.TestCase):
             self.assertEqual(layer.kernel.constraint, k_constraint)
             self.assertEqual(layer.bias.constraint, b_constraint)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_locallyconnected_implementation(self):
-    n_train = 4
-    n_classes = 3
-    n_epochs = 2
 
-    np.random.seed(1)
-    targets = np.random.randint(0, n_classes, (n_train,))
+class LocallyConnectedImplementationModeTest(test.TestCase):
 
-    for width in [1, 17]:
-      for height in [16]:
-        for filters in [2]:
-          for data_format in ['channels_first', 'channels_last']:
-            inputs = get_inputs(data_format, filters, height, n_train, width)
-
-            for kernel_x in [(3,)]:
-              for kernel_y in [()] if width == 1 else [(2,)]:
-                for stride_x in [(1,)]:
-                  for stride_y in [()] if width == 1 else [(3,)]:
-                    for layers in [2]:
-                      kwargs = {
-                          'layers': layers,
-                          'filters': filters,
-                          'kernel_size': kernel_x + kernel_y,
-                          'strides': stride_x + stride_y,
-                          'data_format': data_format,
-                          'n_classes': n_classes,
-                          'input_shape': inputs.shape
-                      }
-
-                      model_1 = get_model(implementation=1, **kwargs)
-                      model_2 = get_model(implementation=2, **kwargs)
-
-                      copy_model_weights(model_2, model_1)
-
-                      # Compare outputs at initialization.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(out_1, out_2,
-                                                         rtol=1e-5, atol=1e-5)
-
-                      # Train.
-                      model_1.fit(x=inputs,
-                                  y=targets,
-                                  epochs=n_epochs,
-                                  batch_size=n_train)
-
-                      model_2.fit(x=inputs,
-                                  y=targets,
-                                  epochs=n_epochs,
-                                  batch_size=n_train)
-
-                      # Compare outputs after a few training steps.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(out_1, out_2,
-                                                         rtol=1e-5, atol=1e-5)
+  def test_locallyconnected_implementation(self):
+    with self.cached_session():
+      num_samples = 4
+      num_classes = 3
+      num_epochs = 2
+
+      np.random.seed(1)
+      targets = np.random.randint(0, num_classes, (num_samples,))
+
+      for width in [1, 6]:
+        for height in [7]:
+          for filters in [2]:
+            for data_format in ['channels_first', 'channels_last']:
+              inputs = get_inputs(
+                  data_format, filters, height, num_samples, width)
+
+              for kernel_x in [(3,)]:
+                for kernel_y in [()] if width == 1 else [(2,)]:
+                  for stride_x in [(1,)]:
+                    for stride_y in [()] if width == 1 else [(3,)]:
+                      for layers in [2]:
+                        kwargs = {
+                            'layers': layers,
+                            'filters': filters,
+                            'kernel_size': kernel_x + kernel_y,
+                            'strides': stride_x + stride_y,
+                            'data_format': data_format,
+                            'num_classes': num_classes,
+                            'input_shape': inputs.shape
+                        }
+
+                        model_1 = get_model(implementation=1, **kwargs)
+                        model_2 = get_model(implementation=2, **kwargs)
+
+                        copy_model_weights(model_2, model_1)
+
+                        # Compare outputs at initialization.
+                        out_1 = model_1.call(inputs)
+                        out_2 = model_2.call(inputs)
+                        self.assertAllCloseAccordingToType(out_1, out_2,
+                                                           rtol=1e-5, atol=1e-5)
+
+                        # Train.
+                        model_1.fit(x=inputs,
+                                    y=targets,
+                                    epochs=num_epochs,
+                                    batch_size=num_samples)
+
+                        model_2.fit(x=inputs,
+                                    y=targets,
+                                    epochs=num_epochs,
+                                    batch_size=num_samples)
+
+                        # Compare outputs after a few training steps.
+                        out_1 = model_1.call(inputs)
+                        out_2 = model_2.call(inputs)
+                        self.assertAllCloseAccordingToType(out_1, out_2,
+                                                           rtol=1e-5, atol=1e-5)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_make_2d(self):
@@ -316,7 +329,7 @@ class LocallyConnectedLayersTest(test.TestCase):
       self.assertAllCloseAccordingToType(inputs_2d, inputs_2d_tf)
 
 
-def get_inputs(data_format, filters, height, n_train, width):
+def get_inputs(data_format, filters, height, num_samples, width):
   if data_format == 'channels_first':
     if width == 1:
       input_shape = (filters, height)
@@ -333,7 +346,7 @@ def get_inputs(data_format, filters, height, n_train, width):
     raise NotImplementedError(data_format)
 
   inputs = np.random.normal(0, 1,
-                            (n_train,) + input_shape).astype(np.float32)
+                            (num_samples,) + input_shape).astype(np.float32)
   return inputs
 
 
@@ -352,7 +365,7 @@ def get_model(implementation,
               kernel_size,
               strides,
               layers,
-              n_classes,
+              num_classes,
               data_format,
               input_shape):
   model = keras.Sequential()
@@ -377,7 +390,7 @@ def get_model(implementation,
         implementation=implementation))
 
   model.add(keras.layers.Flatten())
-  model.add(keras.layers.Dense(n_classes))
+  model.add(keras.layers.Dense(num_classes))
   model.compile(
       optimizer=RMSPropOptimizer(0.01),
       metrics=[keras.metrics.categorical_accuracy],
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index e0094d99f450d96cfa6e56db346027ef2d2dda8f..9db697871fe27af65cb697f47b8cccf434ad72cd 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -126,6 +126,18 @@ class LSTMLayerTest(test.TestCase):
                   optimizer=RMSPropOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
+  def test_masking_with_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    lstm_cells = [keras.layers.LSTMCell(10), keras.layers.LSTMCell(5)]
+    model.add(keras.layers.RNN(lstm_cells, return_sequences=True, unroll=False))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
   def test_from_config_LSTM(self):
     layer_class = keras.layers.LSTM
     for stateful in (False, True):
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index 0ded0e42ed329ed1abb7a7c60362e366ec130e63..f295af3fe04d87d260e4f6a98762dcfb90883531 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -40,7 +40,6 @@ class _Merge(Layer):
 
   def __init__(self, **kwargs):
     super(_Merge, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.supports_masking = True
 
   def _merge_function(self, inputs):
@@ -369,7 +368,6 @@ class Concatenate(_Merge):
 
   def __init__(self, axis=-1, **kwargs):
     super(Concatenate, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.axis = axis
     self.supports_masking = True
     self._reshape_required = False
@@ -467,7 +465,6 @@ class Dot(_Merge):
 
   def __init__(self, axes, normalize=False, **kwargs):
     super(Dot, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     if not isinstance(axes, int):
       if not isinstance(axes, (list, tuple)):
         raise TypeError('Invalid type for `axes` - '
diff --git a/tensorflow/python/keras/layers/noise.py b/tensorflow/python/keras/layers/noise.py
index e7c0478513d2974b853497be0fa221aca96567ee..cb7cee3ebc3ebd2413836b876f2aaf21985f1d9c 100644
--- a/tensorflow/python/keras/layers/noise.py
+++ b/tensorflow/python/keras/layers/noise.py
@@ -55,7 +55,6 @@ class GaussianNoise(Layer):
     super(GaussianNoise, self).__init__(**kwargs)
     self.supports_masking = True
     self.stddev = stddev
-    self._can_use_graph_functions = True
 
   def call(self, inputs, training=None):
 
@@ -100,7 +99,6 @@ class GaussianDropout(Layer):
     super(GaussianDropout, self).__init__(**kwargs)
     self.supports_masking = True
     self.rate = rate
-    self._can_use_graph_functions = True
 
   def call(self, inputs, training=None):
     if 0 < self.rate < 1:
@@ -155,7 +153,6 @@ class AlphaDropout(Layer):
     self.noise_shape = noise_shape
     self.seed = seed
     self.supports_masking = True
-    self._can_use_graph_functions = True
 
   def _get_noise_shape(self, inputs):
     return self.noise_shape if self.noise_shape else array_ops.shape(inputs)
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 991a06e0bfaf66700ad5b535d0346785f03f7100..aa8598d7319948e5e68a48cd50973b23a0764b70 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -26,8 +26,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -145,7 +145,6 @@ class BatchNormalization(Layer):
                **kwargs):
     super(BatchNormalization, self).__init__(
         name=name, trainable=trainable, **kwargs)
-    self._can_use_graph_functions = True
     if isinstance(axis, list):
       self.axis = axis[:]
     else:
@@ -493,7 +492,6 @@ class BatchNormalization(Layer):
     return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=None):
-    original_training_value = training
     if training is None:
       training = K.learning_phase()
 
@@ -517,8 +515,6 @@ class BatchNormalization(Layer):
         # Currently never reaches here since fused_batch_norm does not support
         # virtual batching
         outputs = undo_virtual_batching(outputs)
-      if not context.executing_eagerly() and original_training_value is None:
-        outputs._uses_learning_phase = True  # pylint: disable=protected-access
       return outputs
 
     # Compute the axes along which to reduce the mean / variance
@@ -635,8 +631,6 @@ class BatchNormalization(Layer):
 
     if self.virtual_batch_size is not None:
       outputs = undo_virtual_batching(outputs)
-    if not context.executing_eagerly() and original_training_value is None:
-      outputs._uses_learning_phase = True  # pylint: disable=protected-access
     return outputs
 
   def compute_output_shape(self, input_shape):
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index b11a350dbf08ca04b7e70f8eebe64e5c1939c883..92e412870773f3b89751df30a3b9250e016fb60c 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -134,8 +134,6 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
   def test_batchnorm_convnet_channel_last(self):
-    # keras.backend.set_learning_phase(True)
-
     model = keras.models.Sequential()
     norm = keras.layers.BatchNormalization(
         axis=-1, input_shape=(4, 4, 3), momentum=0.8)
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index b8d6b03664f48a8aa699cab7cb5e372dfd71830f..a0744cddad682fdcae18f571413b668d7767cb2f 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -22,8 +22,8 @@ import functools
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -58,7 +58,6 @@ class Pooling1D(Layer):
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
     super(Pooling1D, self).__init__(name=name, **kwargs)
-    self._can_use_graph_functions = True
     if data_format is None:
       data_format = backend.image_data_format()
     if strides is None:
@@ -231,7 +230,6 @@ class Pooling2D(Layer):
                padding='valid', data_format=None,
                name=None, **kwargs):
     super(Pooling2D, self).__init__(name=name, **kwargs)
-    self._can_use_graph_functions = True
     if data_format is None:
       data_format = backend.image_data_format()
     if strides is None:
@@ -427,7 +425,6 @@ class Pooling3D(Layer):
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
     super(Pooling3D, self).__init__(name=name, **kwargs)
-    self._can_use_graph_functions = True
     if data_format is None:
       data_format = backend.image_data_format()
     if strides is None:
@@ -599,7 +596,6 @@ class GlobalPooling1D(Layer):
 
   def __init__(self, data_format='channels_last', **kwargs):
     super(GlobalPooling1D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.input_spec = InputSpec(ndim=3)
     self.data_format = conv_utils.normalize_data_format(data_format)
 
@@ -705,7 +701,6 @@ class GlobalPooling2D(Layer):
 
   def __init__(self, data_format=None, **kwargs):
     super(GlobalPooling2D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=4)
 
@@ -804,7 +799,6 @@ class GlobalPooling3D(Layer):
 
   def __init__(self, data_format=None, **kwargs):
     super(GlobalPooling3D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=5)
 
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index d22c38f19e8d74b12d3450ab5f7a2f2c0e52539d..5d0efc2f16c3367bd08c76e7c9ea88f7bcb729d0 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -28,8 +28,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
@@ -87,18 +87,8 @@ class StackedRNNCells(Layer):
 
   @property
   def state_size(self):
-    # States are a flat list of the individual cell state size.
-    # e.g. states of a 2-layer LSTM would be `[h1, c1, h2, c2]`.
-    # (assuming one LSTM has states [h, c])
-    # In the case of reverse_state_order=True, the state_size will be
-    # [h2, c2, h1, c1].
-    state_size = []
-    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
-      if _is_multiple_state(cell.state_size):
-        state_size += list(cell.state_size)
-      else:
-        state_size.append(cell.state_size)
-    return tuple(state_size)
+    return tuple(c.state_size for c in
+                 (self.cells[::-1] if self.reverse_state_order else self.cells))
 
   @property
   def output_size(self):
@@ -110,8 +100,6 @@ class StackedRNNCells(Layer):
       return self.cells[-1].state_size
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    # The init state is flattened into a list because state_size is a flattened
-    # list.
     initial_states = []
     for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
       get_initial_state_fn = getattr(cell, 'get_initial_state', None)
@@ -122,39 +110,27 @@ class StackedRNNCells(Layer):
         initial_states.append(_generate_zero_filled_state_for_cell(
             cell, inputs, batch_size, dtype))
 
-    return nest.flatten(initial_states)
+    return tuple(initial_states)
 
   def call(self, inputs, states, constants=None, **kwargs):
     # Recover per-cell states.
-    nested_states = []
-    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
-      if _is_multiple_state(cell.state_size):
-        nested_states.append(states[:len(cell.state_size)])
-        states = states[len(cell.state_size):]
-      else:
-        nested_states.append([states[0]])
-        states = states[1:]
-    if self.reverse_state_order:
-      nested_states = nested_states[::-1]
+    state_size = (self.state_size[::-1]
+                  if self.reverse_state_order else self.state_size)
+    nested_states = nest.pack_sequence_as(state_size, nest.flatten(states))
 
     # Call the cells in order and store the returned states.
     new_nested_states = []
     for cell, states in zip(self.cells, nested_states):
+      states = states if nest.is_sequence(states) else [states]
       if generic_utils.has_arg(cell.call, 'constants'):
         inputs, states = cell.call(inputs, states, constants=constants,
                                    **kwargs)
       else:
         inputs, states = cell.call(inputs, states, **kwargs)
-
       new_nested_states.append(states)
 
-    # Format the new states as a flat list
-    new_states = []
-    if self.reverse_state_order:
-      new_nested_states = new_nested_states[::-1]
-    for cell_states in new_nested_states:
-      new_states += cell_states
-    return inputs, new_states
+    return inputs, nest.pack_sequence_as(state_size,
+                                         nest.flatten(new_nested_states))
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -470,6 +446,9 @@ class RNN(Layer):
                        'an attribute `state_size` '
                        '(tuple of integers, '
                        'one integer per RNN state).')
+    # If True, the output for masked timestep will be zeros, whereas in the
+    # False case, output from previous timestep is returned for masked timestep.
+    self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
     if isinstance(cell, checkpointable.CheckpointableBase):
@@ -853,7 +832,8 @@ class RNN(Layer):
         mask=mask,
         unroll=self.unroll,
         input_length=timesteps,
-        time_major=self.time_major)
+        time_major=self.time_major,
+        zero_output_for_mask=self.zero_output_for_mask)
     if self.stateful:
       updates = []
       for i in range(len(states)):
@@ -865,12 +845,6 @@ class RNN(Layer):
     else:
       output = last_output
 
-    # Properly set learning phase
-    if getattr(last_output, '_uses_learning_phase', False):
-      output._uses_learning_phase = True
-      for state in states:
-        state._uses_learning_phase = True
-
     if self.return_state:
       if not isinstance(states, (list, tuple)):
         states = [states]
@@ -953,6 +927,8 @@ class RNN(Layer):
     }
     if self._num_constants is not None:
       config['num_constants'] = self._num_constants
+    if self.zero_output_for_mask:
+      config['zero_output_for_mask'] = self.zero_output_for_mask
 
     cell_config = self.cell.get_config()
     config['cell'] = {
@@ -1132,12 +1108,6 @@ class SimpleRNNCell(Layer):
     if self.activation is not None:
       output = self.activation(output)
 
-    # Properly set learning phase on output tensor.
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None and not context.executing_eagerly():
-        # This would be harmless to set in eager mode, but eager tensors
-        # disallow setting arbitrary attributes.
-        output._uses_learning_phase = True
     return output, [output]
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
@@ -1640,12 +1610,6 @@ class GRUCell(Layer):
       hh = self.activation(x_h + recurrent_h)
     # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None and not context.executing_eagerly():
-        # This would be harmless to set in eager mode, but eager tensors
-        # disallow setting arbitrary attributes.
-        h._uses_learning_phase = True
-
     return h, [h]
 
   def get_config(self):
@@ -2030,7 +1994,7 @@ class LSTMCell(Layer):
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.implementation = implementation
-    self.state_size = (self.units, self.units)
+    self.state_size = [self.units, self.units]
     self.output_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
@@ -2171,11 +2135,6 @@ class LSTMCell(Layer):
       c, o = self._compute_carry_and_output_fused(z, c_tm1)
 
     h = o * self.activation(c)
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None and not context.executing_eagerly():
-        # This would be harmless to set in eager mode, but eager tensors
-        # disallow setting arbitrary attributes.
-        h._uses_learning_phase = True
     return h, [h, c]
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index 6346b171802ecc8a757a85f226d30f2a4a44c9bb..b1449069e3279e27b08ecc383e72aed63525e521 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -120,7 +120,7 @@ class RNNTest(test.TestCase):
              MinimalRNNCell(16, 8),
              MinimalRNNCell(32, 16)]
     layer = keras.layers.RNN(cells)
-    self.assertEqual(layer.cell.state_size, (8, 8, 16, 16, 32, 32))
+    self.assertEqual(layer.cell.state_size, ((8, 8), (16, 16), (32, 32)))
     self.assertEqual(layer.cell.output_size, 32)
     y = layer(x)
     model = keras.models.Model(x, y)
@@ -1013,8 +1013,8 @@ class RNNTest(test.TestCase):
         inputs, _ = cell(inputs, initial_state)
         output = inputs
         if not context.executing_eagerly():
-          sess.run(variables_lib.global_variables_initializer())
-          output = sess.run(output)
+          self.evaluate(variables_lib.global_variables_initializer())
+          output = self.evaluate(output)
         return output
 
     random_seed.set_random_seed(12345)
@@ -1044,6 +1044,67 @@ class RNNTest(test.TestCase):
                         second_implementation_output)
     self.assertAllClose(first_implementation_output, tf_lstm_cell_output)
 
+  def test_masking_rnn_with_output_and_states(self):
+
+    class Cell(keras.layers.Layer):
+
+      def __init__(self):
+        self.state_size = None
+        self.output_size = None
+        super(Cell, self).__init__()
+
+      def build(self, input_shape):
+        self.state_size = input_shape[-1]
+        self.output_size = input_shape[-1]
+
+      def call(self, inputs, states):
+        return inputs, [s + 1 for s in states]
+
+    x = keras.Input((3, 1), name='x')
+    x_masked = keras.layers.Masking()(x)
+    s_0 = keras.Input((1,), name='s_0')
+    y, s = keras.layers.RNN(
+        Cell(), return_state=True)(x_masked, initial_state=s_0)
+    model = keras.models.Model([x, s_0], [y, s])
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+
+    # last time step masked
+    x_np = np.array([[[1.], [2.], [0.]]])
+    s_0_np = np.array([[10.]])
+    y_np, s_np = model.predict([x_np, s_0_np])
+
+    # 1 is added to initial state two times
+    self.assertAllClose(s_np, s_0_np + 2)
+    # Expect last output to be the same as last output before masking
+    self.assertAllClose(y_np, x_np[:, 1, :])
+
+  def test_zero_output_for_masking(self):
+
+    for unroll in [True, False]:
+      cell = keras.layers.SimpleRNNCell(5)
+      x = keras.Input((5, 5))
+      mask = keras.layers.Masking()
+      layer = keras.layers.RNN(
+          cell, return_sequences=True, zero_output_for_mask=True, unroll=unroll)
+      masked_input = mask(x)
+      y = layer(masked_input)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                    loss='mse')
+
+      np_x = np.ones((6, 5, 5))
+      result_1 = model.predict(np_x)
+
+      # set the time 4 and 5 for last record to be zero (masked).
+      np_x[5, 3:] = 0
+      result_2 = model.predict(np_x)
+
+      # expect the result_2 has same output, except the time 4,5 for last
+      # record.
+      result_1[5, 3:] = 0
+      self.assertAllClose(result_1, result_2)
+
 
 class Minimal2DRNNCell(keras.layers.Layer):
   """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
diff --git a/tensorflow/python/keras/layers/unified_rnn_test.py b/tensorflow/python/keras/layers/unified_rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..744d51824ba998dcfe2fe0b5139497f1a974074b
--- /dev/null
+++ b/tensorflow/python/keras/layers/unified_rnn_test.py
@@ -0,0 +1,635 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnifiedLSTM layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import time
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
+from tensorflow.python.client import session
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine.input_spec import InputSpec
+from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNLSTM
+from tensorflow.python.keras.layers.recurrent import RNN
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import gradient_descent
+
+
+class RNNTest(test.TestCase):
+
+  def setUp(self):
+    rewrites = rewriter_config_pb2.RewriterConfig()
+    rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
+    customer_optimizer = rewrites.custom_optimizers.add()
+    customer_optimizer.name = 'ExperimentalImplementationSelector'
+    rewrites.min_graph_nodes = -1
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewrites)
+    self.config = config_pb2.ConfigProto(graph_options=graph_options)
+
+  def test_unifiedRNN(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with ops.Graph().as_default(), session.Session(config=self.config) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = UnifiedLSTM(rnn_state_size)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      outputs, runtime = layer(inputs)
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEquals(runtime_value, b'cudnn')
+        else:
+          self.assertEquals(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  def test_keras_model_with_lstm(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 10
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(timestep, input_shape),
+        num_classes=output_shape)
+    y_train = keras.utils.to_categorical(y_train, output_shape)
+
+    K.set_session(session.Session(config=self.config))
+    layer = UnifiedLSTM(rnn_state_size)
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs, unused_runtime = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('rmsprop', loss='mse')
+    model.fit(x_train, y_train, epochs=epoch)
+
+  def test_unifiedRNN_with_cond(self):
+    # This test is to demonstrate the graph rewrite of grappler plugin under
+    # the condition that the function returns different number of internal
+    # states.
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with ops.Graph().as_default(), session.Session(config=self.config) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = UnifiedLSTM(rnn_state_size)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      zeros = array_ops.zeros([batch, output_shape])
+      dummy_runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+      a = constant_op.constant(0)
+      b = constant_op.constant(1)
+      # Will always run the lstm layer.
+      outputs, runtime = control_flow_ops.cond(
+          gen_math_ops.less(a, b),
+          lambda: layer(inputs),
+          lambda: (zeros, dummy_runtime))
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEquals(runtime_value, b'cudnn')
+        else:
+          self.assertEquals(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  def _time_performance_run_cudnn_lstm(self, test_config, x_train, y_train):
+    # Get the performance number for standard Cudnn LSTM
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+    epoch = test_config['epoch']
+    warmup_epoch = test_config['warmup_epoch']
+
+    ops.reset_default_graph()
+    with self.test_session(use_gpu=True):
+      cudnn_lstm_layer = CuDNNLSTM(rnn_state_size)
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+
+      outputs = cudnn_lstm_layer(inputs)
+      model = keras.models.Model(inputs, outputs)
+      model.compile('sgd', 'mse')
+
+      total_duration = 0
+      for i in range(epoch):
+        start_time = time.time()
+        model.fit(x_train, y_train)
+        end_time = time.time()
+        if i >= warmup_epoch:
+          duration_per_epoch = end_time - start_time
+          total_duration += duration_per_epoch
+          logging.vlog(2, '%s: Time consumed for epoch %d is: %s',
+                       'CuDNN LSTM', i, duration_per_epoch)
+      logging.info('Average performance for %s per epoch is: %s',
+                   'CuDNN LSTM', (total_duration / epoch))
+      return total_duration / epoch
+
+  def _time_performance_run_unifed_lstm_gpu(
+      self, test_config, x_train, y_train):
+    # Get performance number for Unified_LSTM with grappler swap the impl
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+    epoch = test_config['epoch']
+    warmup_epoch = test_config['warmup_epoch']
+
+    ops.reset_default_graph()
+    K.set_session(session.Session(config=self.config))
+    layer = UnifiedLSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs, _ = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    total_duration = 0
+    for i in range(epoch):
+      start_time = time.time()
+      model.fit(x_train, y_train)
+      end_time = time.time()
+      if i >= warmup_epoch:
+        duration_per_epoch = end_time - start_time
+        total_duration += duration_per_epoch
+        logging.vlog(2, '%s: Time consumed for epoch %d is: %s',
+                     'Unified LSTM', i, duration_per_epoch)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'Unified LSTM', (total_duration / epoch))
+    return total_duration / epoch
+
+  def _time_performance_run_normal_lstm(
+      self, test_config, x_train, y_train):
+    # Get performance number for standard LSTM on GPU.
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+    epoch = test_config['epoch']
+    warmup_epoch = test_config['warmup_epoch']
+
+    ops.reset_default_graph()
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.LSTM(rnn_state_size)
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+
+      outputs = layer(inputs)
+      model = keras.models.Model(inputs, outputs)
+      model.compile('sgd', 'mse')
+
+      total_duration = 0
+      for i in range(epoch):
+        start_time = time.time()
+        model.fit(x_train, y_train)
+        end_time = time.time()
+        if i >= warmup_epoch:
+          duration_per_epoch = end_time - start_time
+          total_duration += duration_per_epoch
+          logging.vlog(2, '%s: Time consumed for epoch %d is: %s',
+                       'Normal LSTM', i, duration_per_epoch)
+      logging.info('Average performance for %s per epoch is: %s',
+                   'Normal LSTM', (total_duration / epoch))
+      return total_duration / epoch
+
+  def test_performance_with_standard_cudnn_impl(self):
+    if not test.is_gpu_available():
+      self.skipTest('performance test will only run on GPU')
+
+    test_config = {
+        'input_shape': 128,
+        'rnn_state_size': 64,
+        'output_shape': 64,
+        'timestep': 50,
+        'epoch': 20,
+        # The performance for warmup epoch is ignored.
+        'warmup_epoch': 1,
+    }
+    batch = 64
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(test_config['timestep'], test_config['input_shape']),
+        num_classes=test_config['output_shape'])
+    y_train = keras.utils.to_categorical(y_train, test_config['output_shape'])
+
+    cudnn_duration = self._time_performance_run_cudnn_lstm(
+        test_config, x_train, y_train)
+    unified_lstm_gpu_duration = self._time_performance_run_unifed_lstm_gpu(
+        test_config, x_train, y_train)
+    normal_lstm_duration = self._time_performance_run_normal_lstm(
+        test_config, x_train, y_train)
+
+    cudnn_vs_unified = cudnn_duration / unified_lstm_gpu_duration
+    unified_vs_normal = normal_lstm_duration / unified_lstm_gpu_duration
+    # Assert the performance diff should be within 80% of the native cudnn impl.
+    self.assertGreaterEqual(
+        cudnn_vs_unified, 0.80,
+        'Expect the performance of Unified LSTM is within 80% of CuDNN LSTM, '
+        'but got {}'.format(cudnn_vs_unified * 100))
+    # Assert the performance diff between CPU impl and GPU impl should be more
+    # than 5 times.
+    self.assertGreaterEqual(
+        unified_vs_normal, 5,
+        'Expect the performance of Unified LSTM is more than 5 times of normal '
+        'LSTM, but got {}'.format(unified_vs_normal))
+
+
+class UnifiedLSTM(RNN):
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               time_major=False,
+               **kwargs):
+    super(RNN, self).__init__(**kwargs)  # pylint: disable=bad-super-call
+    self.units = units
+    cell_spec = collections.namedtuple('cell', ['state_size', 'output_size'])
+    self.cell = cell_spec(
+        state_size=(self.units, self.units), output_size=self.units)
+    self.activation = activations.get(activation)
+    self.recurrent_activation = activations.get(recurrent_activation)
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.unit_forget_bias = unit_forget_bias
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.return_sequences = return_sequences
+    self.return_state = return_state
+    self.go_backwards = go_backwards
+    self.stateful = stateful
+    self.time_major = time_major
+    self._num_constants = None
+    self._num_inputs = None
+    self._states = None
+    self.input_spec = [InputSpec(ndim=3)]
+    self.state_spec = [
+        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
+    ]
+
+  @tf_utils.shape_type_conversion
+  def build(self, input_shape):
+    super(UnifiedLSTM, self).build(input_shape)
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_dim = int(input_shape[-1])
+
+    self.kernel = self.add_weight(
+        shape=(input_dim, self.units * 4),
+        name='kernel',
+        dtype=dtypes.float32,
+        use_resource=True,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units * 4),
+        name='recurrent_kernel',
+        dtype=dtypes.float32,
+        use_resource=True,
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+
+    # Normal LSTM has 4 bias instead of 8.
+    if self.unit_forget_bias:
+
+      def bias_initializer(_, *args, **kwargs):
+        return array_ops.concat([
+            self.bias_initializer((self.units * 5,), *args, **kwargs),
+            initializers.Ones()((self.units,), *args, **kwargs),
+            self.bias_initializer((self.units * 2,), *args, **kwargs),
+        ],
+                                axis=0)
+    else:
+      bias_initializer = self.bias_initializer
+    self.bias = self.add_weight(
+        shape=(self.units * 8,),
+        name='bias',
+        dtype=dtypes.float32,
+        use_resource=True,
+        initializer=bias_initializer,
+        regularizer=self.bias_regularizer,
+        constraint=self.bias_constraint)
+    self.built = True
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    if isinstance(inputs, list):
+      initial_state = inputs[1:]
+      inputs = inputs[0]
+    elif initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
+
+    if self.go_backwards:
+      # Reverse time axis.
+      inputs = K.reverse(inputs, 1)
+
+    outputs, [new_h, new_c], runtime = normal_lstm(
+        inputs, initial_state[0], initial_state[1], self.kernel,
+        self.recurrent_kernel, self.bias, self.units, self.activation,
+        self.recurrent_activation)
+
+    function.register(cudnn_lstm, inputs, initial_state[0], initial_state[1],
+                      self.kernel, self.recurrent_kernel, self.bias, self.units)
+
+    states = [new_h, new_c]
+
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(state_ops.assign(self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = outputs[:, -1, :]
+
+    if self.return_state:
+      return [output] + states
+    else:
+      return output, runtime
+
+  @tf_utils.shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+
+    if _is_multiple_state(self.cell.state_size):
+      state_size = self.cell.state_size
+    else:
+      state_size = [self.cell.state_size]
+
+    if getattr(self.cell, 'output_size', None) is not None:
+      output_dim = tensor_shape.as_shape(self.cell.output_size).as_list()
+    else:
+      # Note that state_size[0] could be a tensor_shape or int.
+      output_dim = tensor_shape.as_shape(state_size[0]).as_list()
+
+    if self.return_sequences:
+      output_shape = tuple([input_shape[0], input_shape[1]] + output_dim)
+    else:
+      output_shape = tuple([input_shape[0]] + output_dim)
+
+    if self.return_state:
+      state_shape = [
+          tuple([input_shape[0]] + tensor_shape.as_shape(dim).as_list())
+          for dim in state_size
+      ]
+      return [output_shape] + state_shape
+    else:
+      return output_shape
+
+  @property
+  def trainable_weights(self):
+    if self.trainable and self.built:
+      return [self.kernel, self.recurrent_kernel, self.bias]
+    return []
+
+  @property
+  def non_trainable_weights(self):
+    if not self.trainable and self.built:
+      return [self.kernel, self.recurrent_kernel, self.bias]
+    return []
+
+  @property
+  def losses(self):
+    return super(RNN, self).losses
+
+  def get_losses_for(self, inputs=None):
+    return super(RNN, self).get_losses_for(inputs=inputs)   # pylint: disable=bad-super-call
+
+  def get_weights(self):
+    return super(RNN, self).get_weights()  # pylint: disable=bad-super-call
+
+
+def _canonical_to_params(weights, biases, shape):
+  weights = [array_ops.reshape(x, shape) for x in weights]
+  biases = [array_ops.reshape(x, shape) for x in biases]
+  return array_ops.concat(weights + biases, axis=0)
+
+
+def _is_multiple_state(state_size):
+  """Check whether the state_size contains multiple states."""
+  return (hasattr(state_size, '__len__') and
+          not isinstance(state_size, tensor_shape.TensorShape))
+
+
+@function.defun_with_attributes(
+    attributes={
+        'experimental_api_implements': 'lstm',
+        'experimental_api_preferred_device': 'CPU'
+    })
+def normal_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, units,
+                activation, recurrent_activation):
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[1]
+
+  def step(cell_inputs, cell_states):
+    h_tm1 = cell_states[0]  # previous memory state
+    c_tm1 = cell_states[1]  # previous carry state
+
+    # Only use the second half of the bias weights.
+    _, real_bias = array_ops.split(bias, 2)
+
+    z = K.dot(cell_inputs, kernel)
+    z += K.dot(h_tm1, recurrent_kernel)
+    z = K.bias_add(z, real_bias)
+
+    z0 = z[:, :units]
+    z1 = z[:, units:2 * units]
+    z2 = z[:, 2 * units:3 * units]
+    z3 = z[:, 3 * units:]
+
+    i = recurrent_activation(z0)
+    f = recurrent_activation(z1)
+    c = f * c_tm1 + i * activation(z2)
+    o = recurrent_activation(z3)
+
+    h = o * activation(c)
+    return h, [h, c]
+
+  _, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h, init_c],
+      constants=None,
+      unroll=False,
+      input_length=timesteps)
+  return outputs, new_states, constant_op.constant(
+      'cpu', dtype=dtypes.string, name='runtime')
+
+
+@function.defun_with_attributes(
+    attributes={
+        'experimental_api_implements': 'lstm',
+        'experimental_api_preferred_device': 'GPU'
+    })
+def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias, units):
+  inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  input_h = array_ops.expand_dims(input_h, axis=0)
+  input_c = array_ops.expand_dims(input_c, axis=0)
+
+  params = _canonical_to_params(
+      weights=[
+          kernel[:, :units],
+          kernel[:, units:units * 2],
+          kernel[:, units * 2:units * 3],
+          kernel[:, units * 3:],
+          recurrent_kernel[:, :units],
+          recurrent_kernel[:, units:units * 2],
+          recurrent_kernel[:, units * 2:units * 3],
+          recurrent_kernel[:, units * 3:],
+      ],
+      biases=[
+          bias[:units],
+          bias[units:units * 2],
+          bias[units * 2:units * 3],
+          bias[units * 3:units * 4],
+          bias[units * 4:units * 5],
+          bias[units * 5:units * 6],
+          bias[units * 6:units * 7],
+          bias[units * 7:],
+      ],
+      shape=constant_op.constant([-1]))
+
+  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs, input_h=input_h, input_c=input_c, params=params)
+  outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  c = c[0]
+  return outputs, [h, c], constant_op.constant(
+      'cudnn', dtype=dtypes.string, name='runtime')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index c795b2aa7eb97e8e112837e6c4a690a63c01d83b..67b154141efc036b5fa7920c8179b35f5eb38cc1 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -23,8 +23,8 @@ import copy
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -230,17 +230,12 @@ class TimeDistributed(Wrapper):
     kwargs = {}
     if generic_utils.has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
-    uses_learning_phase = False  # pylint: disable=redefined-outer-name
 
     input_shape = K.int_shape(inputs)
     if input_shape[0]:
       # batch size matters, use rnn-based implementation
       def step(x, _):
-        global uses_learning_phase  # pylint: disable=global-variable-undefined
         output = self.layer.call(x, **kwargs)
-        if hasattr(output, '_uses_learning_phase'):
-          uses_learning_phase = (output._uses_learning_phase or
-                                 uses_learning_phase)
         return output, []
 
       _, outputs, _ = K.rnn(
@@ -268,8 +263,6 @@ class TimeDistributed(Wrapper):
         inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
         kwargs['mask'] = K.reshape(mask, inner_mask_shape)
       y = self.layer.call(inputs, **kwargs)
-      if hasattr(y, '_uses_learning_phase'):
-        uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
       output_shape = self.compute_output_shape(input_shape).as_list()
       output_shape = self._get_shape_tuple(
@@ -281,9 +274,6 @@ class TimeDistributed(Wrapper):
         self.layer.activity_regularizer is not None):
       regularization_loss = self.layer.activity_regularizer(y)
       self.add_loss(regularization_loss, inputs)
-
-    if uses_learning_phase:
-      y._uses_learning_phase = True
     return y
 
   def compute_mask(self, inputs, mask=None):
@@ -399,6 +389,10 @@ class Bidirectional(Wrapper):
       raise ValueError('Invalid merge mode. '
                        'Merge mode should be one of '
                        '{"sum", "mul", "ave", "concat", None}')
+    if getattr(layer, 'zero_output_for_mask', None) is not None:
+      # Force the zero_output_for_mask to be True if it presents.
+      layer.zero_output_for_mask = True
+
     self.forward_layer = copy.copy(layer)
     config = layer.get_config()
     config['go_backwards'] = not config['go_backwards']
@@ -595,15 +589,6 @@ class Bidirectional(Wrapper):
       raise ValueError(
           'Unrecognized value for `merge_mode`: %s' % (self.merge_mode))
 
-    # Properly set learning phase
-    if (getattr(y, '_uses_learning_phase', False) or
-        getattr(y_rev, '_uses_learning_phase', False)):
-      if self.merge_mode is None:
-        for out in output:
-          out._uses_learning_phase = True
-      else:
-        output._uses_learning_phase = True
-
     if self.return_state:
       if self.merge_mode is None:
         return output + states
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 965960917cc6b54cc9c81c09cb3fe5c4fdeeccc0..bbafa96aab8b10509005829fb3bbd0cea6ba02c5 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -452,16 +452,13 @@ class BidirectionalTest(test.TestCase):
       wrapped = keras.layers.Bidirectional(
           rnn(units, dropout=0.2, recurrent_dropout=0.2), merge_mode=merge_mode)
       outputs = _to_list(wrapped(inputs, training=True))
-      assert all(not getattr(x, '_uses_learning_phase') for x in outputs)
 
       inputs = keras.Input((timesteps, dim))
       wrapped = keras.layers.Bidirectional(
           rnn(units, dropout=0.2, return_state=True), merge_mode=merge_mode)
       outputs = _to_list(wrapped(inputs))
-      assert all(x._uses_learning_phase for x in outputs)
 
       model = keras.Model(inputs, outputs)
-      assert model.uses_learning_phase
       y1 = _to_list(model.predict(x))
       y2 = _to_list(model.predict(x))
       for x1, x2 in zip(y1, y2):
@@ -638,6 +635,34 @@ class BidirectionalTest(test.TestCase):
       y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
+  def test_Bidirectional_with_masking(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    merge_mode = 'concat'
+    x = np.random.rand(samples, timesteps, dim)
+    # clear the first record's timestep 2, and expect the output of timestep 2
+    # is also 0s.
+    x[0, 2] = 0
+
+    with self.cached_session():
+      inputs = keras.Input((timesteps, dim))
+      masked_inputs = keras.layers.Masking()(inputs)
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, return_sequences=True),
+          merge_mode=merge_mode)
+      outputs = _to_list(wrapped(masked_inputs, training=True))
+      self.assertEqual(len(outputs), 1)
+      self.assertEqual(outputs[0].get_shape().as_list(),
+                       [None, timesteps, units * 2])
+
+      model = keras.Model(inputs, outputs)
+      y = _to_list(model.predict(x))
+      self.assertEqual(len(y), 1)
+      self.assertAllClose(y[0][0, 2], np.zeros(units * 2))
+
 
 def _to_list(ls):
   if isinstance(ls, list):
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 9f548bfe0408d5c053c25b9ae14810d582b83e1e..f871ee409ecf8b3dd2a8b88ded561e00798af0b4 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -197,3 +198,9 @@ def get(identifier):
   else:
     raise ValueError('Could not interpret '
                      'loss function identifier:', identifier)
+
+
+LABEL_DTYPES_FOR_LOSSES = {
+    losses_impl.sparse_softmax_cross_entropy: 'int32',
+    sparse_categorical_crossentropy: 'int32'
+}
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 33e526352fae1634a3577f13d89d4aeb0d186524..de99d47d5e6a70591220d7a70ad0de05d17b4fe6 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -24,9 +24,9 @@ import functools
 import sys
 import types
 import weakref
+from enum import Enum
 import six
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -49,6 +49,7 @@ from tensorflow.python.keras.losses import squared_hinge
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
@@ -147,7 +148,8 @@ def result_wrapper(result_fn):
 
       # Wrapping result in merge_call. merge_call is used when we want to leave
       # replica mode and compute a value in cross replica mode.
-      result_t = replica_context.merge_call(merge_fn_wrapper, result_fn, *args)
+      result_t = replica_context.merge_call(
+          merge_fn_wrapper, args=(result_fn,) + args)
     check_is_tensor_or_operation(result_t,
                                  'Metric {0}\'s result'.format(metric_obj.name))
     return result_t
@@ -170,32 +172,6 @@ def weakmethod(method):
   return inner
 
 
-def safe_div(numerator, denominator):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator))
-
-
 def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
   """Squeeze or expand last dimension if needed.
 
@@ -267,11 +243,165 @@ def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
   return y_pred, y_true, sample_weight
 
 
+class _ConfusionMatrix(Enum):
+  TRUE_POSITIVES = 'tp'
+  FALSE_POSITIVES = 'fp'
+  TRUE_NEGATIVES = 'tn'
+  FALSE_NEGATIVES = 'fn'
+
+
+def _assert_thresholds_range(thresholds):
+  invalid_thresholds = [t for t in thresholds if t < 0 or t > 1]
+  if any(invalid_thresholds):
+    raise ValueError('Threshold values must be in [0, 1]. Invalid values: {}'
+                     .format(invalid_thresholds))
+
+
+def _update_confusion_matrix_variables(variables_to_update,
+                                       y_true,
+                                       y_pred,
+                                       thresholds,
+                                       sample_weight=None):
+  """Returns op to update the given confusion matrix variables.
+
+  For every pair of values in y_true and y_pred:
+
+  true_positive: y_true == True and y_pred > thresholds
+  false_negatives: y_true == True and y_pred <= thresholds
+  true_negatives: y_true == False and y_pred <= thresholds
+  false_positive: y_true == False and y_pred > thresholds
+
+  The results will be weighted and added together. When multiple thresholds are
+  provided, we will repeat the same for every threshold.
+
+  For estimation of these metrics over a stream of data, the function creates an
+  `update_op` operation that updates the given variables.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use weights of 0 to mask values.
+
+  Args:
+    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
+      and corresponding variables to update as values.
+    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
+    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
+      the range `[0, 1]`.
+    thresholds: A python list or tuple of float thresholds in `[0, 1]`.
+    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
+      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `y_true` dimension).
+
+  Returns:
+    Update op.
+
+  Raises:
+    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
+      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
+      `variables_to_update` contains invalid keys.
+  """
+  if variables_to_update is None:
+    return
+  y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
+
+  if not any(
+      key for key in variables_to_update if key in list(_ConfusionMatrix)):
+    raise ValueError(
+        'Please provide at least one valid confusion matrix '
+        'variable to update. Valid variable key options are: "{}". '
+        'Received: "{}"'.format(
+            list(_ConfusionMatrix), variables_to_update.keys()))
+
+  invalid_keys = [
+      key for key in variables_to_update if key not in list(_ConfusionMatrix)
+  ]
+  if invalid_keys:
+    raise ValueError(
+        'Invalid keys: {}. Valid variable key options are: "{}"'.format(
+            invalid_keys, list(_ConfusionMatrix)))
+
+  with ops.control_dependencies([
+      check_ops.assert_greater_equal(
+          y_pred,
+          math_ops.cast(0.0, dtype=y_pred.dtype),
+          message='predictions must be >= 0'),
+      check_ops.assert_less_equal(
+          y_pred,
+          math_ops.cast(1.0, dtype=y_pred.dtype),
+          message='predictions must be <= 1')
+  ]):
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        math_ops.cast(y_pred, dtype=dtypes.float32),
+        math_ops.cast(y_true, dtype=dtypes.bool), sample_weight)
+
+  num_thresholds = len(thresholds)
+  num_predictions = array_ops.size(y_pred)
+
+  # Reshape predictions and labels.
+  predictions_2d = array_ops.reshape(y_pred, [1, -1])
+  labels_2d = array_ops.reshape(
+      math_ops.cast(y_true, dtype=dtypes.bool), [1, -1])
+
+  # Tile the thresholds for every prediction.
+  thresh_tiled = array_ops.tile(
+      array_ops.expand_dims(array_ops.constant(thresholds), 1),
+      array_ops.stack([1, num_predictions]))
+
+  # Tile the predictions for every threshold.
+  preds_tiled = array_ops.tile(predictions_2d, [num_thresholds, 1])
+
+  # Compare predictions and threshold.
+  pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled)
+
+  # Tile labels by number of thresholds
+  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
+
+  if sample_weight is not None:
+    weights = weights_broadcast_ops.broadcast_weights(
+        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
+    weights_tiled = array_ops.tile(
+        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
+  else:
+    weights_tiled = None
+
+  update_ops = []
+
+  def weighted_assign_add(label, pred, weights, var):
+    label_and_pred = math_ops.cast(
+        math_ops.logical_and(label, pred), dtype=dtypes.float32)
+    if weights is not None:
+      label_and_pred *= weights
+    return state_ops.assign_add(var, math_ops.reduce_sum(label_and_pred, 1))
+
+  loop_vars = {
+      _ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
+  }
+  update_tn = _ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
+  update_fp = _ConfusionMatrix.FALSE_POSITIVES in variables_to_update
+  update_fn = _ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
+
+  if update_fn or update_tn:
+    pred_is_neg = math_ops.logical_not(pred_is_pos)
+    loop_vars[_ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
+
+  if update_fp or update_tn:
+    label_is_neg = math_ops.logical_not(label_is_pos)
+    loop_vars[_ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
+    if update_tn:
+      loop_vars[_ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg)
+
+  for matrix_cond, (label, pred) in loop_vars.items():
+    if matrix_cond in variables_to_update:
+      update_ops.append(
+          weighted_assign_add(label, pred, weights_tiled,
+                              variables_to_update[matrix_cond]))
+  return control_flow_ops.group(update_ops)
+
+
 @six.add_metaclass(abc.ABCMeta)
 class Metric(Layer):
   """Encapsulates metric logic and state.
 
-  Usage with eager execution:
+  Usage:
 
   ```python
   m = SomeMetric(...)
@@ -280,19 +410,6 @@ class Metric(Layer):
   print('Final result: ', m.result().numpy())
   ```
 
-  Usage with graph execution:
-
-  ```python
-  m = SomeMetric(...)
-  init_op = tf.variables_initializer(m.variables)  # Initialize variables
-  with tf.Session() as sess:
-    sess.run(init_op)
-    for input in ...:
-      update_op = m.update_state(input)
-      sess.run(update_op)
-    print('Final result: ', sess.run(m.result()))
-  ```
-
   Usage with tf.keras API:
 
   ```python
@@ -388,9 +505,20 @@ class Metric(Layer):
     Returns:
       The metric value tensor.
     """
-    update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
+    update_op = self.update_state(*args, **kwargs)
     with ops.control_dependencies([update_op]):
-      return self.result()  # pylint: disable=not-callable
+      result_t = self.result()
+
+      # We are adding the metric object as metadata on the result tensor.
+      # This is required when we want to use a metric with `add_metric` API on
+      # a Model/Layer in graph mode. This metric instance will later be used
+      # to reset variable state after each epoch of training.
+      # Example:
+      #   model = Model()
+      #   model.add_metric(Mean()(values), name='mean')
+      if not context.executing_eagerly():
+        result_t._metric_obj = self  # pylint: disable=protected-access
+      return result_t
 
   def reset_states(self):
     """Resets all of the metric state variables.
@@ -459,15 +587,35 @@ class Metric(Layer):
   ### End: For use by subclasses ###
 
 
+@tf_export('metrics.Mean', 'keras.metrics.Mean', v1=[])
 class Mean(Metric):
   """Computes the (weighted) mean of the given values.
 
+  For example, if values is [1, 3, 5, 7] then the mean is 4.
+  If the weights were specified as [1, 1, 0, 0] then the mean would be 2.
+
   This metric creates two variables, `total` and `count` that are used to
   compute the average of `values`. This average is ultimately returned as `mean`
   which is an idempotent operation that simply divides `total` by `count`.
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.Mean()
+  m.update_state([1, 3, 5, 7])
+  print('Final result: ', m.result().numpy())  # Final result: 4.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.add_metric(metrics_module.Mean(name='mean_1')(outputs))
+  model.compile('sgd', loss='mse')
+  ```
   """
 
   def __init__(self, name='mean', dtype=None):
@@ -521,13 +669,15 @@ class Mean(Metric):
       values = math_ops.multiply(values, sample_weight)
     values = math_ops.reduce_sum(values)
 
-    # Update state variables
+    # Update state variables. Count should be updated only when total is
+    # updated.
     update_total_op = state_ops.assign_add(self.total, values)
-    update_count_op = state_ops.assign_add(self.count, num_values)
-    return control_flow_ops.group(update_total_op, update_count_op)
+    with ops.control_dependencies([update_total_op]):
+      update_count_op = state_ops.assign_add(self.count, num_values)
+      return ops.convert_to_tensor(update_count_op)
 
   def result(self):
-    return safe_div(self.total, self.count)
+    return math_ops.div_no_nan(self.total, self.count)
 
 
 class MeanMetricWrapper(Mean):
@@ -572,14 +722,62 @@ class MeanMetricWrapper(Mean):
         matches, sample_weight=sample_weight)
 
   def get_config(self):
-    config = self._fn_kwargs
+    config = {'fn': self._fn}
+    config.update(self._fn_kwargs)
     base_config = super(MeanMetricWrapper, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('metrics.Accuracy', 'keras.metrics.Accuracy', v1=[])
+class Accuracy(MeanMetricWrapper):
+  """Calculates how often predictions matches labels.
+
+  For example, if `y_true` is [1, 2, 3, 4] and `y_pred` is [0, 2, 3, 4]
+  then the accuracy is 3/4 or .75.  If the weights were specified as
+  [1, 1, 0, 0] then the accuracy would be 1/2 or .5.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the frequency with which `y_pred` matches `y_true`. This frequency is
+  ultimately returned as `binary accuracy`: an idempotent operation that simply
+  divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.Accuracy()
+  m.update_state([1, 2, 3, 4], [0, 2, 3, 4])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.Accuracy()])
+  ```
+  """
+
+  def __init__(self, name='accuracy', dtype=None):
+    super(Accuracy, self).__init__(accuracy, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(Accuracy, cls).from_config(config)
+
+
+@tf_export('metrics.BinaryAccuracy', 'keras.metrics.BinaryAccuracy', v1=[])
 class BinaryAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
+  For example, if `y_true` is [1, 1, 0, 0] and `y_pred` is [0.98, 1, 0, 0.6]
+  then the binary accuracy is 3/4 or .75.  If the weights were specified as
+  [1, 0, 0, 1] then the binary accuracy would be 1/2 or .5.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `binary accuracy`: an idempotent operation that simply
@@ -587,6 +785,21 @@ class BinaryAccuracy(MeanMetricWrapper):
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.BinaryAccuracy()
+  m.update_state([1, 1, 0, 0], [0.98, 1, 0, 0.6])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.BinaryAccuracy()])
+  ```
   """
 
   def __init__(self, name='binary_accuracy', dtype=None, threshold=0.5):
@@ -601,17 +814,48 @@ class BinaryAccuracy(MeanMetricWrapper):
     super(BinaryAccuracy, self).__init__(
         binary_accuracy, name, dtype=dtype, threshold=threshold)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(BinaryAccuracy, cls).from_config(config)
 
+
+@tf_export(
+    'metrics.CategoricalAccuracy', 'keras.metrics.CategoricalAccuracy', v1=[])
 class CategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
+  For example, if `y_true` is [[0, 0, 1], [0, 1, 0]] and `y_pred` is
+  [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
+  If the weights were specified as [0.7, 0.3] then the categorical accuracy
+  would be .3.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `categorical accuracy`: an idempotent operation that
   simply divides `total` by `count`.
 
+  `y_pred` and `y_true` should be passed in as vectors of probabilities, rather
+  than as labels. If necessary, use `tf.one_hot` to expand `y_true` as a vector.
+
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.CategoricalAccuracy()
+  m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.CategoricalAccuracy()])
+  ```
   """
 
   def __init__(self, name='categorical_accuracy', dtype=None):
@@ -624,10 +868,25 @@ class CategoricalAccuracy(MeanMetricWrapper):
     super(CategoricalAccuracy, self).__init__(
         categorical_accuracy, name, dtype=dtype)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(CategoricalAccuracy, cls).from_config(config)
+
 
+@tf_export(
+    'metrics.SparseCategoricalAccuracy',
+    'keras.metrics.SparseCategoricalAccuracy',
+    v1=[])
 class SparseCategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches integer labels.
 
+  For example, if `y_true` is [[2], [1]] and `y_pred` is
+  [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
+  If the weights were specified as [0.7, 0.3] then the categorical accuracy
+  would be .3.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `sparse categorical accuracy`: an idempotent operation
@@ -635,12 +894,333 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.SparseCategoricalAccuracy()
+  m.update_state([[2], [1]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.metrics.SparseCategoricalAccuracy()])
+  ```
   """
 
   def __init__(self, name='sparse_categorical_accuracy', dtype=None):
     super(SparseCategoricalAccuracy, self).__init__(
         sparse_categorical_accuracy, name, dtype=dtype)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(SparseCategoricalAccuracy, cls).from_config(config)
+
+
+class _ConfusionMatrixConditionCount(Metric):
+  """Calculates the number of the given confusion matrix condition."""
+
+  def __init__(self,
+               confusion_matrix_cond,
+               thresholds=None,
+               name=None,
+               dtype=None):
+    """Creates a `_ConfusionMatrixConditionCount` instance.
+
+    Args:
+      confusion_matrix_cond: One of `_ConfusionMatrix` conditions.
+      thresholds: (Optional) Defaults to [0.5]. A python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(_ConfusionMatrixConditionCount, self).__init__(name=name, dtype=dtype)
+    self._confusion_matrix_cond = confusion_matrix_cond
+    self.thresholds = [0.5] if thresholds is None else thresholds
+    _assert_thresholds_range(self.thresholds)
+    self.accumulator = self.add_weight(
+        'accumulator',
+        shape=(len(self.thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates the given confusion matrix condition statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        self._confusion_matrix_cond: self.accumulator
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    return ops.convert_to_tensor(self.accumulator)
+
+
+class FalsePositives(_ConfusionMatrixConditionCount):
+  """Calculates the number of false positives.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  false positives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of false positives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `FalsePositives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to [0.5]. A python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(FalsePositives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.FALSE_POSITIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+class FalseNegatives(_ConfusionMatrixConditionCount):
+  """Calculates the number of false negatives.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  false negatives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of false negatives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `FalseNegatives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to [0.5]. A python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(FalseNegatives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.FALSE_NEGATIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+class TrueNegatives(_ConfusionMatrixConditionCount):
+  """Calculates the number of true negatives.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  true negatives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of true negatives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `TrueNegatives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to [0.5]. A python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TrueNegatives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.TRUE_NEGATIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+class TruePositives(_ConfusionMatrixConditionCount):
+  """Calculates the number of true positives.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  true positives. This metric creates one local variable, `true_positives`
+  that is used to keep track of the number of true positives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `TruePositives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to [0.5]. A python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TruePositives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.TRUE_POSITIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+class Precision(Metric):
+  """Computes the precision of the predictions with respect to the labels.
+
+  The metric creates two local variables, `true_positives` and `false_positives`
+  that are used to compute the precision. This value is ultimately returned as
+  `precision`, an idempotent operation that simply divides `true_positives`
+  by the sum of `true_positives` and `false_positives`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `Precision` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to [0.5]. A python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Precision, self).__init__(name=name, dtype=dtype)
+    self.thresholds = [0.5] if thresholds is None else thresholds
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(len(self.thresholds),),
+        initializer=init_ops.zeros_initializer)
+    self.fp = self.add_weight(
+        'false_positives',
+        shape=(len(self.thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates true positive and false positive statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.FALSE_POSITIVES: self.fp
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    return array_ops.where(
+        math_ops.greater(self.tp + self.fp, 0),
+        math_ops.div(self.tp, self.tp + self.fp),
+        array_ops.zeros_like(self.thresholds))
+
+
+class Recall(Metric):
+  """Computes the recall of the predictions with respect to the labels.
+
+  This metric creates two local variables, `true_positives` and
+  `false_negatives`, that are used to compute the recall. This value is
+  ultimately returned as `recall`, an idempotent operation that simply divides
+  `true_positives` by the sum of `true_positives` and `false_negatives`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `Recall` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to [0.5]. A python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Recall, self).__init__(name=name, dtype=dtype)
+    self.thresholds = [0.5] if thresholds is None else thresholds
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(len(self.thresholds),),
+        initializer=init_ops.zeros_initializer)
+    self.fn = self.add_weight(
+        'false_negatives',
+        shape=(len(self.thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates true positive and false negative statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.FALSE_NEGATIVES: self.fn
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    return array_ops.where(
+        math_ops.greater(self.tp + self.fn, 0),
+        math_ops.div(self.tp, self.tp + self.fn),
+        array_ops.zeros_like(self.thresholds))
+
+
+def accuracy(y_true, y_pred):
+  y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
+  if y_true.dtype != y_pred.dtype:
+    y_pred = math_ops.cast(y_pred, y_true.dtype)
+  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+
 
 @tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred, threshold=0.5):
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 5f5565d4d5a547d640217cf799a20d0050584ed6..eeade4f37dc9667fd614e3327fa73987ceb795dd 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -22,6 +22,7 @@ import os
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -31,6 +32,7 @@ from tensorflow.python.keras import metrics
 from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -47,7 +49,7 @@ class KerasMetricsTest(test.TestCase):
         output = metric(y_a, y_b)
         self.assertEqual(K.eval(output).shape, (6,))
 
-  def test_sparse_categorical_accuracy(self):
+  def test_sparse_categorical_accuracy_int(self):
     with self.cached_session():
       metric = metrics.sparse_categorical_accuracy
       y_true = K.variable(np.random.randint(0, 7, (6,)))
@@ -365,6 +367,28 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(200., self.evaluate(restore_mean.result()))
     self.assertEqual(3, self.evaluate(restore_mean.count))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_accuracy(self):
+    acc_obj = metrics.Accuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[1], [2], [3], [4]], [[1], [2], [3], [4]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check with sample_weight
+    result_t = acc_obj([[2], [1]], [[2], [0]], sample_weight=[[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
+
   @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy(self):
     acc_obj = metrics.BinaryAccuracy(name='my acc')
@@ -435,6 +459,30 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_sparse_categorical_accuracy(self):
+    acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[2], [1]],
+                                     [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check with sample_weight
+    result_t = acc_obj([[2], [1]], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+                       [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
+
   @test_util.run_in_graph_and_eager_modes
   def test_invalid_result(self):
 
@@ -478,5 +526,502 @@ class KerasMetricsTest(test.TestCase):
       invalid_update_obj.update_state()
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class FalsePositivesTest(test.TestCase):
+
+  def test_config(self):
+    fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
+    self.assertEqual(fp_obj.name, 'my_fp')
+    self.assertEqual(len(fp_obj.variables), 1)
+    self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose([7.], result)
+
+  def test_weighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([14.], self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose([7., 4., 2.], result)
+
+  def test_weighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0),
+                     (19.0, 23.0, 29.0, 31.0), (5.0, 15.0, 10.0, 0))
+
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([125., 42., 12.], self.evaluate(result))
+
+  def test_threshold_limit(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
+      metrics.FalsePositives(thresholds=[-1, 0.5, 2])
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FalseNegativesTest(test.TestCase):
+
+  def test_config(self):
+    fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
+    self.assertEqual(fn_obj.name, 'my_fn')
+    self.assertEqual(len(fn_obj.variables), 1)
+    self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose([3.], result)
+
+  def test_weighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([5.], self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose([1., 4., 6.], result)
+
+  def test_weighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
+
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([4., 16., 23.], self.evaluate(result))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TrueNegativesTest(test.TestCase):
+
+  def test_config(self):
+    tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
+    self.assertEqual(tn_obj.name, 'my_tn')
+    self.assertEqual(len(tn_obj.variables), 1)
+    self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose([3.], result)
+
+  def test_weighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([4.], self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose([2., 5., 7.], result)
+
+  def test_weighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((0.0, 2.0, 3.0, 5.0),)
+
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([5., 15., 23.], self.evaluate(result))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TruePositivesTest(test.TestCase):
+
+  def test_config(self):
+    tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
+    self.assertEqual(tp_obj.name, 'my_tp')
+    self.assertEqual(len(tp_obj.variables), 1)
+    self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose([7.], result)
+
+  def test_weighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([12.], self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose([6., 3., 1.], result)
+
+  def test_weighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    result = tp_obj(y_true, y_pred, sample_weight=37.)
+    self.assertAllClose([222., 111., 37.], self.evaluate(result))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PrecisionTest(test.TestCase):
+
+  def test_config(self):
+    p_obj = metrics.Precision(name='my_precision', thresholds=[0.4, 0.9])
+    self.assertEqual(p_obj.name, 'my_precision')
+    self.assertLen(p_obj.variables, 2)
+    self.assertEqual([v.name for v in p_obj.variables],
+                     ['true_positives:0', 'false_positives:0'])
+    self.assertEqual(p_obj.thresholds, [0.4, 0.9])
+
+  def test_value_is_idempotent(self):
+    p_obj = metrics.Precision(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = p_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_precision = self.evaluate(p_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_precision, self.evaluate(p_obj.result()),
+                           1e-3)
+
+  def test_unweighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    p_obj = metrics.Precision(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 4.0
+    weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.1])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(result), 1e-3)
+
+  def test_extreme_thresholds(self):
+    p_obj = metrics.Precision(thresholds=[-1.0, 2.0])  # beyond values range
+    y_pred = math_ops.cast(
+        constant_op.constant([1, 0, 1, 0], shape=(1, 4)), dtype=dtypes.float32)
+    y_true = math_ops.cast(
+        constant_op.constant([0, 1, 1, 1], shape=(1, 4)), dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertArrayNear([0.75, 0.], self.evaluate(result), 0)
+
+  def test_multiple_updates(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.1])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
+                         1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RecallTest(test.TestCase):
+
+  def test_config(self):
+    r_obj = metrics.Recall(name='my_recall', thresholds=[0.4, 0.9])
+    self.assertEqual(r_obj.name, 'my_recall')
+    self.assertLen(r_obj.variables, 2)
+    self.assertEqual([v.name for v in r_obj.variables],
+                     ['true_positives:0', 'false_negatives:0'])
+    self.assertEqual(r_obj.thresholds, [0.4, 0.9])
+
+  def test_value_is_idempotent(self):
+    r_obj = metrics.Recall(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = r_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_recall = self.evaluate(r_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_recall, self.evaluate(r_obj.result()), 1e-3)
+
+  def test_unweighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    r_obj = metrics.Recall(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 1.0
+    weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+    expected_recall = weighted_tp / weighted_t
+    self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.1])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
+
+  def test_extreme_thresholds(self):
+    r_obj = metrics.Recall(thresholds=[-1.0, 2.0])  # beyond values range
+    y_pred = math_ops.cast(
+        constant_op.constant([1, 0, 1, 0], shape=(1, 4)), dtype=dtypes.float32)
+    y_true = math_ops.cast(
+        constant_op.constant([0, 1, 1, 1], shape=(1, 4)), dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertArrayNear([1.0, 0.], self.evaluate(result), 0)
+
+  def test_multiple_updates(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.1])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
+                         1e-3)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index aca058b1111f64ddfdcbc16cab355ca1f33a2a7e..87802d8df0fadbc5d82bc3bebabd613d81f7383b 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -819,6 +820,69 @@ class ModelSubclassingTest(test.TestCase):
     self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
                      m.non_trainable_variables)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_add_weight_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+    class MyModelCustomBuild(keras.Model):
+
+      def build(self, input_shape):
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModelCustomBuild()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+  def test_add_update_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,))
+
+      def call(self, inputs):
+        # Unconditional
+        self.add_update(self.b.assign(self.b * 2))
+        # Conditional
+        self.add_update(self.c.assign(inputs[1, :]), inputs)
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+
+    if context.executing_eagerly():
+      self.assertEqual(0, len(model.updates))
+    else:
+      self.assertEqual(2, len(model.updates))
+      self.assertEqual(1, len(model.get_updates_for(None)))
+      self.assertEqual(1, len(model.get_updates_for(x)))
+
 
 class GraphSpecificModelSubclassingTests(test.TestCase):
 
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 11054b5b08686cc329c9c7b13d299922e730282f..3a0c51b49704cfab87f9eab354b980048bbf8a12 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -206,10 +206,17 @@ def _clone_sequential_model(model, input_tensors=None):
   def clone(layer):
     return layer.__class__.from_config(layer.get_config())
 
-  layers = [clone(layer) for layer in model.layers]
+  # Use model._layers to ensure that all layers are cloned. The model's layers
+  # property will exclude the initial InputLayer (if it exists) in the model,
+  # resulting in a different Sequential model structure.
+  layers = [clone(layer) for layer in model._layers]
   if input_tensors is None:
     return Sequential(layers=layers, name=model.name)
   else:
+    # If input tensors are provided, the original model's InputLayer is
+    # overwritten with a different InputLayer.
+    if isinstance(layers[0], InputLayer):
+      layers = layers[1:]
     if len(generic_utils.to_list(input_tensors)) != 1:
       raise ValueError('To clone a `Sequential` model, we expect '
                        ' at most one tensor '
@@ -297,8 +304,9 @@ def _in_place_subclassed_model_reset(model):
       attributes_cache[name] = value
       assert value in model._layers
     elif isinstance(
-        value, (list, tuple)) and name not in ('layers', '_layers',
-                                               'stateful_metric_functions'):
+        value,
+        (list, tuple)) and name not in ('layers', '_layers', 'metrics',
+                                        '_compile_stateful_metric_functions'):
       # Handle case: list/tuple of layers (also tracked by the Network API).
       if value and all(isinstance(val, Layer) for val in value):
         raise ValueError('We do not support the use of list-of-layers '
@@ -338,14 +346,11 @@ def _in_place_subclassed_model_reset(model):
           'targets',
           '_feed_targets',
           'sample_weight_modes',
-          'weighted_metrics',
-          'metrics_names',
-          'metrics_tensors',
-          'metrics_updates',
-          'stateful_metric_names',
           'total_loss',
           'sample_weights',
           '_feed_sample_weights',
+          '_fit_function',
+          '_eval_function',
           'train_function',
           'test_function',
           'predict_function',
@@ -452,7 +457,7 @@ def clone_and_build_model(
 
     if all([isinstance(clone, Sequential),
             not clone._is_graph_network,
-            model.built]):
+            getattr(model, '_build_input_shape', None) is not None]):
       # Set model inputs to build the model and add input/output properties.
       # TODO(kathywu): Add multiple placeholders to handle edge case where
       # sequential model has multiple inputs.
@@ -488,10 +493,11 @@ def clone_and_build_model(
     clone.compile(
         optimizer,
         model.loss,
-        metrics=metrics_module.clone_metrics(model.metrics),
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
         loss_weights=model.loss_weights,
         sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(model.weighted_metrics),
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
         target_tensors=target_tensors)
 
   return clone
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 36875cf984f87be964a4dd089b5d4bcf9ffdcb1e..4b6bb74ef96b4fd00c89dea1142c8dcb6ee7e6b2 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -50,6 +50,21 @@ class TestModel(keras.Model):
     return self.layer1(x)
 
 
+def sequential_model(add_input_layer, include_input_shape=True):
+  model = keras.models.Sequential()
+  if add_input_layer:
+    model.add(keras.layers.InputLayer(input_shape=(4,)))
+    model.add(keras.layers.Dense(4))
+  elif include_input_shape:
+    model.add(keras.layers.Dense(4, input_shape=(4,)))
+  else:
+    model.add(keras.layers.Dense(4))
+  model.add(keras.layers.BatchNormalization())
+  model.add(keras.layers.Dropout(0.5))
+  model.add(keras.layers.Dense(4))
+  return model
+
+
 class TestModelCloning(test.TestCase):
 
   def test_clone_sequential_model(self):
@@ -57,11 +72,7 @@ class TestModelCloning(test.TestCase):
       val_a = np.random.random((10, 4))
       val_out = np.random.random((10, 4))
 
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
+      model = sequential_model(False)
 
     # Everything should work in a new session.
     keras.backend.clear_session()
@@ -76,20 +87,55 @@ class TestModelCloning(test.TestCase):
 
       # On top of new tensor
       input_a = keras.Input(shape=(4,))
-      new_model = keras.models.clone_model(
-          model, input_tensors=input_a)
+      new_model = keras.models.clone_model(model, input_tensors=input_a)
       self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new, non-Keras tensor
       input_a = keras.backend.variable(val_a)
-      new_model = keras.models.clone_model(
-          model, input_tensors=input_a)
+      new_model = keras.models.clone_model(model, input_tensors=input_a)
       self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
+  def test_clone_sequential_model_input_layer(self):
+    def test_input_layer(include_inputs):
+      with self.cached_session():
+        val_a = np.random.random((10, 4))
+        model = sequential_model(include_inputs, include_inputs)
+        # Sanity check
+        self.assertEqual(
+            isinstance(model._layers[0], keras.layers.InputLayer),
+            include_inputs)
+        self.assertEqual(model._is_graph_network, include_inputs)
+
+      keras.backend.clear_session()
+      with self.cached_session():
+        # With placeholder creation -- clone model should have an InputLayer
+        # if the original model has one.
+        new_model = keras.models.clone_model(model)
+        self.assertEqual(
+            isinstance(new_model._layers[0], keras.layers.InputLayer),
+            include_inputs)
+        self.assertEqual(new_model._is_graph_network, model._is_graph_network)
+
+        # On top of new tensor  -- clone model should always have an InputLayer.
+        input_a = keras.Input(shape=(4,))
+        new_model = keras.models.clone_model(model, input_tensors=input_a)
+        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+        self.assertTrue(new_model._is_graph_network)
+
+        # On top of new, non-Keras tensor  -- clone model should always have an
+        # InputLayer.
+        input_a = keras.backend.variable(val_a)
+        new_model = keras.models.clone_model(model, input_tensors=input_a)
+        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+        self.assertTrue(new_model._is_graph_network)
+
+    test_input_layer(True)
+    test_input_layer(False)
+
   def test_clone_functional_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
@@ -285,7 +331,8 @@ class TestCloneAndBuildModel(test.TestCase):
     self.assertEqual('mse', model.loss)
     self.assertTrue(
         isinstance(model.optimizer, keras.optimizers.RMSprop))
-    self.assertEqual(['acc', metrics.categorical_accuracy], model.metrics)
+    self.assertEqual(['acc', metrics.categorical_accuracy],
+                     model._compile_metrics)
 
   def _clone_and_build_test_helper(self, model, is_subclassed=False):
     inp = np.random.random((10, 4))
@@ -401,11 +448,7 @@ class TestCloneAndBuildModel(test.TestCase):
 
   def test_clone_and_build_sequential_model_without_inputs_defined(self):
     with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
+      model = sequential_model(False, False)
       model.compile('rmsprop', 'mse',
                     metrics=['acc', metrics.categorical_accuracy])
     self._clone_and_build_test_helper(model, False)
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index b4b84fad0cecd89173740693e32f98429547ac7f..6b805781f0b9e9b34ebd7bb80b4aa0075caf4db8 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -7,14 +7,21 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
     name = "optimizer_v2",
     srcs = [
+        "adadelta.py",
+        "adagrad.py",
         "adam.py",
+        "adamax.py",
+        "ftrl.py",
         "gradient_descent.py",
+        "nadam.py",
         "optimizer_v2.py",
+        "rmsprop.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -24,12 +31,107 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:reduce_util",
     ],
 )
 
+cuda_py_test(
+    name = "adagrad_test",
+    size = "medium",
+    srcs = ["adagrad_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "medium",
+    srcs = ["adam_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adamax_test",
+    size = "medium",
+    srcs = ["adamax_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adadelta_test",
+    size = "medium",
+    srcs = ["adadelta_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "ftrl_test",
+    size = "medium",
+    srcs = ["ftrl_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
 cuda_py_test(
     name = "gradient_descent_test",
     size = "medium",
@@ -46,12 +148,54 @@ cuda_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
+    name = "nadam_test",
+    size = "medium",
+    srcs = ["nadam_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+py_test(
     name = "optimizer_v2_test",
     size = "medium",
     srcs = ["optimizer_v2_test.py"],
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
+cuda_py_test(
+    name = "rmsprop_test",
+    size = "medium",
+    srcs = ["rmsprop_test.py"],
     additional_deps = [
         ":optimizer_v2",
         "//tensorflow/python/eager:def_function",
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..21a3f06f4fd546299c6ce11ec7500478f631cd5c
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -0,0 +1,128 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adadelta for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class Adadelta(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the Adadelta algorithm.
+
+  Adadelta optimization is a stochastic gradient descent method that is based on
+  adaptive learning rate per dimension to address two drawbacks:
+    1) the continual decay of learning rates throughout training
+    2) the need for a manually selected global learning rate
+
+  Two accumulation steps are required:
+    1) the accumulation of gradients squared,
+    2) the accumulation of updates squared.
+
+  Initialization:
+
+  $$accum_g_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
+  $$accum_x_0 := 0 \text{(Initialize variable update 2nd order moment vector)}$$
+
+  $$t := t + 1$$
+  $$accum_g_t := rho * accum_g_{t-1} + (1 - rho) * g * g$$
+  $$delta = -\sqrt{accum_x_{t-1}} / (\sqrt{accum_g_{t-1}} + \epsilon)$$
+  $$accum_x_t := rho * accum_x_{t-1} + (1 - rho) * delta * delta$$
+
+  References
+    See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
+      ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
+
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.95,
+               epsilon=1e-7,
+               name='Adadelta'):
+    """Construct a new Adadelta optimizer.
+
+    Adadelta is a more robust extension of Adagrad that adapts learning rates
+    based on a moving window of gradient updates, instead of accumulating all
+    past gradients. This way, Adadelta continues learning even when many updates
+    have been done. Compared to Adagrad, in the original version of Adadelta you
+    don't have to set an initial learning rate. In this version, initial
+    learning rate can be set, as in most other Keras optimizers.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value. The learning rate.
+        To match the exact form in the original paper use 1.0.
+      rho: A `Tensor` or a floating point value. The decay rate.
+      epsilon: A `Tensor` or a floating point value.  A constant epsilon used
+               to better conditioning the grad update.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adadelta".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
+    each be a callable that takes no arguments and returns the actual value to
+    use. This can be useful for changing these values across different
+    invocations of optimizer functions.
+    @end_compatibility
+    """
+    super(Adadelta, self).__init__(name)
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('rho', rho)
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      self.add_slot(v, 'accum_grad')
+      self.add_slot(v, 'accum_var')
+
+  def _resource_apply_dense(self, grad, var):
+    accum_grad = self.get_slot(var, 'accum_grad')
+    accum_var = self.get_slot(var, 'accum_var')
+    return training_ops.resource_apply_adadelta(
+        var.handle,
+        accum_grad.handle,
+        accum_var.handle,
+        math_ops.cast(self._get_hyper('learning_rate'), grad.dtype.base_dtype),
+        math_ops.cast(self._get_hyper('rho'), grad.dtype.base_dtype),
+        math_ops.cast(self._get_hyper('epsilon'), grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    accum_grad = self.get_slot(var, 'accum_grad')
+    accum_var = self.get_slot(var, 'accum_var')
+    return training_ops.resource_sparse_apply_adadelta(
+        var.handle,
+        accum_grad.handle,
+        accum_var.handle,
+        math_ops.cast(self._get_hyper('learning_rate'), grad.dtype.base_dtype),
+        math_ops.cast(self._get_hyper('rho'), grad.dtype.base_dtype),
+        math_ops.cast(self._get_hyper('epsilon'), grad.dtype.base_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(Adadelta, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'rho': self._serialize_hyperparameter('rho'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef95d27abff7efbb7b11c870d1600e8ec97ddb46
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -0,0 +1,169 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adadelta Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdadeltaOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    num_updates = 4  # number of ADADELTA steps to perform
+    for dtype in [dtypes.half, dtypes.float32]:
+      for grad in [0.2, 0.1, 0.01]:
+        for lr in [1.0, 0.5, 0.1]:
+          var0_init = [1.0, 2.0]
+          var1_init = [3.0, 4.0]
+          if use_resource:
+            var0 = resource_variable_ops.ResourceVariable(
+                var0_init, dtype=dtype)
+            var1 = resource_variable_ops.ResourceVariable(
+                var1_init, dtype=dtype)
+          else:
+            var0 = variables.Variable(var0_init, dtype=dtype)
+            var1 = variables.Variable(var1_init, dtype=dtype)
+
+          grads = constant_op.constant([grad, grad], dtype=dtype)
+
+          accum = 0.0
+          accum_update = 0.0
+
+          # ADADELTA gradient optimizer
+          rho = 0.95
+          epsilon = 1e-8
+          if use_callable_params:
+            adadelta_opt = adadelta.Adadelta(
+                learning_rate=lambda: lr,  # pylint: disable=cell-var-from-loop
+                rho=lambda: rho,  # pylint: disable=cell-var-from-loop
+                epsilon=lambda: epsilon)  # pylint: disable=cell-var-from-loop
+          else:
+            adadelta_opt = adadelta.Adadelta(
+                learning_rate=lr, rho=rho, epsilon=epsilon)
+          if not context.executing_eagerly():
+            adadelta_update = adadelta_opt.apply_gradients(
+                zip([grads, grads], [var0, var1]))
+            self.evaluate(variables.global_variables_initializer())
+
+            # Assign slots
+            slot = [None] * 2
+            slot_update = [None] * 2
+            slot[0] = adadelta_opt.get_slot(var0, "accum_grad")
+            self.assertEquals(slot[0].get_shape(), var0.get_shape())
+
+            slot_update[0] = adadelta_opt.get_slot(var0, "accum_var")
+            self.assertEquals(slot_update[0].get_shape(), var0.get_shape())
+
+            slot[1] = adadelta_opt.get_slot(var1, "accum_grad")
+            self.assertEquals(slot[1].get_shape(), var1.get_shape())
+
+            slot_update[1] = adadelta_opt.get_slot(var1, "accum_var")
+            self.assertEquals(slot_update[1].get_shape(), var1.get_shape())
+
+          # Fetch params to validate initial values
+          self.assertAllClose(var0_init, self.evaluate(var0))
+          self.assertAllClose(var1_init, self.evaluate(var1))
+
+          update = [None] * num_updates
+          tot_update = 0
+          for step in range(num_updates):
+            # Run adadelta update for comparison
+            if not context.executing_eagerly():
+              self.evaluate(adadelta_update)
+            else:
+              adadelta_opt.apply_gradients(zip([grads, grads], [var0, var1]))
+
+            # Perform initial update without previous accum values
+            accum = accum * rho + (grad**2) * (1 - rho)
+            update[step] = (
+                np.sqrt(accum_update + epsilon) *
+                (1. / np.sqrt(accum + epsilon)) * grad)
+            accum_update = (
+                accum_update * rho + (update[step]**2) * (1.0 - rho))
+            tot_update += update[step] * lr
+
+            if not context.executing_eagerly():
+              # Check that the accumulators have been updated
+              # TODO(lxuechen): This is hard to test in eager mode
+              for slot_idx in range(2):
+                self.assertAllCloseAccordingToType(
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
+                    self.evaluate(slot[slot_idx]),
+                    rtol=1e-5)
+
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [accum_update, accum_update],
+                        dtype=dtype.as_numpy_dtype()),
+                    self.evaluate(slot_update[slot_idx]),
+                    rtol=1e-5)
+
+              # Check that the parameters have been updated
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var0_init[0] - tot_update, var0_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  self.evaluate(var0),
+                  rtol=1e-5)
+
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var1_init[0] - tot_update, var1_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  self.evaluate(var1),
+                  rtol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adadelta.Adadelta(1.0, 1.0, 1.0).minimize(
+            loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d090e8b842c1e4aecb5c5109c6c48db8cf1b23d
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -0,0 +1,144 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adagrad for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+
+
+class Adagrad(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the Adagrad algorithm.
+
+  Adagrad is an optimizer with parameter-specific learning rates,
+  which are adapted relative to how frequently a parameter gets
+  updated during training. The more updates a parameter receives,
+  the smaller the updates.
+
+  Initialization:
+
+  $$accum_g_0 := initial_accumulator_value$$
+
+  $$t := t + 1$$
+  $$accum_g_t := accum_g_{t-1} + g * g$$
+  $$theta_t := theta_{t-1} - lr * g / (\sqrt{accum_g_t} + \epsilon)$$
+
+  References
+    See [paper]
+      (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    or this
+      [intro](https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               initial_accumulator_value=0.1,
+               epsilon=1e-7,
+               name='Adagrad'):
+    """Construct a new Adagrad optimizer.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value.  The learning rate.
+      initial_accumulator_value: A floating point value.
+        Starting value for the accumulators, must be positive.
+      epsilon: A floating point value.
+        Starting value for the accumulators, must be positive.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adagrad".
+
+    Raises:
+      ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate` can be a callable that
+    takes no arguments and returns the actual value to use. This can be useful
+    for changing these values across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    if initial_accumulator_value <= 0.0:
+      raise ValueError('initial_accumulator_value must be positive: %s' %
+                       initial_accumulator_value)
+    if epsilon < 1e-7:
+      raise ValueError('epsilon must be larger than 1e-7: %s' % epsilon)
+    super(Adagrad, self).__init__(name)
+    self._set_hyper('learning_rate', learning_rate)
+    self._initial_accumulator_value = initial_accumulator_value
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = init_ops.constant_initializer(
+          self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, 'accumulator', init)
+
+  def _init_constant_op(self, v, dtype):
+    def init():
+      # Use a Tensor instead of initializer if variable does not have
+      # static shape.
+      init_constant = gen_array_ops.fill(array_ops.shape(v),
+                                         self._initial_accumulator_value)
+      return math_ops.cast(init_constant, dtype)
+    return init
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    learning_rate = math_ops.cast(self._get_hyper('learning_rate'), var_dtype)
+    epsilon = math_ops.cast(self._get_hyper('epsilon'), var_dtype)
+    acc = self.get_slot(var, 'accumulator')
+
+    acc_t = state_ops.assign_add(
+        acc, math_ops.square(grad), use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, learning_rate * grad / (math_ops.sqrt(acc_t) + epsilon))
+    return var_update
+
+  def _resource_apply_sparse(self, grad, var, indices):
+
+    def _resource_scatter_add(x, i, v):
+      with ops.control_dependencies(
+          [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+        return x.value()
+
+    var_dtype = var.dtype.base_dtype
+    learning_rate = math_ops.cast(self._get_hyper('learning_rate'), var_dtype)
+    epsilon = math_ops.cast(self._get_hyper('epsilon'), var_dtype)
+    acc = self.get_slot(var, 'accumulator')
+
+    acc_t = _resource_scatter_add(acc, indices, math_ops.square(grad))
+    acc_t_slice = array_ops.gather(acc_t, indices)
+    var_update = _resource_scatter_add(
+        var, indices,
+        -learning_rate * grad / (math_ops.sqrt(acc_t_slice) + epsilon))
+    return var_update
+
+  def get_config(self):
+    config = super(Adagrad, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'initial_accumulator_value': self._initial_accumulator_value,
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d0f55c7d7a026ddb16e411199089ea2262408af
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -0,0 +1,349 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for aggregate operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adagrad_update_numpy(param, accum, g_t, lr=0.001, epsilon=1e-7):
+  accum_t = accum + g_t * g_t
+  param_t = param - lr * g_t / (np.sqrt(accum_t) + epsilon)
+  return param_t, accum_t
+
+
+def sparse_adagrad_update_numpy(param,
+                                accum,
+                                gindexs,
+                                gvalues,
+                                lr=0.001,
+                                epsilon=1e-7):
+  accum_t = copy.deepcopy(accum)
+  param_t = copy.deepcopy(param)
+  # first loop accumulates repeated indices if necessary.
+  for i in range(len(gindexs)):
+    gindex = gindexs[i]
+    gvalue = gvalues[i]
+    accum_t[gindex] = accum_t[gindex] + gvalue * gvalue
+  for i in range(len(gindexs)):
+    gindex = gindexs[i]
+    gvalue = gvalues[i]
+    param_t[gindex] = param_t[gindex] - lr * gvalue / (
+        np.sqrt(accum_t[gindex]) + epsilon)
+  return param_t, accum_t
+
+
+class AdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_callable_params=False):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 3.0
+        if not use_callable_params:
+          learning_rate = learning_rate()
+
+        ada_opt = adagrad.Adagrad(learning_rate)
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        if not context.executing_eagerly():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(ada_update)
+          else:
+            ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, 3.0)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, 3.0)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasic(self):
+    self.doTestBasic()
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_callable_params=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adagrad.Adagrad(1.0).minimize(loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType(
+            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = constant_op.constant(3.0)
+        ada_opt = adagrad.Adagrad(learning_rate)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, learning_rate)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, learning_rate)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        learning_rate = 3.0
+        ada_opt = adagrad.Adagrad(learning_rate)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
+
+        accum0_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+
+          var0_np, accum0_np = sparse_adagrad_update_numpy(
+              var0_np, accum0_np, grads0_np_indices,
+              grads0_np[grads0_np_indices], learning_rate)
+          var1_np, accum1_np = sparse_adagrad_update_numpy(
+              var1_np, accum1_np, grads1_np_indices,
+              grads1_np[grads1_np_indices], learning_rate)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+
+        repeated_index_update_var = resource_variable_ops.ResourceVariable(
+            var_np, dtype=dtype)
+        aggregated_update_var = resource_variable_ops.ResourceVariable(
+            var_np, dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def testSparseRepeatedIndicesByEmbeddingLookUp(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = adagrad.Adagrad(2.0).minimize(
+            loss_repeated, var_list=[var_repeated])
+        update_op_aggregated = adagrad.Adagrad(2.0).minimize(
+            loss_aggregated, var_list=[var_aggregated])
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(
+            var_repeated.eval(), var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(
+              var_repeated.eval(), var_aggregated.eval())
+
+  def testSparseStability(self):
+    for dtype in [dtypes.half]:
+      with self.cached_session():
+        shape = [1, 6]
+        var0_np = np.array([[
+            0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257, -0.0105945
+        ]],
+                           dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        grads0_np = np.array([[
+            -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, -8.4877e-05,
+            -9.48906e-05
+        ]],
+                             dtype=dtype.as_numpy_dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np), constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = adagrad.Adagrad(1.0)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 3.0
+        ada_opt = adagrad.Adagrad(learning_rate)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        for _ in range(3):
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, learning_rate)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, learning_rate)
+        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index b05811c419fa8ee435e635feb66408a4cd2ab06a..962680fad68fb024178dfee2e34fbba2f4dd055c 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
 
 
@@ -40,7 +44,7 @@ class Adam(optimizer_v2.OptimizerV2):
                learning_rate=0.001,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=1e-7,
                name='Adam'):
     r"""Construct a new Adam optimizer.
 
@@ -106,23 +110,62 @@ class Adam(optimizer_v2.OptimizerV2):
       self.add_slot(var, 'v')
 
   def _resource_apply_dense(self, grad, var):
+    grad_dtype = grad.dtype.base_dtype
     m = self.get_slot(var, 'm')
     v = self.get_slot(var, 'v')
-    # TODO(tanzheny): let optimizer have its own step counter, and let
-    # beta1_power and beta2_power depend on it.
+    local_step = math_ops.cast(self.iterations + 1, grad_dtype)
+    beta_1_t = math_ops.cast(self._get_hyper('beta_1'), grad_dtype)
+    beta_2_t = math_ops.cast(self._get_hyper('beta_2'), grad_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
     return training_ops.resource_apply_adam(
         var.handle,
         m.handle,
         v.handle,
-        math_ops.cast(self._get_hyper('beta_1'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('beta_2'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('learning_rate'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('beta_1'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('beta_2'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('epsilon'), grad.dtype.base_dtype),
+        beta_1_power,
+        beta_2_power,
+        math_ops.cast(self._get_hyper('learning_rate'), grad_dtype),
+        beta_1_t,
+        beta_2_t,
+        math_ops.cast(self._get_hyper('epsilon'), grad_dtype),
         grad,
         use_locking=self._use_locking)
 
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_t = math_ops.cast(self._get_hyper('beta_1'), var_dtype)
+    beta_2_t = math_ops.cast(self._get_hyper('beta_2'), var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    lr_t = math_ops.cast(self._get_hyper('learning_rate'), var_dtype)
+    epsilon_t = math_ops.cast(self._get_hyper('epsilon'), var_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_scaled_g_values = grad * (1 - beta_1_t)
+    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, 'v')
+    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
+    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(
+        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
+
   def get_config(self):
     config = super(Adam, self).get_config()
     config.update({
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..46a45af224b9a3f5de3e4e0022fa85a89fce01d9
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -0,0 +1,303 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-7):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  beta_2_t = math_ops.cast(opt._get_hyper("beta_2"), dtype)
+  beta_2_power = math_ops.pow(beta_2_t, local_step)
+  return (beta_1_power, beta_2_power)
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = adam.Adam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam.Adam(3.0)
+        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam.Adam().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adam.Adam().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            self.evaluate(repeated_index_update_var))
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              self.evaluate(repeated_index_update_var))
+
+  def doTestBasic(self, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = adam.Adam(learning_rate=learning_rate)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic()
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_callable_params=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.Adam(1.)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, hyper variables, and two unique slot
+      # variables for v1 and v2 respectively.
+      self.assertEqual(9, len(set(opt.variables())))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..7530e629d16c3b613479a936074920fb5e0f25ed
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -0,0 +1,149 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adamax for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import training_ops
+
+
+class Adamax(adam.Adam):
+  """Optimizer that implements the Adamax algorithm.
+
+  It is a variant of Adam based on the infinity norm.
+  Default parameters follow those provided in the paper.
+  Adamax is sometimes superior to adam, specially in models with embeddings.
+
+  References
+    see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+      ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               name='Adamax'):
+    """Construct a new Adamax optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section 7.1 of the paper:
+
+    ```
+    t <- t + 1
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- max(beta2 * v_{t-1}, abs(g))
+    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+    ```
+
+    Similar to AdamOptimizer, the epsilon is added for numerical stability
+    (especially to get rid of division by zero when v_t = 0).
+
+    Contrast to AdamOptimizer, the sparse implementation of this algorithm
+    (used when the gradient is an IndexedSlices object, typically because of
+    `tf.gather` or an embedding lookup in the forward pass) only updates
+    variable slices and corresponding `m_t`, `v_t` terms when that part of
+    the variable was used in the forward pass. This means that the sparse
+    behavior is contrast to the dense behavior (similar to some momentum
+    implementations which ignore momentum unless a variable slice was actually
+    used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adamax".
+    """
+    # pylint: disable=useless-super-delegation
+    super(Adamax, self).__init__(learning_rate, beta_1, beta_2, epsilon, name)
+    # pylint: enable=useless-super-delegation
+
+  def _resource_apply_dense(self, grad, var):
+    grad_dtype = grad.dtype.base_dtype
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    local_step = math_ops.cast(self.iterations + 1, grad_dtype)
+    beta_1_t = math_ops.cast(self._get_hyper('beta_1'), grad_dtype)
+    beta_2_t = math_ops.cast(self._get_hyper('beta_2'), grad_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    return training_ops.resource_apply_ada_max(
+        var.handle,
+        m.handle,
+        v.handle,
+        beta_1_power,
+        math_ops.cast(self._get_hyper('learning_rate'), grad_dtype),
+        beta_1_t,
+        beta_2_t,
+        math_ops.cast(self._get_hyper('epsilon'), grad_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    grad_dtype = grad.dtype.base_dtype
+
+    local_step = math_ops.cast(self.iterations + 1, grad_dtype)
+    beta_1_t = math_ops.cast(self._get_hyper('beta_1'), grad_dtype)
+    beta_2_t = math_ops.cast(self._get_hyper('beta_2'), grad_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    lr_t = math_ops.cast(self._get_hyper('learning_rate'), grad_dtype)
+    epsilon_t = math_ops.cast(self._get_hyper('epsilon'), grad_dtype)
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_slice = array_ops.gather(m, indices)
+    m_t_slice = m_slice * beta_1_t + grad * (1 - beta_1_t)
+    with ops.control_dependencies([m_t_slice]):
+      m_t = self._resource_scatter_update(m, indices, m_t_slice)
+
+    # u_t = max(beta2 * u, abs(g_t))
+    v = self.get_slot(var, 'v')
+    v_slice = array_ops.gather(v, indices)
+    v_t_slice = math_ops.maximum(v_slice * beta_2_t, math_ops.abs(grad))
+    with ops.control_dependencies([v_t_slice]):
+      v_t = self._resource_scatter_update(v, indices, v_t_slice)
+    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
+    var_slice = -lr_t / (1 - beta_1_power) * (
+        m_t_slice / (v_t_slice + epsilon_t))
+    with ops.control_dependencies([var_slice]):
+      var_update = self._resource_scatter_add(var, indices, var_slice)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(
+            x.handle, i, v)]):
+      return x.value()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6b45ccbe95ec3899c93e06551068e7f07363080
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -0,0 +1,309 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adamax."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adamax
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adamax_update_numpy(param,
+                        g_t,
+                        t,
+                        m,
+                        v,
+                        alpha=0.001,
+                        beta1=0.9,
+                        beta2=0.999,
+                        epsilon=1e-8):
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = np.maximum(beta2 * v, np.abs(g_t))
+  param_t = param - (alpha / (1 - beta1**t)) * (m_t / (v_t + epsilon))
+  return param_t, m_t, v_t
+
+
+def adamax_sparse_update_numpy(param,
+                               indices,
+                               g_t,
+                               t,
+                               m,
+                               v,
+                               alpha=0.001,
+                               beta1=0.9,
+                               beta2=0.999,
+                               epsilon=1e-8):
+  m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
+  param_t_slice = param[indices] - ((alpha / (1 - beta1**t)) *
+                                    (m_t_slice / (v_t_slice + epsilon)))
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  return beta_1_power
+
+
+class AdamaxOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)  # pylint: disable=cell-var-from-loop
+        m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
+        var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([2, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = adamax.Adamax()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
+        self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adamax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_sparse_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_sparse_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adamax.Adamax(3.0)
+        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adamax.Adamax().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adamax.Adamax().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasic(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adamax.Adamax()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adamax
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          beta_1_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.Adamax(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adamax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.Adamax()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adamax1 and Adamax2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adamax.Adamax(1.)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, hyper variables, and two unique slot
+      # variables for v1 and v2 respectively.
+      self.assertEqual(9, len(set(opt.variables())))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..2faf65eab3dcb95e8ae726b693f171f8c244ef7c
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ftrl-proximal for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class Ftrl(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the FTRL algorithm.
+
+  See this [paper](
+  https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
+  This version has support for both online L2 (the L2 penalty given in the paper
+  above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
+  loss function).
+  """
+
+  def __init__(self,
+               learning_rate,
+               learning_rate_power=-0.5,
+               initial_accumulator_value=0.1,
+               l1_regularization_strength=0.0,
+               l2_regularization_strength=0.0,
+               name='Ftrl',
+               l2_shrinkage_regularization_strength=0.0):
+    r"""Construct a new FTRL optimizer.
+
+    Args:
+      learning_rate: A float value or a constant float `Tensor`.
+      learning_rate_power: A float value, must be less or equal to zero.
+        Controls how the learning rate decreases during training. Use zero for
+        a fixed learning rate.
+      initial_accumulator_value: The starting value for accumulators.
+        Only zero or positive values are allowed.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Ftrl".
+      l2_shrinkage_regularization_strength: A float value, must be greater than
+        or equal to zero. This differs from L2 above in that the L2 above is a
+        stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
+        The FTRL formulation can be written as:
+        w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
+        \hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
+        function w.r.t. the weights w.
+        Specifically, in the absence of L1 regularization, it is equivalent to
+        the following update rule:
+        w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
+                  2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
+        where lr_t is the learning rate at t.
+        When input is sparse shrinkage will only happen on the active weights.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+
+    References
+      See [paper]
+        (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+    """
+    super(Ftrl, self).__init__(name)
+
+    if initial_accumulator_value < 0.0:
+      raise ValueError(
+          'initial_accumulator_value %f needs to be positive or zero' %
+          initial_accumulator_value)
+    if learning_rate_power > 0.0:
+      raise ValueError('learning_rate_power %f needs to be negative or zero' %
+                       learning_rate_power)
+    if l1_regularization_strength < 0.0:
+      raise ValueError(
+          'l1_regularization_strength %f needs to be positive or zero' %
+          l1_regularization_strength)
+    if l2_regularization_strength < 0.0:
+      raise ValueError(
+          'l2_regularization_strength %f needs to be positive or zero' %
+          l2_regularization_strength)
+    if l2_shrinkage_regularization_strength < 0.0:
+      raise ValueError(
+          'l2_shrinkage_regularization_strength %f needs to be positive'
+          ' or zero' % l2_shrinkage_regularization_strength)
+
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('learning_rate_power', learning_rate_power)
+    self._set_hyper('l1_regularization_strength', l1_regularization_strength)
+    self._set_hyper('l2_regularization_strength', l2_regularization_strength)
+    self._initial_accumulator_value = initial_accumulator_value
+    self._l2_shrinkage_regularization_strength = (
+        l2_shrinkage_regularization_strength)
+
+  def _create_slots(self, var_list):
+    # Create the "accum" and "linear" slots.
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = init_ops.constant_initializer(
+          self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, 'accumulator', init)
+      self.add_slot(var, 'linear')
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    learning_rate = math_ops.cast(self._get_hyper('learning_rate'), var_dtype)
+    learning_rate_power = math_ops.cast(
+        self._get_hyper('learning_rate_power'), var_dtype)
+    l1_regularization_strength = math_ops.cast(
+        self._get_hyper('l1_regularization_strength'), var_dtype)
+    l2_regularization_strength = math_ops.cast(
+        self._get_hyper('l2_regularization_strength'), var_dtype)
+    accum = self.get_slot(var, 'accumulator')
+    linear = self.get_slot(var, 'linear')
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          learning_rate,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          learning_rate_power,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          learning_rate,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          math_ops.cast(self._l2_shrinkage_regularization_strength, var_dtype),
+          learning_rate_power,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    learning_rate = math_ops.cast(self._get_hyper('learning_rate'), var_dtype)
+    learning_rate_power = math_ops.cast(
+        self._get_hyper('learning_rate_power'), var_dtype)
+    l1_regularization_strength = math_ops.cast(
+        self._get_hyper('l1_regularization_strength'), var_dtype)
+    l2_regularization_strength = math_ops.cast(
+        self._get_hyper('l2_regularization_strength'), var_dtype)
+    accum = self.get_slot(var, 'accumulator')
+    linear = self.get_slot(var, 'linear')
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_sparse_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          learning_rate,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          learning_rate_power,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          learning_rate,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          math_ops.cast(self._l2_shrinkage_regularization_strength, var_dtype),
+          learning_rate_power,
+          use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(Ftrl, self).get_config()
+    config.update({
+        'learning_rate':
+            self._serialize_hyperparameter('learning_rate'),
+        'initial_accumulator_value':
+            self._initial_accumulator_value,
+        'learning_rate_power':
+            self._serialize_hyperparameter('learning_rate_power'),
+        'l1_regularization_strength':
+            self._serializer_hyperparameter('l1_regularization_strength'),
+        'l2_regularization_strength':
+            self._serializer_hyperparameter('l2_regularization_strength'),
+        'l2_shrinkage_regularization_strength':
+            self._l2_shrinkage_regularization_strength,
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c14cf75c26904e89b057e662cd97a112e3e67767
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -0,0 +1,426 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for Ftrl operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import ftrl
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import gradient_descent
+
+
+class FtrlOptimizerTest(test.TestCase):
+
+  def doTestFtrlwithoutRegularization(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([0.0, 0.0], dtype=dtype)
+          var1 = variables.Variable([0.0, 0.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllClose([0.0, 0.0], v0_val)
+        self.assertAllClose([0.0, 0.0], v1_val)
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          update.run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.60260963, -4.29698515]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.28432083, -0.56694895]), v1_val)
+
+  def testFtrlWithoutRegularization(self):
+    self.doTestFtrlwithoutRegularization(use_resource=False)
+
+  def testResourceFtrlWithoutRegularization(self):
+    self.doTestFtrlwithoutRegularization(use_resource=True)
+
+  def testFtrlwithoutRegularization2(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          update.run()
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.55607247, -3.98729396]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.28232238, -0.56096673]), v1_val)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = ftrl.Ftrl(1.0).minimize(loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  def testFtrlWithL1(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-7.66718769, -10.91273689]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.93460727, -1.86147261]), v1_val)
+
+  def testFtrlWithL1_L2(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-0.24059935, -0.46829352]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02406147, -0.04830509]), v1_val)
+
+  def testFtrlWithL1_L2_L2Shrinkage(self):
+    """Test the new FTRL op with support for l2 shrinkage.
+
+    The addition of this parameter which places a constant pressure on weights
+    towards the origin causes the gradient descent trajectory to differ. The
+    weights will tend to have smaller magnitudes with this parameter set.
+    """
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-0.22578995, -0.44345796]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.14378493, -0.13229476]), v1_val)
+
+  def testFtrlWithL1_L2_L2ShrinkageSparse(self):
+    """Tests the new FTRL op with support for l2 shrinkage on sparse grads."""
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[4.0], [3.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.02], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
+        self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType([[-0.22578995], [2.]], v0_val)
+        self.assertAllCloseAccordingToType([[4.], [-0.13229476]], v1_val)
+
+  def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
+    """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([1.0, 2.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.1, 0.2], dtype=dtype)
+
+        opt0 = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        opt1 = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0)
+        update0 = opt0.apply_gradients([(grads0, var0)])
+        update1 = opt1.apply_gradients([(grads1, var1)])
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([1.0, 2.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update0.run()
+          update1.run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        # var0 is experiencing L2 shrinkage so it should be smaller than var1
+        # in magnitude.
+        self.assertTrue((v0_val**2 < v1_val**2).all())
+        accum0 = sess.run(opt0.get_slot(var0, "accumulator"))
+        accum1 = sess.run(opt1.get_slot(var1, "accumulator"))
+        # L2 shrinkage should not change how we update grad accumulator.
+        self.assertAllCloseAccordingToType(accum0, accum1)
+
+  def applyOptimizer(self, opt, dtype, steps=5, is_sparse=False):
+    if is_sparse:
+      var0 = variables.Variable([[0.0], [0.0]], dtype=dtype)
+      var1 = variables.Variable([[0.0], [0.0]], dtype=dtype)
+      grads0 = ops.IndexedSlices(
+          constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+          constant_op.constant([0]), constant_op.constant([2, 1]))
+      grads1 = ops.IndexedSlices(
+          constant_op.constant([0.02], shape=[1, 1], dtype=dtype),
+          constant_op.constant([1]), constant_op.constant([2, 1]))
+    else:
+      var0 = variables.Variable([0.0, 0.0], dtype=dtype)
+      var1 = variables.Variable([0.0, 0.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    variables.global_variables_initializer().run()
+
+    sess = ops.get_default_session()
+    v0_val, v1_val = sess.run([var0, var1])
+    if is_sparse:
+      self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
+      self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
+    else:
+      self.assertAllCloseAccordingToType([0.0, 0.0], v0_val)
+      self.assertAllCloseAccordingToType([0.0, 0.0], v1_val)
+
+    # Run Ftrl for a few steps
+    for _ in range(steps):
+      update.run()
+
+    v0_val, v1_val = sess.run([var0, var1])
+    return v0_val, v1_val
+
+  # When variables are initialized with Zero, FTRL-Proximal has two properties:
+  # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
+  # with GradientDescent.
+  # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is identical
+  # with Adagrad.
+  # So, basing on these two properties, we test if our implementation of
+  # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
+  def testEquivAdagradwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Adagrad learning rate
+                learning_rate_power=-0.5,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1), dtype)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  def testEquivSparseAdagradwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Adagrad learning rate
+                learning_rate_power=-0.5,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype,
+            is_sparse=True)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1),
+            dtype,
+            is_sparse=True)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  def testEquivSparseGradientDescentwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Fixed learning rate
+                learning_rate_power=-0.0,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype,
+            is_sparse=True)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            gradient_descent.GradientDescentOptimizer(3.0),
+            dtype,
+            is_sparse=True)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  def testEquivGradientDescentwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Fixed learning rate
+                learning_rate_power=-0.0,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            gradient_descent.GradientDescentOptimizer(3.0), dtype)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 3ee1982af95c64c729f87eff0189c29208a2400e..90106c941cca5b1a22d81e5492b750deafea33b0 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -25,12 +25,21 @@ from tensorflow.python.training import training_ops
 
 
 class SGD(optimizer_v2.OptimizerV2):
-  """Stochastic gradient descent optimizer.
+  """Stochastic gradient descent and momentum optimizer.
 
   Computes:
+  ```
+  theta(t+1) = theta(t) - learning_rate * gradient
+  gradient is evaluated at theta(t).
+  ```
 
+  or Computes (if `use_nesterov = False`):
   ```
-  variable -= learning_rate * gradient
+  v(t+1) = momentum * v(t) - learning_rate * gradient
+  theta(t+1) = theta(t) + v(t+1)
+  if `nesterov` is False, gradient is evaluated at theta(t).
+  if `nesterov` is True, gradient is evaluated at theta(t) + momentum * v(t),
+    and the variables always store theta + m v instead of theta
   ```
 
   Some of the args below are hyperparameters, where a hyperparameter is
@@ -44,49 +53,90 @@ class SGD(optimizer_v2.OptimizerV2):
   changing these values across different invocations of optimizer functions.
   @end_compatibility
 
-  Arguments:
-      learning_rate: float hyperparameter >= 0. Learning rate.
-      name: Optional name prefix for the operations created when applying
-        gradients.  Defaults to 'SGD'.
+  # References
+      nesterov = True, See [Sutskever et al., 2013](
+        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
   """
 
   def __init__(self,
                learning_rate=0.001,
-               momentum=None,
+               momentum=0.0,
                nesterov=False,
                name="SGD"):
+    """Construct a new Stochastic Gradient Descent or Momentum optimizer.
+
+    Arguments:
+      learning_rate: float hyperparameter >= 0. Learning rate.
+      momentum: float hyperparameter >= 0 that accelerates SGD in the relevant
+        direction and dampens oscillations.
+      nesterov: boolean. Whether to apply Nesterov momentum.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to 'SGD'.
+    """
     super(SGD, self).__init__(name)
     self._set_hyper("learning_rate", learning_rate)
 
-  def _apply_dense(self, grad, var):
-    return training_ops.apply_gradient_descent(
-        var,
-        math_ops.cast(self._get_hyper("learning_rate"), var.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking).op
+    self._momentum = False
+    if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0:
+      self._momentum = True
+    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
+      raise ValueError("`momentum` must be between [0, 1].")
+    self._set_hyper("momentum", momentum)
+
+    self._nesterov = nesterov
+
+  def _create_slots(self, var_list):
+    if self._momentum:
+      for var in var_list:
+        self.add_slot(var, "momentum")
 
   def _resource_apply_dense(self, grad, var):
-    return training_ops.resource_apply_gradient_descent(
-        var.handle,
-        math_ops.cast(self._get_hyper("learning_rate"), var.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking)
+    learning_rate = self._get_hyper("learning_rate")
+    if self._momentum:
+      momentum_var = self.get_slot(var, "momentum")
+      return training_ops.resource_apply_momentum(
+          var.handle,
+          momentum_var.handle,
+          math_ops.cast(learning_rate, grad.dtype.base_dtype),
+          grad,
+          math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
+          use_locking=self._use_locking,
+          use_nesterov=self._nesterov)
+    else:
+      return training_ops.resource_apply_gradient_descent(
+          var.handle,
+          math_ops.cast(learning_rate, grad.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking)
 
   def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
-    return resource_variable_ops.resource_scatter_add(
-        var.handle, indices, -grad * math_ops.cast(
-            self._get_hyper("learning_rate"), var.dtype.base_dtype))
+    if self._momentum:
+      return super(SGD, self)._resource_apply_sparse_duplicate_indices(
+          grad, var, indices)
+    else:
+      return resource_variable_ops.resource_scatter_add(
+          var.handle, indices, -grad * math_ops.cast(
+              self._get_hyper("learning_rate"), grad.dtype.base_dtype))
 
-  def _apply_sparse_duplicate_indices(self, grad, var):
-    delta = ops.IndexedSlices(
-        grad.values * math_ops.cast(
-            self._get_hyper("learning_rate"), var.dtype.base_dtype),
-        grad.indices, grad.dense_shape)
-    return var.scatter_sub(delta, use_locking=self._use_locking)
+  def _resource_apply_sparse(self, grad, var, indices):
+    # This method is only needed for momentum optimization.
+    learning_rate = self._get_hyper("learning_rate")
+    momentum_var = self.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_momentum(
+        var.handle,
+        momentum_var.handle,
+        math_ops.cast(learning_rate, grad.dtype.base_dtype),
+        grad,
+        indices,
+        math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._nesterov)
 
   def get_config(self):
     config = super(SGD, self).get_config()
     config.update({
         "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "momentum": self._serialize_hyperparameter("momentum"),
+        "nesterov": self._nesterov,
     })
     return config
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 3fb15c51d04b8b833701b231a37074048f80c47a..fa7cca142006e0d60474e34a9fc765ce9af97a1b 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -18,46 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
 class GradientDescentOptimizerTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes
   def testBasic(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        optimizer = gradient_descent.SGD(3.0)
-        sgd_op = optimizer.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        self.assertEqual(0, len(optimizer.variables()))
-
-  def testBasicResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
@@ -65,19 +47,18 @@ class GradientDescentOptimizerTest(test.TestCase):
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
         grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
         sgd = gradient_descent.SGD(3.0)
+        # self.assertFalse(sgd._initial_decay)
         sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_in_graph_and_eager_modes
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -88,44 +69,34 @@ class GradientDescentOptimizerTest(test.TestCase):
         lr = lambda: 3.0
         sgd = gradient_descent.SGD(lr)
         sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # TODO(apassos) calling initialize_resources on all resources here
-        # doesn't work because the sessions and graph are reused across unit
-        # tests and this would mean trying to reinitialize variables. Figure out
-        # a long-term solution for this.
-        resources.initialize_resources([var0, var1, sgd.iteration]).run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_in_graph_and_eager_modes
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(var0, x) + var1
-        loss = pred * pred
+        loss = lambda: math_ops.matmul(var0, x) + var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
         sgd = gradient_descent.SGD(1.0)
         sgd_op = sgd.minimize(loss, [var0, var1])
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
-        np_grad = 2 * np_pred
-        self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0 - 4.0, 2.0 - 5.0]],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
 
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -137,18 +108,15 @@ class GradientDescentOptimizerTest(test.TestCase):
         pred += var1
         loss = pred * pred
         sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -160,17 +128,14 @@ class GradientDescentOptimizerTest(test.TestCase):
         lrate = constant_op.constant(3.0)
         sgd_op = gradient_descent.SGD(lrate).apply_gradients(
             zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -179,31 +144,9 @@ class GradientDescentOptimizerTest(test.TestCase):
         values = [1.0, 3.0]
         vars_ = [variables.Variable([v], dtype=dtype) for v in values]
         grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], grad.eval())
-
-  def testWithGlobalStep(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        sgd = gradient_descent.SGD(3.0)
-        sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params and optimizer iterations.
-        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        self.assertAllCloseAccordingToType(1, sgd.iteration.eval())
+          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
 
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -218,17 +161,14 @@ class GradientDescentOptimizerTest(test.TestCase):
             constant_op.constant([1]), constant_op.constant([2, 1]))
         sgd_op = gradient_descent.SGD(3.0).apply_gradients(
             zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
         self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testCapturingInDefunWhileExecutingEagerly(self):
     with context.eager_mode():
@@ -251,5 +191,424 @@ class GradientDescentOptimizerTest(test.TestCase):
       self.assertEqual(float(step()), -1.0)
 
 
+class MomentumOptimizerTest(test.TestCase):
+
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    var = var + accum * lr * momentum
+    accum = accum * momentum + g
+    var = var - lr * accum
+    var = var - accum * lr * momentum
+    return var, accum
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasic(self):
+    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtype,
+                                                      name="var0")
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtype,
+                                                      name="var1")
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        learning_rate = 2.0
+        momentum = 0.9
+        mom_opt = gradient_descent.SGD(
+            learning_rate=learning_rate, momentum=momentum)
+        # self.assertFalse(mom_opt._initial_decay)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+
+        # Check we have slots
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(mom_update)
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
+        # Step 2: the momentum accumulators contain the previous update.
+        self.evaluate(mom_update)
+        if context.executing_eagerly():
+          mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
+
+  def testNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtype,
+                                                      name="var0")
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtype,
+                                                      name="var1")
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        loss = 5 * var0 * var0 + 3 * var1
+        mom_op = gradient_descent.SGD(
+            learning_rate=2.0, momentum=0.9, nesterov=True)
+        opt_op = mom_op.minimize(loss, [var0, var1])
+        variables.global_variables_initializer().run()
+        for _ in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
+
+  def testSparseNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        grads = []
+        for t in range(1, 5):
+          grads.append(var0_np * 10)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, dtype=dtype, name="var0")
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, dtype=dtype, name="var1")
+        mom_op = gradient_descent.SGD(
+            learning_rate=2.0, momentum=0.9, nesterov=True)
+        x_feed = array_ops.placeholder(dtype)
+        y_feed = ops.IndexedSlices(x_feed, constant_op.constant([0, 1]),
+                                   constant_op.constant([2]))
+        grads_and_vars = [(y_feed, var0),
+                          (constant_op.constant([3.0, 3.0], dtype=dtype), var1)]
+        opt_update = mom_op.apply_gradients(grads_and_vars)
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_update.run(feed_dict={x_feed: grads[t - 1]})
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        return pred * pred
+
+      # pylint: enable=cell-var-from-loop
+
+      opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss, [var0])
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndicesForEmbeddingLookup(self):
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss, [var0])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
+
+  def testTensorLearningRateAndMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = gradient_descent.SGD(
+            learning_rate=constant_op.constant(2.0),
+            momentum=constant_op.constant(0.9))
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Check we have slots
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
+
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
+        var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([[.1, .1]], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([4, 2]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([[.01, .01], [.01, .01]], dtype=dtype),
+            constant_op.constant([2, 3]), constant_op.constant([4, 2]))
+        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Check we have slots
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([0, 0], self.evaluate(var0)[0])
+        self.assertAllClose([0, 0], self.evaluate(var0)[1])
+        self.assertAllClose([1, 1], self.evaluate(var1)[2])
+
+        # Step 1: the momentum accumulators are 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([0, 0]),
+            self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([.1, .1]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]),
+            self.evaluate(slot1)[2])
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([0, 0]),
+            self.evaluate(var0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
+            self.evaluate(var0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
+            self.evaluate(var1)[2])
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1)[2])
+        # Check that the parameters have been updated.
+        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]),
+            self.evaluate(var0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]),
+            self.evaluate(var1)[2])
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+        mom_update1 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        mom_update2 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update1.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
+        # Step 2: the second momentum accumulators contain the previous update.
+        mom_update2.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConfig(self):
+    with self.cached_session():
+      opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True)
+      config = opt.get_config()
+      opt2 = gradient_descent.SGD.from_config(config)
+      # assert both are equal float values.
+      self.assertEqual(
+          opt._get_hyper("learning_rate"), opt2._get_hyper("learning_rate"))
+      self.assertEqual(opt._get_hyper("momentum"), opt2._get_hyper("momentum"))
+      # self.assertEqual(opt._get_hyper("decay"), opt2._get_hyper("decay"))
+      var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
+      loss = lambda: 3 * var0
+      # learning rate variable created when calling minimize.
+      opt.minimize(loss, [var0])
+      self.evaluate(variables.global_variables_initializer())
+      config = opt.get_config()
+      opt3 = gradient_descent.SGD.from_config(config)
+      self.assertEqual(
+          self.evaluate(opt._get_hyper("learning_rate")),
+          opt3._get_hyper("learning_rate"))
+      self.assertEqual(
+          self.evaluate(opt._get_hyper("momentum")),
+          opt3._get_hyper("momentum"))
+      # self.assertEqual(
+      #     self.evaluate(opt._get_hyper("decay")), opt3._get_hyper("decay"))
+      self.assertTrue(opt3._nesterov)
+
+  def testNesterovWithoutMomentum(self):
+    with self.assertRaisesRegexp(ValueError, "must be between"):
+      gradient_descent.SGD(learning_rate=1.0, momentum=2.0)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
new file mode 100644
index 0000000000000000000000000000000000000000..4be421a73fad4129d3f7ef09a4e8db8201e01f36
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -0,0 +1,110 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Nadam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+
+
+class Nadam(adam.Adam):
+  r"""Optimizer that implements the NAdam algorithm.
+
+  Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
+  Nesterov momentum.
+
+  Initialization:
+
+  $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+  $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+  $$t := 0 \text{(Initialize timestep)}$$
+
+  Computes:
+  $$t := t + 1$$
+  $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+  $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+  $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+  $$m_bar_t := beta_1 * v_t + (1 - beta_1) * g$$
+  $$theta_t := theta_{t-1} - lr_t * m_bar_t / (\sqrt{v_t} + \epsilon)$$
+
+  gradient is evaluated at theta(t) + momentum * v(t), and the variables always
+  store theta + beta_1 * m / sqrt(v) instead of theta.
+
+  References
+    See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+  """
+
+  def _resource_apply_dense(self, grad, var):
+    grad_dtype = grad.dtype.base_dtype
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    local_step = math_ops.cast(self.iterations + 1, grad_dtype)
+    beta_1_t = math_ops.cast(self._get_hyper('beta_1'), grad_dtype)
+    beta_2_t = math_ops.cast(self._get_hyper('beta_2'), grad_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        beta_1_power,
+        beta_2_power,
+        math_ops.cast(self._get_hyper('learning_rate'), grad_dtype),
+        beta_1_t,
+        beta_2_t,
+        math_ops.cast(self._get_hyper('epsilon'), grad_dtype),
+        grad,
+        use_locking=self._use_locking,
+        use_nesterov=True)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_t = math_ops.cast(self._get_hyper('beta_1'), var_dtype)
+    beta_2_t = math_ops.cast(self._get_hyper('beta_2'), var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    lr_t = math_ops.cast(self._get_hyper('learning_rate'), var_dtype)
+    epsilon_t = math_ops.cast(self._get_hyper('epsilon'), var_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_scaled_g_values = grad * (1 - beta_1_t)
+    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+      # m_bar = (1 - beta1) * g_t + beta1 * m_t
+      m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices)
+
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, 'v')
+    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
+    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+    v_t_slice = array_ops.gather(v_t, indices)
+    v_sqrt = math_ops.sqrt(v_t_slice)
+    var_update = self._resource_scatter_add(var, indices,
+                                            -lr * m_bar / (v_sqrt + epsilon_t))
+    return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cc81b1d118c21f5a161e07164b29b0fd1814f4b
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -0,0 +1,169 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nadam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import nadam
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  beta_2_t = math_ops.cast(opt._get_hyper("beta_2"), dtype)
+  beta_2_power = math_ops.pow(beta_2_t, local_step)
+  return (beta_1_power, beta_2_power)
+
+
+def nadam_update_numpy(param,
+                       g_t,
+                       t,
+                       m,
+                       v,
+                       alpha=0.001,
+                       beta1=0.9,
+                       beta2=0.999,
+                       epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  m_bar = (1 - beta1) * g_t + beta1 * m_t
+
+  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class NadamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    sparse_epsilon = 1e-7
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = nadam.Nadam(epsilon=sparse_epsilon)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, epsilon=sparse_epsilon)
+          var1_np, m1, v1 = nadam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, epsilon=sparse_epsilon)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def doTestBasic(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = nadam.Nadam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic(use_resource=False)
+
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 26e6dc294c0a901d3b0538d22923cd7b5462daeb..fa7cfa5b8a2dd56f0bbbb9105685dac2b9ac7499 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -24,6 +24,7 @@ import abc
 
 import six
 
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -31,10 +32,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as optimizer_v1
@@ -324,9 +323,15 @@ class OptimizerV2(optimizer_v1.Optimizer):
                       "_" + var.op.name)
         with ops.name_scope("update" + scope_name), ops.colocate_with(var):
           update_ops.append(update_grad_to_var(grad, var))
-      with ops.colocate_with(self._iterations):
-        update_ops.append(self._iterations.assign_add(1))
-      return control_flow_ops.group(*update_ops)
+      # control dependencies does not work in per replica mode, please change
+      # this once b/118841692 is fixed.
+      # with ops.control_dependencies(update_ops):
+      #   apply_updates = self._iterations.assign_add(1).op
+      apply_updates = merge_update_step(update_ops, self.iterations)
+      return apply_updates
+
+  def get_updates(self, loss, params):
+    return [self.minimize(loss, params)]
 
   def _set_hyper(self, name, value):
     """set hyper `name` to value. value can be callable, tensor, numeric."""
@@ -344,19 +349,41 @@ class OptimizerV2(optimizer_v1.Optimizer):
     value = self._hyper[name]
     return self._call_if_callable(value)
 
+  def __getattribute__(self, name):
+    """Overridden to support hyperparameter access."""
+    try:
+      return super(OptimizerV2, self).__getattribute__(name)
+    except AttributeError as e:
+      # Needed to avoid infinite recursion with __setattr__.
+      if name == "_hyper":
+        raise e
+      # Backwards compatibility with Keras optimizers.
+      if name == "lr":
+        name = "learning_rate"
+      if name in self._hyper:
+        return self._hyper[name]
+      raise e
+
   def __setattr__(self, name, value):
     """Override setattr to support dynamic hyperparameter setting."""
+    # Backwards compatibility with Keras optimizers.
+    if name == "lr":
+      name = "learning_rate"
     if hasattr(self, "_hyper") and name in self._hyper:
       self._set_hyper(name, value)
     else:
       super(OptimizerV2, self).__setattr__(name, value)
 
-  def add_slot(self, var, slot_name):
+  def add_slot(self, var, slot_name, initializer="zeros"):
     var_key = _var_key(var)
     slot_dict = self._slots.setdefault(var_key, {})
     if slot_name not in slot_dict:
       slot_key = _get_slot_key_from_var(var, slot_name)
-      weight = self.add_weight(name=slot_key, shape=var.shape, dtype=var.dtype)
+      weight = self.add_weight(
+          name=slot_key,
+          shape=var.shape,
+          dtype=var.dtype,
+          initializer=initializer)
       slot_dict[slot_name] = weight
       self._weights.append(weight)
 
@@ -372,8 +399,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
       self._iterations = self.add_weight(
           "iter",
           shape=[],
+          dtype=dtypes.int64,
           trainable=False,
-          aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+          aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      self._weights.append(self._iterations)
     for name, value in self._hyper.items():
       if isinstance(value, ops.Tensor) or callable(value):
         pass
@@ -383,11 +412,12 @@ class OptimizerV2(optimizer_v1.Optimizer):
             shape=[],
             trainable=False,
             initializer=value,
-            aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+            aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+        self._weights.append(self._hyper[name])
     self._prepared = True
 
   @property
-  def iteration(self):
+  def iterations(self):
     if not self._prepared:
       self._prepare()
     return self._iterations
@@ -430,10 +460,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
     value = self._get_hyper(hyperparameter_name)
     if callable(value):
       return value()
-    if isinstance(value, (ops.Tensor, variables.Variable)):
+    if isinstance(value, (ops.Tensor, tf_variables.Variable)):
       return backend.get_value(value)
     return value
 
+  def variables(self):
+    """Returns variables of this Optimizer based on the order created."""
+    return self._weights
+
   @property
   def weights(self):
     """Returns variables of this Optimizer based on the order created."""
@@ -470,15 +504,15 @@ class OptimizerV2(optimizer_v1.Optimizer):
                  dtype=None,
                  initializer="zeros",
                  trainable=None,
-                 synchronization=variables.VariableSynchronization.AUTO,
-                 aggregation=variables.VariableAggregation.NONE):
+                 synchronization=tf_variables.VariableSynchronization.AUTO,
+                 aggregation=tf_variables.VariableAggregation.NONE):
 
     if dtype is None:
       dtype = dtypes.float32
     if isinstance(initializer, six.string_types) or callable(initializer):
       initializer = initializers.get(initializer)
 
-    if synchronization == variables.VariableSynchronization.ON_READ:
+    if synchronization == tf_variables.VariableSynchronization.ON_READ:
       if trainable:
         raise ValueError(
             "Synchronization value can be set to "
@@ -502,6 +536,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
         use_resource=True,
         synchronization=synchronization,
         aggregation=aggregation)
+    backend.track_variable(variable)
 
     return variable
 
@@ -521,7 +556,7 @@ def _filter_grads(grads_and_vars):
   filtered = tuple(filtered)
   if not filtered:
     raise ValueError("No gradients provided for any variable: %s." %
-                     ([v.name for _, v in filtered],))
+                     ([v.name for _, v in grads_and_vars],))
   if vars_with_empty_grads:
     logging.warning(
         ("Gradients does not exist for variables %s when minimizing the loss."),
@@ -529,16 +564,31 @@ def _filter_grads(grads_and_vars):
   return filtered
 
 
+def merge_update_step(update_ops, local_step):
+  """Merge local step counter update from different replicas."""
+
+  def merge_update_step_fn(strategy, update_ops, local_step):
+    merged_ops = []
+    for update_op in update_ops:
+      merged_ops.append(strategy.group(update_op))
+    with ops.control_dependencies(merged_ops):
+      incre_op = local_step.assign_add(1).op
+    return incre_op
+
+  return distribution_strategy_context.get_replica_context().merge_call(
+      merge_update_step_fn, args=(update_ops, local_step))
+
+
 def merge_grads(grads_and_vars):
   """Merge gradients from different replicas."""
 
   def merge_grad_fn(strategy, grads_and_vars):
     reduced_grads = strategy.batch_reduce(
-        variable_scope.VariableAggregation.MEAN, grads_and_vars)
+        ds_reduce_util.ReduceOp.MEAN, grads_and_vars)
     return reduced_grads
 
-  return distribution_strategy_context.get_tower_context().merge_call(
-      merge_grad_fn, grads_and_vars)
+  return distribution_strategy_context.get_replica_context().merge_call(
+      merge_grad_fn, args=(grads_and_vars,))
 
 
 def _var_key(var):
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index e5d1a104ca408dc978a652c1dab0edc95f45735e..305267d73e5d852778333e827d0b1ed089502d02 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
@@ -25,6 +27,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import array_ops
@@ -279,8 +287,8 @@ class OptimizerTest(test.TestCase):
   def testIterationWithoutMinimize(self):
     with self.cached_session():
       sgd = gradient_descent.SGD(3.0)
-      self.evaluate(sgd.iteration.initializer)
-      self.assertEqual(0, self.evaluate(sgd.iteration))
+      self.evaluate(sgd.iterations.initializer)
+      self.assertEqual(0, self.evaluate(sgd.iterations))
 
   @test_util.run_in_graph_and_eager_modes
   def testSerializationWithinDefun(self):
@@ -341,8 +349,8 @@ class OptimizerTest(test.TestCase):
       opt2.set_weights(weights)
       self.evaluate([opt_op_1, opt_op_2])
       self.assertAllClose(self.evaluate(var1), self.evaluate(var2))
-      self.assertEqual(1, self.evaluate(opt1.iteration))
-      self.assertEqual(1, self.evaluate(opt2.iteration))
+      self.assertEqual(1, self.evaluate(opt1.iterations))
+      self.assertEqual(1, self.evaluate(opt2.iterations))
 
       var3 = resource_variable_ops.ResourceVariable([1.0, 2.0, 3.0],
                                                     dtype=dtypes.float32)
@@ -370,7 +378,109 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose(
           self.evaluate([var3, var4]), self.evaluate([var5, var6]))
 
-  def testOptimizerWithFunction(self):
+  @test_util.run_in_graph_and_eager_modes
+  def testGettingHyperParameters(self):
+    opt = adam.Adam(learning_rate=1.0)
+    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                 dtype=dtypes.float32)
+    loss = lambda: 3 * var
+    opt_op = opt.minimize(loss, [var])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt_op)
+
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(1.0, lr)
+
+    opt.lr = 2.0
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(2.0, lr)
+
+    self.evaluate(opt.lr.assign(3.0))
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(3.0, lr)
+
+    with self.assertRaises(AttributeError):
+      opt.not_an_attr += 3
+
+  @test_util.run_in_graph_and_eager_modes
+  def testOptimizerWithKerasModel(self):
+    a = input_layer.Input(shape=(3,), name='input_a')
+    b = input_layer.Input(shape=(3,), name='input_b')
+
+    dense = core.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = core.Dropout(0.5, name='dropout')(c)
+
+    model = training.Model([a, b], [d, e])
+
+    optimizer = gradient_descent.SGD(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(optimizer, loss, metrics=['mae'])
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
+              epochs=1,
+              batch_size=5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testOptimizerWithCallbacks(self):
+    input_np = np.random.random((10, 3))
+    output_np = np.random.random((10, 4))
+    a = input_layer.Input(shape=(3,), name='input_a')
+    model = sequential.Sequential()
+    model.add(core.Dense(4, name='dense'))
+    model.add(core.Dropout(0.5, name='dropout'))
+    model(a)
+    optimizer = gradient_descent.SGD(learning_rate=0.1)
+    model.compile(optimizer, loss='mse', metrics=['mae'])
+    # This does not reduce the LR after the first epoch (due to low delta).
+    cbks = [
+        callbacks.ReduceLROnPlateau(
+            monitor='val_loss', factor=0.1, min_delta=0, patience=1, cooldown=5)
+    ]
+    model.fit(
+        input_np,
+        output_np,
+        batch_size=10,
+        validation_data=(input_np, output_np),
+        callbacks=cbks,
+        epochs=5,
+        verbose=0)
+    self.assertAllClose(
+        float(backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
+
+    # This should reduce the LR after the first epoch (due to high delta).
+    cbks = [
+        callbacks.ReduceLROnPlateau(
+            monitor='val_loss',
+            factor=0.1,
+            min_delta=10,
+            patience=1,
+            cooldown=5)
+    ]
+    model.fit(
+        input_np,
+        output_np,
+        batch_size=10,
+        validation_data=(input_np, output_np),
+        callbacks=cbks,
+        epochs=5,
+        verbose=2)
+    self.assertAllClose(
+        float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
+
+
+# Note: These tests are kept in a separate class to avoid bugs in some
+# distributions of Python that break AutoGraph which is used by tf.function.
+class OptimizerWithFunctionTest(test.TestCase):
+
+  def testBasic(self):
     with context.eager_mode():
       var = resource_variable_ops.ResourceVariable([1.0, 2.0],
                                                    dtype=dtypes.float32)
@@ -382,10 +492,8 @@ class OptimizerTest(test.TestCase):
         opt.minimize(loss, [var])
         return var
 
-      self.assertAllClose([0., 1.], fn())
-      # This is just to test tf.function. The values needs to be updated
-      # when adam updates beta_1_power.
-      self.assertAllClose([-1.343838, -0.343838], fn())
+      self.assertAllClose([0., 1.], fn(), atol=1e-4)
+      self.assertAllClose([-1, 0.], fn(), atol=1e-4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..eae5620349be5576baa0743b0bb8ec24f837bb59
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -0,0 +1,193 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RMSprop for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class RMSprop(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the RMSprop algorithm.
+
+  A detailed description of rmsprop.
+
+    - maintain a moving (discounted) average of the square of gradients
+    - divide gradient by the root of this average
+
+  $$mean_square_t = rho * mean_square{t-1} + (1-rho) * gradient ** 2$$
+  $$mom_t = momentum * mom_{t-1} + learning_rate * gradient / \sqrt{ /
+      mean_square_t + \epsilon}$$
+  $$variable_t := variable_{t-1} - mom_t
+
+  This implementation of RMSprop uses plain momentum, not Nesterov momentum.
+
+  The centered version additionally maintains a moving average of the
+  gradients, and uses that average to estimate the variance:
+
+  $$mean_grad_t = rho * mean_grad_{t-1} + (1-rho) * gradient$$
+  $$mean_square_t = rho * mean_square_{t-1} + (1-rho) * gradient ** 2$$
+  $$mom_t = momentum * mom_{t-1} + learning_rate * gradient /
+      sqrt(mean_square_t - mean_grad_t**2 + epsilon)$$
+  $$variable_t := variable_{t-1} - mom_t
+
+  References
+    See ([pdf]
+      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.9,
+               momentum=0.0,
+               epsilon=1e-7,
+               centered=False,
+               name="RMSprop"):
+    """Construct a new RMSprop optimizer.
+
+    Note that in the dense implementation of this algorithm, variables and their
+    corresponding accumulators (momentum, gradient moving average, square
+    gradient moving average) will be updated even if the gradient is zero
+    (i.e. accumulators will decay, momentum will be applied). The sparse
+    implementation (used when the gradient is an `IndexedSlices` object,
+    typically because of `tf.gather` or an embedding lookup in the forward pass)
+    will not update variable slices or their accumulators unless those slices
+    were used in the forward pass (nor is there an "eventual" correction to
+    account for these omitted updates). This leads to more efficient updates for
+    large embedding lookup tables (where most of the slices are not accessed in
+    a particular graph execution), but differs from the published algorithm.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      rho: Discounting factor for the history/coming gradient
+      momentum: A scalar tensor.
+      epsilon: Small value to avoid zero denominator.
+      centered: If True, gradients are normalized by the estimated variance of
+        the gradient; if False, by the uncentered second moment. Setting this to
+        True may help with training, but is slightly more expensive in terms of
+        computation and memory. Defaults to False.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "RMSprop".  @compatibility(eager) When eager
+        execution is enabled, `learning_rate`, `decay`, `momentum`, and
+        `epsilon` can each be a callable that takes no arguments and returns the
+        actual value to use. This can be useful for changing these values across
+        different invocations of optimizer functions. @end_compatibility
+    """
+    super(RMSprop, self).__init__(name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("rho", rho)
+
+    self._momentum = False
+    if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0:
+      self._momentum = True
+    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
+      raise ValueError("`momentum` must be between [0, 1].")
+    self._set_hyper("momentum", momentum)
+
+    self._set_hyper("epsilon", epsilon)
+    self._centered = centered
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      self.add_slot(var, "rms")
+      self.add_slot(var, "momentum")
+      if self._centered:
+        self.add_slot(var, "mg")
+
+  def _resource_apply_dense(self, grad, var):
+    rms = self.get_slot(var, "rms")
+    mom = self.get_slot(var, "momentum")
+    learning_rate = math_ops.cast(
+        self._get_hyper("learning_rate"), grad.dtype.base_dtype)
+    rho = math_ops.cast(self._get_hyper("rho"), grad.dtype.base_dtype)
+    momentum = math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype)
+    epsilon = math_ops.cast(self._get_hyper("epsilon"), grad.dtype.base_dtype)
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          learning_rate,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          learning_rate,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    rms = self.get_slot(var, "rms")
+    mom = self.get_slot(var, "momentum")
+    learning_rate = math_ops.cast(
+        self._get_hyper("learning_rate"), grad.dtype.base_dtype)
+    rho = math_ops.cast(self._get_hyper("rho"), grad.dtype.base_dtype)
+    momentum = math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype)
+    epsilon = math_ops.cast(self._get_hyper("epsilon"), grad.dtype.base_dtype)
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_sparse_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          learning_rate,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          learning_rate,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          indices,
+          use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(RMSprop, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "rho": self._serialize_hyperparameter("rho"),
+        "momentum": self._serialize_hyperparameter("momentum"),
+        "epsilon": self._serialize_hyperparameter("epsilon"),
+        "centered": self._centered,
+    })
+    return config
+
+
+RMSProp = RMSprop
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b64d5cf9eb53ea07db4606e8258217c942aac7
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -0,0 +1,338 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rmsprop."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import itertools
+import math
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+_DATA_TYPES = [dtypes.half, dtypes.float32]
+
+_TEST_PARAM_VALUES = [
+    # learning_rate, rho, momentum, epsilon, centered
+    [0.05, 0.9, 0.0, 1e-3, True],
+    [0.05, 0.9, 0.0, 1e-3, False],
+    [0.1, 0.9, 0.0, 1e-3, True],
+    [0.01, 0.9, 0.0, 1e-5, True],
+    [0.01, 0.9, 0.9, 1e-5, True],
+]
+
+_TESTPARAMS = [
+    [data_type] + values
+    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
+]
+
+
+class RMSpropOptimizerTest(test.TestCase):
+
+  def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
+                            epsilon, centered):
+    rms_t = rms * rho + (1 - rho) * g * g
+    denom_t = rms_t + epsilon
+    if centered:
+      mg_t = mg * rho + (1 - rho) * g
+      denom_t -= mg_t * mg_t
+    else:
+      mg_t = mg
+    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
+    var_t = var - mom_t
+    return var_t, mg_t, rms_t, mom_t
+
+  def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
+                                   lr, rho, momentum, epsilon, centered):
+    mg_t = copy.deepcopy(mg)
+    rms_t = copy.deepcopy(rms)
+    mom_t = copy.deepcopy(mom)
+    var_t = copy.deepcopy(var)
+    for i in range(len(gindexs)):
+      gindex = gindexs[i]
+      gvalue = gvalues[i]
+      rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
+      denom_t = rms_t[gindex] + epsilon
+      if centered:
+        mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
+        denom_t -= mg_t[gindex] * mg_t[gindex]
+      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
+      var_t[gindex] = var[gindex] - mom_t[gindex]
+    return var_t, mg_t, rms_t, mom_t
+
+  def testDense(self):
+    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
+      with test_util.use_gpu():
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np, dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable(var1_np, dtype=dtype)
+        grads0 = constant_op.constant(grads0_np, dtype=dtype)
+        grads1 = constant_op.constant(grads1_np, dtype=dtype)
+        opt = rmsprop.RMSprop(
+            learning_rate=learning_rate,
+            rho=rho,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        if centered:
+          mg0 = opt.get_slot(var0, "mg")
+          mg1 = opt.get_slot(var1, "mg")
+        else:
+          mg0 = None
+          mg1 = None
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 4 steps of RMSprop
+        for _ in range(1, 5):
+          self.evaluate(update)
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate, rho,
+              momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate, rho,
+              momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSprop(
+            learning_rate=1.0,
+            rho=0.0,
+            momentum=0.0,
+            epsilon=0.0,
+            centered=False).minimize(
+                loss, var_list=[var0])
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[0., 1.]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  def testMinimizeSparseResourceVariableCentered(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSprop(
+            learning_rate=1.0,
+            rho=0.0,
+            momentum=0.0,
+            epsilon=1.0,
+            centered=True).minimize(
+                loss, var_list=[var0])
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  def testSparse(self):
+    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
+      with test_util.use_gpu():
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+        grads1_np_indices = np.array([1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+        opt = rmsprop.RMSprop(
+            learning_rate=learning_rate,
+            rho=rho,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        if centered:
+          mg0 = opt.get_slot(var0, "mg")
+          self.assertEqual(mg0 is not None, centered)
+          mg1 = opt.get_slot(var1, "mg")
+          self.assertEqual(mg1 is not None, centered)
+        else:
+          mg0 = None
+          mg1 = None
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 4 steps of RMSprop
+        for _ in range(1, 5):
+          self.evaluate(update)
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+              learning_rate, rho, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+              learning_rate, rho, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testCallableParams(self):
+    with context.eager_mode():
+      for dtype in [dtypes.half, dtypes.float32]:
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+        learning_rate = lambda: 2.0
+        rho = lambda: 0.9
+        momentum = lambda: 0.0
+        epsilon = lambda: 1.0
+        opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Step 1: the rms accumulators where 1. So we should see a normal
+        # update: v -= grad * learning_rate
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
+            ]), self.evaluate(var1))
+        # Step 2: the root mean square accumulators contain the previous update.
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
+            ]), self.evaluate(var1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index 715d80a116c0869291d0ce2d7514a31f07114fe3..9c8020dc05abbd86bcaae01dc87b47ed0bd610d6 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -22,7 +22,16 @@ from __future__ import print_function
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python import tf2
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
+from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import clip_ops
@@ -795,16 +804,27 @@ def deserialize(config, custom_objects=None):
   Returns:
       A Keras Optimizer instance.
   """
-  all_classes = {
-      'sgd': SGD,
-      'rmsprop': RMSprop,
-      'adagrad': Adagrad,
-      'adadelta': Adadelta,
-      'adam': Adam,
-      'adamax': Adamax,
-      'nadam': Nadam,
-      'tfoptimizer': TFOptimizer,
-  }
+  if tf2.enabled():
+    all_classes = {
+        'adadelta': adadelta_v2.Adadelta,
+        'adagrad': adagrad_v2.Adagrad,
+        'adam': adam_v2.Adam,
+        'adamax': adamax_v2.Adamax,
+        'nadam': nadam_v2.Nadam,
+        'rmsprop': rmsprop_v2.RMSprop,
+        'sgd': gradient_descent_v2.SGD
+    }
+  else:
+    all_classes = {
+        'adadelta': Adadelta,
+        'adagrad': Adagrad,
+        'adam': Adam,
+        'adamax': Adamax,
+        'nadam': Nadam,
+        'rmsprop': RMSprop,
+        'sgd': SGD,
+        'tfoptimizer': TFOptimizer
+    }
   # Make deserialization case-insensitive for built-in optimizers.
   if config['class_name'].lower() in all_classes:
     config['class_name'] = config['class_name'].lower()
@@ -833,17 +853,17 @@ def get(identifier):
   Raises:
       ValueError: If `identifier` cannot be interpreted.
   """
+  if isinstance(identifier, (Optimizer, optimizer_v2.OptimizerV2)):
+    return identifier
   # Wrap TF optimizer instances
-  if isinstance(identifier, tf_optimizer_module.Optimizer):
+  elif isinstance(identifier, tf_optimizer_module.Optimizer):
     opt = TFOptimizer(identifier)
     K.track_tf_optimizer(opt)
     return opt
-  if isinstance(identifier, dict):
+  elif isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
     config = {'class_name': str(identifier), 'config': {}}
     return deserialize(config)
-  if isinstance(identifier, Optimizer):
-    return identifier
   else:
     raise ValueError('Could not interpret optimizer identifier:', identifier)
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 9664f09fff0ad872c40b58e3ff2347a2a595d429..46bb0274c6e754874460584c51ea76722ee69e11 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
+import os
 import weakref
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -208,5 +211,40 @@ class KerasOptimizersTest(test.TestCase):
       _ = keras.optimizers.Adam(clipnorm=-2.0)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class KerasV2OptimizersTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('adadelta_tf2', 'adadelta', True), ('adadelta_tf1', 'adadelta', False),
+      ('adagrad_tf2', 'adagrad', True), ('adagrad_tf1', 'adagrad', False),
+      ('adam_tf2', 'adam', True), ('adam_tf1', 'adam', False),
+      ('adamax_tf2', 'adamax', True), ('adamax_tf1', 'adamax', False),
+      ('sgd_tf2', 'sgd', True), ('sgd_tf1', 'sgd', False),
+      ('nadam_tf2', 'nadam', True), ('nadam_tf1', 'nadam', False),
+      ('rmsprop_tf2', 'rmsprop', True), ('rmsprop_tf1', 'rmsprop', False))
+  def test_load_from_string(self, optimizer_string, tf2mode):
+    old_mode = os.environ.get('TF2_BEHAVIOR', None)
+    if tf2mode:
+      os.environ['TF2_BEHAVIOR'] = 'enabled'
+    else:
+      if 'TF2_BEHAVIOR' in os.environ:
+        del os.environ['TF2_BEHAVIOR']
+
+    # Sanity check.
+    self.assertEqual(tf2.enabled(), tf2mode)
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_shape=(10,)))
+    model.compile(optimizer_string, 'binary_crossentropy')
+
+    self.assertEqual(optimizer_string,
+                     model.optimizer.__class__.__name__.lower())
+
+    model.fit(np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'))
+
+    if old_mode is not None:
+      os.environ['TF2_BEHAVIOR'] = old_mode
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index 28b6ad4c65a2919323b81c89de6e5a3d4b5d3ff3..cbcdae214f97ef7a8a37468145aeaaa182d11737 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import six
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
@@ -54,12 +55,14 @@ class L1L2(Regularizer):
     self.l2 = K.cast_to_floatx(l2)
 
   def __call__(self, x):
-    regularization = 0.
-    if self.l1:
-      regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
-    if self.l2:
-      regularization += math_ops.reduce_sum(self.l2 * math_ops.square(x))
-    return regularization
+    if self.l1 or self.l2:
+      regularization = ops.convert_to_tensor(0., dtype=K.floatx())
+      if self.l1:
+        regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
+      if self.l2:
+        regularization += math_ops.reduce_sum(self.l2 * math_ops.square(x))
+      return regularization
+    return None
 
   def get_config(self):
     return {'l1': float(self.l1), 'l2': float(self.l2)}
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 2fae094a1ef93e6dd5031a9445377f1a9759be2b..d342131a521a90399090e48cf578f37c2a2e566c 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -149,7 +149,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     np.testing.assert_allclose(output, actual_output, rtol=1e-3)
 
   # test training mode (e.g. useful for dropout tests)
-  model.compile(RMSPropOptimizer(0.01), 'mse')
+  model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'])
   model.train_on_batch(input_data, actual_output)
 
   # test as first layer in Sequential API
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index c442b31116091955335423d2e60eaacf464c568e..8939044f71d05d762869d3123eab379362781242 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -23,11 +23,13 @@ from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.data_utils import SequenceEnqueuer
+from tensorflow.python.keras.utils.generic_utils import class_and_config_for_serialized_keras_object
 from tensorflow.python.keras.utils.generic_utils import custom_object_scope
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import get_custom_objects
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_class_and_config
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 5af82f369115788c263983769334d06ef4bdc35d..375bd9d196c6296e627b968ff2006fd216e3c68e 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -125,22 +125,48 @@ def get_custom_objects():
   return _GLOBAL_CUSTOM_OBJECTS
 
 
+def serialize_keras_class_and_config(cls_name, cls_config):
+  """Returns the serialization of the class with the given config."""
+  return {'class_name': cls_name, 'config': cls_config}
+
+
 @tf_export('keras.utils.serialize_keras_object')
 def serialize_keras_object(instance):
   _, instance = tf_decorator.unwrap(instance)
   if instance is None:
     return None
   if hasattr(instance, 'get_config'):
-    return {
-        'class_name': instance.__class__.__name__,
-        'config': instance.get_config()
-    }
+    return serialize_keras_class_and_config(instance.__class__.__name__,
+                                            instance.get_config())
   if hasattr(instance, '__name__'):
     return instance.__name__
   else:
     raise ValueError('Cannot serialize', instance)
 
 
+def class_and_config_for_serialized_keras_object(
+    config,
+    module_objects=None,
+    custom_objects=None,
+    printable_module_name='object'):
+  """Returns the class name and config for a serialized keras object."""
+  if (not isinstance(config, dict) or 'class_name' not in config or
+      'config' not in config):
+    raise ValueError('Improper config format: ' + str(config))
+
+  class_name = config['class_name']
+  if custom_objects and class_name in custom_objects:
+    cls = custom_objects[class_name]
+  elif class_name in _GLOBAL_CUSTOM_OBJECTS:
+    cls = _GLOBAL_CUSTOM_OBJECTS[class_name]
+  else:
+    module_objects = module_objects or {}
+    cls = module_objects.get(class_name)
+    if cls is None:
+      raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
+  return (cls, config['config'])
+
+
 @tf_export('keras.utils.deserialize_keras_object')
 def deserialize_keras_object(identifier,
                              module_objects=None,
@@ -151,37 +177,28 @@ def deserialize_keras_object(identifier,
   if isinstance(identifier, dict):
     # In this case we are dealing with a Keras config dictionary.
     config = identifier
-    if 'class_name' not in config or 'config' not in config:
-      raise ValueError('Improper config format: ' + str(config))
-    class_name = config['class_name']
-    if custom_objects and class_name in custom_objects:
-      cls = custom_objects[class_name]
-    elif class_name in _GLOBAL_CUSTOM_OBJECTS:
-      cls = _GLOBAL_CUSTOM_OBJECTS[class_name]
-    else:
-      module_objects = module_objects or {}
-      cls = module_objects.get(class_name)
-      if cls is None:
-        raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
+    (cls, cls_config) = class_and_config_for_serialized_keras_object(
+        config, module_objects, custom_objects, printable_module_name)
+
     if hasattr(cls, 'from_config'):
       arg_spec = tf_inspect.getfullargspec(cls.from_config)
       custom_objects = custom_objects or {}
 
       if 'custom_objects' in arg_spec.args:
         return cls.from_config(
-            config['config'],
+            cls_config,
             custom_objects=dict(
                 list(_GLOBAL_CUSTOM_OBJECTS.items()) +
                 list(custom_objects.items())))
       with CustomObjectScope(custom_objects):
-        return cls.from_config(config['config'])
+        return cls.from_config(cls_config)
     else:
       # Then `cls` may be a function returning a class.
       # in this case by convention `config` holds
       # the kwargs of the function.
       custom_objects = custom_objects or {}
       with CustomObjectScope(custom_objects):
-        return cls(**config['config'])
+        return cls(**cls_config)
   elif isinstance(identifier, six.string_types):
     function_name = identifier
     if custom_objects and function_name in custom_objects:
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 81980b95f4e38d0e41bfc0d5e29510fb4efbd0e0..19facca5a6014b5270041a7d738940cdd3fe03c8 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -118,10 +118,13 @@ cuda_py_test(
     size = "small",
     srcs = ["list_ops_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:list_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/eager:context",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -267,7 +270,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "ctc_loss_op_test",
     size = "small",
     srcs = ["ctc_loss_op_test.py"],
@@ -1053,9 +1056,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "summary_ops_test",
+    name = "summary_v1_ops_test",
     size = "small",
-    srcs = ["summary_ops_test.py"],
+    srcs = ["summary_v1_ops_test.py"],
     additional_deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -1066,9 +1069,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "summary_tensor_op_test",
+    name = "summary_v1_tensor_op_test",
     size = "small",
-    srcs = ["summary_tensor_op_test.py"],
+    srcs = ["summary_v1_tensor_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -1077,7 +1080,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:summary_ops",
+        "//tensorflow/python:summary",
     ],
 )
 
@@ -1153,6 +1156,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "unicode_decode_op_test",
+    size = "small",
+    srcs = ["unicode_decode_op_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "unique_op_test",
     size = "small",
@@ -1337,6 +1352,7 @@ cuda_py_test(
         "//tensorflow/python:test_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
     ],
     shard_count = 10,
     tags = [
@@ -1448,6 +1464,7 @@ cuda_py_test(
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:math_ops",
@@ -1735,9 +1752,11 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python:while_v2",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
     grpc_enabled = True,
+    shard_count = 2,
     tags = ["no_windows"],
 )
 
@@ -2321,9 +2340,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "summary_audio_op_test",
+    name = "summary_v1_audio_op_test",
     size = "small",
-    srcs = ["summary_audio_op_test.py"],
+    srcs = ["summary_v1_audio_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
@@ -2334,9 +2353,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "summary_image_op_test",
+    name = "summary_v1_image_op_test",
     size = "small",
-    srcs = ["summary_image_op_test.py"],
+    srcs = ["summary_v1_image_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
@@ -2370,6 +2389,8 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:cond_v2",
+        "//tensorflow/python:while_v2",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
     ],
@@ -2562,6 +2583,8 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = [
+        # TODO(b/118887316): Re-enable this test in Kokoro.
+        "no_oss",
         "optonly",  # times out
     ],
 )
@@ -2611,34 +2634,6 @@ cuda_py_test(
     tags = ["manual"],
 )
 
-cuda_py_test(
-    name = "dct_ops_test",
-    srcs = ["dct_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-)
-
-cuda_py_test(
-    name = "fft_ops_test",
-    size = "medium",
-    srcs = ["fft_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-    shard_count = 4,
-    tags = ["optonly"],
-)
-
 cuda_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
@@ -2885,7 +2880,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
-    shard_count = 20,
+    shard_count = 30,
 )
 
 cuda_py_test(
@@ -3327,7 +3322,9 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:while_v2",
     ],
     grpc_enabled = True,
 )
diff --git a/tensorflow/python/kernel_tests/accumulate_n_test.py b/tensorflow/python/kernel_tests/accumulate_n_test.py
index 7889edc198f48a0a91ad3c3153b0eb1ecbad76b8..c7f11f854d1c4c6d8a8e5c1c32a6355f37ea6d7d 100644
--- a/tensorflow/python/kernel_tests/accumulate_n_test.py
+++ b/tensorflow/python/kernel_tests/accumulate_n_test.py
@@ -65,7 +65,7 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
             for _ in range(0, num_inputs)
         ]
         accum_n = math_ops.accumulate_n(input_vars)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         accum_n_grad = gradients.gradients(accum_n, input_vars)
         self.assertAllEqual(
             np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
@@ -88,13 +88,13 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       np_val = random_arrays[0]
       for random_array in random_arrays[1:]:
         np_val += random_array
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testZeroArgs(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         tf_val = math_ops.accumulate_n([])
-        tf_val.eval()
+        self.evaluate(tf_val)
 
   def testWrongShape(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index fa370c17b462b899b44a9ec8c5970526222b5eaa..d34a1dc9b299371d46cbe56db4747ec6ba7a53ab 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -37,14 +37,14 @@ class ArgMaxTest(test.TestCase):
     with self.session(use_gpu=use_gpu):
       ans = method(x, axis=axis)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         # Defaults to int64 output.
         self.assertEqual(np.int64, tf_ans.dtype)
         self.assertAllEqual(tf_ans, expected_values)
         self.assertShapeEqual(expected_values, ans)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBothArg(self,
                    method,
@@ -79,7 +79,7 @@ class ArgMaxTest(test.TestCase):
     expected_values = x.argmax()
     with self.session(use_gpu=True):
       ans = math_ops.argmax(x, axis=0, output_type=dtypes.int32)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
       self.assertEqual(np.int32, tf_ans.dtype)
       # The values are equal when comparing int32 to int64 because
       # the values don't have a range that exceeds 32-bit integers.
@@ -87,7 +87,7 @@ class ArgMaxTest(test.TestCase):
     expected_values = x.argmin()
     with self.session(use_gpu=True):
       ans = math_ops.argmin(x, axis=0, output_type=dtypes.int32)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
       self.assertEqual(np.int32, tf_ans.dtype)
       self.assertAllEqual(tf_ans, expected_values)
 
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 9d2c2362457694e45a9656af766b63b61b4e1201..afc158f6975d38ecc5900a1d831fa39998cd2730 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import time
 import unittest
 
@@ -24,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -31,6 +33,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -45,24 +48,23 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
 
   def testNonBatchMatrix(self):
     matrix = [[1, 2, 3], [4, 5, 6]]  # Shape (2, 3)
     expected_transposed = [[1, 4], [2, 5], [3, 6]]  # Shape (3, 2)
-    with self.cached_session():
-      transposed = array_ops.matrix_transpose(matrix)
-      self.assertEqual((3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    transposed = array_ops.matrix_transpose(matrix)
+    self.assertEqual((3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testConjugate(self):
     m = [[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j, 6 + 6j]]
     expected_transposed = [[1 - 1j, 4 - 4j], [2 - 2j, 5 - 5j], [3 - 3j, 6 - 6j]]
-    with self.cached_session():
-      matrix = ops.convert_to_tensor(m)
-      transposed = array_ops.matrix_transpose(matrix, conjugate=True)
-      self.assertEqual((3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    matrix = ops.convert_to_tensor(m)
+    transposed = array_ops.matrix_transpose(matrix, conjugate=True)
+    self.assertEqual((3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testBatchMatrix(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
@@ -71,43 +73,44 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
     matrix_1_t = [[11, 44], [22, 55], [33, 66]]
     batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
     expected_transposed = [matrix_0_t, matrix_1_t]  # Shape (2, 3, 2)
-    with self.cached_session():
-      transposed = array_ops.matrix_transpose(batch_matrix)
-      self.assertEqual((2, 3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    transposed = array_ops.matrix_transpose(batch_matrix)
+    self.assertEqual((2, 3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testNonBatchMatrixDynamicallyDefined(self):
-    matrix = [[1, 2, 3], [4, 5, 6]]  # Shape (2, 3)
+    # needs explicit `constant` because lists are not automatically
+    # converted to sensors when applying `transpose` below
+    matrix = constant_op.constant([[1, 2, 3], [4, 5, 6]])  # Shape (2, 3)
     expected_transposed = [[1, 4], [2, 5], [3, 6]]  # Shape (3, 2)
-    with self.cached_session():
-      matrix_ph = array_ops.placeholder(dtypes.int32)
-      transposed = array_ops.matrix_transpose(matrix_ph)
-      self.assertAllEqual(
-          expected_transposed, transposed.eval(feed_dict={
-              matrix_ph: matrix
-          }))
+    @def_function.function(input_signature=
+                           [tensor_spec.TensorSpec
+                            (shape=None, dtype=dtypes.int32)])
+    def transpose(matrix):
+      self.assertIs(matrix.shape.ndims, None)
+      return array_ops.matrix_transpose(matrix)
+    self.assertAllEqual(expected_transposed, transpose(matrix))
 
   def testBatchMatrixDynamicallyDefined(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
     matrix_0_t = [[1, 4], [2, 5], [3, 6]]
     matrix_1 = [[11, 22, 33], [44, 55, 66]]
     matrix_1_t = [[11, 44], [22, 55], [33, 66]]
-    batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
+    # needs explicit `constant` because lists are not automatically
+    # converted to sensors when applying `transpose` below
+    batch_matrix = constant_op.constant([matrix_0, matrix_1])  # Shape (2, 2, 3)
     expected_transposed = [matrix_0_t, matrix_1_t]  # Shape (2, 3, 2)
-    with self.cached_session():
-      batch_matrix_ph = array_ops.placeholder(dtypes.int32)
-      transposed = array_ops.matrix_transpose(batch_matrix_ph)
-      self.assertAllEqual(
-          expected_transposed,
-          transposed.eval(feed_dict={
-              batch_matrix_ph: batch_matrix
-          }))
+    @def_function.function(input_signature=
+                           [tensor_spec.TensorSpec
+                            (shape=None, dtype=dtypes.int32)])
+    def transpose(matrix):
+      self.assertIs(matrix.shape.ndims, None)
+      return array_ops.matrix_transpose(matrix)
+    self.assertAllEqual(expected_transposed, transpose(batch_matrix))
 
   def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self):
     vector = [1, 2, 3]
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, "should be a "):
-        array_ops.matrix_transpose(vector)
+    with self.assertRaisesRegexp(ValueError, "should be a "):
+      array_ops.matrix_transpose(vector)
 
 
 class BooleanMaskTest(test_util.TensorFlowTestCase):
@@ -553,7 +556,8 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
   def testInt64GPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
-    with self.session(use_gpu=True, force_gpu=True):
+
+    with test_util.force_gpu():
       x = constant_op.constant([1., 2., 3.])
       begin = constant_op.constant([2], dtype=dtypes.int64)
       end = constant_op.constant([3], dtype=dtypes.int64)
@@ -634,12 +638,21 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       bar2 = constant_op.constant(3)
       _ = checker[..., bar:bar2]
       _ = checker[..., bar]
-      with self.assertRaisesRegexp(
-          TypeError,
-          "Value passed to parameter 'begin' has DataType float32 not in "
-          "list of allowed values"):
-        _ = checker[..., 3.0]
       _ = checker[..., 3]
+      _ = checker[..., 2 ** 64 // 2**63]  # Test longs in Python 2
+
+  def testTensorIndexingTypeError(self):
+    with self.session(use_gpu=True):
+      checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
+      expected = re.escape(array_ops._SLICE_TYPE_ERROR)
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker["foo"]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[constant_op.constant("foo")]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[0.0]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[constant_op.constant(0.0)]
 
   def testExpand(self):
     with self.session(use_gpu=True):
@@ -820,7 +833,7 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
       index = constant_op.constant(1, dtype=dtypes.int64)
       b = 2. * a[index]
       grad, = gradients_impl.gradients(b, a)
-      self.assertAllEqual(sess.run(grad), [0., 2., 0.])
+      self.assertAllEqual(self.evaluate(grad), [0., 2., 0.])
 
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
@@ -833,7 +846,7 @@ class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
               math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
               shape=(4, 1, 1)))
       varshape = variables.Variable([6, 4, 4], dtype=dtypes.int32)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0])
       end = constant_op.constant([4, 1, 1])
       strides = constant_op.constant([1, 1, 1])
@@ -846,7 +859,7 @@ class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
           math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
           shape=(4, 1, 1))
       original_shape = constant_op.constant([6, 4, 4], dtype=dtypes.int64)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0], dtype=dtypes.int64)
       end = constant_op.constant([4, 1, 1], dtype=dtypes.int64)
       strides = constant_op.constant([1, 1, 1], dtype=dtypes.int64)
@@ -860,7 +873,7 @@ class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
           math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
           shape=(4, 1, 1))
       original_shape = constant_op.constant([6, 4, 4], dtype=dtypes.int64)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0], dtype=dtypes.int32)
       end = constant_op.constant([4, 1, 1], dtype=dtypes.int64)
       strides = constant_op.constant([1, 1, 1], dtype=dtypes.int64)
@@ -1029,7 +1042,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
     v = resource_variable_ops.ResourceVariable(init_val)
     with self.cached_session() as sess:
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
       with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_large_val))
       with self.assertRaises(ValueError):
@@ -1088,7 +1101,6 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[True, False, False, False, False], [True, True, True, False, False],
            [True, True, False, False, False]])
 
-  @test_util.enable_c_shapes
   def testOneDimensionalDtypeWithoutMaxlen(self):
     with self.cached_session():
       # test dtype and default maxlen:
@@ -1099,7 +1111,6 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           res.eval(),
           [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
 
-  @test_util.enable_c_shapes
   def testOneDimensionalWithoutMaxlen(self):
     with self.cached_session():
       res = array_ops.sequence_mask(
@@ -1111,7 +1122,6 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
            [True, False, False, False],
            [True, True, True, True]])
 
-  @test_util.enable_c_shapes
   def testTwoDimensional(self):
     with self.cached_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
@@ -1178,18 +1188,18 @@ class IdentityTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(x.numpy(), y.numpy())
         self.assertTrue(device in y.device.lower())
 
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         a = constant_op.constant([[2], [3]], dtype=dtypes.float32)
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         b = array_ops.identity(a)
         _test(a, b, "gpu")
-      with ops.device("cpu:0"):
+      with test_util.force_cpu():
         c = array_ops.identity(b)
         _test(b, c, "cpu")
-      with ops.device("cpu:0"):
+      with test_util.force_cpu():
         d = array_ops.identity(c)
         _test(c, d, "cpu")
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         e = array_ops.identity(d)
         _test(d, e, "gpu")
 
@@ -1259,7 +1269,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
             initializer=init_ops.constant_initializer(10.0),
             use_resource=use_resource)
         guarantee_a = array_ops.guarantee_const(a)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertEqual(10.0, guarantee_a.eval())
 
   def testResourceRejection(self):
@@ -1269,7 +1279,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
           initializer=init_ops.constant_initializer(10.0),
           use_resource=True)
       guarantee_a = array_ops.guarantee_const(a.handle)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "cannot be a resource variable"):
         guarantee_a.eval()
diff --git a/tensorflow/python/kernel_tests/atrous_conv2d_test.py b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
index 1d82b3d058834c7d56668e975a0969e32283a69b..fefb79799566fe5641bf75954f01b4d0d33a2121 100644
--- a/tensorflow/python/kernel_tests/atrous_conv2d_test.py
+++ b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
@@ -79,7 +79,8 @@ class AtrousConv2DTest(test.TestCase):
                 y1 = nn_ops.atrous_conv2d(x, f, rate, padding=padding)
                 y2 = nn_ops.conv2d(
                     x, f_up, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
   def testAtrousSequence(self):
     """Tests optimization of sequence of atrous convolutions.
@@ -131,7 +132,8 @@ class AtrousConv2DTest(test.TestCase):
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = array_ops.batch_to_space(y2, crops=pad, block_size=rate)
-              self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-2, atol=1e-2)
+              self.assertAllClose(
+                  y1.eval(), self.evaluate(y2), rtol=1e-2, atol=1e-2)
 
   def testGradient(self):
     with self.session(use_gpu=True):
@@ -193,7 +195,8 @@ class AtrousConv2DTransposeTest(test.TestCase):
                                                     padding)
                 y2 = nn_ops.conv2d_transpose(
                     x, f_up, y_shape, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
 
 class AtrousDepthwiseConv2DTest(test.TestCase):
@@ -220,7 +223,8 @@ class AtrousDepthwiseConv2DTest(test.TestCase):
                 y1 = nn_impl.depthwise_conv2d(
                     x, f, strides, padding, rate=[rate, rate])
                 y2 = nn_impl.depthwise_conv2d(x, f_up, strides, padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 1e09ba5b65cee3b74d350e0d2433c6a459517e5e..14db06b7837b9e4975a5cf98277214d15d610426 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -121,8 +121,7 @@ class ExtractGlimpseTest(test.TestCase):
     with self.cached_session():
       result = image_ops.extract_glimpse(empty_image, [1, 1], offsets)
       self.assertAllEqual(
-          np.zeros(
-              (0, 1, 1, 0), dtype=np.float32), result.eval())
+          np.zeros((0, 1, 1, 0), dtype=np.float32), self.evaluate(result))
 
   def testLargeCenterGlimpse(self):
     self._VerifyValues(
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 225c1b35ae5fb9c5f30fa4966d691c6274a2120d..cd330481214900aea7686160450e203c12eb4aee 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -44,13 +44,13 @@ class GPUBinaryOpsTest(test.TestCase):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = sess.run(out)
+      tf_gpu = self.evaluate(out)
 
     with self.cached_session(use_gpu=False) as sess:
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_cpu = sess.run(out)
+      tf_cpu = self.evaluate(out)
 
     self.assertAllClose(tf_cpu, tf_gpu)
 
@@ -96,7 +96,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       inx = ops.convert_to_tensor(x)
       ofunc = tf_func(inx)
-      tf_out = sess.run(ofunc)
+      tf_out = self.evaluate(ofunc)
     self.assertAllClose(np_out, tf_out)
 
   def _inv(self, x):
@@ -148,7 +148,7 @@ class MathBuiltinUnaryTest(test.TestCase):
       iny = ops.convert_to_tensor(y + 0.1)
       ofunc = inx / iny
       out_func2 = math_ops.floor(ofunc)
-      tf_out = sess.run(out_func2)
+      tf_out = self.evaluate(out_func2)
 
     self.assertAllClose(np_out, tf_out)
 
@@ -214,7 +214,7 @@ class BroadcastSimpleTest(test.TestCase):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = out.eval()
+      tf_gpu = self.evaluate(out)
     self.assertAllClose(np_ans, tf_gpu)
     self.assertShapeEqual(np_ans, out)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
diff --git a/tensorflow/python/kernel_tests/batch_gather_op_test.py b/tensorflow/python/kernel_tests/batch_gather_op_test.py
index 547506d844d3d453f79af895046d51b57721cb73..ad4e87913145a98f7e6c47737b9664f905e39c2c 100644
--- a/tensorflow/python/kernel_tests/batch_gather_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_gather_op_test.py
@@ -52,7 +52,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([3, 7])
         np_val = self._buildParams(expected_result, dtype)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
@@ -68,7 +68,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([[3], [15]])
         np_val = self._buildParams(expected_result, dtype)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
@@ -81,7 +81,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         params = constant_op.constant(params_np)
         indices_tf = constant_op.constant(indices)
         gather_t = array_ops.batch_gather(params, indices_tf)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         expected_result = np.array([[[2, 0], [7, 5]], [[10, 8], [11, 15]]])
         np_val = self._buildParams(expected_result, dtype)
         self.assertAllEqual(np_val, gather_val)
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index 8f6c089b423632aa9acec746cbdd76cb691e3700..a0ad8151b26e399b0a2bebfe89bca82f19249df3 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -86,7 +86,7 @@ class BatchMatmulOpTest(test.TestCase):
     with self.cached_session(use_gpu=is_floating) as sess:
       if static_shape:
         z0 = math_ops.matmul(x, y, adjoint_a=adjoint_a, adjoint_b=adjoint_b)
-        z0_val = z0.eval()
+        z0_val = self.evaluate(z0)
       else:
         x_ph = array_ops.placeholder(x.dtype)
         y_ph = array_ops.placeholder(y.dtype)
diff --git a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
index 742a204883128b2af3abf91b27f089f8b4410e7c..a4b461bc87b3ddd766c762a5bc2d19a46df44c18 100644
--- a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
@@ -91,7 +91,7 @@ class ScatterTest(test.TestCase):
 
       session.run([update0, update1])
 
-      self.assertAllEqual([False, True], var.eval())
+      self.assertAllEqual([False, True], self.evaluate(var))
 
   def testScatterOutOfRange(self):
     params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 92d21462d52f40c22aa60dac1a0c3d6b74ab2f3f..5d7446042e13e1fe406a8f11084ab3f289a74f24 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -48,7 +48,7 @@ class BetaincTest(test.TestCase):
       tf_x_s = constant_op.constant(x_s, dtype=dtype)
       tf_out_t = math_ops.betainc(tf_a_s, tf_b_s, tf_x_s)
       with self.cached_session():
-        tf_out = tf_out_t.eval()
+        tf_out = self.evaluate(tf_out_t)
       scipy_out = special.betainc(a_s, b_s, x_s).astype(np_dt)
 
       # the scipy version of betainc uses a double-only implementation.
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index 79e0f36d242bdc828d4216d0e7a868bbccc849a9..5ceffcfeda3488e3664c927c247aa1550fd43e2a 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -28,9 +29,9 @@ from tensorflow.python.platform import test
 class BitcastTest(test.TestCase):
 
   def _testBitcast(self, x, datatype, shape):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       tf_ans = array_ops.bitcast(x, datatype)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       buff_after = memoryview(out).tobytes()
       buff_before = memoryview(x).tobytes()
       self.assertEqual(buff_before, buff_after)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index e0d46bae83a2c731d98f199c1af74196f5956201..adfb094971757a7ce43fb90a2778012c41d2cc8e 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -18,14 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import tempfile
+
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import boosted_trees_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_handle_op as resource_handle_op
 from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as resource_initialized
 from tensorflow.python.platform import googletest
+from tensorflow.python.training import saver
 
 
 class QuantileOpsTest(test_util.TensorFlowTestCase):
@@ -57,18 +64,16 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     | 5        |     1            |   2.2     |   0.8
     """
 
-    self._feature_0 = constant_op.constant(
-        [[1.2], [12.1], [0.3], [0.5], [0.6], [2.2]], dtype=dtypes.float32)
-    self._feature_1 = constant_op.constant(
-        [[2.3], [1.2], [1.1], [2.6], [3.2], [0.8]], dtype=dtypes.float32)
-    self._feature_0_boundaries = constant_op.constant(
-        [0.3, 0.6, 1.2, 12.1], dtype=dtypes.float32)
-    self._feature_1_boundaries = constant_op.constant(
-        [0.8, 1.2, 2.3, 3.2], dtype=dtypes.float32)
-    self._feature_0_quantiles = constant_op.constant(
-        [[2], [3], [0], [1], [1], [3]], dtype=dtypes.int32)
-    self._feature_1_quantiles = constant_op.constant(
-        [[2], [1], [1], [3], [3], [0]], dtype=dtypes.int32)
+    self._feature_0 = constant_op.constant([1.2, 12.1, 0.3, 0.5, 0.6, 2.2],
+                                           dtype=dtypes.float32)
+    self._feature_1 = constant_op.constant([2.3, 1.2, 1.1, 2.6, 3.2, 0.8],
+                                           dtype=dtypes.float32)
+    self._feature_0_boundaries = np.array([0.3, 0.6, 1.2, 12.1])
+    self._feature_1_boundaries = np.array([0.8, 1.2, 2.3, 3.2])
+    self._feature_0_quantiles = constant_op.constant([2, 3, 0, 1, 1, 3],
+                                                     dtype=dtypes.int32)
+    self._feature_1_quantiles = constant_op.constant([2, 1, 1, 3, 3, 0],
+                                                     dtype=dtypes.int32)
 
     self._example_weights = constant_op.constant(
         [10, 1, 1, 1, 1, 1], dtype=dtypes.float32)
@@ -93,8 +98,8 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
           quantile_accumulator_handle, num_features=2)
       quantiles = boosted_trees_ops.boosted_trees_bucketize(
           [self._feature_0, self._feature_1], buckets)
-      sess.run(summary_op)
-      sess.run(flush_op)
+      self.evaluate(summary_op)
+      self.evaluate(flush_op)
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
@@ -135,6 +140,69 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
+  def testSaveRestoreAfterFlush(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.test_session() as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+
+      save = saver.Saver()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose([], buckets[0].eval())
+      self.assertAllClose([], buckets[1].eval())
+      summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
+                                            self._example_weights)
+      with ops.control_dependencies([summaries]):
+        flush = accumulator.flush()
+      sess.run(flush)
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+      save.save(sess, save_path)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+      save = saver.Saver()
+      save.restore(sess, save_path)
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+
+  def testSaveRestoreBeforeFlush(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.test_session() as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+
+      save = saver.Saver()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
+                                            self._example_weights)
+      sess.run(summaries)
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose([], buckets[0].eval())
+      self.assertAllClose([], buckets[1].eval())
+      save.save(sess, save_path)
+      sess.run(accumulator.flush())
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+      save = saver.Saver()
+      save.restore(sess, save_path)
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose([], buckets[0].eval())
+      self.assertAllClose([], buckets[1].eval())
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
index 65bb9ab55f00c0ad9506122bf357484c7a4acd5f..493cad80f3c8731e85f5682a3ad2c2dec523b4b7 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -35,13 +35,13 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
       resources.initialize_resources(resources.shared_resources()).run()
       stamp_token = ensemble.get_stamp_token()
-      self.assertEqual(0, stamp_token.eval())
+      self.assertEqual(0, self.evaluate(stamp_token))
       (_, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(0, num_trees.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertEqual(0, num_attempted_layers.eval())
-      self.assertAllEqual([0, 1], nodes_range.eval())
+      self.assertEqual(0, self.evaluate(num_trees))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertEqual(0, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([0, 1], self.evaluate(nodes_range))
 
   def testCreateWithProto(self):
     with self.cached_session():
@@ -154,11 +154,11 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(7, stamp_token.eval())
-      self.assertEqual(2, num_trees.eval())
-      self.assertEqual(1, num_finalized_trees.eval())
-      self.assertEqual(6, num_attempted_layers.eval())
-      self.assertAllEqual([16, 19], nodes_range.eval())
+      self.assertEqual(7, self.evaluate(stamp_token))
+      self.assertEqual(2, self.evaluate(num_trees))
+      self.assertEqual(1, self.evaluate(num_finalized_trees))
+      self.assertEqual(6, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([16, 19], self.evaluate(nodes_range))
 
   def testSerializeDeserialize(self):
     with self.cached_session():
@@ -167,11 +167,11 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(5, stamp_token.eval())
-      self.assertEqual(0, num_trees.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertEqual(0, num_attempted_layers.eval())
-      self.assertAllEqual([0, 1], nodes_range.eval())
+      self.assertEqual(5, self.evaluate(stamp_token))
+      self.assertEqual(0, self.evaluate(num_trees))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertEqual(0, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([0, 1], self.evaluate(nodes_range))
 
       # Deserialize.
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
@@ -219,18 +219,18 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       ]):
         (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
          nodes_range) = ensemble.get_states()
-      self.assertEqual(3, stamp_token.eval())
-      self.assertEqual(1, num_trees.eval())
+      self.assertEqual(3, self.evaluate(stamp_token))
+      self.assertEqual(1, self.evaluate(num_trees))
       # This reads from metadata, not really counting the layers.
-      self.assertEqual(5, num_attempted_layers.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertAllEqual([3, 7], nodes_range.eval())
+      self.assertEqual(5, self.evaluate(num_attempted_layers))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertAllEqual([3, 7], self.evaluate(nodes_range))
 
 
       # Serialize.
       new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
       new_stamp_token, new_serialized = ensemble.serialize()
-      self.assertEqual(3, new_stamp_token.eval())
+      self.assertEqual(3, self.evaluate(new_stamp_token))
       new_ensemble_proto.ParseFromString(new_serialized.eval())
       self.assertProtoEquals(ensemble_proto, new_ensemble_proto)
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index 09e9cfa3affb9750938f2292e6e2dc3edddecedb..e4c5431c26c658ed3a220d4abbbe29ef34300023 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -65,10 +65,10 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[0.004775, 0.41184], [0.02823, 0.41184]],
                           sess.run(gains_list))
-      self.assertAllEqual([[1, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[1, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[-.416667], [.568966]], [[-.6], [-.75]]],
@@ -113,10 +113,10 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[0., 0.33931375], [0.01879096, 0.33931375]],
                           sess.run(gains_list))
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
@@ -162,9 +162,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[[0.0], [0.3965517]], [[-0.4], [-0.5]]],
                           sess.run(left_node_contribs_list))
 
@@ -214,12 +214,12 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
 
       self.assertAllClose([[-3., -2.66068625], [-2.98120904, -2.66068625]],
                           sess.run(gains_list))
 
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
@@ -266,9 +266,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
 
       # We can't split node 1 on feature 1 and node 2 on feature 2 because of
       # the min node weight.
-      self.assertAllEqual([[2], [1]], sess.run(node_ids_list))
-      self.assertAllClose([[0.384314], [0.098013]], sess.run(gains_list))
-      self.assertAllEqual([[1], [1]], sess.run(thresholds_list))
+      self.assertAllEqual([[2], [1]], self.evaluate(node_ids_list))
+      self.assertAllClose([[0.384314], [0.098013]], self.evaluate(gains_list))
+      self.assertAllEqual([[1], [1]], self.evaluate(thresholds_list))
       self.assertAllClose([[[0.4852941]], [[-.6]]],
                           sess.run(left_node_contribs_list))
       self.assertAllClose([[[-0.75]], [[-0.014925]]],
@@ -311,9 +311,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
            max_splits=max_splits)
 
       # We can't split either of the nodes on the first feature
-      self.assertEqual(2, len(sess.run(node_ids_list)))
-      self.assertAllEqual([], sess.run(node_ids_list)[0])
-      self.assertAllEqual([1], sess.run(node_ids_list)[1])
+      self.assertEqual(2, len(self.evaluate(node_ids_list)))
+      self.assertAllEqual([], self.evaluate(node_ids_list)[0])
+      self.assertAllEqual([1], self.evaluate(node_ids_list)[1])
 
       # Now check when we can't split on any feature
       (node_ids_list, _, _, _,
@@ -325,7 +325,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
            tree_complexity=0.0,
            min_node_weight=10,
            max_splits=max_splits)
-      self.assertAllEqual([[], []], sess.run(node_ids_list))
+      self.assertAllEqual([[], []], self.evaluate(node_ids_list))
 
   def testMakeStatsSummarySimple(self):
     """Simple test for MakeStatsSummary."""
@@ -359,7 +359,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
               [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
               [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
           ]],
-          result.eval())
+          self.evaluate(result))
 
   def testMakeStatsSummaryMultipleFeatures(self):
     """Tests that MakeStatsSummary works for multiple features."""
@@ -389,7 +389,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
                   [[.3, .4], [0., 0.], [-.4, .5], [.07, .08]],  # node 2
               ],  # feature 1
           ],
-          result.eval())
+          self.evaluate(result))
 
   def _verify_precision(self, length):
     with self.cached_session():
@@ -408,7 +408,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           node_ids, gradients, hessians, [bucketized_features], max_splits,
           num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
 
-      self.assertAllClose([[[[2., 0.2]]]], result.eval())
+      self.assertAllClose([[[[2., 0.2]]]], self.evaluate(result))
 
   def testMakeStatsSummaryNumericalPrecisionSmallBatch(self):
     """Tests numeric precision."""
diff --git a/tensorflow/python/kernel_tests/bucketize_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
index 57413e6af500f1c8fbecbfa46e3bb5e846d02d95..9575b28899fdf8ac708c0f22d2bff669b90c4e0f 100644
--- a/tensorflow/python/kernel_tests/bucketize_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -32,7 +32,7 @@ class BucketizationOpTest(test.TestCase):
         boundaries=[0, 3, 8, 11])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
     with self.session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
   def testFloat(self):
     op = math_ops._bucketize(
@@ -40,7 +40,7 @@ class BucketizationOpTest(test.TestCase):
         boundaries=[0., 3., 8., 11.])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
     with self.session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
   def test2DInput(self):
     op = math_ops._bucketize(
@@ -48,7 +48,7 @@ class BucketizationOpTest(test.TestCase):
         boundaries=[0, 3, 8, 11])
     expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
     with self.session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
   def testInvalidBoundariesOrder(self):
     op = math_ops._bucketize(
diff --git a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
index b19077db560363a22ab3c4c5400541edb9ab4600..031accee553a35ccce203a2d5b03fb31d2d76d19 100644
--- a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
+++ b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
@@ -55,7 +55,7 @@ class RangeSamplerOpsTest(test.TestCase):
           [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
       sampled_candidates, _, _ = candidate_sampling_ops.all_candidate_sampler(
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
-      result = sampled_candidates.eval()
+      result = self.evaluate(sampled_candidates)
 
     expected_ids = [0, 1, 2, 3, 4]
     self.assertAllEqual(result, expected_ids)
@@ -68,7 +68,7 @@ class RangeSamplerOpsTest(test.TestCase):
       _, true_expected_count, _ = candidate_sampling_ops.all_candidate_sampler(
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       true_log_expected_count = math_ops.log(true_expected_count)
-      result = true_log_expected_count.eval()
+      result = self.evaluate(true_log_expected_count)
 
     self.assertAllEqual(result, [[0.0] * self.NUM_TRUE] * self.BATCH_SIZE)
     self.assertEqual(true_expected_count.get_shape(),
@@ -83,7 +83,7 @@ class RangeSamplerOpsTest(test.TestCase):
       _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(  # pylint: disable=line-too-long
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       sampled_log_expected_count = math_ops.log(sampled_expected_count)
-      result = sampled_log_expected_count.eval()
+      result = self.evaluate(sampled_log_expected_count)
 
     self.assertAllEqual(result, [0.0] * self.NUM_SAMPLED)
     self.assertEqual(sampled_expected_count.get_shape(), [self.NUM_SAMPLED])
@@ -97,7 +97,7 @@ class RangeSamplerOpsTest(test.TestCase):
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       accidental_hits = candidate_sampling_ops.compute_accidental_hits(
           true_classes, sampled_candidates, self.NUM_TRUE)
-      indices, ids, weights = sess.run(accidental_hits)
+      indices, ids, weights = self.evaluate(accidental_hits)
 
     self.assertEqual(1, accidental_hits[0].get_shape().ndims)
     self.assertEqual(1, accidental_hits[1].get_shape().ndims)
@@ -114,7 +114,7 @@ class RangeSamplerOpsTest(test.TestCase):
             [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
         sampled, _, _ = candidate_sampling_ops.log_uniform_candidate_sampler(
             true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True, 5, seed=seed)
-        return sampled.eval()
+        return self.evaluate(sampled)
 
     # Non-zero seed. Repeatable.
     for seed in [1, 12, 123, 1234]:
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index a5dff5df629900c5d6848d5cd10f5b727b96aaf0..cdeaf7b69677694ac347aa718934b8e58cbbe7d7 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -107,10 +107,10 @@ class CastOpTest(test.TestCase):
     a = np.random.uniform(-100, 100, 100).astype(np.float32)
     with self.cached_session(use_gpu=False):
       b = math_ops.cast(math_ops.cast(a, dtypes.bfloat16), dtypes.float32)
-      self.assertAllClose(a, b.eval(), rtol=1 / 128.)
+      self.assertAllClose(a, self.evaluate(b), rtol=1 / 128.)
     with self.cached_session(use_gpu=True):
       b = math_ops.cast(math_ops.cast(a, dtypes.bfloat16), dtypes.float32)
-      self.assertAllClose(a, b.eval(), rtol=1 / 128.)
+      self.assertAllClose(a, self.evaluate(b), rtol=1 / 128.)
 
   def testRandom(self):
     self._testAll(np.random.normal(0, 10, 210).reshape([2, 3, 5, 7]))
@@ -187,7 +187,7 @@ class CastOpTest(test.TestCase):
       y = variables.Variable(True, dtype=dtypes.bool)
       cast = math_ops.cast(y, x.dtype)
       variables.global_variables_initializer().run()
-      self.assertEqual(1.0, sess.run(cast))
+      self.assertEqual(1.0, self.evaluate(cast))
 
   def testGradients(self):
     t = [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 88f5cd6f22339dbac4c6f9ec6ea2490e9bd8e7c1..15124a19a2775bd8abf69e67b98639dd79850c58 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -39,6 +40,69 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
+class AssertV2Asserts(test.TestCase):
+
+  def test_passes_when_it_should(self):
+    # This is a v2 test and need to run eagerly
+    with context.eager_mode():
+      c1 = constant_op.constant(-1, name="minus_one", dtype=dtypes.int32)
+      c2 = constant_op.constant(2, name="two", dtype=dtypes.int32)
+      c3 = constant_op.constant([3., 3.], name="three", dtype=dtypes.float32)
+      c4 = constant_op.constant([3., 3.5], name="three_and_a_half",
+                                dtype=dtypes.float32)
+      scalar = c1
+      non_scalar = c3
+      integer = c1
+      non_integer = c3
+      positive = c2
+      negative = c1
+      cases = [
+          (check_ops.assert_equal_v2, (c1, c1), (c1, c2)),
+          (check_ops.assert_less_v2, (c1, c2), (c1, c1)),
+          (check_ops.assert_near_v2, (c3, c3), (c3, c4)),
+          (check_ops.assert_greater_v2, (c2, c1), (c1, c1)),
+          (check_ops.assert_negative_v2, (negative,), (positive,)),
+          (check_ops.assert_positive_v2, (positive,), (negative,)),
+          (check_ops.assert_less_equal_v2, (c1, c1), (c2, c1)),
+          (check_ops.assert_none_equal_v2, (c1, c2), (c3, c4)),
+          (check_ops.assert_non_negative_v2, (positive,), (negative,)),
+          (check_ops.assert_non_positive_v2, (negative,), (positive,)),
+          (check_ops.assert_greater_equal_v2, (c1, c1), (c1, c2)),
+          (check_ops.assert_type_v2, (c1, dtypes.int32), (c1, dtypes.float32),
+           TypeError),
+          (check_ops.assert_integer_v2, (integer,), (non_integer,),
+           TypeError),
+          (check_ops.assert_scalar_v2, (scalar,), (non_scalar,),
+           ValueError),
+          (check_ops.assert_rank_v2, (c1, 0), (c3, 2), ValueError),
+          (check_ops.assert_rank_in_v2, (c1, [0, 1]), (c1, [1, 2]),
+           ValueError),
+          (check_ops.assert_rank_at_least_v2, (non_scalar, 1), (scalar, 1),
+           ValueError),
+      ]
+
+      for case in cases:
+        fn = case[0]
+        passing_args = case[1]
+        failing_args = case[2]
+        error = errors.InvalidArgumentError if len(case) < 4 else case[3]
+
+        print("Testing %s passing properly." % fn)
+
+        fn(*passing_args)
+
+        print("Testing %s failing properly." % fn)
+
+        @def_function.function
+        def failing_fn():
+          fn(*failing_args, message="fail")  # pylint: disable=cell-var-from-loop
+
+        with self.assertRaisesRegexp(error, "fail"):
+          failing_fn()
+
+        del failing_fn
+
+
 class AssertProperIterableTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/kernel_tests/checkpoint_ops_test.py b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
index 51611b75afb051b2f69abb1749c18b3cbf1f66a0..213ac292d3eb561540f79db7cc0669c3c276b352 100644
--- a/tensorflow/python/kernel_tests/checkpoint_ops_test.py
+++ b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
@@ -58,8 +58,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = range(0, 3)
     expected_num_present = 3
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_shifted_vocab(self):
     """Tests where vocab is the same, but shifted / ordered differently."""
@@ -71,8 +71,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [2, 0, 1]
     expected_num_present = 3
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_offset(self):
     """Tests offset and num_new_vocab logic."""
@@ -84,8 +84,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [0]
     expected_num_present = 1
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_old_vocab_size(self):
     """Tests where old_vocab_size is specified."""
@@ -99,8 +99,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [-1, 0, 1]
     expected_num_present = 2
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
 
 class LoadAndRemapMatrixTest(test.TestCase):
@@ -142,7 +142,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=self.old_num_cols)
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
     # No row remapping, new weight matrix has third col, then first col.
     row_remapping = list(range(self.old_num_rows))
@@ -157,7 +157,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=len(col_remapping))
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
     # Both row and column remappings.
     row_remapping = [1, 0, 4]
@@ -172,7 +172,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=len(col_remapping))
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_with_init(self):
     """Tests the op's load and remap where there are missing entries."""
@@ -190,7 +190,8 @@ class LoadAndRemapMatrixTest(test.TestCase):
         [33, init_val, init_val, init_val, 1, init_val], [3, 2])
 
     with self.cached_session():
-      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+      self.assertAllClose(expected_remapped_matrix,
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_all_missing_rows(self):
     """Tests when all the rows are missing and need to be initialized."""
@@ -207,7 +208,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session():
       self.assertAllClose(
           np.reshape(initializing_values, (num_rows, self.old_num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_all_missing_rows_and_cols(self):
     """Tests when all the rows & cols are missing and need to be initialized."""
@@ -225,7 +226,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session():
       self.assertAllClose(
           np.reshape(initializing_values, (num_rows, num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_invalid_remapping(self):
     """Tests that errors are raised when an ID maps to multiple new IDs.
@@ -244,7 +245,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=len(invalid_remapping),
         num_cols=self.old_num_cols)
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
     # Invalid column remapping.
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
@@ -256,7 +257,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=self.old_num_rows,
         num_cols=len(invalid_remapping))
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
   def test_load_and_remap_incorrect_initializing_values(self):
     """Tests that errors are raised with incorrect number of init values."""
@@ -273,7 +274,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=3,
         num_cols=2)
     with self.cached_session(), self.assertRaises(errors.InvalidArgumentError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
@@ -285,7 +286,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=3,
         num_cols=2)
     with self.cached_session(), self.assertRaises(errors.InvalidArgumentError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
 
 class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
@@ -324,7 +325,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           num_rows=num_rows,
           num_cols=num_cols,
           max_rows_in_memory=max_rows_in_memory)
-      self.assertAllClose(np_value[::-1], remapped_matrix.eval())
+      self.assertAllClose(np_value[::-1], self.evaluate(remapped_matrix))
 
       # Tests loading the tensor (except for the first and last rows), with
       # uninitialized values. Requires num_rows to be at least 3 since we're
@@ -348,7 +349,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           np.vstack([
               np.tile(42, [prefix_rows, num_cols]), np_value[1:-1],
               np.tile(42, [suffix_rows, num_cols])
-          ]), remapped_matrix.eval())
+          ]), self.evaluate(remapped_matrix))
 
       # Tests when everything is taken from initializing_values.
       new_rows = 7
@@ -365,7 +366,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           max_rows_in_memory=max_rows_in_memory)
       self.assertAllClose(
           np.reshape(initializing_values, (new_rows, num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
   def test_loading_rows_divisible_by_max_rows(self):
     """Tests loading normal var when rows are evenly divisible by max_rows."""
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index efd7eee84743f36bae7ed224759b5c7a5a2bcb9d..d0cd7eb3029f870aaecc1b32c755bfacc4fdaf09 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -55,7 +55,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
       clip_value = 4.4
       ans = clip_ops.clip_by_value(x, -clip_value, clip_value)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -71,7 +71,7 @@ class ClipTest(test.TestCase):
         clip_value_min = 2
         clip_value_max = 4
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -88,7 +88,7 @@ class ClipTest(test.TestCase):
             [2, 2, 2, 3, 3, 3], shape=[2, 3], dtype=dtype)
         clip_value_max = 4
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -105,7 +105,7 @@ class ClipTest(test.TestCase):
         clip_value_max = constant_op.constant(
             [6, 6, 6, 6, 6, 6], shape=[2, 3], dtype=dtype)
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -123,7 +123,7 @@ class ClipTest(test.TestCase):
         clip_value_max = constant_op.constant(
             [5, 5, 5, 7, 7, 7], shape=[2, 3], dtype=dtype)
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -144,7 +144,7 @@ class ClipTest(test.TestCase):
       np_ans = [float('NaN'), 4.0, -4.0]
       clip_value = 4.0
       ans = clip_ops.clip_by_value(x, -clip_value, clip_value)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -157,10 +157,10 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans_tensor = ans.eval()
+      tf_ans_tensor = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
     self.assertAllClose(np_ans, tf_ans_tensor)
@@ -188,7 +188,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -200,7 +200,7 @@ class ClipTest(test.TestCase):
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -212,7 +212,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [0])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -224,7 +224,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [1])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -236,7 +236,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [1])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -256,7 +256,7 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
@@ -277,7 +277,7 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
@@ -300,7 +300,7 @@ class ClipTest(test.TestCase):
       self.assertTrue(ans[3] is None)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[2].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
@@ -322,7 +322,7 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].values.eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
@@ -352,7 +352,7 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
@@ -371,7 +371,7 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 0.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
@@ -386,7 +386,7 @@ class ClipTest(test.TestCase):
 
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
-        norm.eval()
+        self.evaluate(norm)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
         ans[0].eval()
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
@@ -400,7 +400,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
       clip_norm = 0.8
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -412,7 +412,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
       clip_norm = constant_op.constant(0.8)
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -424,7 +424,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
       clip_norm = 0.9
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -436,7 +436,7 @@ class ClipTest(test.TestCase):
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
       clip_norm = 0.9
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
diff --git a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
index f27a0fc47221d7b200e91b5510c99d9dde3f7d57..215ea97f36d5fc72581f1ad96e7e68166e12e08c 100644
--- a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
+++ b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -30,15 +31,15 @@ class CompareAndBitpackTest(test.TestCase):
                              x, threshold,
                              truth,
                              expected_err_re=None):
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       ans = math_ops.compare_and_bitpack(x, threshold)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertShapeEqual(truth, ans)
         self.assertAllEqual(tf_ans, truth)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBasic(self, dtype):
     rows = 371
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 92d09986e6cf191acaf956fa5c4606155b9cfd0d..6944d73c5fec4a49b79f5416442ac1c6841f2fb2 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -71,7 +71,7 @@ class ConcatOpTest(test.TestCase):
       x1 = constant_op.constant(p1)
       x2 = constant_op.constant(p2)
       c = array_ops.concat([x1, x2], 0)
-      result = c.eval()
+      result = self.evaluate(c)
     self.assertAllEqual(result[:2, :], p1)
     self.assertAllEqual(result[2:, :], p2)
 
@@ -83,7 +83,7 @@ class ConcatOpTest(test.TestCase):
       v2 = variables.Variable(p2)
       c = array_ops.concat([v1, v2], 0)
       variables.global_variables_initializer().run()
-      result = c.eval()
+      result = self.evaluate(c)
 
     self.assertEqual(result.shape, c.get_shape())
     self.assertAllEqual(result[:4, :], p1)
@@ -195,7 +195,7 @@ class ConcatOpTest(test.TestCase):
             grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
     self.assertAllEqual(result, grad_inp)
 
   def testGradientsSimple(self):
@@ -222,7 +222,7 @@ class ConcatOpTest(test.TestCase):
           grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, 0)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -249,7 +249,7 @@ class ConcatOpTest(test.TestCase):
             grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -279,7 +279,7 @@ class ConcatOpTest(test.TestCase):
       grad_tensor = constant_op.constant(grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, concat_dim)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -476,7 +476,7 @@ class ConcatOpTest(test.TestCase):
     with self.cached_session():
       concat_list_t = array_ops.concat([c1, c2], 0)
       concat_tuple_t = array_ops.concat((c1, c2), 0)
-      self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval())
+      self.assertAllEqual(concat_list_t.eval(), self.evaluate(concat_tuple_t))
 
   def testConcatNoScalars(self):
     with self.cached_session():
@@ -543,13 +543,13 @@ class ConcatOpTest(test.TestCase):
 
       c = gen_array_ops.concat_v2([t1, t2], -2)
       self.assertEqual([4, 3], c.get_shape().as_list())
-      output = c.eval()
+      output = self.evaluate(c)
       self.assertAllEqual([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
                           output)
 
       c = gen_array_ops.concat_v2([t1, t2], -1)
       self.assertEqual([2, 6], c.get_shape().as_list())
-      output = c.eval()
+      output = self.evaluate(c)
       self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
   def _testGradientsForAxis(
@@ -615,7 +615,7 @@ class ConcatOpTest(test.TestCase):
         c = gen_array_ops.concat_v2([t1, t2],
                                     constant_op.constant(1, dtype=dtype))
         self.assertEqual([2, 6], c.get_shape().as_list())
-        output = c.eval()
+        output = self.evaluate(c)
         self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
 class ConcatOffsetTest(test.TestCase):
@@ -627,7 +627,7 @@ class ConcatOffsetTest(test.TestCase):
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
   def testNotVector(self):
@@ -679,7 +679,7 @@ class ConcatOffsetTest(test.TestCase):
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
       cdim = constant_op.constant(-3, dtypes.int32)
@@ -687,7 +687,7 @@ class ConcatOffsetTest(test.TestCase):
       s1 = constant_op.constant([1, 3, 5], dtypes.int32)
       s2 = constant_op.constant([3, 3, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [2, 0, 0], [3, 0, 0]])
 
 
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index b077b853edb5ca725bf41d04577b153e15b17924..ace18dbc44f4f4f849165a8a6fccb02ba46dc10f 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -32,6 +33,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -666,6 +668,121 @@ class CondV2Test(test.TestCase):
           if_found,
           "An `If` op was not found, but the graph should not be lowered.")
 
+  def testLoweringDisabledWithSingleThreadedExecutorContext(self):
+    with self.session(graph=ops.Graph()) as sess:
+      @function.defun
+      def _add_cond(x):
+        return cond_v2.cond_v2(
+            constant_op.constant(True, name="pred"),
+            lambda: x,
+            lambda: x + 1)
+
+      x = array_ops.placeholder(shape=None, dtype=dtypes.float32)
+      with context.function_executor_type("SINGLE_THREADED_EXECUTOR"):
+        out_cond = _add_cond(x)
+
+      # The fact that sess.run() succeeds means lowering is disabled, because
+      # the single threaded executor does not support cond v1 ops.
+      sess.run(out_cond, feed_dict={x: 1.0})
+
+  @test_util.enable_control_flow_v2
+  def testStructuredOutputs(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(3.0, name="y")
+
+    def true_fn():
+      return ((x * y,), y)
+
+    def false_fn():
+      return ((x,), y * 3.0)
+
+    output = control_flow_ops.cond(
+        constant_op.constant(False), true_fn, false_fn)
+    self.assertEqual(self.evaluate(output[0][0]), 1.)
+    self.assertEqual(self.evaluate(output[1]), 9.)
+
+  @test_util.enable_control_flow_v2
+  def testRaisesOutputStructuresMismatch(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(3.0, name="y")
+
+    def true_fn():
+      return x * y, y
+
+    def false_fn():
+      return ((x,), y * 3.0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Outputs of true_fn and false_fn must"
+        " have the same structure"):
+      control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
+
+  @test_util.enable_control_flow_v2
+  def testCondAndTensorArray(self):
+    if test_util.is_gpu_available():
+      old_enable_tensor_array_v2 = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
+      # TODO(b/119689663): Enable this.
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = False
+    x = math_ops.range(-5, 5)
+    output = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=x.shape[0])
+
+    def loop_body(i, output):
+
+      def if_true():
+        return output.write(i, x[i]**2)
+
+      def if_false():
+        return output.write(i, x[i])
+
+      output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+      return i + 1, output
+
+    _, output = control_flow_ops.while_loop(
+        lambda i, arr: i < x.shape[0],
+        loop_body,
+        loop_vars=(constant_op.constant(0), output))
+    output_t = output.stack()
+    self.assertAllEqual(
+        self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
+    if test_util.is_gpu_available():
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = old_enable_tensor_array_v2
+
+  @test_util.enable_control_flow_v2
+  def testCondAndTensorArrayInDefun(self):
+    if test_util.is_gpu_available():
+      old_enable_tensor_array_v2 = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
+      # TODO(b/119689663): Enable this.
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = False
+
+    @function.defun
+    def f():
+      x = math_ops.range(-5, 5)
+      output = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=x.shape[0])
+
+      def loop_body(i, output):
+
+        def if_true():
+          return output.write(i, x[i]**2)
+
+        def if_false():
+          return output.write(i, x[i])
+
+        output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+        return i + 1, output
+
+      _, output = control_flow_ops.while_loop(
+          lambda i, arr: i < x.shape[0],
+          loop_body,
+          loop_vars=(constant_op.constant(0), output))
+      return output.stack()
+
+    output_t = f()
+    self.assertAllEqual(
+        self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
+
+    if test_util.is_gpu_available():
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = old_enable_tensor_array_v2
+
 
 class CondV2CollectionTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/conditional_accumulator_test.py b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
index 97ab23fe49b6eea388b61876b99495486e17d9f9..8388070c63a83f4072cd9dee9b8d85b2e3b6ecea 100644
--- a/tensorflow/python/kernel_tests/conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
@@ -149,7 +149,7 @@ class ConditionalAccumulatorTest(test.TestCase):
         accum_op.run()
 
       is_all_equal = True
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       for i in range(len(val)):
         for j in range(len(val[i])):
           is_all_equal &= (val[i][j] == elems_ave[i][j])
@@ -184,7 +184,7 @@ class ConditionalAccumulatorTest(test.TestCase):
         sess.run(accum_op, feed_dict={x: elem})
 
       is_all_equal = True
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       for i in range(len(val)):
         for j in range(len(val[i])):
           is_all_equal &= (val[i][j] == elems_ave[i][j])
@@ -259,7 +259,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(15.0, val)
 
       accum_ops = [q.apply_grad((x,), local_step=1) for x in elems]
@@ -268,7 +268,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(15.0, val)
 
   def testAccumulatorTakeGradSum(self):
@@ -286,7 +286,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(30.0, val)
 
       accum_ops = [q.apply_grad((x,), local_step=1) for x in elems]
@@ -295,7 +295,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(30.0, val)
 
   def testAccumulatorTakeGradInvalidReductionType(self):
@@ -319,7 +319,7 @@ class ConditionalAccumulatorTest(test.TestCase):
         accum_op.run()
 
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        takeg_t.eval()
+        self.evaluate(takeg_t)
 
   def testAccumulatorRepeatedTakeGradMean(self):
     with self.cached_session():
@@ -334,7 +334,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_ave, val)
 
       elems = [20.0, 30.0]
@@ -345,7 +345,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_ave + 0.0, val)
 
   def testAccumulatorRepeatedTakeGradSum(self):
@@ -364,7 +364,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_sum, val)
 
       elems = [20.0, 30.0]
@@ -375,7 +375,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_sum, val)
 
   def testAccumulatorIncrementGlobalStep(self):
@@ -392,7 +392,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       variables.global_variables_initializer().run()
       for _ in range(3):
         set_global_step_op.run()
-        inc_global_step.eval()
+        self.evaluate(inc_global_step)
 
   def testAccumulatorSetGlobalStepPreventsAccumulation(self):
     with self.cached_session():
@@ -410,7 +410,7 @@ class ConditionalAccumulatorTest(test.TestCase):
           accum_op.run()
         takeg_t = q.take_grad(1)
 
-        val = takeg_t.eval()
+        val = self.evaluate(takeg_t)
         self.assertEqual(0.0 + sum(x for x in local_steps
                                    if x >= ls) / sum(1 for x in local_steps
                                                      if x >= ls), val)
@@ -424,7 +424,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_grad(1)
 
       def apply_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(
@@ -436,7 +436,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
 
       self.assertEqual(val, sum(elems) / len(elems))
 
@@ -451,7 +451,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       def apply_grad():
         for accum_op in accum_ops:
           time.sleep(1.0)
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       apply_grad_thread = self.checkedThread(target=apply_grad)
 
@@ -485,7 +485,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       def apply_grad():
         time.sleep(1.0)
         for accum_op in accum_ops:
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       return_array = []
 
@@ -503,7 +503,7 @@ class ConditionalAccumulatorTest(test.TestCase):
 
   def _blocking_takeg(self, sess, takeg_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(takeg_op)
+      self.evaluate(takeg_op)
 
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index bc24345261e5bb7beaa0aa2273ec277b53ea01fb..b001341c03d4bd043bf860370437abc63272ef8b 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -232,7 +232,7 @@ class ConfusionMatrixTest(test.TestCase):
     with self.cached_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int32)
-      tf_cm = cm.eval()
+      tf_cm = self.evaluate(cm)
     self.assertEqual(tf_cm.dtype, np.int32)
 
   def testOutputIsInt64(self):
@@ -241,7 +241,7 @@ class ConfusionMatrixTest(test.TestCase):
     with self.cached_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int64)
-      tf_cm = cm.eval()
+      tf_cm = self.evaluate(cm)
     self.assertEqual(tf_cm.dtype, np.int64)
 
 
@@ -261,8 +261,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -286,8 +286,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -311,8 +311,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder, expected_rank_diff=0))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -337,8 +337,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_label_values = np.reshape(label_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(expected_label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(expected_label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -363,8 +363,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_label_values = np.reshape(label_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(expected_label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(expected_label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -389,8 +389,9 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(expected_prediction_values,
+                          self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -416,8 +417,9 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(expected_prediction_values,
+                          self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 38b8c0c146f0e8137240d67e2c6de4831a90543f..112e201c88befd043272722ea1d804ccca77cc9e 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -282,29 +282,29 @@ class AsTensorTest(test.TestCase):
     with self.cached_session():
       x = ops.convert_to_tensor(tensor_shape.TensorShape([]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([], x.eval())
+      self.assertAllEqual([], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([1, 2, 3]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([1, 2, 3], x.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+      self.assertAllEqual([2**31 - 1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]),
                                 dtype=dtypes_lib.int32)
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+      self.assertAllEqual([2**31 - 1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]))
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([2**31, 2, 3], x.eval())
+      self.assertAllEqual([2**31, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]),
                                 dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([2**31, 2, 3], x.eval())
+      self.assertAllEqual([2**31, 2, 3], self.evaluate(x))
 
       with self.assertRaisesRegexp(
           ValueError, "a dimension is too large .2147483648."):
@@ -314,11 +314,11 @@ class AsTensorTest(test.TestCase):
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3]), dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([1, 2, 3], x.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(x))
 
       x = array_ops.reshape(
           array_ops.zeros([6]), tensor_shape.TensorShape([2, 3]))
-      self.assertAllEqual([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], x.eval())
+      self.assertAllEqual([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], self.evaluate(x))
 
     with self.assertRaisesRegexp(ValueError, "partially known"):
       ops.convert_to_tensor(tensor_shape.TensorShape(None))
@@ -334,12 +334,12 @@ class AsTensorTest(test.TestCase):
     with self.cached_session():
       x = ops.convert_to_tensor(tensor_shape.TensorShape([1, 2, 3])[1])
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual(2, x.eval())
+      self.assertAllEqual(2, self.evaluate(x))
 
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3])[1], dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual(2, x.eval())
+      self.assertAllEqual(2, self.evaluate(x))
 
     shape = tensor_shape.TensorShape(None)
     if shape._v2_behavior:
@@ -372,7 +372,7 @@ class ZerosTest(test.TestCase):
     with self.cached_session():
       ret = array_ops.zeros(shape)
       self.assertEqual(shape, ret.get_shape())
-      return ret.eval()
+      return self.evaluate(ret)
 
   def testConst(self):
     self.assertTrue(
@@ -383,7 +383,7 @@ class ZerosTest(test.TestCase):
     self.assertEqual(0, self._Zeros(()))
     with self.cached_session():
       scalar = array_ops.zeros(constant_op.constant([], dtype=dtypes_lib.int32))
-      self.assertEqual(0, scalar.eval())
+      self.assertEqual(0, self.evaluate(scalar))
 
   def testDynamicSizes(self):
     np_ans = np.array([[0] * 3] * 2)
@@ -392,7 +392,7 @@ class ZerosTest(test.TestCase):
       d = array_ops.fill([2, 3], 12., name="fill")
       # Constructs a tensor of zeros of the same dimensions as "d".
       z = array_ops.zeros(array_ops.shape(d))
-      out = z.eval()
+      out = self.evaluate(z)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, d)
     self.assertShapeEqual(np_ans, z)
@@ -420,13 +420,13 @@ class ZerosTest(test.TestCase):
         z = array_ops.zeros([2, 3], dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        z_value = z.eval()
+        z_value = self.evaluate(z)
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
         z = array_ops.zeros(array_ops.shape(d), dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        z_value = z.eval()
+        z_value = self.evaluate(z)
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
 
@@ -538,7 +538,7 @@ class OnesTest(test.TestCase):
     with self.cached_session():
       ret = array_ops.ones(shape)
       self.assertEqual(shape, ret.get_shape())
-      return ret.eval()
+      return self.evaluate(ret)
 
   def testConst(self):
     self.assertTrue(np.array_equal(self._Ones([2, 3]), np.array([[1] * 3] * 2)))
@@ -548,7 +548,7 @@ class OnesTest(test.TestCase):
     self.assertEqual(1, self._Ones(()))
     with self.cached_session():
       scalar = array_ops.ones(constant_op.constant([], dtype=dtypes_lib.int32))
-      self.assertEqual(1, scalar.eval())
+      self.assertEqual(1, self.evaluate(scalar))
 
   def testDynamicSizes(self):
     np_ans = np.array([[1] * 3] * 2)
@@ -557,7 +557,7 @@ class OnesTest(test.TestCase):
       d = array_ops.fill([2, 3], 12., name="fill")
       # Constructs a tensor of ones of the same dimensions as "d".
       z = array_ops.ones(array_ops.shape(d))
-      out = z.eval()
+      out = self.evaluate(z)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, d)
     self.assertShapeEqual(np_ans, z)
@@ -617,7 +617,7 @@ class OnesLikeTest(test.TestCase):
         z_var = array_ops.ones_like(d)
         # Test that the type is correct
         self.assertEqual(z_var.dtype, dtype)
-        z_value = z_var.eval()
+        z_value = self.evaluate(z_var)
 
       # Test that the value is correct
       self.assertTrue(np.array_equal(z_value, np.array([[1] * 3] * 2)))
@@ -634,7 +634,7 @@ class FillTest(test.TestCase):
   def _compare(self, dims, val, np_ans, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.fill(dims, val, name="fill")
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     # Fill does not set the shape.
     # self.assertShapeEqual(np_ans, tf_ans)
@@ -726,7 +726,7 @@ class PlaceholderTest(test.TestCase):
 
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
   def testShape(self):
     with self.cached_session():
@@ -739,7 +739,7 @@ class PlaceholderTest(test.TestCase):
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float and "
           r"shape \[10,10\]"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
       with self.assertRaisesWithPredicateMatch(
           ValueError, lambda e: "Cannot feed value of shape" in str(e)):
@@ -783,7 +783,7 @@ class PlaceholderTest(test.TestCase):
       # Should trigger an operator error, not a shape error.
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
   def testControlDependency(self):
     with self.cached_session():
@@ -896,7 +896,7 @@ class PlaceholderWithDefaultTest(test.TestCase):
     with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([[2, 2], [2, 2]], shape=[2, 2])
       a = array_ops.identity(p)
-      self.assertAllEqual([[2, 2], [2, 2]], a.eval())
+      self.assertAllEqual([[2, 2], [2, 2]], self.evaluate(a))
       self.assertAllEqual(
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
@@ -907,7 +907,7 @@ class PlaceholderWithDefaultTest(test.TestCase):
     with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([1, 2, 3], shape=[None])
       a = array_ops.identity(p)
-      self.assertAllEqual([1, 2, 3], a.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(a))
       self.assertAllEqual([3, 37], a.eval(feed_dict={p: [3, 37]}))
 
       with self.assertRaises(ValueError):
@@ -917,7 +917,7 @@ class PlaceholderWithDefaultTest(test.TestCase):
     with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([17], shape=None)
       a = array_ops.identity(p)
-      self.assertAllEqual([17], a.eval())
+      self.assertAllEqual([17], self.evaluate(a))
       self.assertAllEqual([3, 37], a.eval(feed_dict={p: [3, 37]}))
       self.assertAllEqual(
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 3a5d817e9d6d9550f29e3b2bfac58aa7ffd5df96..3b8f917282cb1aafacbc0fc4901c68a5fc9e7e55 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -139,7 +139,7 @@ class ControlFlowTest(test.TestCase):
 
       self.assertTrue(isinstance(v2, ops.Tensor))
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v2.eval())
+      self.assertEqual(9, self.evaluate(v2))
 
   def testRefEnter(self):
     with self.cached_session():
@@ -152,7 +152,7 @@ class ControlFlowTest(test.TestCase):
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v3.eval())
+      self.assertEqual(9, self.evaluate(v3))
 
   def testRefSwitch(self):
     with self.cached_session():
@@ -162,7 +162,7 @@ class ControlFlowTest(test.TestCase):
       v1 = control_flow_ops._SwitchRefOrTensor(v._ref(), p)  # pylint: disable=protected-access
       v2 = state_ops.assign(v1[1], 9)
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v2.eval())
+      self.assertEqual(9, self.evaluate(v2))
 
   def testEnterMulExit(self):
     with self.cached_session():
@@ -173,7 +173,7 @@ class ControlFlowTest(test.TestCase):
       mul_op = math_ops.multiply(enter_data, enter_five)
       exit_op = control_flow_ops.exit(mul_op)
 
-      result = exit_op.eval()
+      result = self.evaluate(exit_op)
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
   def testEnterShapePropagation(self):
@@ -214,7 +214,7 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           lambda e: "Retval[0] does not have value" in str(e)):
-        dead_branch.eval()
+        self.evaluate(dead_branch)
 
   def testSwitchMergeLess(self):
     with self.cached_session():
@@ -225,7 +225,7 @@ class ControlFlowTest(test.TestCase):
       switch_op = control_flow_ops.switch(data, less_op)
       merge_op = control_flow_ops.merge(switch_op)[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.arange(1, 7), result)
 
   def testSwitchMergeAddIdentity(self):
@@ -238,7 +238,7 @@ class ControlFlowTest(test.TestCase):
       id_op = array_ops.identity(switch_op[1])
       merge_op = control_flow_ops.merge([add_op, id_op])[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x + 1 for x in [1, 2, 3, 4, 5, 6]]), result)
 
   def testSwitchMergeAddMul(self):
@@ -252,7 +252,7 @@ class ControlFlowTest(test.TestCase):
       mul_op = math_ops.multiply(switch_op[1], five)
       merge_op = control_flow_ops.merge([add_op, mul_op])[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
   def testLoop_false(self):
@@ -269,7 +269,7 @@ class ControlFlowTest(test.TestCase):
       next_n = control_flow_ops.next_iteration(switch_n[0])
       merge_n.op._update_input(1, next_n)
 
-      result = exit_n.eval()
+      result = self.evaluate(exit_n)
     self.assertAllEqual(10, result)
 
   def testLoop_1(self):
@@ -295,7 +295,7 @@ class ControlFlowTest(test.TestCase):
       merge_i.op._update_input(1, next_i)
 
       exit_i = control_flow_ops.exit(switch_i[0])
-      result = exit_i.eval()
+      result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
   def testLoop_2(self):
@@ -321,7 +321,7 @@ class ControlFlowTest(test.TestCase):
       merge_i.op._update_input(1, next_i)
 
       exit_i = control_flow_ops.exit(switch_i[0])
-      result = exit_i.eval()
+      result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
   def testDifferentFrame(self):
@@ -389,7 +389,6 @@ class ControlFlowTest(test.TestCase):
             with self.assertRaisesRegexp(ValueError, "may not be fed"):
               sess.run(r, feed_dict={t: 3})
 
-  @test_util.disable_control_flow_v2("b/113296180 (IndexedSlices)")
   def testCondIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant(10)
@@ -405,7 +404,6 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
-  @test_util.disable_control_flow_v2("b/113296161 (SparseTensors)")
   def testCondSparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -437,6 +435,20 @@ class ControlFlowTest(test.TestCase):
 
       self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
 
+  def testCondWithTensorArrayGrad(self):
+    with self.cached_session() as sess:
+      with ops.device(test.gpu_device_name()):
+        pred = array_ops.placeholder(dtypes.bool, [])
+        x = constant_op.constant([1.0, 2.0, 3.0])
+        y = control_flow_ops.cond(
+            pred, lambda: functional_ops.map_fn(lambda z: z * 2.0, x),
+            lambda: constant_op.constant([1.0, 1.0, 1.0]))
+        g = gradients_impl.gradients(y, x)[0]
+
+      self.assertAllEqual(sess.run(g, {pred: True}), [2.0, 2.0, 2.0])
+      # TODO(b/119791601): Enable this.
+      # self.assertAllEqual(sess.run(g, {pred: False}), [0.0, 0.0, 0.0])
+
   @test_util.disable_control_flow_v2("b/113293074")
   def testCondIndexedSlicesDifferentTypes(self):
     with self.cached_session():
@@ -478,7 +490,7 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: math_ops.subtract(x, 1)
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(11, result)
 
   def testCond_1(self):
@@ -494,7 +506,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(1, 0), lambda: math_ops.add(x, 1),
           lambda: math_ops.subtract(x, 1))
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(9, result)
 
   def testCond_3(self):
@@ -507,7 +519,7 @@ class ControlFlowTest(test.TestCase):
       fn3 = lambda: math_ops.add(control_flow_ops.cond(pred, fn1, fn2), 1)
       r = control_flow_ops.cond(pred, fn3, fn2)
 
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(12, result)
 
   @test_util.run_in_graph_and_eager_modes
@@ -534,9 +546,9 @@ class ControlFlowTest(test.TestCase):
         result = f().eval()
         self.assertEqual(True, result)
         # Only second cond result was fetched, so v1 assign shouldn't run.
-        self.assertEqual(7, v1.eval())
-        self.assertEqual(2, v2.eval())
-        self.assertEqual(7, v3.eval())
+        self.assertEqual(7, self.evaluate(v1))
+        self.assertEqual(2, self.evaluate(v2))
+        self.assertEqual(7, self.evaluate(v3))
 
     result = f_defun()
     self.assertEqual(True, self.evaluate(result))
@@ -557,10 +569,9 @@ class ControlFlowTest(test.TestCase):
 
       for i in range(10):
         alive, count = body(i)
-      self.assertAllEqual(4, count.eval())
+      self.assertAllEqual(4, self.evaluate(count))
 
   def testCond_6(self):
-
     with self.cached_session():
       v1 = variables.Variable([7])
 
@@ -571,7 +582,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       variables.global_variables_initializer().run()
-      result = r.eval()
+      result = self.evaluate(r)
       self.assertAllEqual(np.array([7]), result)
 
   def testCond_7(self):
@@ -582,7 +593,91 @@ class ControlFlowTest(test.TestCase):
       fn1 = lambda: [math_ops.add(x, 1), math_ops.add(x, 2)]
       fn2 = lambda: [y, y]
       r = control_flow_ops.cond(pred, fn1, fn2)
-      self.assertAllEqual([11, 12], sess.run(r))
+      self.assertAllEqual([11, 12], self.evaluate(r))
+
+  def testCondListOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: [math_ops.add(x, y), math_ops.add(x, y)]
+      fn2 = lambda: [y, y]
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertListEqual([210, 210], test_result)
+
+  def testTupleOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: (math_ops.add(x, y), math_ops.add(x, y))
+      fn2 = lambda: (y, y)
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertTupleEqual((210, 210), test_result)
+
+  def testDictOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
+      fn2 = lambda: {"a": y, "b": y}
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertDictEqual({"a": 210, "b": 210}, test_result)
+
+  def testEmbeddedListOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: [[math_ops.add(x, y), math_ops.add(x, y)]]
+      fn2 = lambda: [[y, y]]
+      # Pass strict=True flag as cond_v2 allows for tensors to be
+      # in nested output structures as singletons
+      r = control_flow_ops.cond(pred, fn1, fn2, strict=True)
+      test_result = self.evaluate(r)
+      self.assertListEqual([[210, 210]], test_result)
+
+  def testEmbeddedTupleOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: ((math_ops.add(x, y), math_ops.add(x, y)))
+      fn2 = lambda: ((y, y))
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertTupleEqual(((210, 210)), test_result)
+
+  def testEmbeddedDictOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": {"c": math_ops.add(x, y)},
+                     "b": {"d": math_ops.add(x, y)}}
+      fn2 = lambda: {"a": {"c": y},
+                     "b": {"d": y}}
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertDictEqual({"a": {"c": 210}, "b": {"d": 210}}, test_result)
+
+  def testCheckNestedOutputStruct(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
+      fn2 = lambda: {"c": y, "d": y}
+      v1_msg = "The two structures don't have the same nested structure"
+      v2_msg = "Outputs of true_fn and false_fn must have the same structure"
+      with self.assertRaisesRegexp(
+          ValueError, v2_msg if control_flow_ops.ENABLE_COND_V2 else v1_msg):
+        r = control_flow_ops.cond(pred, fn1, fn2)
+        self.evaluate(r)
 
   def testCondRef(self):
 
@@ -596,7 +691,7 @@ class ControlFlowTest(test.TestCase):
       true_fn = lambda: x
       false_fn = lambda: constant_op.constant([2.0])
       r = control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
-      self.assertAllEqual([2.0], r.eval())
+      self.assertAllEqual([2.0], self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
   def testCondWithControl(self):
@@ -612,7 +707,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           constant_op.constant(True), true_branch,
           lambda: constant_op.constant(1))
-      self.assertEqual(5, r.eval())
+      self.assertEqual(5, self.evaluate(r))
 
   def testUninitializedRefIdentity(self):
     with self.cached_session() as sess:
@@ -677,7 +772,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       grad = gradients_impl.gradients(r, [x])[0]
-      self.assertAllEqual(1.0, grad.eval())
+      self.assertAllEqual(1.0, self.evaluate(grad))
 
   def testCondGrad_2(self):
     with self.cached_session():
@@ -711,6 +806,34 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(980.0, r.eval(feed_dict={c: 1}))
       self.assertAllEqual(30.0, r.eval(feed_dict={c: 3}))
 
+  def testCondGradMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 2},
+                                    allow_soft_placement=True)
+    with self.cached_session(use_gpu=True, config=config) as sess:
+      pred = array_ops.placeholder(dtypes.bool, [])
+      x = array_ops.placeholder(dtypes.float32)
+      y = array_ops.placeholder(dtypes.float32)
+
+      with ops.device("/cpu:0"):
+        z = control_flow_ops.cond(pred, lambda: x * y * 2.0, lambda: 2.0)
+
+      with ops.device("/cpu:1"):
+        grad = gradients_impl.gradients(z, x)[0]
+
+      self.assertEqual(sess.run(grad, {pred: True, x: 1.0, y: 2.0}), 4.0)
+      self.assertEqual(sess.run(grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
+
+      with ops.device("/cpu:0"):
+        grad_grad = gradients_impl.gradients(grad, x)[0]
+
+      # v1 control flow gets None second derivative for some reason.
+      if not control_flow_ops.ENABLE_COND_V2:
+        self.assertIsNone(grad_grad)
+        return
+
+      self.assertEqual(sess.run(grad_grad, {pred: True, x: 1.0, y: 2.0}), 0.0)
+      self.assertEqual(sess.run(grad_grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
+
   def testNestedCond_Simple(self):
     with self.cached_session():
       x = constant_op.constant(0., name="X")
@@ -718,13 +841,13 @@ class ControlFlowTest(test.TestCase):
           constant_op.constant(True), lambda: x,
           lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(y, x)[0]
-      self.assertEqual(1.0, result.eval())
+      self.assertEqual(1.0, self.evaluate(result))
 
       z = control_flow_ops.cond(
           constant_op.constant(False), lambda: x,
           lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(z, x)[0]
-      self.assertEqual(1.0, result.eval())
+      self.assertEqual(1.0, self.evaluate(result))
 
   @test_util.disable_control_flow_v2("b/113327884")
   def testCondGrad_Gather(self):
@@ -750,6 +873,16 @@ class ControlFlowTest(test.TestCase):
       ]
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
+  def testCondPredicateTensor(self):
+    """Regression test for lowering predicate from non-first output of an op."""
+
+    @eager_function.defun
+    def foo():
+      return constant_op.constant("foo"), constant_op.constant(True)
+
+    r = control_flow_ops.cond(foo()[1], lambda: 1.0, lambda: 2.0)
+    self.assertEqual(self.evaluate(r), 1.0)
+
   # TODO(b/117945658): reenable
   @test_util.run_in_graph_and_eager_modes
   def DISABLED_testCondAutoControlDeps(self):
@@ -863,7 +996,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10000)
       b = lambda x: math_ops.add(x, 1)
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual(10000, r.eval())
+      self.assertEqual(10000, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
   def testWhileExternalControlDependencies(self):
@@ -894,7 +1027,7 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(cond=lambda i: i < 5,
                                            body=body_fn, loop_vars=[0])
-      result.eval()
+      self.evaluate(result)
       self.assertAllEqual(v.eval(), 1.0)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
@@ -917,7 +1050,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r[0].dtype, dtypes.int32)
       self.assertEqual(r[1].dtype, dtypes.int32_ref)
 
-      value_i, value_x = sess.run(r)
+      value_i, value_x = self.evaluate(r)
 
     self.assertEqual(100, value_i)
     self.assertEqual(0, value_x)
@@ -926,20 +1059,19 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session():
       s = constant_op.constant(0)
       r = isum(s)
-      self.assertAllEqual(45, r.eval())
+      self.assertAllEqual(45, self.evaluate(r))
 
   def testWhileWithMaximumIterations(self):
     with self.cached_session():
       s = constant_op.constant([1, 2, 3, 4, 5])
       r = isum(s, maximum_iterations=3)
-      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
+      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.cached_session():
       r = control_flow_ops.while_loop(
           lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
-      self.assertEqual(1, r.eval())
+      self.assertEqual(1, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
@@ -967,7 +1099,6 @@ class ControlFlowTest(test.TestCase):
     # Should execute without issue.
     self.assertEqual(3, self.evaluate(loop_execute))
 
-  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -989,29 +1120,45 @@ class ControlFlowTest(test.TestCase):
     gs = gradients_impl.gradients(loop_no_xla, v)
     self.evaluate(gs)  # This should execute without error.
 
-    xla_context = control_flow_ops.XLAControlFlowContext()
-    xla_context.Enter()
-    loop_no_maxiter = create_while_loop()
-    loop_with_maxiter = create_while_loop(maximum_iterations=2)
-    xla_context.Exit()
+    if control_flow_ops.ENABLE_WHILE_V2:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"maximum_iterations is None. It is required and must be statically "
+          r"known \(e.g. a constant value or known shape dimension\) when "
+          r"building while_loop in XLA context."):
+        loop_no_maxiter = create_while_loop()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"maximum_iterations must be statically "
+          r"known \(e.g. a constant value or known shape dimension\) when "
+          r"building while_loop in XLA context."):
+        loop_with_maxiter = create_while_loop(maximum_iterations=2)
+      xla_context.Exit()
+    else:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      loop_no_maxiter = create_while_loop()
+      loop_with_maxiter = create_while_loop(maximum_iterations=2)
+      xla_context.Exit()
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Cannot create a gradient accumulator for tensor '.+' inside "
-        r"XLA while_loop because maximum_iterations was not passed to "
-        r"the tf.while_loop call \('.+'\)."):
-      _ = gradients_impl.gradients(loop_no_maxiter, v)
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Cannot create a gradient accumulator for tensor '.+' inside "
+          r"XLA while_loop because maximum_iterations was not passed to "
+          r"the tf.while_loop call \('.+'\)."):
+        _ = gradients_impl.gradients(loop_no_maxiter, v)
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
-        r"while_loop. maximum_iterations tensor '.+' for while_loop context "
-        r"'.+' must be statically known \(e.g. a constant value or known "
-        r"shape dimension\), or be defined at or outside the while loop "
-        r"context '.*' \(currently defined in '.*'\)"):
-      _ = gradients_impl.gradients(loop_with_maxiter, v)
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+          r"while_loop. maximum_iterations tensor '.+' for while_loop context "
+          r"'.+' must be statically known \(e.g. a constant value or known "
+          r"shape dimension\), or be defined at or outside the while loop "
+          r"context '.*' \(currently defined in '.*'\)"):
+        _ = gradients_impl.gradients(loop_with_maxiter, v)
 
-  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1030,19 +1177,29 @@ class ControlFlowTest(test.TestCase):
           lambda i, x: (i + 1, v * x), (0, 1.0),
           maximum_iterations=max_iter_holder[0])
 
-    xla_context = control_flow_ops.XLAControlFlowContext()
-    xla_context.Enter()
-    loop = create_while_loop()
-    xla_context.Exit()
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
-        r"while_loop. maximum_iterations tensor '.*Placeholder:0' for "
-        r"while_loop context '.+' must be statically known \(e.g. a constant "
-        r"value or known shape dimension\), or be defined at or outside the "
-        r"while loop context '' \(currently defined in 'cond/.+'\)"):
-      _ = gradients_impl.gradients(loop, v)
+    if control_flow_ops.ENABLE_WHILE_V2:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"maximum_iterations must be statically known \(e.g. a constant value"
+          r" or known shape dimension\) when building while_loop in XLA "
+          r"context."):
+        loop = create_while_loop()
+      xla_context.Exit()
+    else:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      loop = create_while_loop()
+      xla_context.Exit()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+          r"while_loop. maximum_iterations tensor '.*Placeholder:0' for "
+          r"while_loop context '.+' must be statically known \(e.g. a constant "
+          r"value or known shape dimension\), or be defined at or outside the "
+          r"while loop context '' \(currently defined in 'cond/.+'\)"):
+        _ = gradients_impl.gradients(loop, v)
 
   @test_util.disable_control_flow_v2("b/118457764")
   def testNestedWhileLoopWithMaxItersFromOuterContextInXLAContext(self):
@@ -1207,7 +1364,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10.0)
       b = lambda x: math_ops.add(x, 1.0)
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllClose(10.0, r.eval())
+      self.assertAllClose(10.0, self.evaluate(r))
 
   def testWhile_Gpu_1(self):
     self._testWhile_Gpu_1(use_gpu=False)
@@ -1223,7 +1380,7 @@ class ControlFlowTest(test.TestCase):
           return math_ops.add(x, 1.0)
 
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllClose(10.0, r.eval())
+      self.assertAllClose(10.0, self.evaluate(r))
 
   def testWhile_Gpu_2(self):
     self._testWhile_Gpu_2(use_gpu=False)
@@ -1244,7 +1401,7 @@ class ControlFlowTest(test.TestCase):
           c, _b, [i, m],
           [i.get_shape(), tensor_shape.unknown_shape()])
       r = r[1] * array_ops.ones([8, 8])
-      self.assertAllEqual(np.ones((8, 8)), r.eval())
+      self.assertAllEqual(np.ones((8, 8)), self.evaluate(r))
 
   def testWhileWithNonTensorInput_Scalar(self):
     with self.cached_session():
@@ -1252,7 +1409,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: x < 10000
       b = lambda x: x + 1
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual(10000, r.eval())
+      self.assertEqual(10000, self.evaluate(r))
 
   def testWhileWithNonTensorInput_Vector(self):
     with self.cached_session():
@@ -1260,7 +1417,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: x[0] < 10000
       b = lambda x: array_ops.stack([x[0] + 1])
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual([10000], r.eval())
+      self.assertEqual([10000], self.evaluate(r))
 
   def testWhileShapeInference(self):
     with self.cached_session():
@@ -1372,7 +1529,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 200)
       b = lambda x: math_ops.add(x, cpu_sum(n))
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertEqual(225, r.eval())
+      self.assertEqual(225, self.evaluate(r))
 
   def testNestedWhile_1(self):
     self._testNestedWhile_1(use_gpu=False)
@@ -1404,7 +1561,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           outer_c, outer_b, [s0], parallel_iterations=1)
-      self.assertEqual(1048576.0, r.eval())
+      self.assertEqual(1048576.0, self.evaluate(r))
 
   def testNestedWhile_2(self):
     self._testNestedWhile_2(use_gpu=False)
@@ -1438,7 +1595,7 @@ class ControlFlowTest(test.TestCase):
 
       res = control_flow_ops.while_loop(
           condition, body, [r], parallel_iterations=1)
-      self.assertAllEqual(12, res.eval())
+      self.assertAllEqual(12, self.evaluate(res))
 
   def testWhileWithControl_3(self):
     with self.cached_session() as sess:
@@ -1485,7 +1642,7 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([control_flow_ops.no_op()]):
         loop = control_flow_ops.while_loop(cond, body,
                                            (constant_op.constant(5),))
-      self.assertEqual(0, sess.run(loop))
+      self.assertEqual(0, self.evaluate(loop))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
   def testWhileCondWithControl_1(self):
@@ -1507,8 +1664,8 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
       variables.global_variables_initializer().run()
-      self.assertEqual(4, r.eval())
-      self.assertAllClose(65536.0, v.eval())
+      self.assertEqual(4, self.evaluate(r))
+      self.assertAllClose(65536.0, self.evaluate(v))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
   def testWhileCondExitControl(self):
@@ -1532,8 +1689,8 @@ class ControlFlowTest(test.TestCase):
           constant_op.constant(False), lambda: constant_op.constant(1.0),
           false_branch)
       variables.global_variables_initializer().run()
-      self.assertEqual(6.0, r.eval())
-      self.assertEqual(99, v.eval())
+      self.assertEqual(6.0, self.evaluate(r))
+      self.assertEqual(99, self.evaluate(v))
 
   def testCondWhile_1(self):
 
@@ -1544,7 +1701,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(0, 1), lambda: control_flow_ops.while_loop(c, b, [n]),
           lambda: n)
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testCondWhile_2(self):
 
@@ -1555,7 +1712,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(1, 0), lambda: math_ops.add(n, 1),
           lambda: control_flow_ops.while_loop(c, b, [n]))
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def _testCondWhile_3(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu) as sess:
@@ -1598,7 +1755,7 @@ class ControlFlowTest(test.TestCase):
           lambda: math_ops.add(x, one), lambda: math_ops.subtract(x, one))
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [i])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testWhileCond_2(self):
 
@@ -1607,7 +1764,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10)
       b = lambda x: control_flow_ops.cond(constant_op.constant(True), lambda: math_ops.add(x, 1), lambda: n)
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testWhileCond_3(self):
 
@@ -1621,7 +1778,36 @@ class ControlFlowTest(test.TestCase):
                                           lambda: math_ops.subtract(x, 1))
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
+
+  def testWhileCondGradMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 2},
+                                    allow_soft_placement=True)
+    with self.cached_session(use_gpu=True, config=config) as sess:
+      pred = array_ops.placeholder(dtypes.bool, [])
+      x_init = constant_op.constant(1.0)
+
+      with ops.device("/cpu:0"):
+        z = control_flow_ops.while_loop(
+            lambda i, _: i < 3,
+            lambda i, x: (i + 1, control_flow_ops.cond(
+                pred, lambda: x * 2.0, lambda: 10.0)),
+            [0, x_init])
+
+      with ops.device("/cpu:1"):
+        grad = gradients_impl.gradients(z, x_init)[0]
+
+      self.assertEqual(sess.run(grad, {pred: True}), 8.0)
+      self.assertEqual(sess.run(grad, {pred: False}), 0.0)
+
+      if not control_flow_ops.ENABLE_WHILE_V2:
+        return
+
+      with ops.device("/cpu:0"):
+        grad_grad = gradients_impl.gradients(grad, x_init)[0]
+
+      self.assertEqual(sess.run(grad_grad, {pred: True}), 0.0)
+      self.assertEqual(sess.run(grad_grad, {pred: False}), 0.0)
 
   # NOTE: It is ok to have parallel_iterations > 1
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
@@ -1643,8 +1829,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
       variables.global_variables_initializer().run()
-      self.assertEqual(3, r.eval())
-      result = select.eval()
+      self.assertEqual(3, self.evaluate(r))
+      result = self.evaluate(select)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
@@ -1668,10 +1854,10 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
       variables.global_variables_initializer().run()
-      self.assertEqual(3, r.eval())
-      result1 = select1.eval()
+      self.assertEqual(3, self.evaluate(r))
+      result1 = self.evaluate(select1)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result1)
-      result2 = select2.eval()
+      result2 = self.evaluate(select2)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result2)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
@@ -1720,9 +1906,9 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [c], parallel_iterations=1)
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(10, var_b.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileUpdateVariable_5(self):
@@ -1749,10 +1935,10 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [var_b], parallel_iterations=1, name="loop")
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(10, var_a.eval())
-      self.assertEqual(10, var_b.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(10, self.evaluate(var_a))
+      self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileUpdateVariable_6(self):
@@ -1779,10 +1965,10 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [c], parallel_iterations=1, name="loop")
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(55, var_b.eval())
-      self.assertEqual(10, var_a.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(55, self.evaluate(var_b))
+      self.assertEqual(10, self.evaluate(var_a))
 
   def testWhileQueue_1(self):
     with self.cached_session():
@@ -1798,7 +1984,7 @@ class ControlFlowTest(test.TestCase):
         return ni
 
       r = control_flow_ops.while_loop(c, b, [i], parallel_iterations=1)
-      self.assertEqual([10], r.eval())
+      self.assertEqual([10], self.evaluate(r))
       for i in xrange(10):
         self.assertEqual([i], q.dequeue().eval())
 
@@ -1834,7 +2020,7 @@ class ControlFlowTest(test.TestCase):
           b1, [r, x],
           [r.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
-      self.assertEqual(45, rx.eval())
+      self.assertEqual(45, self.evaluate(rx))
 
   def _testWhileGrad_ColocateGradients(self, colocate):
     gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
@@ -1869,7 +2055,7 @@ class ControlFlowTest(test.TestCase):
         self.assertFalse(gpu_dev_name in dev)
 
     with self.session(graph=graph) as sess:
-      self.assertAllClose(1024.0, sess.run(r))
+      self.assertAllClose(1024.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116351701 (colocation)")
   def testWhileGrad_ColocateGradients(self):
@@ -1885,7 +2071,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(math_ops.less(1, 2), lambda: r, lambda: v)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(1024.0, r.eval())
+      self.assertAllClose(1024.0, self.evaluate(r))
 
   def testWhileGrad_Shape(self):
     with self.cached_session():
@@ -1925,7 +2111,7 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.multiply(r, r)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertEqual(524288.0, r.eval())
+      self.assertEqual(524288.0, self.evaluate(r))
 
   def testWhileGrad_LoopAdd(self):
     with self.cached_session():
@@ -1936,7 +2122,7 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.add(r, r)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(2048.0, r.eval())
+      self.assertAllClose(2048.0, self.evaluate(r))
 
   def _testWhileGrad_Mul(self, use_gpu, p_iters):
     with self.cached_session(use_gpu=use_gpu) as sess:
@@ -1979,7 +2165,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(512.0, r.eval())
+      self.assertAllClose(512.0, self.evaluate(r))
 
   def testNestedWhileCondWhileGrad(self):
     if control_flow_ops.ENABLE_WHILE_V2 and test_util.is_gpu_available():
@@ -2025,7 +2211,7 @@ class ControlFlowTest(test.TestCase):
       def fn1():
         r = control_flow_ops.while_loop(c, b, [n],
                                         [tensor_shape.unknown_shape()])
-        return gradients_impl.gradients(r, x)
+        return gradients_impl.gradients(r, x)[0]
 
       r = control_flow_ops.cond(math_ops.less(1, 2), fn1, lambda: x)
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
@@ -2271,10 +2457,11 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([x_f]):
         y_f_d = array_ops.identity(y_f, name="y_f_d")
 
-      self.assertAllClose(2.0, y_f_d.eval())  # y_f_d = 1.0 + 1.0
+      self.assertAllClose(2.0, self.evaluate(y_f_d))  # y_f_d = 1.0 + 1.0
       g = gradients_impl.gradients([y_f_d], [x])[0]
       self.assertTrue(g is not None)
-      self.assertAllClose(1.0, g.eval())  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
+      self.assertAllClose(1.0,
+                          self.evaluate(g))  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
 
   def _testNestedWhileGrad_Simple(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -2290,7 +2477,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(8.0, r.eval())
+      self.assertAllClose(8.0, self.evaluate(r))
 
   def testNestedWhileGrad_Simple(self):
     self._testNestedWhileGrad_Simple(use_gpu=False)
@@ -2317,7 +2504,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(256.0, r.eval())
+      self.assertAllClose(256.0, self.evaluate(r))
 
   def testNestedWhileGrad_ParallelInner(self):
     with self.cached_session():
@@ -2340,10 +2527,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(512.0, r.eval())
+      self.assertAllClose(512.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("unsupported: resource creation in body. "
-                                     "Enable with new TAs b/117675481")
   def testNestedWhileGrad_ParallelIterations(self):
     # Make sure the stack pushes and pops of an inner loop are executed in
     # the sequential order of the iterations of its outer loop.
@@ -2362,9 +2547,9 @@ class ControlFlowTest(test.TestCase):
       res = outer_loop(inp)
       optimizer = adam.AdamOptimizer(learning_rate=0.001)
       train_op = optimizer.minimize(math_ops.reduce_mean(math_ops.square(res)))
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(2.999, var.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(train_op)
+      self.assertAllClose(2.999, self.evaluate(var))
 
   def _testWhileCondGrad_Simple(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -2380,7 +2565,7 @@ class ControlFlowTest(test.TestCase):
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(1024.0, r.eval())
+      self.assertAllClose(1024.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/117519152")
   def testWhileCondGrad_Simple(self):
@@ -2422,11 +2607,11 @@ class ControlFlowTest(test.TestCase):
           [i0.get_shape(), tensor_shape.TensorShape([None, 2])])
       s = math_ops.reduce_sum(h)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       optimizer = gradient_descent.GradientDescentOptimizer(0.01)
       op = optimizer.minimize(s)
       sess.run(op)
-      self.assertAllClose([[0.98000002, 1.98000002]], sess.run(x))
+      self.assertAllClose([[0.98000002, 1.98000002]], self.evaluate(x))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileWithRefsWithGradients_1(self):
@@ -2477,7 +2662,7 @@ class ControlFlowTest(test.TestCase):
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       r = gradients_impl.gradients(r.values, values)[0]
-      self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
+      self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
   def testWhileGrad_SparseTensor(self):
@@ -2500,7 +2685,7 @@ class ControlFlowTest(test.TestCase):
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       r = gradients_impl.gradients(r.values, values)[0]
-      self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
+      self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   def testCallGradInLoop(self):
@@ -2520,10 +2705,8 @@ class ControlFlowTest(test.TestCase):
 
       output_grad = control_flow_ops.while_loop(
           c, b, [i0, constant_op.constant(0.0)])
-      self.assertAllClose(600.0, sess.run(output_grad)[1])
+      self.assertAllClose(600.0, self.evaluate(output_grad)[1])
 
-  @test_util.disable_control_flow_v2("unsupported: resource creation in body. "
-                                     "Enable with new TAs b/117675481")
   def testWhileAndTensorArray(self):
     with self.cached_session() as sess:
       param = constant_op.constant(2.0)
@@ -2541,7 +2724,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [n0, y0], parallel_iterations=1)
       r = gradients_impl.gradients(r, param)[0]
-      self.assertAllClose(107520.0, sess.run(r))
+      self.assertAllClose(107520.0, self.evaluate(r))
 
   def testWhileGrad_StopGrad(self):
     with self.cached_session():
@@ -2558,9 +2741,9 @@ class ControlFlowTest(test.TestCase):
       rx, ry = control_flow_ops.while_loop(c, b, [x, y])
 
       r = gradients_impl.gradients(rx, y)[0]
-      self.assertEqual(136.0, r.eval())
+      self.assertEqual(136.0, self.evaluate(r))
       r = gradients_impl.gradients(ry, y)[0]
-      self.assertEqual(32.0, r.eval())
+      self.assertEqual(32.0, self.evaluate(r))
 
       r = gradients_impl.gradients(array_ops.stop_gradient(rx), y)[0]
       self.assertEqual(r, None)
@@ -2578,13 +2761,13 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r, None)
 
       r = gradients_impl.gradients(math_ops.add(rx, ry), y)[0]
-      self.assertEqual(168.0, r.eval())
+      self.assertEqual(168.0, self.evaluate(r))
       r = gradients_impl.gradients(
           math_ops.add(rx, array_ops.stop_gradient(ry)), y)[0]
-      self.assertEqual(136.0, r.eval())
+      self.assertEqual(136.0, self.evaluate(r))
       r = gradients_impl.gradients(
           math_ops.add(array_ops.stop_gradient(rx), ry), y)[0]
-      self.assertEqual(32.0, r.eval())
+      self.assertEqual(32.0, self.evaluate(r))
 
   def testWhileGrad_StopGradInside(self):
     with self.cached_session():
@@ -2601,9 +2784,9 @@ class ControlFlowTest(test.TestCase):
       rx, _ = control_flow_ops.while_loop(c, b, [x, y])
 
       r = gradients_impl.gradients(rx, y)[0]
-      self.assertAllClose(0.0, r.eval())
+      self.assertAllClose(0.0, self.evaluate(r))
       r = gradients_impl.gradients(rx, x)[0]
-      self.assertAllClose(156.0, r.eval())
+      self.assertAllClose(156.0, self.evaluate(r))
 
   def testWhileGrad_StopGradInsideNoShape(self):
     with self.cached_session() as sess:
@@ -2657,7 +2840,7 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.add(math_ops.square(y), rx)
       r = math_ops.add(r, rg)
       r = gradients_impl.gradients(r, y)[0]
-      self.assertEqual(388.0, r.eval())
+      self.assertEqual(388.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileGradientWithNontrainablePath1(self):
@@ -2674,8 +2857,8 @@ class ControlFlowTest(test.TestCase):
     dy_dq, = gradients_impl.gradients(y, q)
     self.assertIsNotNone(dy_dq)
     with self.cached_session() as sess:
-      sess.run(q.initializer)
-      self.assertAllClose([0., 0.], sess.run(dy_dq))
+      self.evaluate(q.initializer)
+      self.assertAllClose([0., 0.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileGradientWithNontrainablePath2(self):
@@ -2692,8 +2875,8 @@ class ControlFlowTest(test.TestCase):
     dy_dq, = gradients_impl.gradients(y, q)
     self.assertIsNotNone(dy_dq)
     with self.cached_session() as sess:
-      sess.run(q.initializer)
-      self.assertAllClose([1., 1.], sess.run(dy_dq))
+      self.evaluate(q.initializer)
+      self.assertAllClose([1., 1.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   def testIssue16504(self):
@@ -2743,7 +2926,7 @@ class ControlFlowTest(test.TestCase):
       z = math_ops.add(r, array_ops.stop_gradient(math_ops.reduce_sum(grads)))
       result = gradients_impl.gradients(z, vars_)[0]
       variables.global_variables_initializer().run()
-      self.assertEqual(5.0, result.eval())
+      self.assertEqual(5.0, self.evaluate(result))
 
   def testOneValueCond(self):
 
@@ -2804,7 +2987,7 @@ class ControlFlowTest(test.TestCase):
       r4 = control_flow_ops.case(
           [(x < y, f1), (x < y, f2)], default=f3, exclusive=True)
       with self.assertRaisesOpError("Input error:"):
-        r4.eval()
+        self.evaluate(r4)
 
       # Check that the default is called if none of the others are
       r5 = control_flow_ops.case({x > y: f1}, default=f3)
@@ -2851,17 +3034,17 @@ class ControlFlowTest(test.TestCase):
 
       variables.global_variables_initializer().run()
       self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(2, r2.eval())
+      self.assertEqual(2, self.evaluate(r2))
       self.assertAllEqual(sess.run([v0, v1, v2]), [-1, -1, 2])
 
       variables.global_variables_initializer().run()
       self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(1, r1.eval())
+      self.assertEqual(1, self.evaluate(r1))
       self.assertAllEqual(sess.run([v0, v1, v2]), [-1, 1, -1])
 
       variables.global_variables_initializer().run()
       self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(0, r0.eval())
+      self.assertEqual(0, self.evaluate(r0))
       self.assertAllEqual(sess.run([v0, v1, v2]), [0, -1, -1])
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
@@ -2883,15 +3066,15 @@ class ControlFlowTest(test.TestCase):
       self.assertTrue(isinstance(i, ops.Tensor))
       variables.global_variables_initializer().run()
 
-      self.assertEqual(0, v.eval())
+      self.assertEqual(0, self.evaluate(v))
 
       # True case: c = 2 is >= 1, v is set to 1.
       self.assertEqual(1, i.eval(feed_dict={c.name: 2}))
-      self.assertEqual(1, v.eval())
+      self.assertEqual(1, self.evaluate(v))
 
       # False case: c = 0 is not >= 1, v is set to 2.
       self.assertEqual(2, i.eval(feed_dict={c.name: 0}))
-      self.assertEqual(2, v.eval())
+      self.assertEqual(2, self.evaluate(v))
 
   def testWithOpsDependencies(self):
     with self.cached_session() as sess:
@@ -2933,14 +3116,14 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching v directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        v.eval()
+        self.evaluate(v)
 
       # Get the value of 'c2_with_c1_dep', which should cause 'v'
       # to be initialized.
-      self.assertAllEqual(20, c2_with_c1_dep.eval())
+      self.assertAllEqual(20, self.evaluate(c2_with_c1_dep))
 
       # Ensure that 'v' is initialized
-      self.assertAllClose(0.0, v.eval())
+      self.assertAllClose(0.0, self.evaluate(v))
 
   def testWithIndexedSlicesDependencies(self):
     with self.cached_session():
@@ -2955,13 +3138,15 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching gather_v_at_1 will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        gather_v_at_1.eval()
+        self.evaluate(gather_v_at_1)
 
       # Getting gather_v_at_1_after_init will work, and initialize v.
-      self.assertAllEqual([[10.0, 11.0]], gather_v_at_1_after_init.eval())
+      self.assertAllEqual([[10.0, 11.0]],
+                          self.evaluate(gather_v_at_1_after_init))
 
       # Double check that 'v' is initialized
-      self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]], v.eval())
+      self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]],
+                          self.evaluate(v))
 
   def testDependenciesDevice(self):
     with ops.Graph().as_default():
@@ -2995,7 +3180,7 @@ class ControlFlowTest(test.TestCase):
       init = control_flow_ops.group(v1.initializer, v2.initializer)
       # Fetching v1 directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        v1.eval()
+        self.evaluate(v1)
 
       # Runs "init" before fetching v1 and v2.
       init.run()
@@ -3110,7 +3295,7 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(condition, body,
                                            [constant_op.constant(4)])
-      self.assertEqual(10, sess.run(result))
+      self.assertEqual(10, self.evaluate(result))
 
       # Ensure that we cannot run a tensor that escapes the loop body
       # accidentally.
@@ -3154,6 +3339,47 @@ class ControlFlowTest(test.TestCase):
                 ]), 1)
 
 
+  def testQIntSwitchMerge(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      constant_qint = constant_op.constant(np.array([42]), dtypes.qint8)
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.switch(constant_qint, cond)
+      result = control_flow_ops.merge([v_f, v_t])
+      sess.run(result)
+
+  def testQIntRefSwitchMerge(self):
+    with self.cached_session(use_gpu=test.is_gpu_available()) as sess:
+      var_qint = gen_state_ops.variable(
+          shape=[1], dtype=dtypes.qint8, name="v", container="", shared_name="")
+      assign_op = state_ops.assign(
+          var_qint, constant_op.constant(np.array([42]), dtypes.qint8))
+      self.evaluate(assign_op)
+
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.ref_switch(var_qint, cond)
+      result = control_flow_ops.ref_merge([v_f, v_t])
+      sess.run(result)
+
+  def testUInt64SwitchMerge(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      constant_uint64 = constant_op.constant(np.array([42]), dtypes.uint64)
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.switch(constant_uint64, cond)
+      result = control_flow_ops.merge([v_f, v_t])
+      sess.run(result)
+
+  def testQIntArgAndRet(self):
+
+    @function.Defun(dtypes.qint8)
+    def func(x):
+      return x
+
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      qint = constant_op.constant(np.array([42]), dtypes.qint8)
+      result = func(qint)
+      sess.run(result)
+
+
 class ControlFlowContextCheckTest(test.TestCase):
 
   def _getWhileTensor(self):
@@ -3290,20 +3516,20 @@ class TupleTest(test.TestCase):
 
         # v1 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v1.eval()
+          self.evaluate(v1)
 
         # v2 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v2.eval()
+          self.evaluate(v2)
 
         if v1_first:
           # Getting t1 initializes v2.
-          self.assertAllClose([3.0], t1.eval())
-          self.assertAllClose([10.0], v2.eval())
+          self.assertAllClose([3.0], self.evaluate(t1))
+          self.assertAllClose([10.0], self.evaluate(v2))
         else:
           # Getting t2 initializes v1.
-          self.assertAllClose([30.0], t2.eval())
-          self.assertAllClose([1.0], v1.eval())
+          self.assertAllClose([30.0], self.evaluate(t2))
+          self.assertAllClose([1.0], self.evaluate(v1))
 
   def testIndexedSlices(self):
     for v1_first in [True, False]:
@@ -3328,22 +3554,22 @@ class TupleTest(test.TestCase):
 
         # v1 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v1.eval()
+          self.evaluate(v1)
 
         # v2 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v2.eval()
+          self.evaluate(v2)
 
         if v1_first:
           # Getting g1 initializes v2.
-          self.assertAllClose([[10.0, 11.0]], g1.eval())
+          self.assertAllClose([[10.0, 11.0]], self.evaluate(g1))
           self.assertAllClose([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]],
-                              v2.eval())
+                              self.evaluate(v2))
         else:
           # Getting g2 initializes v1.
-          self.assertAllClose([[10.1, 11.1]], g2.eval())
+          self.assertAllClose([[10.1, 11.1]], self.evaluate(g2))
           self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]],
-                              v1.eval())
+                              self.evaluate(v1))
 
   def testAcceptTensorsAsControlInputs(self):
     with self.cached_session():
@@ -3353,9 +3579,9 @@ class TupleTest(test.TestCase):
           [constant_op.constant(0)], control_inputs=[assign])
 
       # Should trigger the assign.
-      t.eval()
+      self.evaluate(t)
 
-      self.assertEquals(1, var.eval())
+      self.assertEquals(1, self.evaluate(var))
 
 
 class AssertTest(test.TestCase):
@@ -3456,7 +3682,7 @@ class WhileOpBenchmark(test.Benchmark):
     with session.Session() as sess, ops.device(default_device):
       # Get the initial id i, input x, and kernel.
       i, x, kernel = self._getInitVariables()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       if static_unroll:
         for _ in xrange(steps):
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index 8540875d75e19b967aa4da9b4499b030df10dd7e..e8463323df90bd37d927f88bd41b09bef45de541 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -43,7 +43,7 @@ class Conv1DTest(test.TestCase):
         with self.cached_session(use_gpu=test.is_gpu_available()):
           c = nn_ops.conv1d(x, filters, stride, padding="VALID")
           reduced = array_ops.squeeze(c)
-          output = reduced.eval()
+          output = self.evaluate(reduced)
           if stride == 1:
             self.assertEqual(len(output), 3)
             self.assertAllClose(output,
@@ -69,7 +69,7 @@ class Conv1DTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv1d_transpose(
           x, f, y_shape, stride=stride, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index 6f9992a317f44268c772aa6a3316120f0577eeb3..d9aa4ab96725323ca6c6d0bb01e050fe3c926ed3 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -23,7 +23,6 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -53,7 +52,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells=kernel_height * kernel_width
@@ -91,7 +90,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[2]):
@@ -124,7 +123,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
@@ -195,7 +194,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="SAME", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         for n in xrange(x_shape[0]):
           for k in xrange(f_shape[2]):
             for w in xrange(y_shape[3]):
@@ -230,7 +229,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="SAME", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         for n in xrange(x_shape[0]):
           for k in xrange(f_shape[2]):
             for w in xrange(y_shape[3]):
@@ -265,7 +264,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="VALID", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         cache_values = np.zeros(y_shape, dtype=np.float32)
         # The amount of padding added
         pad = 1
@@ -293,7 +292,6 @@ class Conv2DTransposeTest(test.TestCase):
 
         self.assertAllClose(cache_values, value)
 
-  @test_util.enable_c_shapes
   def testConv2DTransposeShapeInference(self):
     # Test case for 8972
     initializer = random_ops.truncated_normal(
diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index 2527b837692b5e31126499db85224d2a8d3b5321..d4e7ec14da304969cc7a811e7117a9f7c2516dde 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -48,7 +48,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells = kernel_depth * kernel_height * kernel_width
@@ -98,7 +98,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[3]):
@@ -146,7 +146,7 @@ class Conv3DTransposeTest(test.TestCase):
         output = nn_ops.conv3d_transpose(
             x_value, f_value, constant_op.constant(y_shape, dtype=dtype),
             strides=strides, padding="SAME")
-        output.eval()
+        self.evaluate(output)
 
   def testConv3DTransposeValid(self):
     with self.cached_session():
@@ -165,7 +165,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 57b09dc167fc8560ada595ad0d342ba76987ed13..3ec5c29df7daf19127d1b99b4b2113d42d048e7f 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -52,11 +52,11 @@ class Conv3DTest(test.TestCase):
   def _DtypesToTest(self, use_gpu):
     if use_gpu:
       if not test_util.CudaSupportsHalfMatMulAndConv():
-        return [dtypes.float32]
+        return [dtypes.float64, dtypes.float32]
       else:
         # It is important that float32 comes before float16 here,
         # as we will be using its gradients as reference for fp16 gradients.
-        return [dtypes.float32, dtypes.float16]
+        return [dtypes.float64, dtypes.float32, dtypes.float16]
     else:
       return [dtypes.float64, dtypes.float32, dtypes.float16]
 
@@ -109,7 +109,7 @@ class Conv3DTest(test.TestCase):
         results.append(result)
 
       with self.cached_session() as sess:
-        values = sess.run(results)
+        values = self.evaluate(results)
         for value in values:
           print("expected = ", expected)
           print("actual = ", value)
@@ -184,8 +184,8 @@ class Conv3DTest(test.TestCase):
         computed_results.append(computed)
         tolerance = 1e-2 if use_gpu else 1e-5
         with self.cached_session() as sess:
-          expected_values = sess.run(expected_results)
-          computed_values = sess.run(computed_results)
+          expected_values = self.evaluate(expected_results)
+          computed_values = self.evaluate(computed_results)
           for e_value, c_value in zip(expected_values, computed_values):
             print("expected = ", e_value)
             print("actual = ", c_value)
@@ -638,6 +638,30 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  # Test the fast path in gemm_pack_rhs/mkldnn_gemm_pack, when channel
+  # dimension is a multiple of packet size.
+  def testInputGradientValidPaddingStrideOneFastPath(self):
+    self.ConstructAndTestGradient(
+        batch=2,
+        input_shape=(3, 5, 4),
+        filter_shape=(2, 2, 2),
+        in_depth=8,
+        out_depth=2,
+        stride=1,
+        padding="VALID",
+        test_input=True)
+
+  def testFilterGradientValidPaddingStrideOneFastPath(self):
+    self.ConstructAndTestGradient(
+        batch=2,
+        input_shape=(4, 6, 5),
+        filter_shape=(2, 2, 2),
+        in_depth=8,
+        out_depth=2,
+        stride=1,
+        padding="VALID",
+        test_input=False)
+
   # Testing for backprops
   def _RunAndVerifyBackprop(self, input_sizes, filter_sizes, output_sizes,
                             strides, dilations, padding, data_format, use_gpu,
@@ -691,8 +715,8 @@ class Conv3DTest(test.TestCase):
         expected_grad = gradients_impl.gradients(expected, t1
                                                  if mode == "input" else t2)[0]
         # "values" consists of two tensors for two backprops
-        actual_value = sess.run(actual_grad)
-        expected_value = sess.run(expected_grad)
+        actual_value = self.evaluate(actual_grad)
+        expected_value = self.evaluate(expected_grad)
         self.assertShapeEqual(actual_value, actual_grad)
         self.assertShapeEqual(expected_value, expected_grad)
       print("expected = ", expected_value)
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 0ccbbf155c53196043c2c4597168c3b2ace72b20..2d21f6f4ae5100dbda413891d37545e37bf5207d 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -908,8 +908,8 @@ class Conv2DTest(test.TestCase):
         conv = gradients_impl.gradients(conv_forward, t1)[0]
         conv_2 = gradients_impl.gradients(conv_forward_2, t1)[0]
         # "values" consists of two tensors for two backprops
-        value = sess.run(conv)
-        value_2 = sess.run(conv_2)
+        value = self.evaluate(conv)
+        value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
       tf_logging.info("expected = ", value_2)
@@ -961,8 +961,8 @@ class Conv2DTest(test.TestCase):
           conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
         conv = gradients_impl.gradients(conv_forward, t2)[0]
         conv_2 = gradients_impl.gradients(conv_forward, t2)[0]
-        value = sess.run(conv)
-        value_2 = sess.run(conv_2)
+        value = self.evaluate(conv)
+        value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
       tf_logging.info("expected = ", value_2)
@@ -1545,7 +1545,7 @@ class DepthwiseConv2DTest(test.TestCase):
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
@@ -1667,9 +1667,9 @@ class SeparableConv2DTest(test.TestCase):
       if data_format == "NCHW":
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = ", value)
-    self.assertArrayNear(expected, np.ravel(value), 1e-5)
+    self.assertArrayNear(expected, np.ravel(value), 1e-3)
     self.assertShapeEqual(value, conv)
 
   def _testSeparableConv2D(self, data_format):
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index cfc7cb98aa678190ae81a2d9ee40ef4984453a91..b38776ec5bb03badaa98983c02be70b53f141666 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -23,9 +23,15 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ctc_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
@@ -52,6 +58,24 @@ def SimpleSparseTensorFrom(x):
   return sparse_tensor.SparseTensor(x_ix, x_val, x_shape)
 
 
+def _ctc_loss_v2(labels, inputs, sequence_length,
+                 preprocess_collapse_repeated=False,
+                 ctc_merge_repeated=True,
+                 ignore_longer_outputs_than_inputs=False,
+                 time_major=True):
+  """Call ctc_loss_v2 with v1 args."""
+  assert not preprocess_collapse_repeated
+  assert ctc_merge_repeated
+  assert not ignore_longer_outputs_than_inputs
+  return ctc_ops.ctc_loss_v2(
+      labels=labels,
+      logits=inputs,
+      logit_length=sequence_length,
+      label_length=None,
+      blank_index=-1,
+      logits_time_major=time_major)
+
+
 class CTCLossTest(test.TestCase):
 
   def _testCTCLoss(self,
@@ -66,7 +90,7 @@ class CTCLossTest(test.TestCase):
     inputs_t = constant_op.constant(inputs)
 
     with self.cached_session(use_gpu=False) as sess:
-      loss = ctc_ops.ctc_loss(
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
       grad = gradients_impl.gradients(loss, [inputs_t])[0]
 
@@ -234,9 +258,9 @@ class CTCLossTest(test.TestCase):
     inputs_t_transposed = constant_op.constant(inputs.transpose(1, 0, 2))
 
     with self.session(use_gpu=False) as sess:
-      loss = ctc_ops.ctc_loss(
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
-      loss_transposed = ctc_ops.ctc_loss(
+      loss_transposed = _ctc_loss_v2(
           inputs=inputs_t_transposed,
           labels=labels,
           sequence_length=seq_lens,
@@ -253,7 +277,7 @@ class CTCLossTest(test.TestCase):
     v = [1.0]
 
     with self.session(use_gpu=False):
-      loss = ctc_ops.ctc_loss(
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
       # Taking ths second gradient should fail, since it is not
       # yet supported.
@@ -272,7 +296,519 @@ class CTCLossTest(test.TestCase):
     with self.session(use_gpu=False) as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "batch_size must not be 0"):
-        sess.run(ctc_ops.ctc_loss(labels, inputs, sequence_lengths))
+        sess.run(_ctc_loss_v2(labels, inputs, sequence_lengths))
+
+
+class CTCLossTestV2(test.TestCase):
+
+  def testCtcLossV2(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    max_label_length = 5
+    num_frames = 12
+
+    labels = random_ops.random_uniform(
+        [batch_size, max_label_length], minval=1, maxval=num_labels,
+        dtype=dtypes.int64)
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+
+    label_length = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=max_label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_length, maxlen=max_label_length, dtype=label_length.dtype)
+    labels *= label_mask
+    logit_length = [num_frames] * batch_size
+
+    ref_loss = ctc_ops.ctc_loss_v2(
+        labels=labels,
+        logits=logits,
+        label_length=label_length,
+        logit_length=logit_length)
+    ref_grad = gradients_impl.gradients(ref_loss, [logits])
+
+    sparse_labels = ctc_ops.dense_labels_to_sparse(labels, label_length)
+
+    def assert_same_loss_and_grads(loss):
+      with self.cached_session() as sess:
+        self.assertAllClose(*sess.run([loss, ref_loss]))
+        grad = gradients_impl.gradients(loss, [logits])
+        self.assertAllClose(*sess.run([grad, ref_grad]), rtol=2e-06, atol=2e-06)
+
+    assert_same_loss_and_grads(
+        ctc_ops.ctc_loss_v2(
+            labels=sparse_labels,
+            logits=logits,
+            label_length=label_length,
+            logit_length=logit_length,
+            blank_index=0))
+
+  def testCtcLossDenseIsSameAsCtcLoss(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      random_seed.set_random_seed(5)
+
+      batch_size = 8
+      num_labels = 6
+      label_length = 5
+      num_frames = 12
+      logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+      labels = random_ops.random_uniform(
+          [batch_size, label_length], minval=1, maxval=num_labels,
+          dtype=dtypes.int64)
+
+      label_lengths = random_ops.random_uniform(
+          [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+      label_mask = array_ops.sequence_mask(
+          label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+      labels *= label_mask
+
+      logit_lengths = [num_frames] * batch_size
+
+      ctc_loss = ctc_ops.ctc_loss_dense(
+          labels=labels,
+          logits=logits,
+          label_length=label_lengths,
+          logit_length=logit_lengths)
+      ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+      # Shift labels down by one (move blank from 0 to num_labels -1)
+      tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
+      tf_nn_ctc_logits = array_ops.concat([
+          logits[:, :, 1:],
+          logits[:, :, 0:1],
+      ], axis=2)
+
+      tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+          tf_ctc_loss_labels, label_lengths)
+
+      tf_nn_ctc_loss = ctc_ops.ctc_loss(
+          labels=tf_ctc_loss_labels,
+          inputs=tf_nn_ctc_logits,
+          sequence_length=logit_lengths,
+          time_major=True)
+      tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+      with self.cached_session() as sess:
+        for _ in range(32):
+          self.assertAllClose(*sess.run([ctc_loss, tf_nn_ctc_loss]))
+          self.assertAllClose(*sess.run([ctc_loss_grads, tf_nn_ctc_grads]),
+                              rtol=2e-06, atol=2e-06)
+
+  def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    label_length = 5
+    num_frames = 12
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+    labels = random_ops.random_uniform(
+        [batch_size, label_length], minval=1, maxval=num_labels,
+        dtype=dtypes.int64)
+
+    label_lengths = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+    labels *= label_mask
+
+    logit_lengths = [num_frames] * batch_size
+
+    ctc_loss = ctc_ops.ctc_loss_dense(
+        labels=labels,
+        logits=logits,
+        label_length=label_lengths,
+        logit_length=logit_lengths,
+        unique=ctc_ops.ctc_unique_labels(labels))
+    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+    # Shift labels down by one (move blank from 0 to num_labels -1)
+    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
+    tf_nn_ctc_logits = array_ops.concat([
+        logits[:, :, 1:],
+        logits[:, :, 0:1],
+    ], axis=2)
+
+    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+        tf_ctc_loss_labels, label_lengths)
+
+    tf_nn_ctc_loss = ctc_ops.ctc_loss(
+        labels=tf_ctc_loss_labels,
+        inputs=tf_nn_ctc_logits,
+        sequence_length=logit_lengths,
+        time_major=True)
+    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+    with self.cached_session() as sess:
+      for _ in range(32):
+        self.assertAllClose(*sess.run([ctc_loss, tf_nn_ctc_loss]))
+        self.assertAllClose(*sess.run([ctc_loss_grads, tf_nn_ctc_grads]),
+                            rtol=2e-06, atol=2e-06)
+
+  def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    label_length = 5
+    num_frames = 12
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+    labels = random_ops.random_uniform(
+        [batch_size, label_length], minval=0, maxval=num_labels-1,
+        dtype=dtypes.int64)
+
+    label_lengths = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+    labels *= label_mask
+
+    logit_lengths = [num_frames] * batch_size
+
+    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
+    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+        tf_ctc_loss_labels, label_lengths)
+
+    tf_nn_ctc_loss = ctc_ops.ctc_loss(
+        labels=tf_ctc_loss_labels,
+        inputs=logits,
+        sequence_length=logit_lengths,
+        time_major=True)
+    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+    # Shift the blank logits/labels to be somewhere in the middle.
+    blank_index = 2
+    shifted_logits = array_ops.concat([
+        logits[:, :, :blank_index],
+        logits[:, :, -1:],
+        logits[:, :, blank_index:-1],
+    ], axis=2)
+    shifted_labels = array_ops.where(labels < blank_index, labels, labels + 1)
+
+    ctc_loss = ctc_ops.ctc_loss_dense(
+        labels=shifted_labels,
+        logits=shifted_logits,
+        label_length=label_lengths,
+        logit_length=logit_lengths,
+        blank_index=blank_index)
+    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+    with self.cached_session() as sess:
+      for _ in range(32):
+        self.assertAllClose(*sess.run([ctc_loss, tf_nn_ctc_loss]))
+        self.assertAllClose(*sess.run([ctc_loss_grads, tf_nn_ctc_grads]),
+                            rtol=2e-06, atol=2e-06)
+
+  def testCtcLossDenseWithNegativeBlankIndexIsSameAsCtcLoss(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      random_seed.set_random_seed(5)
+
+      batch_size = 8
+      num_labels = 6
+      label_length = 5
+      num_frames = 12
+      logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+      labels = random_ops.random_uniform(
+          [batch_size, label_length], minval=0, maxval=num_labels-1,
+          dtype=dtypes.int64)
+
+      label_lengths = random_ops.random_uniform(
+          [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+      label_mask = array_ops.sequence_mask(
+          label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+      labels *= label_mask
+
+      logit_lengths = [num_frames] * batch_size
+
+      ctc_loss = ctc_ops.ctc_loss_dense(
+          labels=labels,
+          logits=logits,
+          label_length=label_lengths,
+          logit_length=logit_lengths,
+          blank_index=-1)
+      ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+      tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
+      tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+          tf_ctc_loss_labels, label_lengths)
+
+      tf_nn_ctc_loss = ctc_ops.ctc_loss(
+          labels=tf_ctc_loss_labels,
+          inputs=logits,
+          sequence_length=logit_lengths,
+          time_major=True)
+      tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+      with self.cached_session() as sess:
+        for _ in range(32):
+          self.assertAllClose(*sess.run([ctc_loss, tf_nn_ctc_loss]))
+          self.assertAllClose(*sess.run([ctc_loss_grads, tf_nn_ctc_grads]),
+                              rtol=2e-06, atol=2e-06)
+
+  def testCollapseRepeated(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 3, 3, 3, 0],
+                [1, 4, 4, 4, 0],
+                [4, 2, 2, 9, 4]],
+        seq_length=[4, 5, 5])
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  def testCollapseRepeatedPreservesDtypes(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=constant_op.constant(
+            [[1, 3, 3, 3, 0],
+             [1, 4, 4, 4, 0],
+             [4, 2, 2, 9, 4]],
+            dtype=dtypes.int64),
+        seq_length=constant_op.constant([4, 5, 5], dtype=dtypes.int64))
+    self.assertEqual(new_seq_lengths.dtype, dtypes.int64)
+    self.assertEqual(collapsed.dtype, dtypes.int64)
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  def testCollapseRepeatedExtraPadding(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 3, 3, 3, 0, 0, 0],
+                [1, 4, 4, 4, 0, 1, 2],
+                [4, 2, 2, 9, 4, 0, 0]],
+        seq_length=[4, 5, 5])
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  def testCollapseRepeatedFrontRepeats(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 1, 1, 2, 2],
+                [1, 1, 1, 2, 2],
+                [1, 1, 1, 2, 2]],
+        seq_length=[5, 4, 3])
+    self.assertAllEqual(new_seq_lengths, [2, 2, 1])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 2],
+         [1, 2],
+         [1, 0]])
+
+  def testCollapseRepeatedAllLabelsTheSame(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1]],
+        seq_length=[4, 5, 1])
+    self.assertAllEqual(new_seq_lengths, [1, 1, 1])
+    self.assertAllEqual(
+        collapsed,
+        [[1],
+         [1],
+         [1]])
+
+  def testDenseSequencesToSparse(self):
+    labels = [[1, 3, 3, 3, 0],
+              [1, 4, 4, 4, 0],
+              [4, 2, 2, 9, 4]]
+    length = [4, 5, 5]
+    sparse = ctc_ops.dense_labels_to_sparse(labels, length)
+    new_dense = sparse_ops.sparse_tensor_to_dense(sparse)
+
+    self.assertAllEqual(labels, new_dense)
+
+    padded_labels = [[1, 3, 3, 3, 0, 0, 0, 0],
+                     [1, 4, 4, 4, 0, 0, 0, 0],
+                     [4, 2, 2, 9, 4, 0, 0, 0]]
+    length = [4, 5, 5]
+    sparse = ctc_ops.dense_labels_to_sparse(padded_labels, length)
+    padded_dense = sparse_ops.sparse_tensor_to_dense(sparse)
+
+    self.assertAllEqual(padded_dense, new_dense)
+
+  def testUnique(self):
+    labels = [
+        [3, 4, 4, 3],
+        [1, 1, 1, 0],
+    ]
+    unique, idx = ctc_ops.ctc_unique_labels(labels)
+    self.assertAllEqual([
+        [3, 4, 0, 0],
+        [1, 0, 0, 0],
+    ], unique)
+    self.assertAllEqual([
+        [0, 1, 1, 0],
+        [0, 0, 0, 1],
+    ], idx)
+
+  def testSumStates(self):
+    idx = [
+        [0, 1, 0, 1],
+        [0, 0, 0, 1],
+    ]
+    states = math_ops.log([
+        [[1.0, 2.0, 3.0, 4.0],
+         [5.0, 6.0, 7.0, 8.0]],
+        [[0.1, 0.2, 0.3, 0.4],
+         [0.5, 0.6, 0.7, 0.8]],
+    ])
+    sum_of_states = math_ops.exp(ctc_ops._sum_states(idx, states))
+    self.assertAllClose([
+        [[4.0, 6.0, 0.0, 0.0],
+         [18.0, 8.0, 0.0, 0.0]],
+        [[0.4, 0.6, 0.0, 0.0],
+         [1.8, 0.8, 0.0, 0.0]]
+    ], sum_of_states)
+
+  def testStateToOlabel(self):
+    labels = [
+        [3, 4, 3, 4],
+        [1, 1, 1, 0],
+    ]
+    num_labels = 8
+
+    # 3 frames, 2 batch, 10 states (5 label, 5 blank).
+    states = [
+        [[0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20],
+         [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30]],
+        [[1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
+         [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0]],
+        [[11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
+         [21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]],
+    ]
+    labels = ops.convert_to_tensor(labels)
+    states = math_ops.log(states)
+    olabel = ctc_ops._state_to_olabel(labels, num_labels, states)
+    olabel = math_ops.exp(olabel)
+    blank = olabel[:, :, 0]
+    self.assertAllClose(blank, [
+        [0.16 + 0.17 + 0.18 + 0.19 + 0.20,
+         0.26 + 0.27 + 0.28 + 0.29 + 0.30],
+        [1.6 + 1.7 + 1.8 + 1.9 + 2.0,
+         2.6 + 2.7 + 2.8 + 2.9 + 3.0],
+        [16.0 + 17.0 + 18.0 + 19.0 + 20.0,
+         26.0 + 27.0 + 28.0 + 29.0 + 30.0]
+    ])
+    self.assertAllClose(olabel[:, :, 1:], [
+        [[0.0, 0.0, 0.12 + 0.14, 0.13 + 0.15, 0.0, 0.0, 0.0],
+         [0.22 + 0.23 + 0.24, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 1.2 + 1.4, 1.3 + 1.5, 0.0, 0.0, 0.0],
+         [2.2 + 2.3 + 2.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 12.0 + 14.0, 13.0 + 15.0, 0.0, 0.0, 0.0],
+         [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+    ])
+
+  def testStateToOlabelUnique(self):
+    labels = [
+        [3, 4, 3, 4],
+        [1, 1, 1, 0],
+    ]
+    num_labels = 8
+
+    # 3 frames, 2 batch, 10 states (5 label, 5 blank).
+    states = [
+        [[0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20],
+         [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30]],
+        [[1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
+         [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0]],
+        [[11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
+         [21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]],
+    ]
+    labels = ops.convert_to_tensor(labels)
+    states = math_ops.log(states)
+    olabel = ctc_ops._state_to_olabel_unique(
+        labels, num_labels, states, ctc_ops.ctc_unique_labels(labels))
+    olabel = math_ops.exp(olabel)
+    blank = olabel[:, :, 0]
+    self.assertAllClose(blank, [
+        [0.16 + 0.17 + 0.18 + 0.19 + 0.20,
+         0.26 + 0.27 + 0.28 + 0.29 + 0.30],
+        [1.6 + 1.7 + 1.8 + 1.9 + 2.0,
+         2.6 + 2.7 + 2.8 + 2.9 + 3.0],
+        [16.0 + 17.0 + 18.0 + 19.0 + 20.0,
+         26.0 + 27.0 + 28.0 + 29.0 + 30.0]])
+    self.assertAllClose(olabel[:, :, 1:], [
+        [[0.0, 0.0, 0.12 + 0.14, 0.13 + 0.15, 0.0, 0.0, 0.0],
+         [0.22 + 0.23 + 0.24, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 1.2 + 1.4, 1.3 + 1.5, 0.0, 0.0, 0.0],
+         [2.2 + 2.3 + 2.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 12.0 + 14.0, 13.0 + 15.0, 0.0, 0.0, 0.0],
+         [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+    ])
+
+  def testScan(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      out = ctc_ops._scan(
+          lambda accum, elem: accum + elem,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0)
+      self.assertAllEqual([24.0, 26.0, 29.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          inclusive=True)
+      self.assertAllEqual([23.0, 24.0, 26.0, 29.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          reverse=True)
+      self.assertAllEqual([29.0, 28.0, 26.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          reverse=True,
+          inclusive=True)
+      self.assertAllEqual([29.0, 28.0, 26.0, 23.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([[0.0, 1.0], [2.0, 3.0], [4.0, 5.0]]),
+          constant_op.constant([23.0, 24.0]))
+      self.assertAllEqual([[23.0, 25.0], [25.0, 28.0], [29.0, 33.0]], out)
+
+  def testScanCapturesVariables(self):
+    with self.cached_session() as sess:
+      x = random_ops.random_uniform([])
+      fn = lambda accum, elem: accum + x * elem
+      out = ctc_ops._scan(fn, constant_op.constant([0.0, 1.0, 2.0]), 23.0)
+      self.assertAllEqual(*sess.run([
+          [23.0 + x * 0.0, 23.0 + x * 1.0, 23.0 + x * 3.0], out
+      ]))
+
+  def testScanMultipleAccumulators(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      def fn(accum, elem):
+        accum_a, accum_b = accum
+        return accum_a + elem, accum_b * elem
+      out = ctc_ops._scan(
+          fn, constant_op.constant([1.0, 2.0, 3.0]),
+          (23.0, constant_op.constant([1.0, 2.0])))
+      a, b = out
+      self.assertAllEqual([24.0, 26.0, 29.0], a)
+      self.assertAllEqual([[1.0, 2.0], [2.0, 4.0], [6.0, 12.0]], b)
+
+  def testScanMultipleElements(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      def fn(accum, elem):
+        elem_a, elem_b = elem
+        return accum + (elem_a * elem_b)
+      elems_a = constant_op.constant([1.0, 2.0, 3.0])
+      elems_b = constant_op.constant([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]])
+      out = ctc_ops._scan(
+          fn, (elems_a, elems_b),
+          initial=constant_op.constant([0.0, 0.0]))
+      self.assertAllEqual(
+          [[1.0, 2.0], [5.0, 8.0], [14.0, 20.0]], out)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 8028f93a8c561c4e5d416240469c5da1724dd1ab..df166b610191d2ce2efc431393247b9c60ba3ef0 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -81,7 +81,7 @@ class BinaryOpTest(test.TestCase):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_cpu = out.eval()
+      tf_cpu = self.evaluate(out)
       # Test that the op takes precedence over numpy operators.
       np_left = tf_func(x, iny).eval()
       np_right = tf_func(inx, y).eval()
@@ -178,7 +178,7 @@ class BinaryOpTest(test.TestCase):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = out.eval()
+      tf_gpu = self.evaluate(out)
     self.assertAllClose(np_ans, tf_gpu)
     self.assertShapeEqual(np_ans, out)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
@@ -748,7 +748,7 @@ class ComparisonOpTest(test.TestCase):
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
-      ret = out.eval()
+      ret = self.evaluate(out)
     return ret[0]
 
   def testScalarCompareScalar(self):
@@ -779,7 +779,7 @@ class ComparisonOpTest(test.TestCase):
     np_ans = np_func(x, y)
     with self.test_session(force_gpu=test_util.is_gpu_available()):
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index c5311ad834a700bf3341b5c25fb8a22f837eae62..87248bf9c89a20fa237fe99e015d855ac5940d69 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -88,7 +88,7 @@ class ComparisonOpTest(test.TestCase):
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
-      ret = out.eval()
+      ret = self.evaluate(out)
     return ret[0]
 
   def testScalarCompareScalar(self):
@@ -119,7 +119,7 @@ class ComparisonOpTest(test.TestCase):
     np_ans = np_func(x, y)
     with self.test_session(force_gpu=test_util.is_gpu_available()):
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
@@ -223,7 +223,7 @@ class LogicalOpTest(test.TestCase):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_val = out.eval()
+      tf_val = self.evaluate(out)
     self.assertEqual(out.dtype, dtypes_lib.bool)
     self.assertAllEqual(np_ans, tf_val)
     self.assertShapeEqual(np_ans, out)
@@ -233,7 +233,7 @@ class LogicalOpTest(test.TestCase):
     with self.test_session(use_gpu=use_gpu,
                            force_gpu=use_gpu and test_util.is_gpu_available()):
       out = math_ops.logical_not(ops.convert_to_tensor(x))
-      tf_val = out.eval()
+      tf_val = self.evaluate(out)
     self.assertEqual(out.dtype, dtypes_lib.bool)
     self.assertAllEqual(np_ans, tf_val)
     self.assertShapeEqual(np_ans, out)
@@ -319,7 +319,7 @@ class SelectOpTest(test.TestCase):
     with self.test_session(use_gpu=use_gpu,
                            force_gpu=use_gpu and test_util.is_gpu_available()):
       out = array_ops.where(c, x, y)
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, out)
 
@@ -463,7 +463,7 @@ class BatchSelectOpTest(test.TestCase):
     with self.test_session(use_gpu=use_gpu,
                            force_gpu=use_gpu and test_util.is_gpu_available()):
       out = array_ops.where(c, x, y)
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, out)
 
@@ -644,13 +644,13 @@ class MathOpsOverloadTest(test.TestCase):
     with self.test_session(use_gpu=False):
       inx = ops.convert_to_tensor(x, dtype=dtype)
       z = func(inx, y)  # Should use __add__, __sub__, etc.
-      return z.eval()
+      return self.evaluate(z)
 
   def _computeLiteralAndTensor(self, x, y, dtype, func):
     with self.test_session(use_gpu=False):
       iny = ops.convert_to_tensor(y, dtype=dtype)
       z = func(x, iny)  # Should use __radd__, __rsub__, etc.
-      return z.eval()
+      return self.evaluate(z)
 
   def _compareBinary(self, x, y, dtype, np_func, tf_func):
     np_ans = np_func(x, y).astype(dtype.as_numpy_dtype)
@@ -777,9 +777,9 @@ class IsFiniteInfNanTest(test.TestCase):
             tf_y = math_ops.sqrt(x)
             tf_nan = math_ops.is_nan(tf_y)
             if value < 0:
-              self.assertAllEqual(np_nan, tf_nan.eval())
+              self.assertAllEqual(np_nan, self.evaluate(tf_nan))
             else:
-              self.assertAllCloseAccordingToType(np_y, tf_y.eval())
+              self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
 
 
 class RoundingTest(test.TestCase):
@@ -788,7 +788,7 @@ class RoundingTest(test.TestCase):
     y = np.rint(x) if y is None else np.asarray(y)
     with self.cached_session() as sess:
       tf_rint = math_ops.rint(x)
-      np_rint = sess.run(tf_rint)
+      np_rint = self.evaluate(tf_rint)
     self.assertAllEqual(y, np_rint)
     self.assertShapeEqual(y, tf_rint)
 
@@ -833,7 +833,7 @@ class ComplexMakeRealImagTest(test.TestCase):
       real = ops.convert_to_tensor(real)
       imag = ops.convert_to_tensor(imag)
       tf_ans = math_ops.complex(real, imag)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -855,10 +855,10 @@ class ComplexMakeRealImagTest(test.TestCase):
       tf_imag = math_ops.imag(inx)
       tf_real_real = math_ops.real(tf_real)
       tf_imag_real = math_ops.imag(tf_real)
-      self.assertAllEqual(np_real, tf_real.eval())
-      self.assertAllEqual(np_imag, tf_imag.eval())
-      self.assertAllEqual(np_real, tf_real_real.eval())
-      self.assertAllEqual(np_zeros, tf_imag_real.eval())
+      self.assertAllEqual(np_real, self.evaluate(tf_real))
+      self.assertAllEqual(np_imag, self.evaluate(tf_imag))
+      self.assertAllEqual(np_real, self.evaluate(tf_real_real))
+      self.assertAllEqual(np_zeros, self.evaluate(tf_imag_real))
 
   def testRealImag64(self):
     real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float32)
@@ -881,7 +881,7 @@ class ComplexMakeRealImagTest(test.TestCase):
         force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
       inx = ops.convert_to_tensor(cplx)
       tf_angle = math_ops.angle(inx)
-      tf_angle_val = sess.run(tf_angle)
+      tf_angle_val = self.evaluate(tf_angle)
     self.assertAllEqual(np_angle, tf_angle_val)
     self.assertShapeEqual(np_angle, tf_angle)
 
@@ -916,7 +916,7 @@ class ComplexMakeRealImagTest(test.TestCase):
                            force_gpu=use_gpu and test_util.is_gpu_available()):
       inx = ops.convert_to_tensor(cplx)
       tf_conj = math_ops.conj(inx)
-      tf_ans = tf_conj.eval()
+      tf_ans = self.evaluate(tf_conj)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, tf_conj)
 
@@ -1032,13 +1032,13 @@ class AccumulateTest(test.TestCase):
       np_val = random_arrays[0]
       for random_array in random_arrays[1:]:
         np_val += random_array
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testZeroArgs(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         tf_val = math_ops.accumulate_n([])
-        tf_val.eval()
+        self.evaluate(tf_val)
 
   def testWrongShape(self):
     with self.cached_session():
@@ -1070,7 +1070,7 @@ class PolyvalTest(test.TestCase):
     np_val = np.polyval(coeffs, x)
     with self.cached_session():
       tf_val = math_ops.polyval(coeffs, x)
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testSimple(self):
     for dtype in [
@@ -1093,7 +1093,7 @@ class PolyvalTest(test.TestCase):
         np_val = np.polyval(coeffs, x)
         with self.cached_session():
           tf_val = math_ops.polyval(coeffs, x)
-          self.assertAllClose(np_val, tf_val.eval())
+          self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testEmpty(self):
     x = np.random.rand(2, 2).astype(np.float32)
@@ -1101,7 +1101,7 @@ class PolyvalTest(test.TestCase):
     np_val = np.polyval(coeffs, x)
     with self.cached_session():
       tf_val = math_ops.polyval(coeffs, x)
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index 77f182784ebb0a149762e291c4e0bdd937bf8dfa..7096083a1fbe8f61aebb5df514db5b95289137c6 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -84,7 +84,7 @@ class UnaryOpTest(test.TestCase):
         np_ans *= 1.1
       else:
         y = tf_func(inx)
-      tf_cpu = y.eval()
+      tf_cpu = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       if x.dtype == np.float16:
         self.assertAllClose(np_ans, tf_cpu, rtol=1e-3, atol=1e-3)
@@ -140,7 +140,7 @@ class UnaryOpTest(test.TestCase):
     np_ans = np_func(x)
     with self.test_session(force_gpu=test_util.is_gpu_available()):
       result = tf_func(ops.convert_to_tensor(x))
-      tf_gpu = result.eval()
+      tf_gpu = self.evaluate(result)
     if x.dtype == np.float16:
       self.assertAllClose(np_ans, tf_gpu, rtol=1e-3, atol=1e-3)
     else:
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index eebaffbe13ab1afbc9c6e36c2e5710dcf56e4b15..5e7991382ed14ed401edd38c6ab28af6630e1099 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -61,7 +61,7 @@ class DecodeBmpOpTest(test.TestCase):
     decode = array_ops.squeeze(image_ops.decode_bmp(img_in))
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
   def testGrayscale(self):
@@ -136,7 +136,7 @@ class DecodeBmpOpTest(test.TestCase):
     decode = image_ops.decode_bmp(img_in)
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
 
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index 0975f964b5898d9e100e2fdcd2af98029e28be95..7a8743e11f03b336d70232ab68d7893d19ae029d 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -76,7 +76,7 @@ class DecodeImageOpTest(test.TestCase):
 
         bad_channels = image_ops.decode_image(gif0, channels=1)
         with self.assertRaises(errors_impl.InvalidArgumentError):
-          bad_channels.eval()
+          self.evaluate(bad_channels)
 
   def testJpeg(self):
     # Read a real jpeg and verify shape
@@ -92,7 +92,7 @@ class DecodeImageOpTest(test.TestCase):
 
       bad_channels = image_ops.decode_image(jpeg0, channels=4)
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        bad_channels.eval()
+        self.evaluate(bad_channels)
 
   def testPng(self):
     # Read some real PNGs, converting to different channel numbers
@@ -113,7 +113,7 @@ class DecodeImageOpTest(test.TestCase):
     decode = image_ops.decode_image(image_bytes)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        decode.eval()
+        self.evaluate(decode)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index 66b3e0f22fd2ab07311895da5df5448ee4e6e6f0..8c4ccbd88e222d6dbb91ab9a687df94d9a7b7370 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -80,7 +80,7 @@ class DecodeJpegBenchmark(test.Benchmark):
           initializer=image_ops.encode_jpeg(tiled_image))
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       images = []
       for _ in xrange(parallelism):
         if crop_window is None:
diff --git a/tensorflow/python/kernel_tests/decode_png_op_test.py b/tensorflow/python/kernel_tests/decode_png_op_test.py
index 8f36343667f72b410f14a1934c93a61debaff59e..5a0b742a6a46aa994eb555f09ab3fb75c8a03b15 100644
--- a/tensorflow/python/kernel_tests/decode_png_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_png_op_test.py
@@ -47,7 +47,7 @@ class DecodePngOpTest(test.TestCase):
             img_in, dtype=dtypes.uint16))
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
 
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
index affbaf159d82e15d6c15a83ae509851ae1219c7f..0676664685d6d5e1baeb6227fc86b020e74af445 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
@@ -43,7 +43,7 @@ class AssignOpTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       def run_add(add_op):
-        sess.run(add_op)
+        self.evaluate(add_op)
 
       threads = [
           self.checkedThread(
@@ -54,7 +54,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertTrue((vals >= ones).all())
       self.assertTrue((vals <= ones * 20).all())
@@ -70,7 +70,7 @@ class AssignOpTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       def run_assign(assign_op):
-        sess.run(assign_op)
+        self.evaluate(assign_op)
 
       threads = [
           self.checkedThread(
@@ -81,7 +81,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
 
       # Assert every element is taken from one of the assignments.
       self.assertTrue((vals > 0).all())
@@ -103,7 +103,7 @@ class AssignOpTest(test.TestCase):
       p.initializer.run()
 
       def run_add(add_op):
-        sess.run(add_op)
+        self.evaluate(add_op)
 
       threads = [
           self.checkedThread(
@@ -114,7 +114,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertAllEqual(vals, ones * 20)
 
@@ -131,7 +131,7 @@ class AssignOpTest(test.TestCase):
       p.initializer.run()
 
       def run_assign(assign_op):
-        sess.run(assign_op)
+        self.evaluate(assign_op)
 
       threads = [
           self.checkedThread(
@@ -142,7 +142,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
 
       # Assert every element is the same, and taken from one of the assignments.
       self.assertTrue(vals[0, 0] > 0)
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/dense_update_ops_test.py
index 3e0a03d634f13f182dcd142f188c6721f18aa4a5..a4766fed72e62f21181ae64be03f82b16de354ff 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_test.py
@@ -36,8 +36,8 @@ class AssignOpTest(test.TestCase):
       p = variables.Variable(x)
       assign = state_ops.assign(p, y)
       p.initializer.run()
-      new_value = assign.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(assign)
+      return self.evaluate(p), new_value
 
   def _initAssignAddFetch(self, x, y, use_gpu=False):
     """Initialize a param to init, and compute param += y."""
@@ -45,8 +45,8 @@ class AssignOpTest(test.TestCase):
       p = variables.Variable(x)
       add = state_ops.assign_add(p, y)
       p.initializer.run()
-      new_value = add.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(add)
+      return self.evaluate(p), new_value
 
   def _initAssignSubFetch(self, x, y, use_gpu=False):
     """Initialize a param to init, and compute param -= y."""
@@ -54,8 +54,8 @@ class AssignOpTest(test.TestCase):
       p = variables.Variable(x)
       sub = state_ops.assign_sub(p, y)
       p.initializer.run()
-      new_value = sub.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(sub)
+      return self.evaluate(p), new_value
 
   def _testTypes(self, vals):
     for dtype in [np.float32, np.float64, np.int32, np.int64]:
@@ -90,13 +90,13 @@ class AssignOpTest(test.TestCase):
       p = variables.VariableV1([1])
       a = state_ops.assign(p, data, validate_shape=False)
       a.op.run()
-      self.assertAllEqual(p.eval(), data.eval())
+      self.assertAllEqual(p.eval(), self.evaluate(data))
 
       # Assign to yet another shape
       data2 = array_ops.fill([10, 10], 1)
       a2 = state_ops.assign(p, data2, validate_shape=False)
       a2.op.run()
-      self.assertAllEqual(p.eval(), data2.eval())
+      self.assertAllEqual(p.eval(), self.evaluate(data2))
 
   def testInitRequiredAssignAdd(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 13a28caf1fd8f3217490da5e594224d493f60850..c4bed1108037a33ec85859e4e129d2cd670bcbcf 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -106,13 +106,13 @@ class DepthToSpaceTest(test.TestCase):
       # test NHWC (default) on CPU
       x_tf = array_ops.depth_to_space(input_nhwc, block_size)
       self.assertAllEqual(x_tf.shape, x_out.shape)
-      x_tf.eval()
+      self.evaluate(x_tf)
     if test.is_gpu_available():
       with self.cached_session(use_gpu=True):
         # test NHWC (default) on GPU
         x_tf = array_ops.depth_to_space(input_nhwc, block_size)
         self.assertAllEqual(x_tf.shape, x_out.shape)
-        x_tf.eval()
+        self.evaluate(x_tf)
 
   # Tests for different width and height.
   def testNonSquare(self):
@@ -185,7 +185,7 @@ class DepthToSpaceTest(test.TestCase):
     # divisible by 16.
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   # Test when the block size is 0.
   def testBlockSize0(self):
@@ -194,7 +194,7 @@ class DepthToSpaceTest(test.TestCase):
     block_size = 0
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   # Test when the block size is 1. The block size should be > 1.
   def testBlockSizeOne(self):
@@ -205,7 +205,7 @@ class DepthToSpaceTest(test.TestCase):
     block_size = 1
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   def testBlockSizeLargerThanInput(self):
     # The block size is too large for this input.
@@ -214,7 +214,7 @@ class DepthToSpaceTest(test.TestCase):
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   def testBlockSizeNotDivisibleDepth(self):
     # The depth is not divisible by the square of the block size.
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 77b27c6c7e01bde8fa005142e7e5c00110bd628f..f6d834c2f85e36e4fdd0f91b9d9a893992096793 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -162,7 +162,7 @@ class DepthwiseConv2DTest(test.TestCase):
         conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
 
       try:
-        native_result = sess.run(conv_native)
+        native_result = self.evaluate(conv_native)
       except errors.InvalidArgumentError as e:
         # Grouped convolution kernel is only registered for cuDNN 7. Silently
         # return when we are running on an earlier version or without GPU.
@@ -174,7 +174,7 @@ class DepthwiseConv2DTest(test.TestCase):
 
       conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      interface_result = sess.run(conv_interface)
+      interface_result = self.evaluate(conv_interface)
 
     tf_logging.info(
         "data_type: %r, use_gpu: %r, grouped_conv: %r, max diff = %f",
@@ -269,7 +269,7 @@ class DepthwiseConv2DTest(test.TestCase):
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       conv = nn_ops.depthwise_conv2d_native(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = %r", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
@@ -528,7 +528,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_input(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -548,7 +548,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_input(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -580,7 +580,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -600,7 +600,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
diff --git a/tensorflow/python/kernel_tests/determinant_op_test.py b/tensorflow/python/kernel_tests/determinant_op_test.py
index da33b2848b738e54ab03297e7754ad0f59deb4a4..602ceb6ebd91d4e69fc271e86b02d022c89b759d 100644
--- a/tensorflow/python/kernel_tests/determinant_op_test.py
+++ b/tensorflow/python/kernel_tests/determinant_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
@@ -35,7 +36,7 @@ from tensorflow.python.platform import test
 class DeterminantOpTest(test.TestCase):
 
   def _compareDeterminantBase(self, matrix_x, tf_ans):
-    out = tf_ans.eval()
+    out = self.evaluate(tf_ans)
     shape = matrix_x.shape
     if shape[-1] == 0 and shape[-2] == 0:
       np_ans = np.ones(shape[:-2]).astype(matrix_x.dtype)
@@ -54,15 +55,15 @@ class DeterminantOpTest(test.TestCase):
       np_ans = np_ans.astype(matrix_x.dtype)
 
     self.assertShapeEqual(np_ans, abs_log_det_tf)
-    sign_tf_val = sign_tf.eval()
-    abs_log_det_tf_val = abs_log_det_tf.eval()
+    sign_tf_val = self.evaluate(sign_tf)
+    abs_log_det_tf_val = self.evaluate(abs_log_det_tf)
     self.assertAllClose(
         sign_tf_val * np.exp(abs_log_det_tf_val),
         np_sign * np.exp(np_ans),
         atol=5e-5)
 
   def _compareDeterminant(self, matrix_x):
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       self._compareDeterminantBase(matrix_x,
                                    linalg_ops.matrix_determinant(matrix_x))
       self._compareLogDeterminantBase(
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 9e43258fa2d0f82cabed85b32b7fe2a8ee5e11f8..f7a9cd8d6e20f47dc9d93986701b47c4fbd92e55 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -89,7 +89,7 @@ class MatrixSetDiagTest(test.TestCase):
                                [1.0, 1.0, 3.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag, output.eval())
+      self.assertAllEqual(mat_set_diag, self.evaluate(output))
 
   def testRectangular(self):
     with self.session(use_gpu=True):
@@ -98,14 +98,14 @@ class MatrixSetDiagTest(test.TestCase):
       expected = np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((2, 3), output.get_shape())
-      self.assertAllEqual(expected, output.eval())
+      self.assertAllEqual(expected, self.evaluate(output))
 
       v = np.array([3.0, 4.0])
       mat = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
       expected = np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 2), output.get_shape())
-      self.assertAllEqual(expected, output.eval())
+      self.assertAllEqual(expected, self.evaluate(output))
 
   def _testSquareBatch(self, dtype):
     with self.cached_session(use_gpu=True):
@@ -121,7 +121,7 @@ class MatrixSetDiagTest(test.TestCase):
 
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 3, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag_batch, output.eval())
+      self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
   def testSquareBatch(self):
     self._testSquareBatch(np.float32)
@@ -140,7 +140,7 @@ class MatrixSetDiagTest(test.TestCase):
                                      [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]])
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 2, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag_batch, output.eval())
+      self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 2"):
@@ -273,9 +273,9 @@ class DiagTest(test.TestCase):
   def _diagOp(self, diag, dtype, expected_ans, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.diag(ops.convert_to_tensor(diag.astype(dtype)))
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       tf_ans_inv = array_ops.diag_part(expected_ans)
-      inv_out = tf_ans_inv.eval()
+      inv_out = self.evaluate(tf_ans_inv)
     self.assertAllClose(out, expected_ans)
     self.assertAllClose(inv_out, diag)
     self.assertShapeEqual(expected_ans, tf_ans)
@@ -421,7 +421,7 @@ class DiagPartOpTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       tensor = ops.convert_to_tensor(tensor.astype(dtype))
       tf_ans_inv = array_ops.diag_part(tensor)
-      inv_out = tf_ans_inv.eval()
+      inv_out = self.evaluate(tf_ans_inv)
     self.assertAllClose(inv_out, expected_ans)
     self.assertShapeEqual(expected_ans, tf_ans_inv)
 
@@ -445,7 +445,7 @@ class DiagPartOpTest(test.TestCase):
         t = ops.convert_to_tensor(x.astype(np.float32))
         t.set_shape(shape)
         tf_ans = array_ops.diag_part(t)
-        out = tf_ans.eval()
+        out = self.evaluate(tf_ans)
       self.assertAllClose(out, expected_ans)
       self.assertShapeEqual(expected_ans, tf_ans)
 
diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index c6bb06eab3090a103f4a7da92a7f1f5354d9020a..9c593d2737a998308db5f08010783828a6ca2c0b 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -287,7 +287,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     }
 
     with self.cached_session() as sess:
-      run_result = sess.run(to_run)
+      run_result = self.evaluate(to_run)
 
     self.assertAllEqual(run_result["cat_prob"].shape,
                         run_result["norm_prob"].shape)
@@ -355,7 +355,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       samples = dist.sample(n, seed=123)
       samples.set_shape([n, 1, 2])
       self.assertEqual(samples.dtype, dtypes.int32)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertFalse(np.any(sample_values < 0))
       self.assertFalse(np.any(sample_values > 1))
       self.assertAllClose(
@@ -371,7 +371,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       samples = dist.sample((100, 100), seed=123)
       prob = dist.prob(samples)
-      prob_val = prob.eval()
+      prob_val = self.evaluate(prob)
       self.assertAllClose(
           [0.2**2 + 0.8**2], [prob_val[:, :, :, 0].mean()], atol=1e-2)
       self.assertAllClose(
@@ -393,26 +393,26 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
 
       prob = dist.prob(1)
-      self.assertAllClose([[0.8, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([1])
-      self.assertAllClose([[0.8, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([0, 1])
-      self.assertAllClose([[0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[0, 1]])
-      self.assertAllClose([[0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[[0, 1]]])
-      self.assertAllClose([[[0.2, 0.6]]], prob.eval())
+      self.assertAllClose([[[0.2, 0.6]]], self.evaluate(prob))
 
       prob = dist.prob([[1, 0], [0, 1]])
-      self.assertAllClose([[0.8, 0.4], [0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.4], [0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[[1, 1], [1, 0]], [[1, 0], [0, 1]]])
       self.assertAllClose([[[0.8, 0.6], [0.8, 0.4]], [[0.8, 0.4], [0.2, 0.6]]],
-                          prob.eval())
+                          self.evaluate(prob))
 
   def testLogPMFShape(self):
     with self.cached_session():
@@ -462,7 +462,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           b = categorical.Categorical(logits=b_logits)
 
           kl = kullback_leibler.kl_divergence(a, b)
-          kl_val = sess.run(kl)
+          kl_val = self.evaluate(kl)
           # Make sure KL(a||a) is 0
           kl_same = sess.run(kullback_leibler.kl_divergence(a, a))
 
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index d558ca09cc64b1337d2e5f47fc742282eaf7307f..3662ca1ad1f579a71cb65a6e74e58692f5da7df3 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -110,7 +110,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [1., 0]
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1 / 3., pmf.eval())
+      self.assertAllClose(1 / 3., self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesNontrivialN(self):
@@ -122,7 +122,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [3., 2]
       dist = ds.DirichletMultinomial(5., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1 / 7., pmf.eval())
+      self.assertAllClose(1 / 7., self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesMultidimensionalN(self):
@@ -134,7 +134,7 @@ class DirichletMultinomialTest(test.TestCase):
       n = np.full([4, 3], 5., dtype=np.float32)
       dist = ds.DirichletMultinomial(n, alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose([[1 / 7., 1 / 7., 1 / 7.]] * 4, pmf.eval())
+      self.assertAllClose([[1 / 7., 1 / 7., 1 / 7.]] * 4, self.evaluate(pmf))
       self.assertEqual((4, 3), pmf.get_shape())
 
   def testPmfAlphaStretchedInBroadcastWhenSameRank(self):
@@ -145,7 +145,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [[1., 0], [0., 1]]
       dist = ds.DirichletMultinomial([1.], alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose([1 / 3., 2 / 3.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 3.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfAlphaStretchedInBroadcastWhenLowerRank(self):
@@ -155,7 +155,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [1., 2]
       counts = [[1., 0], [0., 1]]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 3.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 3.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
@@ -165,7 +165,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       counts = [[1., 0]]
       pmf = ds.DirichletMultinomial([1., 1.], alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 5.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 5.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
@@ -175,7 +175,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       counts = [1., 0]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 5.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 5.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfForOneVoteIsTheMeanWithOneRecordInput(self):
@@ -289,7 +289,7 @@ class DirichletMultinomialTest(test.TestCase):
         expected_covariance = n * (n + alpha_0) / (1 + alpha_0) * shared_matrix
 
         self.assertEqual([2, 2], covariance.get_shape())
-        self.assertAllClose(expected_covariance, covariance.eval())
+        self.assertAllClose(expected_covariance, self.evaluate(covariance))
 
   def testCovarianceNAlphaBroadcast(self):
     alpha_v = [1., 2, 3]
@@ -327,7 +327,7 @@ class DirichletMultinomialTest(test.TestCase):
           ns * (ns + alpha_0) / (1 + alpha_0))[..., array_ops.newaxis]
 
       self.assertEqual([4, 3, 3], covariance.get_shape())
-      self.assertAllClose(expected_covariance, covariance.eval())
+      self.assertAllClose(expected_covariance, self.evaluate(covariance))
 
   def testCovarianceMultidimensional(self):
     alpha = np.random.rand(3, 5, 4).astype(np.float32)
@@ -353,7 +353,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(0., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1.0, pmf.eval())
+      self.assertAllClose(1.0, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testLargeTauGivesPreciseProbabilities(self):
@@ -368,7 +368,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.8, pmf.eval(), atol=1e-4)
+      self.assertAllClose(0.8, self.evaluate(pmf), atol=1e-4)
       self.assertEqual((), pmf.get_shape())
 
     # Two (three sided) coin flips.  Prob[coin 3] = 0.8.
@@ -376,7 +376,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(2., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.8**2, pmf.eval(), atol=1e-2)
+      self.assertAllClose(0.8**2, self.evaluate(pmf), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
 
     # Three (three sided) coin flips.
@@ -384,7 +384,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(3., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(3 * 0.1 * 0.8 * 0.8, pmf.eval(), atol=1e-2)
+      self.assertAllClose(3 * 0.1 * 0.8 * 0.8, self.evaluate(pmf), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
 
   def testSmallTauPrefersCorrelatedResults(self):
@@ -399,7 +399,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(0.5, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
     # If there are two draws, it is much more likely that they are the same.
@@ -409,7 +409,7 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(2., alpha)
       pmf_same = dist.prob(counts_same)
       pmf_different = dist.prob(counts_different)
-      self.assertLess(5 * pmf_different.eval(), pmf_same.eval())
+      self.assertLess(5 * self.evaluate(pmf_different), self.evaluate(pmf_same))
       self.assertEqual((), pmf_same.get_shape())
 
   def testNonStrictTurnsOffAllChecks(self):
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index e77e1117d493511748dea2dc1aff46ea8e7658e6..b8bc2e55cfe6a460ff21e1c191462bce3ec32d4b 100644
--- a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -63,17 +63,17 @@ class KLTest(test.TestCase):
       kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
-        kl.eval()
+        self.evaluate(kl)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
         a.kl_divergence(a).eval()
       a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=True)
       kl_ok = kullback_leibler.kl_divergence(a, a)
-      self.assertAllEqual([float("nan")], kl_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(kl_ok))
       self_kl_ok = a.kl_divergence(a)
-      self.assertAllEqual([float("nan")], self_kl_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(self_kl_ok))
       cross_ok = a.cross_entropy(a)
-      self.assertAllEqual([float("nan")], cross_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(cross_ok))
 
   def testRegistrationFailures(self):
 
diff --git a/tensorflow/python/kernel_tests/distributions/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
index 3840d7331cacf588218e3c7dfea85662d545a13a..b3f3416a52faf78c269c76839a3f5d7ac533bbab 100644
--- a/tensorflow/python/kernel_tests/distributions/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -127,7 +127,7 @@ class MultinomialTest(test.TestCase):
       p = [0.5, 0.5]
       counts = [1., 0]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(0.5, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesNontrivialN(self):
@@ -138,7 +138,7 @@ class MultinomialTest(test.TestCase):
       dist = multinomial.Multinomial(total_count=5., probs=p)
       pmf = dist.prob(counts)
       # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
-      self.assertAllClose(81. / 10000, pmf.eval())
+      self.assertAllClose(81. / 10000, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenSameRank(self):
@@ -146,7 +146,7 @@ class MultinomialTest(test.TestCase):
       p = [[0.1, 0.9]]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertAllClose([0.1, 0.9], self.evaluate(pmf))
       self.assertEqual((2), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenLowerRank(self):
@@ -154,7 +154,7 @@ class MultinomialTest(test.TestCase):
       p = [0.1, 0.9]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertAllClose([0.1, 0.9], self.evaluate(pmf))
       self.assertEqual((2), pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
@@ -182,7 +182,7 @@ class MultinomialTest(test.TestCase):
       # [2]
       counts = [2., 1]
       pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
-      pmf.eval()
+      self.evaluate(pmf)
       self.assertEqual(pmf.get_shape(), (2, 2))
 
   def testPmfShapeCountsPStretchedN(self):
@@ -191,7 +191,7 @@ class MultinomialTest(test.TestCase):
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
       pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
-      pmf.eval()
+      self.evaluate(pmf)
       self.assertEqual((4, 3), pmf.get_shape())
 
   def testMultinomialMean(self):
diff --git a/tensorflow/python/kernel_tests/distributions/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
index cc43e12168697c4f5a0cda48896b3d7d3c108ae4..6b6de8b1393b25175cb701dcaaa4e04321b65d3c 100644
--- a/tensorflow/python/kernel_tests/distributions/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -362,7 +362,7 @@ class ErfInvTest(test.TestCase):
 
       expected_x = special.erfinv(x)
       x = special_math.erfinv(x)
-      self.assertAllClose(expected_x, x.eval(), atol=0.)
+      self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
   def testErfInvIntegerInput(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index 07da855a0174d7b217ac383758e358922b7e18e4..80da39dfde172f2f82362405aa51d951a41533e6 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -40,7 +40,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([0, 13], partition_vals[0])
@@ -62,7 +62,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([[0, 1, 2], [3, 4, 5]], partition_vals[0])
@@ -87,7 +87,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual(part1, partition_vals[0])
@@ -109,7 +109,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=num_partitions)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(num_partitions, len(partition_vals))
     for i in range(num_partitions):
@@ -125,7 +125,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([3 + 4j, 7 + 8j], partition_vals[0])
@@ -138,7 +138,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = 3
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
@@ -164,7 +164,7 @@ class DynamicPartitionTest(test.TestCase):
             outputs = data_flow_ops.dynamic_partition(
                 data_t, partitions_t, num_partitions=n)
             self.assertEqual(n, len(outputs))
-            outputs_val = sess.run(outputs)
+            outputs_val = self.evaluate(outputs)
             for i, output in enumerate(outputs_val):
               self.assertAllEqual(output, data[partitions == i])
 
@@ -183,7 +183,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
@@ -199,7 +199,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=3)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(3, len(partition_vals))
     self.assertAllEqual([[]], partition_vals[0])
@@ -215,7 +215,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
@@ -236,7 +236,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([6], partition_vals[0])
@@ -257,7 +257,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=5)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(5, len(partition_vals))
     self.assertAllEqual([5], partition_vals[0])
@@ -281,7 +281,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=40)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(40, len(partition_vals))
     for i in range(40):
@@ -335,7 +335,7 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual(len(inds), x.shape[0])
     partitioned = data_flow_ops.dynamic_partition(x, inds, 16)
     with self.cached_session() as sess:
-      res = sess.run(partitioned)
+      res = self.evaluate(partitioned)
     self.assertEqual(res[-1].shape[0], 192)
 
 
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index c3f67d29aa4cf3fd12d9c4b8c990b065aaa401ab..3d063c4e0ec04da591dcd01d93cd8a05f505c052 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -36,18 +37,18 @@ class DynamicStitchTestBase(object):
     self.stitch_op = stitch_op
 
   def testScalar(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant(0), constant_op.constant(1)]
       data = [constant_op.constant(40), constant_op.constant(60)]
       for step in -1, 1:
         stitched_t = self.stitch_op(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40, 60][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
   def testShapeInferenceForScalarWithNonConstantIndices(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           array_ops.placeholder(dtype=dtypes.int32),
           constant_op.constant(1)
@@ -61,7 +62,7 @@ class DynamicStitchTestBase(object):
         self.assertEqual([None], stitched_t.get_shape().as_list())
 
   def testSimpleOneDimensional(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       # Test various datatypes in the simple case to ensure that the op was
       # registered under those types.
       dtypes_to_test = [
@@ -78,23 +79,23 @@ class DynamicStitchTestBase(object):
                 constant_op.constant([10, 60, 20, 30, 50]), dtype=dtype)
         ]
         stitched_t = self.stitch_op(indices, data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([8], stitched_t.get_shape().as_list())
 
   def testOneListOneDimensional(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant([1, 6, 2, 3, 5, 0, 4, 7])]
       data = [constant_op.constant([10, 60, 20, 30, 50, 0, 40, 70])]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8], stitched_t.get_shape().as_list())
 
   def testSimpleTwoDimensional(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           constant_op.constant([0, 4, 7]),
           constant_op.constant([1, 6]),
@@ -106,14 +107,14 @@ class DynamicStitchTestBase(object):
           constant_op.constant([[20, 21], [30, 31], [50, 51]])
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
                            [50, 51], [60, 61], [70, 71]], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
   def testZeroSizeTensor(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           constant_op.constant([0, 4, 7]),
           constant_op.constant([1, 6]),
@@ -127,7 +128,7 @@ class DynamicStitchTestBase(object):
           array_ops.zeros([0, 2], dtype=dtypes.int32)
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
                            [50, 51], [60, 61], [70, 71]], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
@@ -147,7 +148,7 @@ class DynamicStitchTestBase(object):
                                 [[1., 2.], [31., 32.]]])
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10. * np.arange(7)[:, None] + [1., 2.]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -157,7 +158,7 @@ class DynamicStitchTestBase(object):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7. * datum.eval(), grad)
+        self.assertAllEqual(7. * self.evaluate(datum), grad)
 
   def testErrorIndicesMultiDimensional(self):
     indices = [
@@ -222,12 +223,12 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
     DynamicStitchTestBase.__init__(self, data_flow_ops.parallel_dynamic_stitch)
 
   def testScalar(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant(0), constant_op.constant(1)]
       data = [constant_op.constant(40.0), constant_op.constant(60.0)]
       for step in -1, 1:
         stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40.0, 60.0][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
@@ -246,7 +247,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
               [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -256,7 +257,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * datum.eval(), grad)
+        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
   # GPU version unit tests
   def testScalarGPU(self):
@@ -265,7 +266,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
       data = [constant_op.constant(40.0), constant_op.constant(60.0)]
       for step in -1, 1:
         stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40.0, 60.0][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
@@ -284,7 +285,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
               [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -294,7 +295,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * datum.eval(), grad)
+        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/edit_distance_op_test.py b/tensorflow/python/kernel_tests/edit_distance_op_test.py
index dab5eee7f508bbff3af185299e508536e1f23908..4a06ab770aaa072c8858e0f527f21dcbc10bbbdd 100644
--- a/tensorflow/python/kernel_tests/edit_distance_op_test.py
+++ b/tensorflow/python/kernel_tests/edit_distance_op_test.py
@@ -49,11 +49,11 @@ class EditDistanceTest(test.TestCase):
 
     if expected_err_re is None:
       self.assertEqual(edit_distance.get_shape(), expected_shape)
-      output = edit_distance.eval()
+      output = self.evaluate(edit_distance)
       self.assertAllClose(output, expected_output)
     else:
       with self.assertRaisesOpError(expected_err_re):
-        edit_distance.eval()
+        self.evaluate(edit_distance)
 
   def _testEditDistance(self,
                         hypothesis,
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 008d6fbf577ac86553a5c6e58769c4f60d178334..443f54a9586caddfa9b9b813c92a5123207e962a 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -76,7 +76,7 @@ class ScatterAddSubTest(test.TestCase):
       # p = init
       variables.global_variables_initializer().run()
       # p += vals
-      result = p2.eval()
+      result = self.evaluate(p2)
     # Compute the expected 'p' using numpy operations.
     for i, ind in enumerate(indices):
       if scatter_op == state_ops.scatter_add:
@@ -278,7 +278,7 @@ class EmbeddingLookupTest(test.TestCase):
       norms = math_ops.sqrt(
           math_ops.reduce_sum(embeddings * embeddings, axis=1))
       normalized = embeddings / array_ops.stack([norms, norms], axis=1)
-      self.assertAllEqual(embedding.eval(), 2 * normalized.eval())
+      self.assertAllEqual(embedding.eval(), 2 * self.evaluate(normalized))
 
   def testSimpleShardedPartitionedVariable(self):
     with self.cached_session() as sess:
@@ -319,7 +319,7 @@ class EmbeddingLookupTest(test.TestCase):
       p_var_val = sess.run(list(p_variable))
       # Actual test
       print(ops.get_default_graph().as_graph_def())
-      tf_result = embedding.eval()
+      tf_result = self.evaluate(embedding)
     np_result, _, _ = _EmbeddingResult(params, id_vals, num_shards, vocab_size)
     self.assertAllEqual(params_values, p_var_val)
     self.assertAllEqual(np_result, tf_result)
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 61436f24cfed348712e3ccfba4fe009932133c12..bb3c0ae80694035dd362f5024ecdddeb0e364bb0 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -43,7 +44,7 @@ class ExtractImagePatches(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       out_tensor = array_ops.extract_image_patches(
           constant_op.constant(image),
           ksizes=ksizes,
@@ -51,7 +52,7 @@ class ExtractImagePatches(test.TestCase):
           rates=rates,
           padding=padding,
           name="im2col")
-      self.assertAllClose(patches, out_tensor.eval())
+      self.assertAllClose(patches, self.evaluate(out_tensor))
 
   def testKsize1x1Stride1x1Rate1x1(self):
     """Verifies that for 1x1 kernel the output equals the input."""
diff --git a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
index bbb3fef85b4cc7f4423c6c3414607db10732fa0b..88f7df8fbb64512c9ca362ec7c310a5805c9c728 100644
--- a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -45,14 +46,14 @@ class ExtractVolumePatches(test.TestCase):
     ksizes = [1] + ksizes + [1]
     strides = [1] + strides + [1]
 
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       out_tensor = array_ops.extract_volume_patches(
           constant_op.constant(image),
           ksizes=ksizes,
           strides=strides,
           padding=padding,
           name="im2col_3d")
-      self.assertAllClose(patches, out_tensor.eval())
+      self.assertAllClose(patches, self.evaluate(out_tensor))
 
   # pylint: disable=bad-whitespace
   def testKsize1x1x1Stride1x1x1(self):
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index 8961c4b13c25269671fdc16fc425516d01970892..c184b93c80e4b46d251888d95d3d9cabc4b95795 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -159,7 +159,7 @@ class FIFOQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -211,7 +211,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testDequeueHalf(self):
@@ -225,7 +225,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -240,7 +240,7 @@ class FIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
@@ -269,7 +269,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        x_val, y_val = sess.run(dequeued_t)
+        x_val, y_val = self.evaluate(dequeued_t)
         x, y = elems[i]
         self.assertEqual([x], x_val)
         self.assertEqual([y], y_val)
@@ -288,9 +288,9 @@ class FIFOQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -302,7 +302,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i % 4]], vals)
 
   def testEmptyEnqueueMany(self):
@@ -313,9 +313,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -323,9 +323,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpTo(self):
     with self.cached_session():
@@ -333,9 +333,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithNoShape(self):
     with self.cached_session():
@@ -356,7 +356,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -369,8 +369,8 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testDequeueUpToNoBlocking(self):
     with self.cached_session():
@@ -381,8 +381,8 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testMultiDequeueMany(self):
     with self.cached_session() as sess:
@@ -399,17 +399,17 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
@@ -429,13 +429,13 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual([None], dequeued_t[0].get_shape().as_list())
       self.assertEqual([None, 2], dequeued_t[1].get_shape().as_list())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
@@ -518,7 +518,7 @@ class FIFOQueueTest(test.TestCase):
                                    r"Expected \[2,3,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testParallelEnqueueMany(self):
     with self.cached_session() as sess:
@@ -529,7 +529,7 @@ class FIFOQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -596,11 +596,11 @@ class FIFOQueueTest(test.TestCase):
 
       def enqueue():
         for _ in xrange(100):
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       def dequeue():
         for _ in xrange(100):
-          self.assertTrue(sess.run(dequeued_t) in (10.0, 20.0))
+          self.assertTrue(self.evaluate(dequeued_t) in (10.0, 20.0))
 
       enqueue_threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       dequeue_threads = [self.checkedThread(target=dequeue) for _ in range(10)]
@@ -632,7 +632,7 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         for i in xrange(250):
-          self.assertEqual(i, sess.run(dequeued_t))
+          self.assertEqual(i, self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -663,7 +663,7 @@ class FIFOQueueTest(test.TestCase):
       dequeuemany_t = q.dequeue_many(count_placeholder)
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -672,7 +672,7 @@ class FIFOQueueTest(test.TestCase):
       while elements_dequeued < 250:
         # With equal probability, run Dequeue or dequeue_many.
         if random.random() > 0.5:
-          self.assertEqual(elements_dequeued, dequeued_t.eval())
+          self.assertEqual(elements_dequeued, self.evaluate(dequeued_t))
           elements_dequeued += 1
         else:
           count = random.randint(0, min(20, 250 - elements_dequeued))
@@ -701,7 +701,7 @@ class FIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
         dequeued_elems.extend(sess.run(dequeued_t).tolist())
@@ -728,7 +728,7 @@ class FIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
         dequeued_elems.extend(sess.run(dequeued_t).tolist())
@@ -778,12 +778,12 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
       close_op.run()
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -797,7 +797,7 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         for elem in elems:
-          self.assertEqual([elem], sess.run(dequeued_t))
+          self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
@@ -842,7 +842,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems, sess.run(dequeued_t))
+        self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
@@ -867,7 +867,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
@@ -892,8 +892,8 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
-        self.assertAllEqual(elems[3:], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
+        self.assertAllEqual(elems[3:], self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -913,16 +913,16 @@ class FIFOQueueTest(test.TestCase):
       cleanup_dequeue_t = q.dequeue()
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        self.assertAllEqual(elems[0:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[0:3], self.evaluate(dequeued_t))
         with self.assertRaises(errors_impl.OutOfRangeError):
           sess.run(dequeued_t)
-        self.assertEqual(elems[3], sess.run(cleanup_dequeue_t))
+        self.assertEqual(elems[3], self.evaluate(cleanup_dequeue_t))
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -1051,7 +1051,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1059,8 +1059,8 @@ class FIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
-      self.assertEqual([50.0], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
       thread.join()
 
   def testBlockingEnqueueManyToFullQueue(self):
@@ -1074,7 +1074,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1082,10 +1082,10 @@ class FIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
         time.sleep(0.01)
-      self.assertEqual([50.0], dequeued_t.eval())
-      self.assertEqual([60.0], dequeued_t.eval())
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
+      self.assertEqual([60.0], self.evaluate(dequeued_t))
 
       # Make sure the thread finishes before exiting.
       thread.join()
@@ -1103,7 +1103,7 @@ class FIFOQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # Expect the operation to succeed once the dequeue op runs.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1113,18 +1113,18 @@ class FIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
 
       for elem in [20.0, 30.0, 40.0, 50.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
       self.assertEqual(0, q.size().eval())
 
   def testBlockingEnqueueManyBeforeClose(self):
@@ -1138,7 +1138,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1148,17 +1148,17 @@ class FIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
       for elem in [20.0, 30.0, 50.0, 60.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
 
   def testDoesNotLoseValue(self):
     with self.cached_session():
@@ -1266,19 +1266,19 @@ class FIFOQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1331,14 +1331,14 @@ class FIFOQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1405,7 +1405,7 @@ class FIFOQueueTest(test.TestCase):
       q.enqueue_many(input_tuple).run()
 
       output_tuple_t = q.dequeue_many(32)
-      output_tuple = sess.run(output_tuple_t)
+      output_tuple = self.evaluate(output_tuple_t)
 
       for (input_elem, output_elem) in zip(input_tuple, output_tuple):
         self.assertAllEqual(input_elem, output_elem)
@@ -1507,7 +1507,7 @@ class FIFOQueueDictTest(test.TestCase):
       enqueue_op4 = q.enqueue_many({"f": [40.0, 50.0]})
       dequeue = q.dequeue()
       dequeue_2 = q.dequeue_many(2)
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
       sess.run(enqueue_op2)
       sess.run(enqueue_op3)
       sess.run(enqueue_op4)
@@ -1565,7 +1565,7 @@ class FIFOQueueDictTest(test.TestCase):
       })
       dequeue = q.dequeue()
       dequeue_2 = q.dequeue_many(2)
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
       sess.run(enqueue_op2)
       sess.run(enqueue_op3)
       sess.run(enqueue_op4)
@@ -1613,8 +1613,8 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
                                    "Timed out waiting for notification"):
         sess.run(dequeued_t, options=config_pb2.RunOptions(timeout_in_ms=10))
 
-      sess.run(enqueue_op)
-      self.assertEqual(37, sess.run(dequeued_t))
+      self.evaluate(enqueue_op)
+      self.assertEqual(37, self.evaluate(dequeued_t))
 
 
 class QueueContainerTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
index f89d2062f1e736068a50344234b05aad423a17e7..cb7659a89a914d9607b3ec634892a83706a49d6d 100644
--- a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
@@ -37,7 +37,6 @@ class FractionalAvgTest(test.TestCase):
   # Random number generate with seed.
   _PRNG = np.random.RandomState(341261000)
   _SEED = 341261001
-  _SEED2 = 341261002
 
   def _AvgPoolAlongRows(self, input_matrix, row_seq, overlapping):
     """Perform average pool along row of a 2-D matrix based on row_seq.
@@ -128,14 +127,12 @@ class FractionalAvgTest(test.TestCase):
       None
     """
     with self.cached_session() as sess:
-      p, r, c = nn_ops.fractional_avg_pool(
+      p, r, c = nn_ops.fractional_avg_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       actual, row_seq, col_seq = sess.run([p, r, c])
       expected = self._GetExpectedFractionalAvgPoolResult(input_tensor, row_seq,
                                                           col_seq, overlapping)
@@ -161,14 +158,12 @@ class FractionalAvgTest(test.TestCase):
       rand_mat = self._PRNG.randint(10, size=tensor_shape)
       pooling_ratio = [1, math.sqrt(2), math.sqrt(2), 1]
       with self.cached_session() as sess:
-        p, r, c = nn_ops.fractional_avg_pool(
+        p, r, c = nn_ops.fractional_avg_pool_v2(
             rand_mat.astype(np.float32),
             pooling_ratio,
             pseudo_random,
             overlapping,
-            deterministic=True,
-            seed=self._SEED,
-            seed2=self._SEED2)
+            seed=self._SEED)
         tensor_output, row_seq, col_seq = sess.run([p, r, c])
         expected_result = self._GetExpectedFractionalAvgPoolResult(
             rand_mat.astype(np.float32), row_seq, col_seq, overlapping)
@@ -214,12 +209,6 @@ class FractionalAvgTest(test.TestCase):
 
   def testIntegerTensorInput(self):
     """Test FractionalAvgPool works fine when input tensor is integer type.
-
-    I would have used _ValidateFractionalAvgPoolResult function to automate this
-    process, however, there's rounding issue. It is caused by numpy.mean cast
-    integer input to numpy.float64 for intermediate use. While for
-    fractional_avg_pool, the mean operation is integer division (trucated).  So,
-    for this test case, I will hard code a simple matrix.
     """
     pseudo_random = True
     overlapping = True
@@ -234,29 +223,9 @@ class FractionalAvgTest(test.TestCase):
         [4, 4, 5, 9, 7, 2]
     ])
     # pyformat: enable
-    with self.cached_session() as sess:
-      # Since deterministic = True, seed and seed2 are fixed. Therefore r, and c
-      # are the same each time. We can have an expected result precomputed.
-      # r = [0, 2, 4, 6]
-      # c = [0, 1, 3, 4, 6]
-
-      # pyformat: disable
-      expected = np.array([
-          [6, 5, 3, 5],
-          [5, 5, 4, 5],
-          [5, 4, 7, 5]
-      ]).reshape((1, 3, 4, 1))
-      # pyformat: enable
-      p, unused_r, unused_c = nn_ops.fractional_avg_pool(
-          mat.reshape(tensor_shape), [1, math.sqrt(3), math.sqrt(2), 1],
-          pseudo_random,
-          overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual = sess.run(p)
-      self.assertShapeEqual(expected, p)
-      self.assertAllClose(expected, actual)
+    self._ValidateFractionalAvgPoolResult(mat.reshape(tensor_shape),
+                                          [1, math.sqrt(3), math.sqrt(2), 1],
+                                          pseudo_random, overlapping)
 
   def testDifferentTensorShapes(self):
     """Test different shapes of input tensor.
@@ -320,14 +289,12 @@ class FractionalAvgTest(test.TestCase):
       pooling_ratio = [1, 1.5, 1.5, 1]
       pseudo_random = False
       overlapping = False
-      p, r, c = nn_ops.fractional_avg_pool(
+      p, r, c = nn_ops.fractional_avg_pool_v2(
           input_holder,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # First run.
       input_a = np.zeros([3, 32, 32, 3])
       actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
@@ -372,7 +339,6 @@ class FractionalAvgPoolGradTest(test.TestCase):
   """
   _PRNG = np.random.RandomState(341261004)
   _SEED = 341261005
-  _SEED2 = 341261006
 
   def _GenerateRandomInputTensor(self, shape):
     num_elements = 1
@@ -398,7 +364,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.avg_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               num_elements = 1
               for dim_size in output_data.shape:
                 num_elements *= dim_size
@@ -407,7 +373,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
               fap_input_backprop_tensor = gen_nn_ops.fractional_avg_pool_grad(
@@ -416,7 +382,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=False)
-              fap_input_backprop = fap_input_backprop_tensor.eval()
+              fap_input_backprop = self.evaluate(fap_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fap_input_backprop_tensor)
               self.assertAllClose(input_backprop, fap_input_backprop)
 
@@ -437,7 +403,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.avg_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               num_elements = 1
               for dim_size in output_data.shape:
                 num_elements *= dim_size
@@ -446,7 +412,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows, row_window_size - 1))
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
@@ -457,7 +423,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=True)
-              fap_input_backprop = fap_input_backprop_tensor.eval()
+              fap_input_backprop = self.evaluate(fap_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fap_input_backprop_tensor)
               self.assertAllClose(input_backprop, fap_input_backprop)
 
@@ -470,15 +436,13 @@ class FractionalAvgPoolGradTest(test.TestCase):
       for overlapping in True, False:
         with self.cached_session() as _:
           input_tensor = constant_op.constant(input_data, shape=input_shape)
-          output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+          output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
               input_tensor,
               pooling_ratio,
               pseudo_random=pseudo_random,
               overlapping=overlapping,
-              deterministic=True,
-              seed=self._SEED,
-              seed2=self._SEED2)
-          output_data = output_tensor.eval()
+              seed=self._SEED)
+          output_data = self.evaluate(output_tensor)
           output_shape = output_data.shape
           # error_margin and delta setting is similar to avg_pool_grad.
           error_margin = 1e-4
@@ -503,15 +467,13 @@ class FractionalAvgPoolGradTest(test.TestCase):
             input_data = self._GenerateRandomInputTensor(input_shape)
             with self.cached_session() as _:
               input_tensor = constant_op.constant(input_data, shape=input_shape)
-              output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+              output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
                   input_tensor,
                   pooling_ratio,
                   pseudo_random=pseudo_random,
                   overlapping=overlapping,
-                  deterministic=True,
-                  seed=self._SEED,
-                  seed2=self._SEED2)
-              output_data = output_tensor.eval()
+                  seed=self._SEED)
+              output_data = self.evaluate(output_tensor)
               output_shape = output_data.shape
               # error_margin and delta setting is similar to avg_pool_grad.
               error_margin = 1e-4
@@ -534,14 +496,12 @@ class FractionalAvgPoolGradTest(test.TestCase):
 
     with self.cached_session() as _:
       input_tensor = constant_op.constant(input_data, shape=input_shape)
-      output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+      output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random=pseudo_random,
           overlapping=overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # error_margin and delta setting is similar to avg_pool_grad.
       error_margin = 1e-4
       gradient_error = gradient_checker.compute_gradient_error(
diff --git a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
index 9b94ca85547590600306bf8aef2caa0f3c3eac8e..0427e34fc1f91849cc9399c4497eab539c76b0c5 100644
--- a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
@@ -37,7 +37,6 @@ class FractionalMaxPoolTest(test.TestCase):
   # Random number generate with seed.
   _PRNG = np.random.RandomState(341261)
   _SEED = 123456
-  _SEED2 = 654321
 
   def _MaxPoolAlongRows(self, input_matrix, row_seq, overlapping):
     """Perform max pool along row of a 2-D matrix based on row_seq.
@@ -128,14 +127,12 @@ class FractionalMaxPoolTest(test.TestCase):
       None
     """
     with self.cached_session() as sess:
-      p, r, c = nn_ops.fractional_max_pool(
+      p, r, c = nn_ops.fractional_max_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       actual, row_seq, col_seq = sess.run([p, r, c])
       expected = self._GetExpectedFractionalMaxPoolResult(input_tensor, row_seq,
                                                           col_seq, overlapping)
@@ -161,14 +158,12 @@ class FractionalMaxPoolTest(test.TestCase):
       rand_mat = self._PRNG.randint(10, size=tensor_shape)
       pooling_ratio = [1, math.sqrt(2), math.sqrt(2), 1]
       with self.cached_session() as sess:
-        p, r, c = nn_ops.fractional_max_pool(
+        p, r, c = nn_ops.fractional_max_pool_v2(
             rand_mat,
             pooling_ratio,
             pseudo_random,
             overlapping,
-            deterministic=True,
-            seed=self._SEED,
-            seed2=self._SEED2)
+            seed=self._SEED)
         tensor_output, row_seq, col_seq = sess.run([p, r, c])
         expected_result = self._GetExpectedFractionalMaxPoolResult(rand_mat,
                                                                    row_seq,
@@ -291,14 +286,12 @@ class FractionalMaxPoolTest(test.TestCase):
       pooling_ratio = [1, 1.5, 1.5, 1]
       pseudo_random = False
       overlapping = False
-      p, r, c = nn_ops.fractional_max_pool(
+      p, r, c = nn_ops.fractional_max_pool_v2(
           input_holder,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # First run.
       input_a = np.zeros([3, 32, 32, 3])
       actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
@@ -344,7 +337,6 @@ class FractionalMaxPoolGradTest(test.TestCase):
 
   _PRNG = np.random.RandomState(341261)
   _SEED = 123456
-  _SEED2 = 654321
 
   def _GenerateUniqueRandomInputTensor(self, shape):
     """Generate 'unqiue' random input tensor.
@@ -382,12 +374,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.max_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
               input_backprop_tensor = gen_nn_ops.max_pool_grad(
                   input_tensor, output_tensor, output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
               fmp_input_backprop_tensor = gen_nn_ops.fractional_max_pool_grad(
@@ -397,7 +389,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=False)
-              fmp_input_backprop = fmp_input_backprop_tensor.eval()
+              fmp_input_backprop = self.evaluate(fmp_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fmp_input_backprop_tensor)
               self.assertAllClose(input_backprop, fmp_input_backprop)
 
@@ -417,12 +409,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.max_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
               input_backprop_tensor = gen_nn_ops.max_pool_grad(
                   input_tensor, output_tensor, output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows, row_window_size - 1))
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
@@ -434,7 +426,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=True)
-              fmp_input_backprop = fmp_input_backprop_tensor.eval()
+              fmp_input_backprop = self.evaluate(fmp_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fmp_input_backprop_tensor)
               self.assertAllClose(input_backprop, fmp_input_backprop)
 
@@ -449,15 +441,13 @@ class FractionalMaxPoolGradTest(test.TestCase):
       for overlapping in True, False:
         with self.cached_session() as _:
           input_tensor = constant_op.constant(input_data, shape=input_shape)
-          output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+          output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
               input_tensor,
               pooling_ratio,
               pseudo_random=pseudo_random,
               overlapping=overlapping,
-              deterministic=True,
-              seed=self._SEED,
-              seed2=self._SEED2)
-          output_data = output_tensor.eval()
+              seed=self._SEED)
+          output_data = self.evaluate(output_tensor)
           output_shape = output_data.shape
           # error_margin and delta setting is similar to max_pool_grad.
           error_margin = 1e-3
@@ -484,15 +474,13 @@ class FractionalMaxPoolGradTest(test.TestCase):
             input_data += self._PRNG.random_sample(input_shape)
             with self.cached_session() as _:
               input_tensor = constant_op.constant(input_data, shape=input_shape)
-              output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+              output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
                   input_tensor,
                   pooling_ratio,
                   pseudo_random=pseudo_random,
                   overlapping=overlapping,
-                  deterministic=True,
-                  seed=self._SEED,
-                  seed2=self._SEED2)
-              output_data = output_tensor.eval()
+                  seed=self._SEED)
+              output_data = self.evaluate(output_tensor)
               output_shape = output_data.shape
               # error_margin and delta setting is similar to max_pool_grad.
               error_margin = 1e-3
@@ -517,14 +505,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
 
     with self.cached_session() as _:
       input_tensor = constant_op.constant(input_data, shape=input_shape)
-      output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+      output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random=pseudo_random,
           overlapping=overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # error_margin and delta setting is similar to max_pool_grad.
       error_margin = 1e-3
       gradient_error = gradient_checker.compute_gradient_error(
@@ -592,7 +578,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
           row_seq,
           col_seq,
           overlapping=False)
-      input_backprop_not_overlapping = r.eval()
+      input_backprop_not_overlapping = self.evaluate(r)
       self.assertShapeEqual(
           np.reshape(expected_input_backprop_not_overlapping, input_size), r)
       self.assertAllClose(expected_input_backprop_not_overlapping,
@@ -602,7 +588,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
           output_data_overlapping, shape=output_size)
       r = gen_nn_ops.fractional_max_pool_grad(
           input_tensor, output_tensor, grad, row_seq, col_seq, overlapping=True)
-      input_backprop_overlapping = r.eval()
+      input_backprop_overlapping = self.evaluate(r)
       self.assertShapeEqual(
           np.reshape(expected_input_backprop_overlapping, input_size), r)
       self.assertAllClose(expected_input_backprop_overlapping,
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index f35450b6fd67d1f3256b64799372df0368dd5431..0af32b048e31edc6f9610ff25fd45586bbe18b04 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -56,6 +56,7 @@ def simple_scoped_fn(a, x):
     return math_ops.multiply(math_ops.add(a, x), two)
 
 
+@test_util.with_control_flow_v2
 class FunctionalOpsTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -481,6 +482,7 @@ class FunctionalOpsTest(test.TestCase):
     y = functional_ops.map_fn(lambda e: e, x)
     self.assertIs(None, y.get_shape().dims)
 
+  @test_util.disable_control_flow_v2("b/119323354")
   @test_util.run_in_graph_and_eager_modes
   def testMapEmptyScalar(self):
     map_return = functional_ops.map_fn(lambda x: 1, constant_op.constant([]))
@@ -489,6 +491,7 @@ class FunctionalOpsTest(test.TestCase):
 
   # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
   # so the body of the while loop never executes
+  @test_util.disable_control_flow_v2("b/119323354")
   def testMapEmptyTensor(self):
     with self.cached_session():
       map_return = functional_ops.map_fn(lambda x: array_ops.zeros([3, 2]),
@@ -564,8 +567,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:worker/replica:0/task:0/cpu:1")
 
     with session.Session(worker[0].target) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
   def testRemoteFunctionDirectSession(self):
@@ -588,8 +591,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/cpu:1")
 
     with self.test_session(config=worker_config) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
   def testRemoteFunctionSameDeviceDirectSession(self):
@@ -607,8 +610,8 @@ class FunctionalOpsTest(test.TestCase):
           args=[a, b], Tout=[dtypes.int32], f=_remote_fn, target="/cpu:0")
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
   def testRemoteFunctionCPUGPU(self):
@@ -631,8 +634,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/device:GPU:0")[0] + 3.0
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
   def testRemoteFunctionGPUCPU(self):
@@ -655,8 +658,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/cpu:0")[0] + 3.0
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
   def testRemoteFunctionGPUCPUStrings(self):
@@ -674,7 +677,7 @@ class FunctionalOpsTest(test.TestCase):
           args=[a], Tout=[dtypes.string], f=_remote_fn, target="/cpu:0")
 
     with self.cached_session() as sess:
-      ret = sess.run(remote_op)
+      ret = self.evaluate(remote_op)
       self.assertAllEqual(ret, [b"a"])
 
   def testRemoteFunctionCrossProcess(self):
@@ -696,8 +699,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:worker/replica:0/task:1/cpu:0")[0] + 3.0
 
     with session.Session(workers[0].target) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9)
 
   def testIf(self):
@@ -829,6 +832,49 @@ class FunctionalOpsTest(test.TestCase):
           self.assertAllEqual(5050.,
                               sess.run([result, c], feed_dict={n: 100.})[0])
 
+  # pylint: disable=cell-var-from-loop
+  def testWhileCapturedInputs(self):
+    for use_gpu in (True, False):
+      with ops.Graph().as_default() as g:
+        v = variables.Variable(1.0)
+
+        def TestCond(n, *args):
+          del args
+          return n < 10
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def TestUnary(n, x):
+          return math_ops.add(n, 1), x + n + v
+
+        @function.Defun(*[dtypes.float32] * 3)
+        def TestBinary(n, x, x2):
+          return math_ops.add(n, 1), x + n + v, x2 + v
+
+        with self.session(graph=g, use_gpu=use_gpu) as sess:
+          result_unary = functional_ops.While(
+              [1.0, 0.],
+              function.Defun(*[dtypes.float32] * 2)(TestCond), TestUnary)
+          result_binary = functional_ops.While(
+              [1.0, 0., 0.],
+              function.Defun(*[dtypes.float32] * 3)(TestCond), TestBinary)
+          self.evaluate(variables.global_variables_initializer())
+          assert len(result_unary) == 2
+          self.assertEqual([10.0, 54.0], self.evaluate(result_unary))
+          assert len(result_binary) == 3
+          self.assertEqual([10.0, 54.0, 9.0], self.evaluate(result_binary))
+
+          def TestCondCapture(n, *args):
+            del args
+            return math_ops.to_float(n) + v < 10
+
+          with self.assertRaises(ValueError):
+            _ = functional_ops.While(
+                [1],
+                function.Defun(dtypes.int32)(TestCondCapture),
+                function.Defun(dtypes.int32, dtypes.float32)(TestUnary))
+
+  # pylint: enable=cell-var-from-loop
+
   def _tfSum(self, use_gpu, rewrite_with_while):
     with ops.Graph().as_default() as g:
       with self.session(graph=g, use_gpu=use_gpu) as sess:
@@ -846,7 +892,7 @@ class FunctionalOpsTest(test.TestCase):
                 100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)
             [0],
         ]
-        xvals = sess.run(xs)
+        xvals = self.evaluate(xs)
       self.assertAllEqual(210, xvals[0])
       self.assertAllEqual(5050, xvals[1])
 
@@ -903,16 +949,16 @@ class FunctionalOpsTest(test.TestCase):
         result_binary = functional_ops.For(
             1, 10, 1, [0., 0.], TestBinary,
             rewrite_with_while=rewrite_with_while)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         assert not result_nullary
         # The nullary variant doesn't return anything so we can't easily run it.
         # As a total hack, fetch the operation by name and run it.
         sess.run(ops.get_default_graph().get_operation_by_name(
             "While" if rewrite_with_while else "For"))
         assert len(result_unary) == 1
-        self.assertEqual([54.0], sess.run(result_unary))
+        self.assertEqual([54.0], self.evaluate(result_unary))
         assert len(result_binary) == 2
-        self.assertEqual([54.0, 9.0], sess.run(result_binary))
+        self.assertEqual([54.0, 9.0], self.evaluate(result_binary))
 
   def _tfMLP(self, xval, wsval, bsval, rewrite_with_while):
     # On GPU, don't rewrite using a while loop.
@@ -931,7 +977,7 @@ class FunctionalOpsTest(test.TestCase):
           MLP,
           rewrite_with_while=rewrite_with_while)[0]
 
-      return ret.eval()
+      return self.evaluate(ret)
 
   def _npMLP(self, xval, wsval, bsval):
     for i in range(wsval.shape[0]):
@@ -995,8 +1041,8 @@ class FunctionalOpsTest(test.TestCase):
       avals = [Poly(a), Grad(a)]
       b = constant_op.constant(1.)
       bvals = [Poly(b), Grad(b)]
-      self.assertAllEqual(sess.run(avals), [8., 4.])
-      self.assertAllEqual(sess.run(bvals), [17., 16.])
+      self.assertAllEqual(self.evaluate(avals), [8., 4.])
+      self.assertAllEqual(self.evaluate(bvals), [17., 16.])
 
 
 # TODO(akshayka): Replace `function.Defun` with tf.contrib.eager.defun` in the
@@ -1147,7 +1193,7 @@ class PartitionedCallTest(test.TestCase):
             allow_soft_placement=False,
             log_device_placement=True,
             device_count={"CPU": 2})) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       expected = sess.run(sum_gather())
       result = sess.run(
           functional_ops.partitioned_call(
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index ee761435d84677f35c00f1e146316d59e656e872..532d8903ee175a603c1f4d7788cf6580f0727ac2 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -40,7 +40,7 @@ class GatherNdTest(test.TestCase):
       params = constant_op.constant(np.array([8, 1, 2, 3, 7, 5], dtype=dtype))
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertAllEqual(np.array([7, 7, 8], dtype=dtype), gather_nd_val)
     self.assertEqual([3], gather_nd_t.get_shape())
@@ -60,20 +60,20 @@ class GatherNdTest(test.TestCase):
 
       indices_empty = np.empty((0, 2), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
       indices_empty = np.empty((0, 1), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0, 3], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0, 3), dtype=np.float32), gather_nd_ok_val)
 
       params_empty = np.empty((0, 3), dtype=np.float32)
       indices_empty = np.empty((0, 2), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params_empty, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
@@ -82,7 +82,7 @@ class GatherNdTest(test.TestCase):
       gather_nd_break_t = array_ops.gather_nd(params_empty, indices_nonempty)
       with self.assertRaisesOpError(
           r"Requested more than 0 entries, but params is empty."):
-        gather_nd_break_t.eval()
+        self.evaluate(gather_nd_break_t)
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
   def testIndexScalar(self):
@@ -91,7 +91,7 @@ class GatherNdTest(test.TestCase):
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4, 1])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
       self.assertEqual([], gather_nd_t.get_shape())
       self.assertAllEqual(np.array(7), gather_nd_val)
 
@@ -101,7 +101,7 @@ class GatherNdTest(test.TestCase):
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
       self.assertEqual([2], gather_nd_t.get_shape())
       self.assertAllEqual(np.array([-7, 7]), gather_nd_val)
 
@@ -111,7 +111,7 @@ class GatherNdTest(test.TestCase):
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([3, 2], gather_nd_t.get_shape())
     self.assertAllEqual(np.array([[-7, 7], [-7, 7], [-8, 8]]), gather_nd_val)
@@ -125,7 +125,7 @@ class GatherNdTest(test.TestCase):
       params_t = constant_op.constant(params)
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([3, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(params[[4, 4, 0]], gather_nd_val)
@@ -140,7 +140,7 @@ class GatherNdTest(test.TestCase):
       indices = constant_op.constant(
           [[], []], dtype=dtypes.int32)  # Size (2, 0)
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([2, 6, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(
@@ -156,7 +156,7 @@ class GatherNdTest(test.TestCase):
       params_t = constant_op.constant(params)
       indices = constant_op.constant([[[3], [2], [1]], [[4], [4], [0]]])
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([2, 3, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(params[[3, 2, 1, 4, 4, 0]].reshape(2, 3, 2, 2),
@@ -168,7 +168,7 @@ class GatherNdTest(test.TestCase):
       params = np.random.rand(*shape)
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected, gather_nd_val)
@@ -181,7 +181,7 @@ class GatherNdTest(test.TestCase):
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
       indices_reshaped = indices.reshape([10, 10, 20, 5])
       gather_nd_t = array_ops.gather_nd(params, indices_reshaped)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
@@ -205,7 +205,7 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def _disabledTestBadIndicesGPU(self):
     # TODO disabled due to different behavior on GPU and CPU
@@ -218,7 +218,7 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def testBadIndicesWithSlicesCPU(self):
     with self.session(use_gpu=False):
@@ -227,7 +227,7 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def _disabledTestBadIndicesWithSlicesGPU(self):
     # TODO disabled due to different behavior on GPU and CPU
@@ -240,7 +240,7 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def testGradientsRank2Elements(self):
     indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
@@ -251,7 +251,7 @@ class GatherNdTest(test.TestCase):
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array([[1, 0], [0, 2]], dtype=np.float64)
     with self.session(use_gpu=True):
-      assert np.array_equal(expected_grads, grads.eval())
+      assert np.array_equal(expected_grads, self.evaluate(grads))
 
   def testGradientsRank2Slices(self):
     indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
@@ -278,7 +278,7 @@ class GatherNdTest(test.TestCase):
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
     with self.session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
   def testGradientsRank7Elements(self):
     # Shape [1,1,2,1,1,2,2]
@@ -307,7 +307,7 @@ class GatherNdTest(test.TestCase):
             [[[[3, 4], [7, 8]]]]
         ]]], dtype=np.float64)
     with self.session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
   def testGradientsInt64Indices(self):
     indices = constant_op.constant(
@@ -322,7 +322,7 @@ class GatherNdTest(test.TestCase):
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
     with self.session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
   def testGradientsRank2SlicesWithEmptySpace(self):
     indices = constant_op.constant([[2], [0], [5]], dtype=dtypes.int32)
@@ -361,10 +361,10 @@ class GatherNdOpBenchmark(test.Benchmark):
       gather_op = array_ops.gather_nd(t_params, t_indices)
       variables.global_variables_initializer().run()
       for _ in range(10):
-        gather_op.eval()
+        self.evaluate(gather_op)
       t1 = time.time()
       for _ in range(1000):
-        gather_op.eval()
+        self.evaluate(gather_op)
       t2 = time.time()
       self.report_benchmark(iters=1000, wall_time=(t2 - t1) / 1000.0)
 
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index bdafc52ab5ec3bd6157b098712cdd35122bb17af..326e4aacd2079959a6f429f1a87db0758fe8f6e5 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -50,7 +50,7 @@ class GatherTest(test.TestCase):
           params = constant_op.constant(params_np)
           indices_tf = constant_op.constant(indices)
           gather_t = array_ops.gather(params, indices_tf)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           np_val = params_np[indices]
           self.assertAllEqual(np_val, gather_val)
           self.assertEqual(np_val.shape, gather_t.get_shape())
@@ -65,7 +65,7 @@ class GatherTest(test.TestCase):
           params = constant_op.constant(params_np)
           indices = constant_op.constant(2)
           gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
           expected_shape = data.shape[:axis] + data.shape[axis + 1:]
           self.assertEqual(expected_shape, gather_t.get_shape())
@@ -81,7 +81,7 @@ class GatherTest(test.TestCase):
           # The indices must be in bounds for any axis.
           indices = constant_op.constant([0, 1, 0, 2])
           gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
                               gather_val)
           expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
@@ -142,8 +142,11 @@ class GatherTest(test.TestCase):
               source_slice = ((slice(None),) * outer_dims + (source_index,) +
                               (slice(None),) * inner_dims)
               correct_params_grad[dest_slice] += gather_grad[source_slice]
-            self.assertAllClose(correct_params_grad, params_grad.eval(),
-                                atol=2e-6, rtol=2e-6)
+            self.assertAllClose(
+                correct_params_grad,
+                self.evaluate(params_grad),
+                atol=2e-6,
+                rtol=2e-6)
 
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
diff --git a/tensorflow/python/kernel_tests/gradient_correctness_test.py b/tensorflow/python/kernel_tests/gradient_correctness_test.py
index 291a69ebac6625ea9b50a54d2e0e28083b463d85..12b8a4c8e3b206c7490aa0f13d19690cbc7d845d 100644
--- a/tensorflow/python/kernel_tests/gradient_correctness_test.py
+++ b/tensorflow/python/kernel_tests/gradient_correctness_test.py
@@ -35,7 +35,7 @@ class GradientCorrectnessTest(test.TestCase):
       yexp = math_ops.exp(x)
       yexplog = math_ops.log(yexp)
       grads = gradients_impl.gradients([yexp, yexplog], [x])
-      grad_vals = sess.run(grads)
+      grad_vals = self.evaluate(grads)
       exp1_plus_one = (1.0 + np.exp(1.0)).astype(np.float32)
       # [dexp(x)/dx + d(log(exp(x)))/dx] @ x=1 == exp(1) + 1
       self.assertAllClose(grad_vals[0], exp1_plus_one)
@@ -44,13 +44,13 @@ class GradientCorrectnessTest(test.TestCase):
     x = constant_op.constant(3.)
     dx_dx, = gradients_impl.gradients(x, x)
     with self.cached_session() as sess:
-      self.assertAllClose(1., sess.run(dx_dx))
+      self.assertAllClose(1., self.evaluate(dx_dx))
 
   def testIntegerIdentityGradient(self):
     x = constant_op.constant(3)
     dx_dx, = gradients_impl.gradients(x, x)
     with self.cached_session() as sess:
-      self.assertAllClose(1, sess.run(dx_dx))
+      self.assertAllClose(1, self.evaluate(dx_dx))
 
   def testGradientWithIntegerPath(self):
     x = constant_op.constant([3.9, 4.1])
@@ -58,7 +58,7 @@ class GradientCorrectnessTest(test.TestCase):
     y = x * k
     dy_dx, = gradients_impl.gradients(y, x)
     with self.cached_session() as sess:
-      self.assertAllClose([3., 4.], sess.run(dy_dx))
+      self.assertAllClose([3., 4.], self.evaluate(dy_dx))
 
   def testNoIntegerGradient1(self):
     x = constant_op.constant([3.9, 4.1])
diff --git a/tensorflow/python/kernel_tests/huge_slice_op_test.py b/tensorflow/python/kernel_tests/huge_slice_op_test.py
index 8646d74c96f179cde41184eab3af1f72583360fa..4074946350aa5ce753a39fb173346d1d4f7fe3c7 100644
--- a/tensorflow/python/kernel_tests/huge_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/huge_slice_op_test.py
@@ -33,11 +33,11 @@ class SliceTest(test.TestCase):
       a_large = array_ops.tile(
           constant_op.constant(np.array([False, True] * 4)), [2**29 + 3])
       slice_t = array_ops.slice(a_large, np.asarray([3]).astype(np.int64), [3])
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([True, False, True], slice_val)
 
       slice_t = array_ops.slice(
           a_large, constant_op.constant([long(2)**32 + 3], dtype=dtypes.int64),
           [3])
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([True, False, True], slice_val)
diff --git a/tensorflow/python/kernel_tests/in_topk_op_test.py b/tensorflow/python/kernel_tests/in_topk_op_test.py
index 6fdb497bc6f8d15d54b9d35ed8c15ed9caceb1db..507822b3142a77a3782be52a3d19bb9bd664b684 100644
--- a/tensorflow/python/kernel_tests/in_topk_op_test.py
+++ b/tensorflow/python/kernel_tests/in_topk_op_test.py
@@ -32,7 +32,7 @@ class InTopKTest(test.TestCase):
     np_ans = np.array(expected)
     with self.cached_session():
       precision = nn_ops.in_top_k(predictions, target, k)
-      out = precision.eval()
+      out = self.evaluate(precision)
       self.assertAllClose(np_ans, out)
       self.assertShapeEqual(np_ans, precision)
 
@@ -77,7 +77,7 @@ class InTopKTest(test.TestCase):
     np_ans = np.array([False, True])
     with self.cached_session():
       precision = nn_ops.in_top_k(predictions, target, k)
-      out = precision.eval()
+      out = self.evaluate(precision)
       self.assertAllClose(np_ans, out)
       self.assertShapeEqual(np_ans, precision)
 
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 70bfbf8544a8a8689d6f48c730ee90479236b2a9..074985dd931ac8da3de7614e9003f289ce10c869 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -349,7 +349,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
           shape=shape,
           initializer=init_ops.uniform_unit_scaling_initializer())
       variables.global_variables_initializer().run()
-      self.assertAllEqual(shape, x.eval().shape)
+      self.assertAllEqual(shape, self.evaluate(x).shape)
 
   def testDuplicatedInitializer(self):
     init = init_ops.uniform_unit_scaling_initializer()
@@ -435,7 +435,7 @@ class RangeTest(test.TestCase):
       tf_ans = math_ops.range(start, limit, delta, name="range")
       self.assertEqual([len(np.arange(start, limit, delta))],
                        tf_ans.get_shape())
-      return tf_ans.eval()
+      return self.evaluate(tf_ans)
 
   def testBasic(self):
     self.assertTrue(
@@ -524,7 +524,7 @@ class LinSpaceTest(test.TestCase):
       with self.session(graph=graph, force_gpu=self.force_gpu):
         tf_ans = math_ops.linspace(start, stop, num, name="linspace")
         self.assertEqual([num], tf_ans.get_shape())
-        return tf_ans.eval()
+        return self.evaluate(tf_ans)
 
   def testPositive(self):
     for self.force_gpu in self._gpu_modes():
@@ -616,7 +616,7 @@ class OrthogonalInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
   def testShapesValues(self):
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -674,7 +674,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
   def testShapesValues(self):
     gain = 3.14
@@ -706,11 +706,10 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
         with self.session(use_gpu=True) as sess:
           sess.run(my_ops)
           # Check the shape of the outputs
-          t = outputs.eval()
+          t = self.evaluate(outputs)
           self.assertAllEqual(t.shape, outputs_shape)
           # Check isometry of the delta-orthogonal kernel.
-          self.assertAllClose(sess.run(ratio), np.sqrt(gain),
-                              rtol=tol, atol=tol)
+          self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
   def testNonuniformity(self):
     value = 0
@@ -724,7 +723,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
                                         initializer=
                                         init_ops.convolutional_delta_orthogonal)
         x.initializer.run()
-        y = x.eval()[1, 1, :, :]
+        y = self.evaluate(x)[1, 1, :, :]
         determinant = np.linalg.det(y)
         value += determinant
         abs_value += np.abs(determinant)
@@ -774,7 +773,7 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
   def testNonuniformity(self):
     value = 0
@@ -845,10 +844,10 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       with self.session(use_gpu=True) as sess:
         sess.run(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
@@ -888,7 +887,7 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
@@ -940,10 +939,10 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
       with self.session(use_gpu=True) as sess:
         sess.run(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
@@ -983,7 +982,7 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
   def testNonuniformity(self):
     value = 0
@@ -1065,10 +1064,10 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       with self.cached_session(use_gpu=True) as sess:
         sess.run(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class IdentityInitializerTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 1b23e747764c65eaf3820a8832df0d657170ca7b..bf6fa9ea71f391287a7c21d042ae67ed57c9fc2b 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -35,7 +35,7 @@ class LargeConcatOpTest(test.TestCase):
     with self.session(use_gpu=False):
       # TODO(dga):  Add more depth to this test to validate correctness,
       # not just non-crashingness, once other large tensor fixes have gone in.
-      _ = onezeros.eval()
+      _ = self.evaluate(onezeros)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index be2e31cb5adec71f7c55633441f7eca23f3ec2b5..ba9e64979a48ccce82a283e74a1a024c4bcceda8 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -40,6 +40,44 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "linear_operator_adjoint_test",
+    size = "medium",
+    srcs = ["linear_operator_adjoint_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out
+    ],
+)
+
+cuda_py_test(
+    name = "linear_operator_algebra_test",
+    size = "small",
+    srcs = ["linear_operator_algebra_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "linear_operator_block_diag_test",
     size = "medium",
@@ -89,7 +127,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["linear_operator_circulant_test.py"],
     additional_deps = [
-        "//tensorflow/python/ops/linalg",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:spectral_ops_test_util",
@@ -99,6 +136,8 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/signal",
     ],
     shard_count = 5,
     tags = [
@@ -150,6 +189,28 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "linear_operator_inversion_test",
+    size = "medium",
+    srcs = ["linear_operator_inversion_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out
+    ],
+)
+
 cuda_py_test(
     name = "linear_operator_full_matrix_test",
     size = "medium",
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bed4b5268e8d27a25ab735f7e3e1a6c9e4d5d95
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
@@ -0,0 +1,118 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_adjoint
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+LinearOperatorAdjoint = linear_operator_adjoint.LinearOperatorAdjoint  # pylint: disable=invalid-name
+
+
+class LinearOperatorAdjointTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    self._atol[dtypes.complex64] = 1e-5
+    self._rtol[dtypes.complex64] = 1e-5
+
+  def _operator_and_matrix(self,
+                           build_info,
+                           dtype,
+                           use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+
+    if ensure_self_adjoint_and_pd:
+      matrix = linear_operator_test_util.random_positive_definite_matrix(
+          shape, dtype, force_well_conditioned=True)
+    else:
+      matrix = linear_operator_test_util.random_tril_matrix(
+          shape, dtype, force_well_conditioned=True, remove_upper=True)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    if ensure_self_adjoint_and_pd:
+      operator = LinearOperatorAdjoint(
+          linalg.LinearOperatorFullMatrix(
+              lin_op_matrix, is_positive_definite=True, is_self_adjoint=True))
+    else:
+      operator = LinearOperatorAdjoint(
+          linalg.LinearOperatorLowerTriangular(lin_op_matrix))
+
+    return operator, linalg.adjoint(matrix)
+
+  def test_base_operator_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    operator_adjoint = LinearOperatorAdjoint(operator)
+    self.assertTrue(operator_adjoint.is_positive_definite)
+    self.assertTrue(operator_adjoint.is_non_singular)
+    self.assertFalse(operator_adjoint.is_self_adjoint)
+
+  def test_supplied_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator_adjoint = LinearOperatorAdjoint(
+        operator,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator_adjoint.is_positive_definite)
+    self.assertTrue(operator_adjoint.is_non_singular)
+    self.assertFalse(operator_adjoint.is_self_adjoint)
+
+  def test_contradicting_hints_raise(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_positive_definite=False)
+    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+      LinearOperatorAdjoint(operator, is_positive_definite=True)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
+    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+      LinearOperatorAdjoint(operator, is_self_adjoint=True)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, name="my_operator", is_non_singular=True)
+
+    operator = LinearOperatorAdjoint(operator)
+
+    self.assertEqual("my_operator_adjoint", operator.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e296c026c09b36afd39b891befb767a222f5f19
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for registration mechanisms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.linalg import cholesky_registrations  # pylint: disable=unused-import
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import matmul_registrations  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+# pylint: disable=protected-access
+_CHOLESKY_DECOMPS = linear_operator_algebra._CHOLESKY_DECOMPS
+_MATMUL = linear_operator_algebra._MATMUL
+_registered_cholesky = linear_operator_algebra._registered_cholesky
+_registered_matmul = linear_operator_algebra._registered_matmul
+# pylint: enable=protected-access
+
+
+class CholeskyTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Cholesky to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterCholesky(CustomLinOp)
+    def _cholesky(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    with self.assertRaisesRegexp(ValueError, "positive definite"):
+      CustomLinOp(dtype=None, is_self_adjoint=True).cholesky()
+
+    with self.assertRaisesRegexp(ValueError, "self adjoint"):
+      CustomLinOp(dtype=None, is_positive_definite=True).cholesky()
+
+    custom_linop = CustomLinOp(
+        dtype=None, is_self_adjoint=True, is_positive_definite=True)
+    self.assertEqual("OK", custom_linop.cholesky())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterCholesky(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
+
+  def testExactCholeskyRegistrationsAllMatch(self):
+    for (k, v) in _CHOLESKY_DECOMPS.items():
+      self.assertEqual(v, _registered_cholesky(k[0]))
+
+
+class MatmulTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Matmul to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterMatmul(CustomLinOp, CustomLinOp)
+    def _matmul(a, b):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    custom_linop = CustomLinOp(
+        dtype=None, is_self_adjoint=True, is_positive_definite=True)
+    self.assertEqual("OK", custom_linop.matmul(custom_linop))
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterMatmul(CustomLinOp, CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterMatmul(
+        CustomLinOp, CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterMatmul(
+          CustomLinOp, CustomLinOp)(lambda a: None)
+
+  def testExactMatmulRegistrationsAllMatch(self):
+    for (k, v) in _MATMUL.items():
+      self.assertEqual(v, _registered_matmul(k[0], k[1]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index 30951b1b0eb27d95c0ce2f7d266ac6aa84da8dd4..f0cc5d709f9bfec2e3dcfadecc8f949bb6ce6e6d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_block_diag as block_diag
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -78,7 +79,9 @@ class SquareLinearOperatorBlockDiagTest(
         build_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
     ]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
     expected_blocks = (
         build_info.__dict__["blocks"] if "blocks" in build_info.__dict__
@@ -98,7 +101,11 @@ class SquareLinearOperatorBlockDiagTest(
 
     operator = block_diag.LinearOperatorBlockDiag(
         [linalg.LinearOperatorFullMatrix(
-            l, is_square=True) for l in lin_op_matrices])
+            l,
+            is_square=True,
+            is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+            is_positive_definite=True if ensure_self_adjoint_and_pd else None)
+         for l in lin_op_matrices])
 
     # Should be auto-set.
     self.assertTrue(operator.is_square)
@@ -129,6 +136,40 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_block_diag_cholesky_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+        ],
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    cholesky_factor = operator.cholesky()
+    self.assertTrue(isinstance(
+        cholesky_factor,
+        block_diag.LinearOperatorBlockDiag))
+    self.assertEqual(2, len(cholesky_factor.operators))
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[0],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[1],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+
   def test_is_non_singular_auto_set(self):
     # Matrix with two positive eigenvalues, 11 and 8.
     # The matrix values do not effect auto-setting of the flags.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index f1e151ebd862ffdbb0a266060dfc6ae7d5a24ef2..d5580d0e8863985ecbfcca0f64fd9bdeb2485bbc 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.linalg import linear_operator_circulant
 from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(0)
@@ -75,8 +76,8 @@ class LinearOperatorCirculantBaseTest(object):
       x = np.zeros([domain_dimension])
       # x is a basis vector.
       x[m] = 1.0
-      fft_x = math_ops.fft(x.astype(np.complex64))
-      h_convolve_x = math_ops.ifft(spectrum * fft_x)
+      fft_x = fft_ops.fft(x.astype(np.complex64))
+      h_convolve_x = fft_ops.ifft(spectrum * fft_x)
       matrix_rows.append(h_convolve_x)
     matrix = array_ops.stack(matrix_rows, axis=-1)
     return math_ops.cast(matrix, dtype)
@@ -97,7 +98,9 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
     # real, the matrix will not be real.
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating real spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -105,6 +108,8 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
     # spectrum is bounded away from zero.
     spectrum = linear_operator_test_util.random_sign_uniform(
         shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    if ensure_self_adjoint_and_pd:
+      spectrum = math_ops.abs(spectrum)
     # If dtype is complex, cast spectrum to complex.  The imaginary part will be
     # zero, so the operator will still be self-adjoint.
     spectrum = math_ops.cast(spectrum, dtype)
@@ -115,7 +120,10 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant(
-        lin_op_spectrum, is_self_adjoint=True, input_output_dtype=dtype)
+        lin_op_spectrum,
+        is_self_adjoint=True,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        input_output_dtype=dtype)
 
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
@@ -129,7 +137,8 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
 
 class LinearOperatorCirculantTestHermitianSpectrum(
@@ -146,7 +155,9 @@ class LinearOperatorCirculantTestHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -160,14 +171,14 @@ class LinearOperatorCirculantTestHermitianSpectrum(
     #  = IFFT[EvenPartOf[pre_spectrum]]
     # is the IFFT of something that is also bounded away from zero.
     # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
-    pre_h = math_ops.ifft(pre_spectrum_c)
+    pre_h = fft_ops.ifft(pre_spectrum_c)
 
     # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
     # So we will make spectrum = FFT[h], for real valued h.
     h = math_ops.real(pre_h)
     h_c = _to_complex(h)
 
-    spectrum = math_ops.fft(h_c)
+    spectrum = fft_ops.fft(h_c)
 
     lin_op_spectrum = spectrum
 
@@ -175,7 +186,11 @@ class LinearOperatorCirculantTestHermitianSpectrum(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant(
-        lin_op_spectrum, input_output_dtype=dtype)
+        lin_op_spectrum,
+        input_output_dtype=dtype,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+    )
 
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
@@ -189,7 +204,8 @@ class LinearOperatorCirculantTestHermitianSpectrum(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
 
 class LinearOperatorCirculantTestNonHermitianSpectrum(
@@ -205,7 +221,16 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  # Skip Cholesky since we are explicitly testing non-hermitian
+  # spectra.
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     shape = build_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
@@ -234,7 +259,8 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
   def test_simple_positive_real_spectrum_gives_self_adjoint_pos_def_oper(self):
     with self.cached_session() as sess:
@@ -251,7 +277,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
   def test_defining_operator_using_real_convolution_kernel(self):
     with self.cached_session():
       convolution_kernel = [1., 2., 1.]
-      spectrum = math_ops.fft(
+      spectrum = fft_ops.fft(
           math_ops.cast(convolution_kernel, dtypes.complex64))
 
       # spectrum is shape [3] ==> operator is shape [3, 3]
@@ -269,14 +295,14 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       # Make spectrum the FFT of a real convolution kernel h.  This ensures that
       # spectrum is Hermitian.
       h = linear_operator_test_util.random_normal(shape=(3, 4))
-      spectrum = math_ops.fft(math_ops.cast(h, dtypes.complex64))
+      spectrum = fft_ops.fft(math_ops.cast(h, dtypes.complex64))
       operator = linalg.LinearOperatorCirculant(
           spectrum, input_output_dtype=dtypes.complex64)
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
       np.testing.assert_allclose(
-          0, imag_matrix.eval(), rtol=0, atol=eps * 3 * 4)
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3 * 4)
 
   def test_convolution_kernel_same_as_first_row_of_to_dense(self):
     spectrum = [[3., 2., 1.], [2., 1.5, 1.]]
@@ -287,7 +313,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
       self.assertAllEqual((2, 3), h.get_shape())
       self.assertAllEqual((2, 3, 3), c.get_shape())
-      self.assertAllClose(h.eval(), c.eval()[:, :, 0])
+      self.assertAllClose(h.eval(), self.evaluate(c)[:, :, 0])
 
   def test_assert_non_singular_fails_for_singular_operator(self):
     spectrum = math_ops.cast([0, 4, 2j + 2], dtypes.complex64)
@@ -397,8 +423,8 @@ class LinearOperatorCirculant2DBaseTest(object):
         x = np.zeros(block_shape)
         # x is a basis vector.
         x[n0, n1] = 1.0
-        fft_x = math_ops.fft2d(x.astype(np.complex64))
-        h_convolve_x = math_ops.ifft2d(spectrum * fft_x)
+        fft_x = fft_ops.fft2d(x.astype(np.complex64))
+        h_convolve_x = fft_ops.ifft2d(spectrum * fft_x)
         # We want the flat version of the action of the operator on a basis
         # vector, not the block version.
         h_convolve_x = array_ops.reshape(h_convolve_x, shape[:-1])
@@ -421,7 +447,9 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -435,14 +463,14 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
     #  = IFFT[EvenPartOf[pre_spectrum]]
     # is the IFFT of something that is also bounded away from zero.
     # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
-    pre_h = math_ops.ifft2d(pre_spectrum_c)
+    pre_h = fft_ops.ifft2d(pre_spectrum_c)
 
     # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
     # So we will make spectrum = FFT[h], for real valued h.
     h = math_ops.real(pre_h)
     h_c = _to_complex(h)
 
-    spectrum = math_ops.fft2d(h_c)
+    spectrum = fft_ops.fft2d(h_c)
 
     lin_op_spectrum = spectrum
 
@@ -450,7 +478,10 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant2D(
-        lin_op_spectrum, input_output_dtype=dtype)
+        lin_op_spectrum,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        input_output_dtype=dtype)
 
     mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
 
@@ -470,7 +501,14 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     shape = build_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
@@ -602,7 +640,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       convolution_kernel = linear_operator_test_util.random_normal(
           shape=(2, 2, 3, 5), dtype=dtypes.float32)
       # Convolution kernel is real ==> spectrum is Hermitian.
-      spectrum = math_ops.fft3d(
+      spectrum = fft_ops.fft3d(
           math_ops.cast(convolution_kernel, dtypes.complex64))
 
       # spectrum is Hermitian ==> operator is real.
@@ -634,7 +672,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       #         =      H1  +      H2
       # where H1 is real since it is Hermitian,
       # and H2 is imaginary since it is anti-Hermitian.
-      ifft_s = math_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
+      ifft_s = fft_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
 
       # Throw away H2, keep H1.
       real_ifft_s = math_ops.real(ifft_s)
@@ -642,7 +680,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       # This is the perfect spectrum!
       # spectrum = DFT[H1]
       #          = S1,
-      fft_real_ifft_s = math_ops.fft3d(
+      fft_real_ifft_s = fft_ops.fft3d(
           math_ops.cast(real_ifft_s, dtypes.complex64))
 
       # S1 is Hermitian ==> operator is real.
@@ -665,7 +703,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       # S2 is anti-Hermitian ==> operator is imaginary.
       # S2 is real ==> operator is self-adjoint.
       imag_ifft_s = math_ops.imag(ifft_s)
-      fft_imag_ifft_s = math_ops.fft3d(
+      fft_imag_ifft_s = fft_ops.fft3d(
           1j * math_ops.cast(imag_ifft_s, dtypes.complex64))
       operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s)
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index 02f56db5962748ce6fc247f7e672044aeb5e4b3e..3f19dc4bffeeac9505ac35c864d0be48da049c8a 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -42,8 +42,12 @@ class SquareLinearOperatorCompositionTest(
     self._rtol[dtypes.float32] = 1e-4
     self._rtol[dtypes.complex64] = 1e-4
 
+  @property
+  def _tests_to_skip(self):
+    # Cholesky not implemented.
+    return ["cholesky"]
+
   def _operator_and_matrix(self, build_info, dtype, use_placeholder):
-    sess = ops.get_default_session()
     shape = list(build_info.shape)
 
     # Either 1 or 2 matrices, depending.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 0758349531e2da9d29342cbe149933b2fa30bfd9..91f4097438f480a30f4eedb7e0b0f12093df9367 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -32,17 +32,26 @@ class LinearOperatorDiagTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
     diag = linear_operator_test_util.random_sign_uniform(
         shape[:-1], minval=1., maxval=2., dtype=dtype)
 
+    if ensure_self_adjoint_and_pd:
+      # Abs on complex64 will result in a float32, so we cast back up.
+      diag = math_ops.cast(math_ops.abs(diag), dtype=dtype)
+
     lin_op_diag = diag
 
     if use_placeholder:
       lin_op_diag = array_ops.placeholder_with_default(diag, shape=None)
 
-    operator = linalg.LinearOperatorDiag(lin_op_diag)
+    operator = linalg.LinearOperatorDiag(
+        lin_op_diag,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     matrix = array_ops.matrix_diag(diag)
 
@@ -145,6 +154,46 @@ class LinearOperatorDiagTest(
       self.assertAllEqual(operator_solve.get_shape(), mat_solve.get_shape())
       self.assertAllClose(*sess.run([operator_solve, mat_solve]))
 
+  def test_diag_matmul(self):
+    operator1 = linalg_lib.LinearOperatorDiag([2., 3.])
+    operator2 = linalg_lib.LinearOperatorDiag([1., 2.])
+    operator3 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3.)
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2., 6.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2., 6.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator1.matmul(operator3)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator3.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
+
+  def test_diag_cholesky_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(
+        diag,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg.LinearOperatorDiag))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
index 8c2d2cf0774b682835f521f2e434d87fbc2aec84..36575ceec364c23127e1423f8ee738182c25b1d4 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
@@ -33,7 +33,9 @@ class SquareLinearOperatorFullMatrixTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
 
     matrix = linear_operator_test_util.random_positive_definite_matrix(
@@ -44,7 +46,12 @@ class SquareLinearOperatorFullMatrixTest(
     if use_placeholder:
       lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
 
-    operator = linalg.LinearOperatorFullMatrix(lin_op_matrix, is_square=True)
+    # Set the hints to none to test non-symmetric PD code paths.
+    operator = linalg.LinearOperatorFullMatrix(
+        lin_op_matrix,
+        is_square=True,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     return operator, matrix
 
@@ -123,7 +130,13 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.float64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+
+    # Matrix is always symmetric and positive definite in this class.
+    del ensure_self_adjoint_and_pd
+
     shape = list(build_info.shape)
 
     matrix = linear_operator_test_util.random_positive_definite_matrix(
@@ -134,7 +147,11 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
     if use_placeholder:
       lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
 
-    operator = linalg.LinearOperatorFullMatrix(lin_op_matrix, is_square=True)
+    operator = linalg.LinearOperatorFullMatrix(
+        lin_op_matrix,
+        is_square=True,
+        is_self_adjoint=True,
+        is_positive_definite=True)
 
     return operator, matrix
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 465a8194dd98aa9ed704635d14c1315ccf211b0e..522213e26b758fe2631c670c54448a940c4d5b5d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
@@ -41,7 +42,12 @@ class LinearOperatorIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    # Identity matrix is already Hermitian Positive Definite.
+    del ensure_self_adjoint_and_pd
+
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
@@ -77,7 +83,7 @@ class LinearOperatorIdentityTest(
           num_rows=2, dtype=dtypes.float16)
       x = rng.randn(2, 3).astype(np.float16)
       y = operator.matmul(x)
-      self.assertAllClose(x, y.eval())
+      self.assertAllClose(x, self.evaluate(y))
 
   def test_non_scalar_num_rows_raises_static(self):
     with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
@@ -242,6 +248,16 @@ class LinearOperatorIdentityTest(
           is_non_singular=None,
       )
 
+  def test_identity_cholesky_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg_lib.LinearOperatorIdentity))
+
 
 class LinearOperatorScaledIdentityTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
@@ -253,7 +269,10 @@ class LinearOperatorScaledIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
@@ -266,6 +285,9 @@ class LinearOperatorScaledIdentityTest(
     multiplier = linear_operator_test_util.random_sign_uniform(
         shape=batch_shape, minval=1., maxval=2., dtype=dtype)
 
+    if ensure_self_adjoint_and_pd:
+      # Abs on complex64 will result in a float32, so we cast back up.
+      multiplier = math_ops.cast(math_ops.abs(multiplier), dtype=dtype)
 
     # Nothing to feed since LinearOperatorScaledIdentity takes no Tensor args.
     lin_op_multiplier = multiplier
@@ -275,7 +297,10 @@ class LinearOperatorScaledIdentityTest(
           multiplier, shape=None)
 
     operator = linalg_lib.LinearOperatorScaledIdentity(
-        num_rows, lin_op_multiplier)
+        num_rows,
+        lin_op_multiplier,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     multiplier_matrix = array_ops.expand_dims(
         array_ops.expand_dims(multiplier, -1), -1)
@@ -332,7 +357,7 @@ class LinearOperatorScaledIdentityTest(
           num_rows=2, multiplier=multiplier)
       x = rng.randn(2, 3).astype(np.float16)
       y = operator.matmul(x)
-      self.assertAllClose(multiplier[..., None, None] * x, y.eval())
+      self.assertAllClose(multiplier[..., None, None] * x, self.evaluate(y))
 
   def test_non_scalar_num_rows_raises_static(self):
     # Many "test_...num_rows" tests are performed in LinearOperatorIdentity.
@@ -420,6 +445,41 @@ class LinearOperatorScaledIdentityTest(
     self.assertTrue(operator.is_non_singular)
     self.assertTrue(operator.is_self_adjoint is None)
 
+  def test_identity_matmul(self):
+    operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
+    operator2 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3.)
+    self.assertTrue(isinstance(
+        operator1.matmul(operator1),
+        linalg_lib.LinearOperatorIdentity))
+
+    self.assertTrue(isinstance(
+        operator1.matmul(operator1),
+        linalg_lib.LinearOperatorIdentity))
+
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorScaledIdentity))
+    self.assertAllClose(3., self.evaluate(operator_matmul.multiplier))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorScaledIdentity))
+    self.assertAllClose(3., self.evaluate(operator_matmul.multiplier))
+
+  def test_scaled_identity_cholesky_type(self):
+    operator = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2,
+        multiplier=3.,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg_lib.LinearOperatorScaledIdentity))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9344c526ee8ce3bd68de6876626a86a9ad6ab0d8
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_inversion
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+LinearOperatorInversion = linear_operator_inversion.LinearOperatorInversion  # pylint: disable=invalid-name
+
+
+class LinearOperatorInversionTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    self._atol[dtypes.complex64] = 1e-5
+    self._rtol[dtypes.complex64] = 1e-5
+
+  def _operator_and_matrix(self,
+                           build_info,
+                           dtype,
+                           use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+
+    if ensure_self_adjoint_and_pd:
+      matrix = linear_operator_test_util.random_positive_definite_matrix(
+          shape, dtype, force_well_conditioned=True)
+    else:
+      matrix = linear_operator_test_util.random_tril_matrix(
+          shape, dtype, force_well_conditioned=True, remove_upper=True)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    if ensure_self_adjoint_and_pd:
+      operator = LinearOperatorInversion(
+          linalg.LinearOperatorFullMatrix(
+              lin_op_matrix, is_positive_definite=True, is_self_adjoint=True))
+    else:
+      operator = LinearOperatorInversion(
+          linalg.LinearOperatorLowerTriangular(lin_op_matrix))
+
+    return operator, linalg.inv(matrix)
+
+  def test_base_operator_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    operator_inv = LinearOperatorInversion(operator)
+    self.assertTrue(operator_inv.is_positive_definite)
+    self.assertTrue(operator_inv.is_non_singular)
+    self.assertFalse(operator_inv.is_self_adjoint)
+
+  def test_supplied_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator_inv = LinearOperatorInversion(
+        operator,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator_inv.is_positive_definite)
+    self.assertTrue(operator_inv.is_non_singular)
+    self.assertFalse(operator_inv.is_self_adjoint)
+
+  def test_contradicting_hints_raise(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_positive_definite=False)
+    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+      LinearOperatorInversion(operator, is_positive_definite=True)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
+    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+      LinearOperatorInversion(operator, is_self_adjoint=True)
+
+  def test_singular_raises(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 1.], [1., 1.]]
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=False)
+    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+      LinearOperatorInversion(operator)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+      LinearOperatorInversion(operator, is_non_singular=False)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, name="my_operator", is_non_singular=True)
+
+    operator = LinearOperatorInversion(operator)
+
+    self.assertEqual("my_operator_inv", operator.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index f039b60f6480e7921fc776ccc223c43b8573e8f0..2b1ae6e1f5a59a6555f7e32f932f15155dcae9b3 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_kronecker as kronecker
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -69,8 +70,8 @@ class KroneckerDenseTest(test.TestCase):
         [5., 10., -1., -2.]], dtype=dtypes.float32)
 
     with self.cached_session():
-      self.assertAllClose(_kronecker_dense([x, y]).eval(), z.eval())
-      self.assertAllClose(_kronecker_dense([y, x]).eval(), w.eval())
+      self.assertAllClose(_kronecker_dense([x, y]).eval(), self.evaluate(z))
+      self.assertAllClose(_kronecker_dense([y, x]).eval(), self.evaluate(w))
 
 
 class SquareLinearOperatorKroneckerTest(
@@ -99,7 +100,12 @@ class SquareLinearOperatorKroneckerTest(
   def _tests_to_skip(self):
     return ["det", "solve", "solve_with_broadcast"]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    # Kronecker products constructed below will be from symmetric
+    # positive-definite matrices.
+    del ensure_self_adjoint_and_pd
     shape = list(build_info.shape)
     expected_factors = build_info.__dict__["factors"]
     matrices = [
@@ -116,7 +122,11 @@ class SquareLinearOperatorKroneckerTest(
 
     operator = kronecker.LinearOperatorKronecker(
         [linalg.LinearOperatorFullMatrix(
-            l, is_square=True) for l in lin_op_matrices])
+            l,
+            is_square=True,
+            is_self_adjoint=True,
+            is_positive_definite=True)
+         for l in lin_op_matrices])
 
     matrices = linear_operator_util.broadcast_matrix_batch_dims(matrices)
 
@@ -180,6 +190,40 @@ class SquareLinearOperatorKroneckerTest(
     with self.assertRaisesRegexp(ValueError, ">=1 operators"):
       kronecker.LinearOperatorKronecker([])
 
+  def test_kronecker_cholesky_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+        ],
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    cholesky_factor = operator.cholesky()
+    self.assertTrue(isinstance(
+        cholesky_factor,
+        kronecker.LinearOperatorKronecker))
+    self.assertEqual(2, len(cholesky_factor.operators))
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[0],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[1],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
index 207e5edf818f988f1e87a3c21c320d57841145d1..2920f3ae7ebc549ae960215445fc933bb30913dd 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
@@ -69,7 +69,8 @@ class BaseLinearOperatorLowRankUpdatetest(object):
     return linear_operator_test_util.random_uniform(
         diag_shape, minval=1e-4, maxval=1., dtype=dtype)
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
     # Recall A = L + UDV^H
     shape = list(build_info.shape)
     diag_shape = shape[:-1]
@@ -93,7 +94,7 @@ class BaseLinearOperatorLowRankUpdatetest(object):
     lin_op_v = v
 
     # D
-    if self._is_diag_update_positive:
+    if self._is_diag_update_positive or ensure_self_adjoint_and_pd:
       diag_update = self._gen_positive_diag(dtype, diag_update_shape)
     else:
       diag_update = linear_operator_test_util.random_normal(
@@ -178,6 +179,10 @@ class LinearOperatorLowRankUpdatetestWithDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UDU^H, D !> 0, L > 0 ==> A !> 0 and we cannot use a Cholesky."""
 
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
   _use_diag_update = True
   _is_diag_update_positive = False
   _use_v = False
@@ -217,6 +222,10 @@ class LinearOperatorLowRankUpdatetestNoDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UV^H, L > 0 ==> A is not symmetric and we cannot use a Cholesky."""
 
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
   _use_diag_update = False
   _is_diag_update_positive = None
   _use_v = True
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index e3c8f5cb688553bad4cbfcfc7fb5e92130ac76a2..bd41f9ed9d335f6f7e77cb7a19c5db1e59482d48 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
@@ -29,6 +30,11 @@ class LinearOperatorLowerTriangularTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
+  @property
+  def _tests_to_skip(self):
+    # Cholesky does not make sense for triangular matrices.
+    return ["cholesky"]
+
   def _operator_and_matrix(self, build_info, dtype, use_placeholder):
     shape = list(build_info.shape)
     # Upper triangle will be nonzero, but ignored.
@@ -71,6 +77,30 @@ class LinearOperatorLowerTriangularTest(
     with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
       linalg.LinearOperatorLowerTriangular([1.])
 
+  def test_triangular_diag_matmul(self):
+    operator1 = linalg_lib.LinearOperatorLowerTriangular(
+        [[1., 0., 0.], [2., 1., 0.], [2., 3., 3.]])
+    operator2 = linalg_lib.LinearOperatorDiag([2., 2., 3.])
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorLowerTriangular))
+    self.assertAllClose(
+        math_ops.matmul(
+            operator1.to_dense(),
+            operator2.to_dense()),
+        self.evaluate(operator_matmul.to_dense()))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorLowerTriangular))
+    self.assertAllClose(
+        math_ops.matmul(
+            operator2.to_dense(),
+            operator1.to_dense()),
+        self.evaluate(operator_matmul.to_dense()))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 819347343b1d22257e9f3579caced56128596723..2f67df408cb78f2c6537826bf89c7a075f435c85 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -134,7 +134,7 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       operator_dense = operator.to_dense()
       self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
-      self.assertAllClose(matrix, operator_dense.eval())
+      self.assertAllClose(matrix, self.evaluate(operator_dense))
 
   def test_generic_to_dense_method_non_square_matrix_tensor(self):
     matrix = rng.randn(2, 3, 4)
@@ -152,7 +152,7 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       y = operator.matvec(x)
       self.assertAllEqual((2,), y.get_shape())
-      self.assertAllClose([1., 2.], y.eval())
+      self.assertAllClose([1., 2.], self.evaluate(y))
 
   def test_solvevec(self):
     matrix = [[1., 0], [0., 2.]]
@@ -161,7 +161,7 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       x = operator.solvevec(y)
       self.assertAllEqual((2,), x.get_shape())
-      self.assertAllClose([1., 1 / 2.], x.eval())
+      self.assertAllClose([1., 1 / 2.], self.evaluate(x))
 
   def test_is_square_set_to_true_for_square_static_shapes(self):
     operator = LinearOperatorShape(shape=(2, 4, 4))
@@ -208,6 +208,77 @@ class LinearOperatorTest(test.TestCase):
     operator = LinearOperatorMatmulSolve(matrix, is_square=True)
     self.assertTrue(operator.is_square)
 
+  def test_linear_operator_matmul_hints_closed(self):
+    matrix = array_ops.placeholder(dtypes.float32)
+    operator1 = LinearOperatorMatmulSolve(matrix)
+
+    operator_matmul = operator1.matmul(operator1)
+
+    self.assertEqual(None, operator_matmul.is_square)
+    self.assertEqual(None, operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+    operator2 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=True,
+        is_self_adjoint=True,
+        is_positive_definite=True,
+        is_square=True,
+    )
+
+    operator_matmul = operator2.matmul(operator2)
+
+    self.assertTrue(operator_matmul.is_square)
+    self.assertTrue(operator_matmul.is_non_singular)
+    self.assertTrue(operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+  def test_linear_operator_matmul_hints_false(self):
+    matrix = array_ops.placeholder(dtypes.float32)
+    operator1 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=False,
+        is_self_adjoint=False,
+        is_positive_definite=False,
+        is_square=True,
+    )
+
+    operator_matmul = operator1.matmul(operator1)
+
+    self.assertTrue(operator_matmul.is_square)
+    self.assertFalse(operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+    operator2 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=False,
+        is_self_adjoint=False,
+        is_positive_definite=False,
+        is_square=False,
+    )
+
+    operator_matmul = operator2.matmul(operator2)
+
+    self.assertEqual(None, operator_matmul.is_square)
+    self.assertEqual(None, operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+  def test_linear_operator_matmul_hint_infer_square(self):
+    matrix1 = array_ops.placeholder(shape=[2, 3], dtype=dtypes.float32)
+    matrix2 = array_ops.placeholder(shape=[3, 2], dtype=dtypes.float32)
+    matrix3 = array_ops.placeholder(shape=[3, 4], dtype=dtypes.float32)
+
+    operator1 = LinearOperatorMatmulSolve(matrix1, is_square=False)
+    operator2 = LinearOperatorMatmulSolve(matrix2, is_square=False)
+    operator3 = LinearOperatorMatmulSolve(matrix3, is_square=False)
+
+    self.assertTrue(operator1.matmul(operator2).is_square)
+    self.assertTrue(operator2.matmul(operator1).is_square)
+    self.assertFalse(operator1.matmul(operator3).is_square)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index 31fb19e4a69b6847e06cc0aca2e86f91f78e3762..5ce26169728bafb4788551f0a47aea5cf0e85eef 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -102,7 +102,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     self.assertTrue(isinstance(tensor, ops.Tensor))
 
     with self.cached_session():
-      self.assertAllClose(arr, tensor.eval())
+      self.assertAllClose(arr, self.evaluate(tensor))
 
   def test_static_dims_broadcast(self):
     # x.batch_shape = [3, 1, 2]
@@ -205,7 +205,7 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
       result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
@@ -244,7 +244,7 @@ class MatmulWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matmul_with_broadcast(x, y)
       self.assertAllEqual((2, 1, 7), result.get_shape())
       expected = math_ops.matmul(x, y_broadcast)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_static_dims_broadcast_y_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
@@ -261,7 +261,7 @@ class MatmulWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matmul_with_broadcast(x, y)
       self.assertAllEqual((2, 3, 5, 5), result.get_shape())
       expected = math_ops.matmul(x_broadcast, y)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_static_dims_broadcast_y_has_extra_dims_transpose_a_and_b(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
@@ -280,7 +280,7 @@ class MatmulWithBroadcastTest(test.TestCase):
       self.assertAllEqual((2, 3, 5, 1), result.get_shape())
       expected = math_ops.matmul(
           x_broadcast, y, transpose_a=True, transpose_b=True)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_static_dims_broadcast_y_has_extra_dims_transpose_dynamic(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
@@ -344,7 +344,7 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix, rhs_broadcast)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_static_dims_broadcast_rhs_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
@@ -362,7 +362,7 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_static_dims_broadcast_rhs_has_extra_dims_dynamic(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
@@ -385,7 +385,7 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
       self.assertAllEqual(3, result.shape.ndims)
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
       self.assertAllClose(
-          expected.eval(),
+          self.evaluate(expected),
           result.eval(feed_dict={
               matrix_ph: matrix,
               rhs_ph: rhs
@@ -408,7 +408,7 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
           matrix, rhs, adjoint=True)
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs, adjoint=True)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
@@ -447,7 +447,7 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_static_dims_broadcast_rhs_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
@@ -466,7 +466,7 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(matrix_broadcast, rhs)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
@@ -486,7 +486,7 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(
           matrix_broadcast, rhs, adjoint=True)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index ad97d1a93ea68ce3f76b78eddb615fca01d8c74a..e875579a7af6632948b34312a72ca9312be6ae59 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -35,7 +35,7 @@ class LinearOperatorZerosTest(
 
   @property
   def _tests_to_skip(self):
-    return ["log_abs_det", "solve", "solve_with_broadcast"]
+    return ["cholesky", "log_abs_det", "solve", "solve_with_broadcast"]
 
   @property
   def _operator_build_infos(self):
@@ -46,7 +46,10 @@ class LinearOperatorZerosTest(
         build_info((3, 4, 4)),
         build_info((2, 1, 4, 4))]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     del use_placeholder
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
@@ -166,6 +169,17 @@ class LinearOperatorZerosTest(
     self.assertFalse(operator.is_non_singular)
     self.assertTrue(operator.is_self_adjoint)
 
+  def test_zeros_matmul(self):
+    operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
+    operator2 = linalg_lib.LinearOperatorZeros(num_rows=2)
+    self.assertTrue(isinstance(
+        operator1.matmul(operator2),
+        linalg_lib.LinearOperatorZeros))
+
+    self.assertTrue(isinstance(
+        operator2.matmul(operator1),
+        linalg_lib.LinearOperatorZeros))
+
 
 class LinearOperatorZerosNotSquareTest(
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 03b640a85a3ba0bc3617be2d8ae8ec5a438343ff..709ecbfc35f8bdd27c0053b1b572ee3cba8c42e4 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -50,7 +50,7 @@ class ShapeTest(test_lib.TestCase):
       determinants = linalg_ops.matrix_determinant(batch_identity)
       reduced = math_ops.reduce_sum(determinants)
       sum_grad = gradients_impl.gradients(reduced, batch_identity)[0]
-      self.assertAllClose(batch_identity.eval(), sum_grad.eval())
+      self.assertAllClose(batch_identity.eval(), self.evaluate(sum_grad))
 
 
 class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
@@ -69,7 +69,7 @@ def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
       if functor_.__name__ == 'matrix_square_root':
         # Square the input matrix to ensure that its matrix square root exists
         a = math_ops.matmul(a, a)
-        a_np = a.eval()
+        a_np = self.evaluate(a)
       b = functor_(a, **kwargs_)
 
       # Optimal stepsize for central difference is O(epsilon^{1/3}).
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 28391aaa878a81a2d29d2cdba455b631e141d61c..b5eeee099803288edf385d3abe4c96ae2e03bbf1 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -85,7 +85,7 @@ class LogdetTest(test.TestCase):
           #     [_RandomPDMatrix(n, self.rng, np_dtype),
           #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
           logdet_tf = linalg.logdet(matrix)
-          self.assertAllClose(logdet_np, logdet_tf.eval(), atol=atol)
+          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -94,7 +94,7 @@ class LogdetTest(test.TestCase):
       _, logdet_np = np.linalg.slogdet(matrix)
       with self.session(use_gpu=True):
         logdet_tf = linalg.logdet(matrix)
-        self.assertAllClose(logdet_np, logdet_tf.eval(), atol=atol)
+        self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
 
 class SlogdetTest(test.TestCase):
@@ -110,8 +110,9 @@ class SlogdetTest(test.TestCase):
         sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
         with self.session(use_gpu=True):
           sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-          self.assertAllClose(log_abs_det_np, log_abs_det_tf.eval(), atol=atol)
-          self.assertAllClose(sign_np, sign_tf.eval(), atol=atol)
+          self.assertAllClose(
+              log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+          self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -120,8 +121,9 @@ class SlogdetTest(test.TestCase):
       sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
       with self.session(use_gpu=True):
         sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-        self.assertAllClose(log_abs_det_np, log_abs_det_tf.eval(), atol=atol)
-        self.assertAllClose(sign_np, sign_tf.eval(), atol=atol)
+        self.assertAllClose(
+            log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+        self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
 
 class AdjointTest(test.TestCase):
@@ -135,7 +137,7 @@ class AdjointTest(test.TestCase):
         matrix = ops.convert_to_tensor(matrix_np)
         transposed = linalg.adjoint(matrix)
         self.assertEqual((3, 2), transposed.get_shape())
-        self.assertAllEqual(expected_transposed, transposed.eval())
+        self.assertAllEqual(expected_transposed, self.evaluate(transposed))
 
 
 class EyeTest(parameterized.TestCase, test.TestCase):
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index d57012dc860d850cb110726a096b87b0b253e0d1..09cb5cf0ba96c4bd8407beb65f3faf8b6c754fb9 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np  # pylint: disable=unused-import
 
 from tensorflow.python.client import session
@@ -28,9 +29,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -38,40 +41,84 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
 
 
-def scalar_shape():
-  return ops.convert_to_tensor([], dtype=dtypes.int32)
+@test_util.run_all_in_graph_and_eager_modes
+class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-
-class ListOpsTest(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testPushPop(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=scalar_shape())
+  def _testPushPop(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[],
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 1.0)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testPushPopGPU(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testPushPop(self, max_num_elements):
+    self._testPushPop(max_num_elements)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testPushPopGPU(self, max_num_elements):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
-      self.testPushPop()
+      self._testPushPop(max_num_elements)
+
+  def testPushInFullListFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[], max_num_elements=1)
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Tried to push item into a full list"):
+      l = list_ops.tensor_list_push_back(l, 2.)
+      self.evaluate(l)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testPopFromEmptyTensorListFails(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[],
+        max_num_elements=max_num_elements)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Trying to pop from an empty list"):
+      l = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.evaluate(l)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStack(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=scalar_shape())
+  def _testStack(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[],
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    if not context.executing_eagerly():
+      self.assertAllEqual(t.shape.as_list(), [None])
     self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStackWithUnknownElementShape(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testStack(self, max_num_elements):
+    self._testStack(max_num_elements)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testStackGPU(self, max_num_elements):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testStack(max_num_elements)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testStackWithUnknownElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32,
+        element_shape=None,
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
 
@@ -85,10 +132,13 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStackWithPartiallyDefinedElementShape(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testStackWithPartiallyDefinedElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=[-1])
+        element_dtype=dtypes.float32,
+        element_shape=[None],
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0]))
 
@@ -102,11 +152,14 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStackEmptyList(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testStackEmptyList(self, max_num_elements):
     # Should be able to stack empty lists with fully defined element_shape.
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=[1, 2])
+        element_dtype=dtypes.float32,
+        element_shape=[1, 2],
+        max_num_elements=max_num_elements)
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(t).shape, (0, 1, 2))
 
@@ -115,7 +168,9 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.float32, element_shape=[-1, 2])
+          element_dtype=dtypes.float32,
+          element_shape=[None, 2],
+          max_num_elements=max_num_elements)
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -123,15 +178,20 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.float32, element_shape=-1)
+          element_dtype=dtypes.float32,
+          element_shape=None,
+          max_num_elements=max_num_elements)
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testGatherGrad(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testGatherGrad(self, max_num_elements):
     with backprop.GradientTape() as tape:
-      l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                     element_shape=scalar_shape())
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=[],
+          max_num_elements=max_num_elements)
       c0 = constant_op.constant(1.0)
       tape.watch(c0)
       l = list_ops.tensor_list_push_back(l, c0)
@@ -142,10 +202,13 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     dt = tape.gradient(s, c0)
     self.assertAllEqual(self.evaluate(dt), 6.0)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testGatherWithUnknownElementShape(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testGatherWithUnknownElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32,
+        element_shape=None,
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([3.0, 4.0]))
@@ -162,10 +225,13 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testGatherWithPartiallyDefinedElementShape(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testGatherWithPartiallyDefinedElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=[-1])
+        element_dtype=dtypes.float32,
+        element_shape=[None],
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([4.0, 5.0]))
@@ -182,12 +248,15 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testGatherEmptyList(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  def testGatherEmptyList(self, max_num_elements):
     # Should be able to gather from empty lists with fully defined
     # element_shape.
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=[1, 2])
+        element_dtype=dtypes.float32,
+        element_shape=[1, 2],
+        max_num_elements=max_num_elements)
     t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
     self.assertAllEqual((0, 1, 2), self.evaluate(t).shape)
 
@@ -196,7 +265,9 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.float32, element_shape=[-1, 2])
+          element_dtype=dtypes.float32,
+          element_shape=[None, 2],
+          max_num_elements=max_num_elements)
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -205,11 +276,12 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.float32, element_shape=-1)
+          element_dtype=dtypes.float32,
+          element_shape=None,
+          max_num_elements=max_num_elements)
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
 
-  @test_util.run_in_graph_and_eager_modes
   def testScatterGrad(self):
     with backprop.GradientTape() as tape:
       c0 = constant_op.constant([1.0, 2.0])
@@ -224,51 +296,59 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     dt = tape.gradient(loss, c0)
     self.assertAllEqual(self.evaluate(dt), [2., 4.])
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStackGPU(self):
-    if not context.num_gpus():
-      return
-    with context.device("gpu:0"):
-      self.testStack()
-
-  @test_util.run_in_graph_and_eager_modes
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 1.0)
     self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testFromTensorGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetSetItem(self):
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e0), 1.0)
     l = list_ops.tensor_list_set_item(l, 0, 3.0)
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(t), [3.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetSetGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testGetSetItem()
 
-  @test_util.run_in_graph_and_eager_modes
+  def testSetGetGrad(self):
+    with backprop.GradientTape() as tape:
+      t = constant_op.constant(5.)
+      tape.watch(t)
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+      l = list_ops.tensor_list_set_item(l, 1, 2. * t)
+      e = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(e), 10.0)
+    self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0)
+
+  def testSetOnEmptyListWithMaxNumElementsFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[], max_num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to modify element 0 in a list with 0 elements."):
+      l = list_ops.tensor_list_set_item(l, 0, 1.)
+      self.evaluate(l)
+
   def testUnknownShape(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32, element_shape=None)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0, 2.0]))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
@@ -276,12 +356,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 1.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testCPUGPUCopy(self):
     if not context.num_gpus():
       return
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     with context.device("gpu:0"):
       l_gpu = array_ops.identity(l)
       self.assertAllEqual(
@@ -294,12 +373,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
             list_ops.tensor_list_pop_back(
                 l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testCPUGPUCopyNested(self):
     if not context.num_gpus():
       return
     t = constant_op.constant([1.0, 2.0])
-    child_l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    child_l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     l = list_ops.empty_tensor_list(
         element_shape=constant_op.constant([], dtype=dtypes.int32),
         element_dtype=dtypes.variant)
@@ -331,7 +409,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
               list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32)),
           [[1]])
 
-  def testGraphStackInLoop(self):
+  def testSkipEagerStackInLoop(self):
     with self.cached_session():
       t1 = list_ops.empty_tensor_list(
           element_shape=constant_op.constant([], dtype=dtypes.int32),
@@ -348,7 +426,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32)
       self.assertAllEqual(self.evaluate(s1), [0, 1, 2, 3])
 
-  def testGraphStackSwitchDtype(self):
+  def testSkipEagerStackSwitchDtype(self):
     with self.cached_session():
       list_ = list_ops.empty_tensor_list(
           element_shape=constant_op.constant([], dtype=dtypes.int32),
@@ -369,7 +447,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       np_s1 = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)
       self.assertAllEqual(self.evaluate(s1), np_s1)
 
-  def testGraphStackInLoopSwitchDtype(self):
+  def testSkipEagerStackInLoopSwitchDtype(self):
     with self.cached_session():
       t1 = list_ops.empty_tensor_list(
           element_shape=constant_op.constant([], dtype=dtypes.int32),
@@ -392,7 +470,6 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       np_s1 = np.vstack([np.arange(1, 4) * i for i in range(4)])
       self.assertAllEqual(self.evaluate(s1), np_s1)
 
-  @test_util.run_in_graph_and_eager_modes
   def testSerialize(self):
     worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
     with ops.Graph().as_default(), session.Session(target=worker.target):
@@ -407,15 +484,12 @@ class ListOpsTest(test_util.TensorFlowTestCase):
         worker_e = array_ops.identity(e)
       self.assertAllEqual(self.evaluate(worker_e), [2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testSerializeListWithInvalidTensors(self):
     worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
     with ops.Graph().as_default(), session.Session(target=worker.target):
       with ops.device("/job:worker"):
         l = list_ops.tensor_list_reserve(
-            element_dtype=dtypes.float32,
-            element_shape=scalar_shape(),
-            num_elements=2)
+            element_dtype=dtypes.float32, element_shape=[], num_elements=2)
         l = list_ops.tensor_list_set_item(l, 0, 1.)
       with ops.device("/job:ps"):
         l_ps = array_ops.identity(l)
@@ -425,13 +499,12 @@ class ListOpsTest(test_util.TensorFlowTestCase):
         worker_t = array_ops.identity(t)
       self.assertAllEqual(self.evaluate(worker_t), [1.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testSerializeListWithUnknownRank(self):
     worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
     with ops.Graph().as_default(), session.Session(target=worker.target):
       with ops.device("/job:worker"):
         t = constant_op.constant([[1.0], [2.0]])
-        l = list_ops.tensor_list_from_tensor(t, element_shape=-1)
+        l = list_ops.tensor_list_from_tensor(t, element_shape=None)
       with ops.device("/job:ps"):
         l_ps = array_ops.identity(l)
         element_shape = list_ops.tensor_list_element_shape(
@@ -440,11 +513,32 @@ class ListOpsTest(test_util.TensorFlowTestCase):
         element_shape = array_ops.identity(element_shape)
       self.assertEqual(self.evaluate(element_shape), -1)
 
-  @test_util.run_in_graph_and_eager_modes
+  def testSerializeListWithMaxNumElements(self):
+    if context.num_gpus():
+      # TODO(b/119151861): Enable on GPU.
+      return
+    worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
+    with ops.Graph().as_default(), session.Session(target=worker.target):
+      with ops.device("/job:worker"):
+        l = list_ops.empty_tensor_list(
+            element_shape=None,
+            element_dtype=dtypes.float32,
+            max_num_elements=2)
+        l = list_ops.tensor_list_push_back(l, 1.)
+      with ops.device("/job:ps"):
+        l_ps = array_ops.identity(l)
+        l_ps = list_ops.tensor_list_push_back(l_ps, 2.)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Tried to push item into a full list"):
+        with ops.device("/job:worker"):
+          l_worker = array_ops.identity(l_ps)
+          l_worker = list_ops.tensor_list_push_back(l_worker, 3.0)
+          self.evaluate(l_worker)
+
   def testPushPopGradients(self):
     with backprop.GradientTape() as tape:
-      l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                     element_shape=scalar_shape())
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=[])
       c = constant_op.constant(1.0)
       tape.watch(c)
       l = list_ops.tensor_list_push_back(l, c)
@@ -452,24 +546,22 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       e = 2 * e
     self.assertAllEqual(self.evaluate(tape.gradient(e, [c])[0]), 2.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testStackFromTensorGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       c2 = list_ops.tensor_list_stack(
           l, element_dtype=dtypes.float32, num_elements=2)
       result = c2 * 2.0
     grad = tape.gradient(result, [c])[0]
     self.assertAllEqual(self.evaluate(grad), [2.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetSetGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       c2 = constant_op.constant(3.0)
       tape.watch(c2)
       l = list_ops.tensor_list_set_item(l, 0, c2)
@@ -480,18 +572,17 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0])
     self.assertAllEqual(self.evaluate(grad_c2), 6.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testSetOutOfBounds(self):
     c = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(c, element_shape=[])
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0))
 
-  def testSetItemWithMismatchedShapeFails(self):
+  def testSkipEagerSetItemWithMismatchedShapeFails(self):
     with self.cached_session() as sess:
       ph = array_ops.placeholder(dtypes.float32)
       c = constant_op.constant([1.0, 2.0])
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       # Set a placeholder with unknown shape to satisfy the shape inference
       # at graph building time.
       l = list_ops.tensor_list_set_item(l, 0, ph)
@@ -500,10 +591,9 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                                    "incompatible shape"):
         sess.run(l_0, {ph: [3.0]})
 
-  @test_util.run_in_graph_and_eager_modes
   def testResourceVariableScatterGather(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(c, element_shape=[])
     v = vs.get_variable("var", initializer=[l] * 10, use_resource=True)
     v_r_0_stacked = list_ops.tensor_list_stack(v[0], dtypes.float32)
     self.evaluate(v.initializer)
@@ -511,10 +601,8 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     v_r_sparse_stacked = list_ops.tensor_list_stack(
         v.sparse_read(0), dtypes.float32)
     self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_sparse_stacked))
-    l_new_0 = list_ops.tensor_list_from_tensor(
-        [3.0, 4.0], element_shape=scalar_shape())
-    l_new_1 = list_ops.tensor_list_from_tensor(
-        [5.0, 6.0], element_shape=scalar_shape())
+    l_new_0 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l_new_1 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
     updated_v = state_ops.scatter_update(v, [3, 5], [l_new_0, l_new_1])
     updated_v_elems = array_ops.unstack(updated_v)
     updated_v_stacked = [
@@ -524,11 +612,10 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                 [[1.0, 2.0]] * 4)
     self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
-  @test_util.run_in_graph_and_eager_modes
   def testConcat(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=[])
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=[])
     l_batch_0 = array_ops.stack([l0, l1])
     l_batch_1 = array_ops.stack([l1, l0])
 
@@ -564,7 +651,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       self.evaluate(
           list_ops.tensor_list_concat_lists(
               l_batch_0,
-              list_ops.empty_tensor_list(scalar_shape(), dtypes.float32),
+              list_ops.empty_tensor_list([], dtypes.float32),
               element_dtype=dtypes.float32))
 
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
@@ -578,17 +665,15 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"input_b\[0\].dtype != element_dtype."):
       l_batch_of_int_tls = array_ops.stack(
-          [list_ops.tensor_list_from_tensor([1], element_shape=scalar_shape())]
-          * 2)
+          [list_ops.tensor_list_from_tensor([1], element_shape=[])] * 2)
       self.evaluate(
           list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_int_tls,
                                             element_dtype=dtypes.float32))
 
-  @test_util.run_in_graph_and_eager_modes
   def testPushBackBatch(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=[])
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=[])
     l_batch = array_ops.stack([l0, l1])
     l_push = list_ops.tensor_list_push_back_batch(l_batch, [3.0, 4.0])
     l_unstack = array_ops.unstack(l_push)
@@ -626,14 +711,13 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                                  "Invalid data type at index 0"):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
-  @test_util.run_in_graph_and_eager_modes
   def testZerosLike(self):
     for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16,
                   dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32,
                   dtypes.float64, dtypes.complex64, dtypes.complex128,
                   dtypes.bool):
       l_empty = list_ops.empty_tensor_list(
-          element_dtype=dtype, element_shape=scalar_shape())
+          element_dtype=dtype, element_shape=[])
       l_empty_zeros = array_ops.zeros_like(l_empty)
       t_empty_zeros = list_ops.tensor_list_stack(
           l_empty_zeros, element_dtype=dtype)
@@ -651,17 +735,15 @@ class ListOpsTest(test_util.TensorFlowTestCase):
           self.evaluate(t_full_zeros), np.zeros(
               (2,), dtype=dtype.as_numpy_dtype))
 
-  @test_util.run_in_graph_and_eager_modes
   def testZerosLikeNested(self):
     for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16,
                   dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32,
                   dtypes.float64, dtypes.complex64, dtypes.complex128,
                   dtypes.bool):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.variant, element_shape=scalar_shape())
+          element_dtype=dtypes.variant, element_shape=[])
 
-      sub_l = list_ops.empty_tensor_list(
-          element_dtype=dtype, element_shape=scalar_shape())
+      sub_l = list_ops.empty_tensor_list(element_dtype=dtype, element_shape=[])
       l = list_ops.tensor_list_push_back(l, sub_l)
       sub_l = list_ops.tensor_list_push_back(sub_l, math_ops.cast(
           1, dtype=dtype))
@@ -692,16 +774,14 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(
           self.evaluate(outputs[0]), np.zeros((2,), dtype=dtype.as_numpy_dtype))
 
-  @test_util.run_in_graph_and_eager_modes
   def testElementShape(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32, element_shape=None)
     shape = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int32)
     self.assertEqual(self.evaluate(shape), -1)
 
   def testZerosLikeUninitialized(self):
-    l0 = list_ops.tensor_list_reserve(
-        scalar_shape(), 3, element_dtype=dtypes.float32)
+    l0 = list_ops.tensor_list_reserve([], 3, element_dtype=dtypes.float32)
     l1 = list_ops.tensor_list_set_item(l0, 0, 1.)  # [1., _, _]
     zeros_1 = array_ops.zeros_like(l1)  # [0., _, _]
     l2 = list_ops.tensor_list_set_item(l1, 2, 2.)  # [1., _, 2.]
@@ -717,6 +797,74 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(self.evaluate(res_1), [0.])
     self.assertAllEqual(self.evaluate(res_2), [0., 0.])
 
+  def testSkipEagerTensorListGetItemGradAggregation(self):
+    l = list_ops.tensor_list_reserve(
+        element_shape=[], num_elements=1, element_dtype=dtypes.float32)
+    x = constant_op.constant(1.0)
+    l = list_ops.tensor_list_set_item(l, 0, x)
+    l_read1 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    l_read2 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients([l_read1, l_read2], [x])
+    with self.cached_session() as sess:
+      self.assertSequenceEqual(sess.run(grad), [2.])
+
+  def testSkipEagerBuildElementShape(self):
+    fn = list_ops._build_element_shape
+    # Unknown shape -> -1.
+    self.assertEqual(fn(None), -1)
+    self.assertEqual(fn(tensor_shape.unknown_shape()), -1)
+    # Scalar shape -> [] with type int32.
+    self.assertEqual(fn([]).dtype, dtypes.int32)
+    self.assertEqual(fn(tensor_shape.scalar()).dtype, dtypes.int32)
+    self.assertAllEqual(self.evaluate(fn([])), np.array([], np.int32))
+    self.assertAllEqual(
+        self.evaluate(fn(tensor_shape.scalar())), np.array([], np.int32))
+    # Tensor -> Tensor
+    shape = constant_op.constant(1)
+    self.assertIs(fn(shape), shape)
+    # Shape with unknown dims -> shape list with -1's.
+    shape = [None, 5]
+    self.assertAllEqual(fn(shape), [-1, 5])
+    self.assertAllEqual(fn(tensor_shape.TensorShape(shape)), [-1, 5])
+    # Shape with unknown dims and tensor dims -> shape list with -1's and tensor
+    # dims.
+    t = array_ops.placeholder(dtypes.int32)
+    shape = [None, 5, t]
+    result = fn(shape)
+    self.assertAllEqual(result[:2], [-1, 5])
+    self.assertIs(result[2], t)
+
+  def testAddN(self):
+    l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[])
+    l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
+    result = math_ops.add_n((l1, l2, l3))
+    result_t = list_ops.tensor_list_stack(result, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(result_t), [9., 12.])
+
+  def testAddNNestedList(self):
+    l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[])
+    l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
+    l4 = list_ops.tensor_list_from_tensor([7.0, 8.0], element_shape=[])
+    a = list_ops.empty_tensor_list(
+        element_dtype=dtypes.variant, element_shape=[])
+    a = list_ops.tensor_list_push_back(a, l1)
+    a = list_ops.tensor_list_push_back(a, l2)
+    b = list_ops.empty_tensor_list(
+        element_dtype=dtypes.variant, element_shape=[])
+    b = list_ops.tensor_list_push_back(b, l3)
+    b = list_ops.tensor_list_push_back(b, l4)
+    result = math_ops.add_n((a, b))
+    result_0 = list_ops.tensor_list_stack(
+        list_ops.tensor_list_get_item(result, 0, element_dtype=dtypes.variant),
+        element_dtype=dtypes.float32)
+    result_1 = list_ops.tensor_list_stack(
+        list_ops.tensor_list_get_item(result, 1, element_dtype=dtypes.variant),
+        element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(result_0), [6., 8.])
+    self.assertAllEqual(self.evaluate(result_1), [10., 12.])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/logging_ops_test.py b/tensorflow/python/kernel_tests/logging_ops_test.py
index 8e9b87f6512e975584af2baf9fc4afb0547625ea..e8fa1cfa28d910843fe41fe1c114700fffdfb342 100644
--- a/tensorflow/python/kernel_tests/logging_ops_test.py
+++ b/tensorflow/python/kernel_tests/logging_ops_test.py
@@ -52,7 +52,7 @@ class LoggingOpsTest(test.TestCase):
               math_ops.less(epsilon, y), ["Divide-by-zero"])
       ]):
         out = math_ops.div(z, y)
-      self.assertAllEqual(2.0, out.eval())
+      self.assertAllEqual(2.0, self.evaluate(out))
       # assert(epsilon < x)
       # z / x
       #
@@ -63,7 +63,7 @@ class LoggingOpsTest(test.TestCase):
       ]):
         out = math_ops.div(z, x)
       with self.assertRaisesOpError("less than x"):
-        out.eval()
+        self.evaluate(out)
 
 
 class PrintV2Test(test.TestCase):
@@ -387,8 +387,8 @@ class PrintGradientTest(test.TestCase):
       wx_print = logging_ops.Print(wx, [w, w, w])
       wx_grad = gradients_impl.gradients(wx, w)[0]
       wx_print_grad = gradients_impl.gradients(wx_print, w)[0]
-      wxg = wx_grad.eval()
-      wxpg = wx_print_grad.eval()
+      wxg = self.evaluate(wx_grad)
+      wxpg = self.evaluate(wx_print_grad)
     self.assertAllEqual(wxg, wxpg)
 
 
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index bd93942efbd016d5c456c761f26397cddc9a598c..ab4c9c730bd10eff48bc3d6249c970975d9f6baf 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -52,14 +52,14 @@ class HashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
       self.assertAllEqual([3], output.get_shape())
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
       exported_keys_tensor, exported_values_tensor = table.export()
 
       self.assertItemsEqual([b"brain", b"salad", b"surgery"],
-                            exported_keys_tensor.eval())
-      self.assertItemsEqual([0, 1, 2], exported_values_tensor.eval())
+                            self.evaluate(exported_keys_tensor))
+      self.assertItemsEqual([0, 1, 2], self.evaluate(exported_values_tensor))
 
   def testHashTableFindHighRank(self):
     with self.cached_session():
@@ -76,7 +76,7 @@ class HashTableOpTest(test.TestCase):
           [["brain", "salad"], ["tank", "tarkus"]])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([[0, 1], [-1, -1]], result)
 
   def testHashTableInitWithPythonArrays(self):
@@ -94,7 +94,7 @@ class HashTableOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
   def testHashTableInitWithNumPyArrays(self):
@@ -111,7 +111,7 @@ class HashTableOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
   def testMultipleHashTables(self):
@@ -154,7 +154,7 @@ class HashTableOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
   def testHashTableWithSparseTensorInput(self):
@@ -174,7 +174,7 @@ class HashTableOpTest(test.TestCase):
           constant_op.constant(sp_shape, dtypes.int64))
       output = table.lookup(input_tensor)
 
-      out_indices, out_values, out_shape = sess.run(output)
+      out_indices, out_values, out_shape = self.evaluate(output)
 
       self.assertAllEqual([0, 1, -1], out_values)
       self.assertAllEqual(sp_indices, out_indices)
@@ -221,7 +221,7 @@ class HashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
 
       with self.assertRaisesOpError("Table not initialized"):
-        output.eval()
+        self.evaluate(output)
 
   def testInitializeTwice(self):
     with self.cached_session():
@@ -286,7 +286,7 @@ class HashTableOpTest(test.TestCase):
       input_tensor = constant_op.constant([0, 1, -1])
       output = table.lookup(input_tensor)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
 
 
@@ -306,9 +306,9 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_string_index_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
@@ -322,9 +322,9 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
@@ -339,9 +339,9 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_string_index_table_from_file_tensor_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
@@ -352,9 +352,9 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
       self.assertEqual(1,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
@@ -367,11 +367,11 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
 
       feed_dict = {vocabulary_placeholder.name: vocabulary_file}
       lookup_ops.tables_initializer().run(feed_dict=feed_dict)
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
       self.assertEqual(0,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
@@ -387,9 +387,9 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_int64_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
@@ -403,9 +403,9 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_index_table_from_file_with_default_value(self):
     default_value = -42
@@ -416,9 +416,9 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), ids.eval())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_file_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
@@ -429,7 +429,7 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual(
           (
@@ -437,7 +437,7 @@ class IndexTableFromFile(test.TestCase):
               2,  # From vocabulary file.
               867,  # 3 + fingerprint("tarkus") mod 300.
               860),  # 3 + fingerprint("toccata") mod 300.
-          ids.eval())
+          self.evaluate(ids))
 
   def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
     self.assertRaises(
@@ -476,9 +476,9 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, -1, -1), ids.eval())
+      self.assertAllEqual((1, -1, -1), self.evaluate(ids))
       self.assertEqual(2, table.size().eval())
 
   def test_index_table_from_file_with_vocab_size_too_large(self):
@@ -504,9 +504,9 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, -1), ids.eval())
+      self.assertAllEqual((1, 2, -1), self.evaluate(ids))
       self.assertEqual(3, table.size().eval())
 
   def test_index_table_from_file_with_invalid_hashers(self):
@@ -614,9 +614,9 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_int64_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
@@ -626,9 +626,9 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_index_table_from_tensor_with_default_value(self):
     default_value = -42
@@ -639,9 +639,9 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), ids.eval())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_tensor_missing_vocabulary_list(self):
     with self.cached_session():
@@ -656,7 +656,7 @@ class IndexTableFromTensor(test.TestCase):
           vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "keys and values cannot be empty"):
         lookup_ops.tables_initializer().run()
@@ -698,10 +698,10 @@ class IndexToStringTableFromFileTest(test.TestCase):
         features = table.lookup(
             constant_op.constant([0, 1, 2, 3], dtypes.int64))
         with self.assertRaises(errors_impl.OpError):
-          features.eval()
+          self.evaluate(features)
         lookup_ops.tables_initializer().run()
         self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                            features.eval())
+                            self.evaluate(features))
 
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
@@ -713,10 +713,10 @@ class IndexToStringTableFromFileTest(test.TestCase):
           value_column_index=0)
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
   def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
@@ -729,10 +729,10 @@ class IndexToStringTableFromFileTest(test.TestCase):
           delimiter=" ")
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
   def test_index_to_string_table_with_default_value(self):
     default_value = b"NONE"
@@ -742,10 +742,10 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
-                          features.eval())
+                          self.evaluate(features))
 
   def test_index_to_string_table_with_vocab_size_too_small(self):
     default_value = b"NONE"
@@ -757,10 +757,10 @@ class IndexToStringTableFromFileTest(test.TestCase):
           default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", default_value, default_value),
-                          features.eval())
+                          self.evaluate(features))
 
   def test_index_to_string_table_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
@@ -770,7 +770,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       init = lookup_ops.tables_initializer()
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", init.run)
@@ -783,9 +783,10 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
+      self.assertAllEqual((b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
 
 
 class IndexToStringTableFromTensorTest(test.TestCase):
@@ -799,11 +800,11 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       features = table.lookup(indices)
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
   def test_duplicate_entries(self):
     with self.cached_session():
@@ -813,7 +814,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       features = table.lookup(indices)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
+      self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
 
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
@@ -824,11 +825,11 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([1, 2, 4], dtypes.int64)
       features = table.lookup(indices)
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
 
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
-                          features.eval())
+                          self.evaluate(features))
 
 
 class InitializeTableFromFileOpTest(test.TestCase):
@@ -870,7 +871,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       output = table.lookup(
           constant_op.constant((42, 1, 11), dtype=dtypes.int64))
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
   def testInitializeIndexTable(self):
@@ -889,7 +890,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       output = table.lookup(input_values)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
 
   def testMultiColumn(self):
@@ -911,7 +912,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([1, 5, 6], result)
 
   def testInvalidDataTypeInMultiColumn(self):
@@ -1078,7 +1079,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
   def testInvalidFilenames(self):
@@ -1119,7 +1120,8 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
 
       out = table.lookup(input_values)
-      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], out.eval())
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"],
+                          self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
   def testStringToIdTable(self):
@@ -1135,7 +1137,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, -1], out.eval())
+      self.assertAllEqual([0, 1, 2, -1], self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
   def testInt64ToIdTable(self):
@@ -1152,7 +1154,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
 
       out = table.lookup(
           constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
-      self.assertAllEqual((0, 1, 2, -1), out.eval())
+      self.assertAllEqual((0, 1, 2, -1), self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
 
@@ -1181,7 +1183,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
   def testInt32IdTableWithHashBuckets(self):
@@ -1203,7 +1205,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
 
       out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
   def testInt64IdTableWithHashBuckets(self):
@@ -1223,7 +1225,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
 
       out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
   def testStringIdTableWithOnlyHashBucket(self):
@@ -1244,7 +1246,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               1,  # fingerprint("salad") mod 5.
               4  # fingerprint("surgery") mod 5
           ],
-          out.eval())
+          self.evaluate(out))
       self.assertEquals(oov_buckets, table.size().eval())
 
   def testInt32IdTableWithOnlyHashBucket(self):
@@ -1266,7 +1268,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               4,  # fingerprint("1") mod 5.
               2  # fingerprint("-1000") mod 5
           ],
-          out.eval())
+          self.evaluate(out))
       self.assertEquals(oov_buckets, table.size().eval())
 
   def testFloat64IdTableWithOnlyHashBucket(self):
@@ -1342,7 +1344,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out1 = table1.lookup(input_string_1)
 
-      self.assertAllEqual([0, 1, 2, 3], out1.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
 
     with self.cached_session():
@@ -1363,7 +1365,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out2 = table2.lookup(input_string_2)
 
-      self.assertAllEqual([3, 1, 3], out2.eval())
+      self.assertAllEqual([3, 1, 3], self.evaluate(out2))
       self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
 
   def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index b04996f78893da4042c364933aab26b09e029cd1..bda63bcaa929fa6f5a3bcc67d6b57f16e64badc0 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -54,55 +54,55 @@ class AbsoluteDifferenceLossTest(test.TestCase):
   def testAllCorrectNoLossWeight(self):
     loss = losses.absolute_difference(self._predictions, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testNonZeroLoss(self):
     loss = losses.absolute_difference(self._labels, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(5.5, loss.eval(), 3)
+      self.assertAlmostEqual(5.5, self.evaluate(loss), 3)
 
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions,
                                       constant_op.constant(weights))
     with self.cached_session():
-      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     weights = constant_op.constant((1.2, 0.0), shape=(2, 1))
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.6, loss.eval(), 3)
+      self.assertAlmostEqual(5.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 0.0], shape=[2, 1])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.6, loss.eval(), 3)
+      self.assertAlmostEqual(5.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeights(self):
     weights = constant_op.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(16.6, loss.eval(), 3)
+      self.assertAlmostEqual(16.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
     weights = constant_op.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(6.0, loss.eval(), 3)
+      self.assertAlmostEqual(6.0, self.evaluate(loss), 3)
 
   def testLossWithSampleSpecificWeightsAllZero(self):
     weights = array_ops.zeros((2, 3))
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testEagerNoMemoryLeaked(self):
@@ -149,7 +149,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = 2.3
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -159,7 +159,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits,
                                           constant_op.constant(weights))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -168,7 +168,8 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant((1.2, 3.4, 5.6))
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testAllWrongAllWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -177,7 +178,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([0, 0, 0], shape=[3])
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSomeWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -186,7 +187,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 0, 0], shape=[3])
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(12.0, loss.eval(), 3)
+      self.assertAlmostEqual(12.0, self.evaluate(loss), 3)
 
   def testSoftmaxWithMeasurementSpecificWeightsRaisesException(self):
     with self.cached_session():
@@ -302,7 +303,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = 2.3
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -312,7 +313,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits,
                                                  constant_op.constant(weights))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWith1DTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -322,7 +323,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(
           labels, logits, constant_op.constant((weights,)))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithPlaceholderForWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0],
@@ -374,7 +375,8 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 3.4, 5.6], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithColumnWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -383,7 +385,8 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([[1.2], [3.4], [5.6]])
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testAllWrongAllWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -392,7 +395,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([0, 0, 0], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSomeWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -401,7 +404,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 0, 0], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(12.0, loss.eval(), 3)
+      self.assertAlmostEqual(12.0, self.evaluate(loss), 3)
 
   def testMeasurementSpecificWeightsRaisesException(self):
     with self.cached_session():
@@ -481,7 +484,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       loss = losses.sigmoid_cross_entropy(labels, logits)
       self.assertEquals(logits.dtype, loss.dtype)
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testLossWithSingleDimPlaceholderForLogitsAndWeights1(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 1))
@@ -536,7 +539,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       loss = losses.sigmoid_cross_entropy(labels, logits, weights)
       self.assertEquals(logits.dtype, loss.dtype)
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(1700.0 / 7.0, loss.eval(), 3)
+      self.assertAlmostEqual(1700.0 / 7.0, self.evaluate(loss), 3)
 
   def testMultiCorrectSigmoid(self):
     logits = constant_op.constant([[100.0, -100.0, 100.0],
@@ -548,7 +551,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
 
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSigmoidFloat64(self):
     logits = constant_op.constant((
@@ -563,7 +566,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals(logits.dtype, loss.dtype)
 
     with self.cached_session():
-      self.assertAlmostEqual(44.444, loss.eval(), 3)
+      self.assertAlmostEqual(44.444, self.evaluate(loss), 3)
 
   def testSigmoidNoReduction(self):
     logits = constant_op.constant((
@@ -576,11 +579,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals(logits.dtype, loss.dtype)
 
     with self.cached_session():
-      self.assertAllClose((
-          (0., 0., 0.),
-          (0., 100., 100.),
-          (100., 0., 100.)
-      ), loss.eval(), 3)
+      self.assertAllClose(((0., 0., 0.), (0., 100., 100.), (100., 0., 100.)),
+                          self.evaluate(loss), 3)
 
   def testSigmoidLabelSmoothingCorrect(self):
     with self.cached_session():
@@ -619,7 +619,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       softmax_labels = constant_op.constant([[0, 1], [1, 0], [0, 1]])
       softmax_loss = losses.softmax_cross_entropy(
           softmax_labels, softmax_logits, label_smoothing=label_smoothing)
-      self.assertAlmostEqual(sigmoid_loss.eval(), softmax_loss.eval(), 3)
+      self.assertAlmostEqual(sigmoid_loss.eval(), self.evaluate(softmax_loss),
+                             3)
 
 
 class LogLossTest(test.TestCase):
@@ -648,7 +649,7 @@ class LogLossTest(test.TestCase):
   def testAllCorrectNoLossWeight(self):
     loss = losses.log_loss(self._labels, self._labels)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testAllCorrectNoLossWeightWithPlaceholder(self):
     tf_predictions = array_ops.placeholder(
@@ -662,14 +663,14 @@ class LogLossTest(test.TestCase):
     loss = losses.log_loss(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(-np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
@@ -677,7 +678,7 @@ class LogLossTest(test.TestCase):
                            constant_op.constant(weights))
     with self.cached_session():
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithScalarTensorWeightAndPlaceholder(self):
     tf_predictions = array_ops.placeholder(
@@ -707,7 +708,8 @@ class LogLossTest(test.TestCase):
         np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 6.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeightsSomeZero(self):
     weights = constant_op.constant((1.2, 0), shape=(2, 1))
@@ -716,7 +718,8 @@ class LogLossTest(test.TestCase):
                                       (2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeightsSomeZero(self):
     weights = constant_op.constant([1.2, 0], shape=[2, 1])
@@ -725,7 +728,8 @@ class LogLossTest(test.TestCase):
                                       (2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             self.evaluate(loss), 3)
 
   def testWeightsWithSameNumDimsButWrongShapeThrowsException(self):
     weights = constant_op.constant(np.random.normal(size=(2, 4)), shape=[2, 4])
@@ -743,7 +747,8 @@ class LogLossTest(test.TestCase):
         constant_op.constant(
             weights, shape=(2, 3)))
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithMeasurementSpecificWeightsWithPlaceholder(self):
     weights = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3))
@@ -770,7 +775,7 @@ class LogLossTest(test.TestCase):
         constant_op.constant(
             weights, shape=(2, 3)))
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses), loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses), self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeightsMostZeroWithPlaceholder(self):
     weights = np.array([0, 0, 0, 0, 0, 2]).reshape((2, 3))
@@ -788,7 +793,7 @@ class LogLossTest(test.TestCase):
     tf_weights = array_ops.zeros(shape=(2, 3))
     loss = losses.log_loss(self._labels, self._predictions, tf_weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
 
 class HingeLossTest(test.TestCase):
@@ -870,7 +875,7 @@ class HuberLossTest(test.TestCase):
       labels = constant_op.constant([1.0, -1.0, 0.0, 0.5])
       expected = 0.5 * np.array([0.5**2, 0.4**2, 0.5**2, 0.5**2]).mean()
       loss = losses.huber_loss(labels, predictions, delta=delta)
-      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+      self.assertAllClose(expected, self.evaluate(loss), atol=1e-5)
 
   def testAllLinearDelta(self):
     delta = 0.5
@@ -880,7 +885,7 @@ class HuberLossTest(test.TestCase):
     expected -= 0.5 * delta**2
     loss = losses.huber_loss(labels, predictions, delta=delta)
     with self.cached_session():
-      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+      self.assertAllClose(expected, self.evaluate(loss), atol=1e-5)
 
 
 class MeanSquaredErrorTest(test.TestCase):
@@ -906,55 +911,55 @@ class MeanSquaredErrorTest(test.TestCase):
   def testAllCorrectNoLossWeight(self):
     loss = losses.mean_squared_error(self._predictions, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testNonZeroLoss(self):
     loss = losses.mean_squared_error(self._labels, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(49.5, loss.eval(), 3)
+      self.assertAlmostEqual(49.5, self.evaluate(loss), 3)
 
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions,
                                      constant_op.constant(weights))
     with self.cached_session():
-      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(767.8 / 6.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 3.4], shape=[2, 1])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(767.8 / 6.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeights(self):
     weights = constant_op.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(587 / 5.0, loss.eval(), 3)
+      self.assertAlmostEqual(587 / 5.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
     weights = constant_op.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(18.0, loss.eval(), 3)
+      self.assertAlmostEqual(18.0, self.evaluate(loss), 3)
 
   def testLossWithSampleSpecificWeightsAllZero(self):
     weights = array_ops.zeros((2, 3))
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
 
 class MeanPairwiseSquaredErrorTest(test.TestCase):
@@ -991,7 +996,8 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     with self.cached_session():
       static_inputs_op = losses.mean_pairwise_squared_error(
           predictions=predictions, labels=labels, weights=weights)
-      self.assertAlmostEqual(expected_loss, static_inputs_op.eval(), places=3)
+      self.assertAlmostEqual(
+          expected_loss, self.evaluate(static_inputs_op), places=3)
 
       predictions_placeholder = array_ops.placeholder(
           dtypes.float32, shape=np.asarray(predictions.shape))
@@ -1040,9 +1046,9 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with self.cached_session() as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         for grad, _ in gradients_to_variables:
-          np_grad = sess.run(grad)
+          np_grad = self.evaluate(grad)
           self.assertFalse(np.isnan(np_grad).any())
 
   def testNonZeroLossWithPythonScalarWeight(self):
@@ -1060,7 +1066,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         weights=constant_op.constant(weights))
     with self.cached_session():
       self.assertAlmostEqual(weights * np.sum(self._expected_losses),
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithScalarZeroWeight(self):
     self._test_valid_weights(
@@ -1215,7 +1221,7 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2)
     with self.cached_session():
-      self.assertAlmostEqual(0, loss.eval(), 5)
+      self.assertAlmostEqual(0, self.evaluate(loss), 5)
 
   def testPartiallyCorrectWithIntegerValues(self):
     loss = losses.cosine_distance(
@@ -1223,7 +1229,7 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2)
     with self.cached_session():
-      self.assertAlmostEqual(1, loss.eval(), 5)
+      self.assertAlmostEqual(1, self.evaluate(loss), 5)
 
   def testPartiallyCorrectFloatingPointValues(self):
     predictions = np.matrix(
@@ -1241,7 +1247,7 @@ class CosineDistanceLossTest(test.TestCase):
     loss = losses.cosine_distance(tf_labels, tf_preds, dim=2)
 
     with self.cached_session():
-      self.assertAlmostEqual(1.0, loss.eval(), 5)
+      self.assertAlmostEqual(1.0, self.evaluate(loss), 5)
 
   def testSampleSpecificWeights(self):
     loss = losses.cosine_distance(
@@ -1250,7 +1256,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=np.asarray((1, 0, 0)).reshape((3, 1, 1)))
     with self.cached_session():
-      self.assertEqual(1.0, loss.eval())
+      self.assertEqual(1.0, self.evaluate(loss))
 
   def testMeasurementSpecificWeights(self):
     loss = losses.cosine_distance(
@@ -1260,7 +1266,7 @@ class CosineDistanceLossTest(test.TestCase):
         weights=constant_op.constant(
             [1, 0, 0, 1, 1, 1], shape=(3, 2, 1)))
     with self.cached_session():
-      self.assertEqual(3.0 / 4.0, loss.eval())
+      self.assertEqual(3.0 / 4.0, self.evaluate(loss))
 
   def testMeasurementSpecificWeightsWithPlaceholderWithShape(self):
     tf_predictions = array_ops.placeholder(
@@ -1282,7 +1288,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=array_ops.zeros((3, 1, 1)))
     with self.cached_session():
-      self.assertEqual(0, loss.eval())
+      self.assertEqual(0, self.evaluate(loss))
 
   def testZeroLossWhenAllMeasurementSpecificWeightsAreZero(self):
     loss = losses.cosine_distance(
@@ -1291,7 +1297,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=array_ops.zeros((3, 2, 1)))
     with self.cached_session():
-      self.assertEqual(0, loss.eval())
+      self.assertEqual(0, self.evaluate(loss))
 
 
 class AddLossTest(test.TestCase):
@@ -1351,15 +1357,16 @@ class ComputeWeightedLossTest(test.TestCase):
         with self.session(g):
           for unweighted_loss in unweighted_losses:
             if reduction == losses.Reduction.NONE:
-              self.assertAllClose(self._raw_losses, unweighted_loss.eval())
+              self.assertAllClose(self._raw_losses,
+                                  self.evaluate(unweighted_loss))
             elif reduction == losses.Reduction.SUM:
               self.assertAllClose(
-                  np.sum(self._raw_losses), unweighted_loss.eval())
+                  np.sum(self._raw_losses), self.evaluate(unweighted_loss))
             else:
               # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS,
               # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE.
               self.assertAllClose(
-                  np.mean(self._raw_losses), unweighted_loss.eval())
+                  np.mean(self._raw_losses), self.evaluate(unweighted_loss))
 
   def testUnweightedFromPlaceholder(self):
     for reduction in losses.Reduction.all():
@@ -1398,7 +1405,7 @@ class ComputeWeightedLossTest(test.TestCase):
       self.assertEqual(1, len(util.get_losses()))
       with self.cached_session():
         self.assertAllClose(
-            np.mean(weight * self._raw_losses), weighted_loss.eval())
+            np.mean(weight * self._raw_losses), self.evaluate(weighted_loss))
 
   def _test_invalid_weights(self, weights):
     with ops.Graph().as_default():
@@ -1470,24 +1477,22 @@ class ComputeWeightedLossTest(test.TestCase):
           weighted_losses = weights * self._raw_losses
           weighted_sum = np.sum(weighted_losses)
           if reduction == losses.Reduction.NONE:
-            self.assertAllClose(weighted_losses, weighted_loss.eval())
+            self.assertAllClose(weighted_losses, self.evaluate(weighted_loss))
           elif reduction == losses.Reduction.SUM:
-            self.assertAllClose(weighted_sum, weighted_loss.eval())
+            self.assertAllClose(weighted_sum, self.evaluate(weighted_loss))
           else:
             broadcast_weights = weights * np.ones_like(self._raw_losses)
             if reduction == losses.Reduction.MEAN:
-              self.assertAllClose(
-                  weighted_sum / np.sum(broadcast_weights),
-                  weighted_loss.eval())
+              self.assertAllClose(weighted_sum / np.sum(broadcast_weights),
+                                  self.evaluate(weighted_loss))
             elif (reduction == losses.Reduction.SUM_OVER_NONZERO_WEIGHTS or
                   reduction == losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
               self.assertAllClose(
                   weighted_sum / np.count_nonzero(broadcast_weights),
-                  weighted_loss.eval())
+                  self.evaluate(weighted_loss))
             elif reduction == losses.Reduction.SUM_OVER_BATCH_SIZE:
-              self.assertAllClose(
-                  weighted_sum / self._raw_losses.size,
-                  weighted_loss.eval())
+              self.assertAllClose(weighted_sum / self._raw_losses.size,
+                                  self.evaluate(weighted_loss))
 
   def test1x1x1Weight(self):
     self._test_valid_weights((((17.0,),),))
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index d503f3d7c9f5625b89e5e4168b3eaed9ab98612c..4b5bd4059fa758bdd95d44d1d8c48fc7031fc427 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -148,7 +148,7 @@ class MapStageTest(test.TestCase):
       for i in range(n):
         self.assertTrue(sess.run(peek, feed_dict={gi: i})[0] == i)
 
-      self.assertTrue(sess.run(size) == 10)
+      self.assertTrue(self.evaluate(size) == 10)
 
   def testSizeAndClear(self):
     with ops.Graph().as_default() as G:
@@ -170,11 +170,11 @@ class MapStageTest(test.TestCase):
 
     with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 3})
-      self.assertEqual(sess.run(size), 1)
+      self.assertEqual(self.evaluate(size), 1)
       sess.run(stage, feed_dict={x: -1, pi: 1})
-      self.assertEqual(sess.run(size), 2)
+      self.assertEqual(self.evaluate(size), 2)
       sess.run(clear)
-      self.assertEqual(sess.run(size), 0)
+      self.assertEqual(self.evaluate(size), 0)
 
   def testCapacity(self):
     capacity = 3
@@ -231,13 +231,13 @@ class MapStageTest(test.TestCase):
                                              capacity))
 
       # Should have capacity elements in the staging area
-      self.assertTrue(sess.run(size) == capacity)
+      self.assertTrue(self.evaluate(size) == capacity)
 
       # Clear the staging area completely
       for i in range(n):
         sess.run(get)
 
-      self.assertTrue(sess.run(size) == 0)
+      self.assertTrue(self.evaluate(size) == 0)
 
   def testMemoryLimit(self):
     memory_limit = 512 * 1024  # 512K
@@ -295,13 +295,13 @@ class MapStageTest(test.TestCase):
                                              capacity))
 
       # Should have capacity elements in the staging area
-      self.assertTrue(sess.run(size) == capacity)
+      self.assertTrue(self.evaluate(size) == capacity)
 
       # Clear the staging area completely
       for i in range(n):
         sess.run(get)
 
-      self.assertTrue(sess.run(size) == 0)
+      self.assertTrue(self.evaluate(size) == 0)
 
   def testOrdering(self):
     import six
@@ -332,14 +332,14 @@ class MapStageTest(test.TestCase):
       for i in keys:
         sess.run(stage, feed_dict={pi: i, x: i})
 
-      self.assertTrue(sess.run(size) == n)
+      self.assertTrue(self.evaluate(size) == n)
 
       # Check that key, values come out in ascending order
       for i, k in enumerate(reversed(keys)):
-        get_key, values = sess.run(get)
+        get_key, values = self.evaluate(get)
         self.assertTrue(i == k == get_key == values)
 
-      self.assertTrue(sess.run(size) == 0)
+      self.assertTrue(self.evaluate(size) == 0)
 
   def testPartialDictInsert(self):
     with ops.Graph().as_default() as G:
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 4760236ca0e1a07212f7112636b0edaa2e95765c..6167e01864d44838afcd729d97576f6f72c94690 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -35,6 +35,19 @@ from tensorflow.python.platform import test as test_lib
 # os.environ["TF_MATMUL_AUTOTUNE_ENABLE"] = "1" to enable it.
 
 
+class MatVecTest(test_lib.TestCase):
+  """Simple test for matvec, which is sugar on top of matmul."""
+
+  def testTwoByTwoCase(self):
+    a = np.array([[1, 2], [3, 4]])
+    b = np.array([5, 6])
+    with self.cached_session():
+      c = math_ops.matvec(a, b)
+      self.assertAllEqual((2,), c.shape)
+      c_ = self.evaluate(c)
+    self.assertAllEqual([5 + 2 * 6, 3 * 5 + 4 * 6], c_)
+
+
 def _AddTest(test, op_name, testcase_name, fn):
   test_name = "_".join(["test", op_name, testcase_name])
   if hasattr(test, test_name):
@@ -77,7 +90,7 @@ def _GetMatMulTest(a_np_, b_np_, use_static_shape_, **kwargs_):
         a = constant_op.constant(effective_a_np)
         b = constant_op.constant(effective_b_np)
         res = math_ops.matmul(a, b, **kwargs_)
-        tf_val = res.eval()
+        tf_val = self.evaluate(res)
       else:
         a = array_ops.placeholder(a_np_.dtype)
         b = array_ops.placeholder(b_np_.dtype)
@@ -207,7 +220,7 @@ class MatMulInfixOperatorTest(test_lib.TestCase):
     c = infix_matmul(a, b)
     d = math_ops.matmul(a, b)
     with self.cached_session():
-      self.assertAllEqual(c.eval(), d.eval())
+      self.assertAllEqual(c.eval(), self.evaluate(d))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 93a668f12598e2029143a241b52888aa6fe52a7c..129ea40dfe67e916dad24bf4824e0f33ce084ff7 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -62,7 +62,7 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
                 batch_mat,
                 constant_op.constant(lower, index_dtype),
                 constant_op.constant(upper, index_dtype))
-            self.assertAllEqual(band_np, band.eval())
+            self.assertAllEqual(band_np, self.evaluate(band))
 
   return Test
 
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
index 3abdf50ece5eaf94d01c3f20c2b6a3ec0009b86f..d41b449a1fa895a935f1859d5af478188f5e8e9c 100644
--- a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
@@ -50,7 +51,7 @@ class ExponentialOpTest(test.TestCase):
 
   def _verifyExponential(self, x, np_type):
     inp = x.astype(np_type)
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       tf_ans = linalg_impl.matrix_exponential(inp)
       if x.size == 0:
         np_ans = np.empty(x.shape, dtype=np_type)
@@ -61,7 +62,7 @@ class ExponentialOpTest(test.TestCase):
             np_ans[i] = np_expm(inp[i])
         else:
           np_ans = np_expm(inp)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
 
   def _verifyExponentialReal(self, x):
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 2247f1541e2cc74a171aebd8ee5e40c2dedf32fc..5cef4b79a32b85e3366ce018d1d8634867c20a75 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -46,7 +46,7 @@ class InverseOpTest(test.TestCase):
           tiling = list(y.shape)
           tiling[-2:] = [1, 1]
           np_ans = np.tile(np_ans, tiling)
-        out = tf_ans.eval()
+        out = self.evaluate(tf_ans)
         self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
         self.assertShapeEqual(y, tf_ans)
 
@@ -146,7 +146,7 @@ class InverseOpTest(test.TestCase):
         inv1 = linalg_ops.matrix_inverse(matrix1, adjoint=adjoint_)
         inv2 = linalg_ops.matrix_inverse(matrix2, adjoint=adjoint_)
         all_ops += [inv1, inv2]
-      inv = sess.run(all_ops)
+      inv = self.evaluate(all_ops)
       self.assertAllEqual(inv[0], inv[1])
       self.assertAllEqual(inv[2], inv[3])
 
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index 2010a4b2a86c245c83176c800af85b1f43cf405e..81c0b5a7727f33c6ece24127c2c7d9e5e4b17a22 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
@@ -39,11 +40,11 @@ class LogarithmOpTest(test.TestCase):
 
   def _verifyLogarithm(self, x, np_type):
     inp = x.astype(np_type)
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       # Verify that expm(logm(A)) == A.
       tf_ans = linalg_impl.matrix_exponential(
           gen_linalg_ops.matrix_logarithm(inp))
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertAllClose(inp, out, rtol=1e-4, atol=1e-3)
 
   def _verifyLogarithmComplex(self, x):
diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index 9e30ae162899c8a82c103feed42d69090704c93a..80badee896250e3000a9bfcfbf2a61b70b94da8d 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -63,7 +63,7 @@ class MatrixSolveOpTest(test.TestCase):
               out = sess.run(tf_ans, {a_ph: a, b_ph: b})
             else:
               tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
-              out = tf_ans.eval()
+              out = self.evaluate(tf_ans)
               self.assertEqual(tf_ans.get_shape(), out.shape)
             self.assertEqual(np_ans.shape, out.shape)
             self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
@@ -126,7 +126,7 @@ class MatrixSolveOpTest(test.TestCase):
         s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
         s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
         all_ops += [s1, s2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       self.assertAllEqual(val[0], val[1])
       self.assertAllEqual(val[2], val[3])
 
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index 9212580313cf6552d6b5de5b8a882aaa21056357..1f2144bdee9332be0d2b59f1a91a693cccbaef3b 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -108,7 +108,7 @@ class SquareRootOpTest(test.TestCase):
       sqrt1 = gen_linalg_ops.matrix_square_root(matrix1)
       sqrt2 = gen_linalg_ops.matrix_square_root(matrix2)
       all_ops = [sqrt1, sqrt2]
-      sqrt = sess.run(all_ops)
+      sqrt = self.evaluate(all_ops)
       self.assertAllEqual(sqrt[0], sqrt[1])
 
 
diff --git a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
index 445faca3ee2f32d867a6039314d784db1e5f95ae..317b8f8716e95aea48fe23994bd6f328956bb262 100644
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@@ -87,7 +87,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
           b_tf = constant_op.constant(b)
           tf_ans = linalg_ops.matrix_triangular_solve(
               a_tf, b_tf, lower=lower, adjoint=adjoint)
-          tf_val = tf_ans.eval()
+          tf_val = self.evaluate(tf_ans)
           np_ans = np.linalg.solve(a_np, b)
           self.assertEqual(np_ans.shape, tf_ans.get_shape())
         self.assertEqual(np_ans.shape, tf_val.shape)
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 5dcdb9e4205e209091bb54474aa5c672f29cd081..b68327105a7cf0b8efd6f46a9ee44cafd4b3ed7e 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -203,10 +203,10 @@ class MeanTest(test.TestCase):
 
       mean, update_op = metrics.mean(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAlmostEqual(1.65, self.evaluate(mean), 5)
 
   def testUpdateOpsReturnsCurrentValue(self):
     with self.cached_session() as sess:
@@ -220,14 +220,14 @@ class MeanTest(test.TestCase):
 
       mean, update_op = metrics.mean(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
-      self.assertAlmostEqual(1.475, sess.run(update_op), 5)
-      self.assertAlmostEqual(12.4 / 6.0, sess.run(update_op), 5)
-      self.assertAlmostEqual(1.65, sess.run(update_op), 5)
+      self.assertAlmostEqual(0.5, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(1.475, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(12.4 / 6.0, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(1.65, self.evaluate(update_op), 5)
 
-      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+      self.assertAlmostEqual(1.65, self.evaluate(mean), 5)
 
   def testUnweighted(self):
     values = _test_values((3, 2, 4, 1))
@@ -370,10 +370,10 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(mean))
+        self.evaluate(update_op)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(mean))
 
   def testMultiDimensional(self):
     with self.cached_session() as sess:
@@ -391,10 +391,11 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(2):
-        sess.run(update_op)
-      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]], sess.run(mean))
+        self.evaluate(update_op)
+      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]],
+                          self.evaluate(mean))
 
   def testUpdateOpsReturnsCurrentValue(self):
     with self.cached_session() as sess:
@@ -408,14 +409,14 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAllClose([[0, 1]], sess.run(update_op), 5)
-      self.assertAllClose([[-2.1, 5.05]], sess.run(update_op), 5)
-      self.assertAllClose([[2.3 / 3., 10.1 / 3.]], sess.run(update_op), 5)
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(update_op), 5)
+      self.assertAllClose([[0, 1]], self.evaluate(update_op), 5)
+      self.assertAllClose([[-2.1, 5.05]], self.evaluate(update_op), 5)
+      self.assertAllClose([[2.3 / 3., 10.1 / 3.]], self.evaluate(update_op), 5)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(update_op), 5)
 
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(mean), 5)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(mean), 5)
 
   def testBinaryWeighted1d(self):
     with self.cached_session() as sess:
@@ -439,10 +440,10 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[3.25, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[3.25, 0.5]], self.evaluate(mean), 5)
 
   def testWeighted1d(self):
     with self.cached_session() as sess:
@@ -466,10 +467,10 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[0.8, 3.52]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[0.8, 3.52]], self.evaluate(mean), 5)
 
   def testWeighted2d_1(self):
     with self.cached_session() as sess:
@@ -493,10 +494,10 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[-2.1, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[-2.1, 0.5]], self.evaluate(mean), 5)
 
   def testWeighted2d_2(self):
     with self.cached_session() as sess:
@@ -520,10 +521,10 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[0, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[0, 0.5]], self.evaluate(mean), 5)
 
 
 class AccuracyTest(test.TestCase):
@@ -576,11 +577,11 @@ class AccuracyTest(test.TestCase):
     accuracy, update_op = metrics.accuracy(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_accuracy = accuracy.eval()
@@ -609,10 +610,10 @@ class AccuracyTest(test.TestCase):
 
       accuracy, update_op = metrics.accuracy(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in xrange(3):
-        sess.run(update_op)
-      self.assertEqual(0.5, sess.run(update_op))
+        self.evaluate(update_op)
+      self.assertEqual(0.5, self.evaluate(update_op))
       self.assertEqual(0.5, accuracy.eval())
 
   def testEffectivelyEquivalentSizes(self):
@@ -621,7 +622,7 @@ class AccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval())
       self.assertEqual(1.0, accuracy.eval())
 
@@ -631,7 +632,7 @@ class AccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions, weights=2.0)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval())
       self.assertEqual(1.0, accuracy.eval())
 
@@ -645,7 +646,7 @@ class AccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # if streaming_accuracy does not flatten the weight, accuracy would be
       # 0.33333334 due to an intended broadcast of weight. Due to flattening,
       # it will be higher than .95
@@ -666,7 +667,7 @@ class AccuracyTest(test.TestCase):
       accuracy, update_op = metrics.accuracy(labels, predictions,
                                              weights_placeholder)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # if streaming_accuracy does not flatten the weight, accuracy would be
       # 0.33333334 due to an intended broadcast of weight. Due to flattening,
       # it will be higher than .95
@@ -704,10 +705,10 @@ class AccuracyTest(test.TestCase):
 
       accuracy, update_op = metrics.accuracy(labels, predictions, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in xrange(3):
-        sess.run(update_op)
-      self.assertEqual(1.0, sess.run(update_op))
+        self.evaluate(update_op)
+      self.assertEqual(1.0, self.evaluate(update_op))
       self.assertEqual(1.0, accuracy.eval())
 
 
@@ -747,11 +748,11 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_precision = precision.eval()
@@ -766,8 +767,8 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op))
       self.assertAlmostEqual(1, precision.eval())
 
   def testSomeCorrect_multipleInputDtypes(self):
@@ -779,7 +780,7 @@ class PrecisionTest(test.TestCase):
       precision, update_op = metrics.precision(labels, predictions)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.5, update_op.eval())
         self.assertAlmostEqual(0.5, precision.eval())
 
@@ -882,8 +883,8 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertAlmostEqual(0, precision.eval())
 
   def testZeroTrueAndFalsePositivesGivesZeroPrecision(self):
@@ -892,8 +893,8 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0.0, precision.eval())
 
 
@@ -934,11 +935,11 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_recall = recall.eval()
@@ -953,8 +954,8 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(1, recall.eval())
 
   def testSomeCorrect_multipleInputDtypes(self):
@@ -966,7 +967,7 @@ class RecallTest(test.TestCase):
       recall, update_op = metrics.recall(labels, predictions)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.5, update_op.eval())
         self.assertAlmostEqual(0.5, recall.eval())
 
@@ -977,7 +978,7 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       weighted_tp = 2.0 + 5.0
       weighted_t = (2.0 + 2.0) + (5.0 + 5.0)
       expected_precision = weighted_tp / weighted_t
@@ -991,7 +992,7 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       weighted_tp = 3.0 + 1.0
       weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
       expected_precision = weighted_tp / weighted_t
@@ -1006,8 +1007,8 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0, recall.eval())
 
   def testZeroTruePositivesAndFalseNegativesGivesZeroRecall(self):
@@ -1016,8 +1017,8 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0, recall.eval())
 
 
@@ -1056,11 +1057,11 @@ class AUCTest(test.TestCase):
     auc, update_op = metrics.auc(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_auc = auc.eval()
@@ -1078,8 +1079,8 @@ class AUCTest(test.TestCase):
       labels = constant_op.constant(inputs)
       auc, update_op = metrics.auc(labels, predictions, curve=curve)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
 
       self.assertEqual(1, auc.eval())
 
@@ -1093,8 +1094,8 @@ class AUCTest(test.TestCase):
             constant_op.constant([0, 1, 1, 0], shape=(1, 4)), dtype=label_dtype)
         auc, update_op = metrics.auc(labels, predictions)
 
-        sess.run(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.5, sess.run(update_op))
+        self.evaluate(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.5, self.evaluate(update_op))
 
         self.assertAlmostEqual(0.5, auc.eval())
 
@@ -1106,8 +1107,8 @@ class AUCTest(test.TestCase):
       weights = constant_op.constant([2], shape=(1, 1))
       auc, update_op = metrics.auc(labels, predictions, weights=weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.5, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(0.5, auc.eval(), 5)
 
@@ -1119,8 +1120,8 @@ class AUCTest(test.TestCase):
       weights = constant_op.constant([1, 2, 3, 4], shape=(1, 4))
       auc, update_op = metrics.auc(labels, predictions, weights=weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.7, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.7, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(0.7, auc.eval(), 5)
 
@@ -1134,10 +1135,10 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.79726744594
       expected = 1 - math.log(1.5) / 2
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
   def testCorrectAnotherAUCPRSpecialCase(self):
@@ -1150,10 +1151,10 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.61350593198
       expected = (2.5 - 2 * math.log(4./3) - 0.25 * math.log(7./5)) / 3
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
   def testThirdCorrectAUCPRSpecialCase(self):
@@ -1166,10 +1167,10 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.90410597584
       expected = 1 - math.log(4./3) / 3
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
   def testIncorrectAUCPRSpecialCase(self):
@@ -1180,8 +1181,8 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.79166, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
 
@@ -1195,8 +1196,8 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.610317, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
 
@@ -1210,8 +1211,8 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.90277, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-3)
 
@@ -1223,8 +1224,8 @@ class AUCTest(test.TestCase):
       labels = constant_op.constant(1 - inputs, dtype=dtypes_lib.float32)
       auc, update_op = metrics.auc(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0, self.evaluate(update_op))
 
       self.assertAlmostEqual(0, auc.eval())
 
@@ -1234,8 +1235,8 @@ class AUCTest(test.TestCase):
       labels = array_ops.zeros([4])
       auc, update_op = metrics.auc(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 6)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
@@ -1245,8 +1246,8 @@ class AUCTest(test.TestCase):
       labels = array_ops.ones([4])
       auc, update_op = metrics.auc(labels, predictions, curve='PR')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 6)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
@@ -1317,9 +1318,9 @@ class AUCTest(test.TestCase):
                                      num_thresholds=500,
                                      weights=tf_weights)
 
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         for i in range(num_batches):
-          sess.run(update_op)
+          self.evaluate(update_op)
 
         # Since this is only approximate, we can't expect a 6 digits match.
         # Although with higher number of samples/thresholds we should see the
@@ -1371,11 +1372,11 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_specificity = specificity.eval()
@@ -1391,8 +1392,8 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
       self.assertEqual(1, specificity.eval())
 
   def testSomeCorrectHighSensitivity(self):
@@ -1406,8 +1407,8 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.8)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1.0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1.0, self.evaluate(update_op))
       self.assertAlmostEqual(1.0, specificity.eval())
 
   def testSomeCorrectLowSensitivity(self):
@@ -1421,9 +1422,9 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, self.evaluate(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
   def testWeighted1d_multipleLabelDtypes(self):
@@ -1440,9 +1441,9 @@ class SpecificityAtSensitivityTest(test.TestCase):
           labels, predictions, weights=weights, sensitivity=0.4)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
 
-        self.assertAlmostEqual(0.6, sess.run(update_op))
+        self.assertAlmostEqual(0.6, self.evaluate(update_op))
         self.assertAlmostEqual(0.6, specificity.eval())
 
   def testWeighted2d(self):
@@ -1458,9 +1459,9 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, weights=weights, sensitivity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(8.0 / 15.0, sess.run(update_op))
+      self.assertAlmostEqual(8.0 / 15.0, self.evaluate(update_op))
       self.assertAlmostEqual(8.0 / 15.0, specificity.eval())
 
 
@@ -1508,11 +1509,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_sensitivity = sensitivity.eval()
@@ -1528,8 +1529,8 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
       self.assertEqual(1, specificity.eval())
 
   def testSomeCorrectHighSpecificity(self):
@@ -1543,8 +1544,8 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.8)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.8, self.evaluate(update_op))
       self.assertAlmostEqual(0.8, specificity.eval())
 
   def testSomeCorrectLowSpecificity(self):
@@ -1558,8 +1559,8 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.6, self.evaluate(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
   def testWeighted_multipleLabelDtypes(self):
@@ -1577,8 +1578,8 @@ class SensitivityAtSpecificityTest(test.TestCase):
           labels, predictions, weights=weights, specificity=0.4)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.675, sess.run(update_op))
+        self.evaluate(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.675, self.evaluate(update_op))
         self.assertAlmostEqual(0.675, specificity.eval())
 
 
@@ -1639,7 +1640,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     rec, rec_op = metrics.recall_at_thresholds(labels, predictions, thresholds)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates, then verify idempotency.
       sess.run([prec_op, rec_op])
@@ -1663,7 +1664,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
 
       self.assertEqual(1, prec.eval())
@@ -1683,7 +1684,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                    thresholds)
 
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         sess.run([prec_op, rec_op])
 
         self.assertAlmostEqual(0.5, prec.eval())
@@ -1701,7 +1702,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
 
       self.assertAlmostEqual(0, prec.eval())
@@ -1729,7 +1730,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec_low = array_ops.reshape(rec_low, shape=())
       rec_high = array_ops.reshape(rec_high, shape=())
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
 
       self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
@@ -1759,7 +1760,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec_low = array_ops.reshape(rec_low, shape=())
       rec_high = array_ops.reshape(rec_high, shape=())
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
 
       self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
@@ -1783,7 +1784,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       [rec_low, rec_high] = array_ops.split(
           value=rec, num_or_size_splits=2, axis=0)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
 
       self.assertAlmostEqual(0.75, prec_low.eval())
@@ -1801,7 +1802,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
 
       self.assertAlmostEqual(0, prec.eval(), 6)
@@ -1869,7 +1870,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(tf_labels, tf_predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(int(num_samples / batch_size)):
         sess.run([prec_op, rec_op])
       # Since this is only approximate, we can't expect a 6 digits match.
@@ -2802,11 +2803,11 @@ class MeanAbsoluteErrorTest(test.TestCase):
     error, update_op = metrics.mean_absolute_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
@@ -2823,8 +2824,8 @@ class MeanAbsoluteErrorTest(test.TestCase):
     error, update_op = metrics.mean_absolute_error(labels, predictions, weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(3, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(3, self.evaluate(update_op))
       self.assertEqual(3, error.eval())
 
 
@@ -2867,11 +2868,11 @@ class MeanRelativeErrorTest(test.TestCase):
                                                    normalizer)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
@@ -2892,8 +2893,8 @@ class MeanRelativeErrorTest(test.TestCase):
         labels, predictions, normalizer=labels)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(expected_error, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(expected_error, self.evaluate(update_op))
       self.assertEqual(expected_error, error.eval())
 
   def testSingleUpdateNormalizedByZeros(self):
@@ -2908,8 +2909,8 @@ class MeanRelativeErrorTest(test.TestCase):
         labels, predictions, normalizer=array_ops.zeros_like(labels))
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0.0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0.0, self.evaluate(update_op))
       self.assertEqual(0.0, error.eval())
 
 
@@ -2946,11 +2947,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
@@ -2964,8 +2965,8 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
   def testSingleUpdateWithError(self):
@@ -2977,8 +2978,8 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(6, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(6, self.evaluate(update_op))
       self.assertEqual(6, error.eval())
 
   def testSingleUpdateWithErrorAndWeights(self):
@@ -2991,8 +2992,8 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions, weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(13, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(13, self.evaluate(update_op))
       self.assertEqual(13, error.eval())
 
   def testMultipleBatchesOfSizeOne(self):
@@ -3013,9 +3014,9 @@ class MeanSquaredErrorTest(test.TestCase):
 
       error, update_op = metrics.mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(208.0 / 6, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
+      self.assertAlmostEqual(208.0 / 6, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(208.0 / 6, error.eval(), 5)
 
@@ -3054,7 +3055,7 @@ class MeanSquaredErrorTest(test.TestCase):
       mse1, update_op1 = metrics.mean_squared_error(
           labels1, predictions1, name='msd1')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       sess.run([update_op0, update_op1])
       sess.run([update_op0, update_op1])
 
@@ -3081,7 +3082,7 @@ class MeanSquaredErrorTest(test.TestCase):
       mae, ma_update_op = metrics.mean_absolute_error(labels, predictions)
       mse, ms_update_op = metrics.mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       sess.run([ma_update_op, ms_update_op])
       sess.run([ma_update_op, ms_update_op])
 
@@ -3123,11 +3124,11 @@ class RootMeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.root_mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
@@ -3142,8 +3143,8 @@ class RootMeanSquaredErrorTest(test.TestCase):
 
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
 
       self.assertEqual(0, rmse.eval())
 
@@ -3156,7 +3157,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
 
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
       self.assertAlmostEqual(math.sqrt(6), rmse.eval(), 5)
 
@@ -3171,8 +3172,8 @@ class RootMeanSquaredErrorTest(test.TestCase):
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions,
                                                         weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(math.sqrt(13), sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(math.sqrt(13), self.evaluate(update_op))
 
       self.assertAlmostEqual(math.sqrt(13), rmse.eval(), 5)
 
@@ -3221,11 +3222,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=1)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
@@ -3243,8 +3244,8 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
   def testSingleUpdateWithError1(self):
@@ -3259,8 +3260,8 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 5)
       self.assertAlmostEqual(1, error.eval(), 5)
 
   def testSingleUpdateWithError2(self):
@@ -3280,8 +3281,8 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1.0, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1.0, self.evaluate(update_op), 5)
       self.assertAlmostEqual(1.0, error.eval(), 5)
 
   def testSingleUpdateWithErrorAndWeights1(self):
@@ -3299,8 +3300,8 @@ class MeanCosineDistanceTest(test.TestCase):
         labels, predictions, dim=2, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
   def testSingleUpdateWithErrorAndWeights2(self):
@@ -3318,7 +3319,7 @@ class MeanCosineDistanceTest(test.TestCase):
         labels, predictions, dim=2, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.5, update_op.eval())
       self.assertEqual(1.5, error.eval())
 
@@ -3360,7 +3361,7 @@ class PcntBelowThreshTest(test.TestCase):
       pcnt1, update_op1 = metrics.percentage_below(values, 7, name='medium')
       pcnt2, update_op2 = metrics.percentage_below(values, 1, name='low')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       sess.run([update_op0, update_op1, update_op2])
 
       pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
@@ -3382,7 +3383,7 @@ class PcntBelowThreshTest(test.TestCase):
       pcnt2, update_op2 = metrics.percentage_below(
           values, 1, weights=weights, name='low')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertListEqual([1.0, 0.5, 0.0],
                            sess.run([update_op0, update_op1, update_op2]))
 
@@ -3446,11 +3447,11 @@ class MeanIOUTest(test.TestCase):
         labels, predictions, num_classes=num_classes)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_mean_iou = mean_iou.eval()
@@ -3482,9 +3483,9 @@ class MeanIOUTest(test.TestCase):
 
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 1.0 / 4.0, 0.])
       self.assertEqual(desired_output, miou.eval())
 
@@ -3529,7 +3530,7 @@ class MeanIOUTest(test.TestCase):
 
       variables.local_variables_initializer().run()
       for _ in range(6):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([2.0 / 3.0, 1.0 / 2.0])
       self.assertAlmostEqual(desired_output, mean_iou.eval())
 
@@ -3563,9 +3564,9 @@ class MeanIOUTest(test.TestCase):
 
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 3.0, 2.0 / 4.0])
       self.assertAlmostEqual(desired_output, miou.eval())
 
@@ -3587,7 +3588,7 @@ class MeanIOUTest(test.TestCase):
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       confusion_matrix = update_op.eval()
       self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
       desired_miou = np.mean([3. / 5., 5. / 7.])
@@ -3599,7 +3600,7 @@ class MeanIOUTest(test.TestCase):
     num_classes = 1
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(40, update_op.eval()[0])
       self.assertEqual(1.0, miou.eval())
 
@@ -3609,7 +3610,7 @@ class MeanIOUTest(test.TestCase):
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
       self.assertEqual(0., miou.eval())
 
@@ -3640,7 +3641,7 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(
           labels, predictions, num_classes, weights=weights)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
       desired_miou = np.mean([2. / 4., 4. / 6.])
       self.assertAlmostEqual(desired_miou, miou.eval())
@@ -3659,7 +3660,7 @@ class MeanIOUTest(test.TestCase):
     num_classes = 3
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
           1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
@@ -3671,7 +3672,7 @@ class MeanIOUTest(test.TestCase):
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[1, 0], [0, 0]], update_op.eval())
       self.assertAlmostEqual(1, miou.eval())
 
@@ -3689,7 +3690,7 @@ class MeanIOUTest(test.TestCase):
     num_classes = 3
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
           1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
@@ -3752,11 +3753,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
         labels, predictions, num_classes=num_classes)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_mean_accuracy = mean_accuracy.eval()
@@ -3788,9 +3789,9 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0, 1.0 / 3.0, 0.0])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
@@ -3835,7 +3836,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
 
       variables.local_variables_initializer().run()
       for _ in range(6):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([2.0 / 2.0, 0.5 / 1.5])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
@@ -3870,9 +3871,9 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 2.0 / 3.0, 0.])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
@@ -3883,7 +3884,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval()[0])
       self.assertEqual(1.0, mean_accuracy.eval())
 
@@ -3894,7 +3895,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([0.0, 0.0], update_op.eval())
       self.assertEqual(0., mean_accuracy.eval())
 
@@ -3913,7 +3914,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes, weights=weights)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       desired_accuracy = np.array([2. / 2., 4. / 6.], dtype=np.float32)
       self.assertAllEqual(desired_accuracy, update_op.eval())
       desired_mean_accuracy = np.mean(desired_accuracy)
@@ -3945,7 +3946,7 @@ class FalseNegativesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(3., tn_update_op.eval())
       self.assertAllClose(3., tn.eval())
@@ -3964,7 +3965,7 @@ class FalseNegativesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(5., tn_update_op.eval())
       self.assertAllClose(5., tn.eval())
@@ -3994,7 +3995,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), fn.eval())
       self.assertAllEqual((0, 2, 3), fn_update_op.eval())
       self.assertAllEqual((0, 2, 3), fn.eval())
@@ -4013,7 +4014,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), fn.eval())
       self.assertAllEqual((0.0, 8.0, 11.0), fn_update_op.eval())
       self.assertAllEqual((0.0, 8.0, 11.0), fn.eval())
@@ -4044,7 +4045,7 @@ class FalsePositivesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(7., tn_update_op.eval())
       self.assertAllClose(7., tn.eval())
@@ -4063,7 +4064,7 @@ class FalsePositivesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(14., tn_update_op.eval())
       self.assertAllClose(14., tn.eval())
@@ -4093,7 +4094,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), fp.eval())
       self.assertAllEqual((7, 4, 2), fp_update_op.eval())
       self.assertAllEqual((7, 4, 2), fp.eval())
@@ -4114,7 +4115,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), fp.eval())
       self.assertAllEqual((125.0, 42.0, 12.0), fp_update_op.eval())
       self.assertAllEqual((125.0, 42.0, 12.0), fp.eval())
@@ -4145,7 +4146,7 @@ class TrueNegativesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(3., tn_update_op.eval())
       self.assertAllClose(3., tn.eval())
@@ -4164,7 +4165,7 @@ class TrueNegativesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(4., tn_update_op.eval())
       self.assertAllClose(4., tn.eval())
@@ -4194,7 +4195,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), tn.eval())
       self.assertAllEqual((2, 5, 7), tn_update_op.eval())
       self.assertAllEqual((2, 5, 7), tn.eval())
@@ -4213,7 +4214,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), tn.eval())
       self.assertAllEqual((5.0, 15.0, 23.0), tn_update_op.eval())
       self.assertAllEqual((5.0, 15.0, 23.0), tn.eval())
@@ -4244,7 +4245,7 @@ class TruePositivesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(7., tn_update_op.eval())
       self.assertAllClose(7., tn.eval())
@@ -4263,7 +4264,7 @@ class TruePositivesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(12., tn_update_op.eval())
       self.assertAllClose(12., tn.eval())
@@ -4293,7 +4294,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), tp.eval())
       self.assertAllEqual((3, 1, 0), tp_update_op.eval())
       self.assertAllEqual((3, 1, 0), tp.eval())
@@ -4310,7 +4311,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), tp.eval())
       self.assertAllEqual((111.0, 37.0, 0.0), tp_update_op.eval())
       self.assertAllEqual((111.0, 37.0, 0.0), tp.eval())
diff --git a/tensorflow/python/kernel_tests/morphological_ops_test.py b/tensorflow/python/kernel_tests/morphological_ops_test.py
index 6d601554b80408ff6f419b164cd12bcb493a2f61..4ee04209ccb42f81127e29b0b6e99ae61b26a949 100644
--- a/tensorflow/python/kernel_tests/morphological_ops_test.py
+++ b/tensorflow/python/kernel_tests/morphological_ops_test.py
@@ -52,7 +52,7 @@ class DilationTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="dilation2d")
-      self.assertAllClose(out, out_tensor.eval())
+      self.assertAllClose(out, self.evaluate(out_tensor))
 
   def _testDilationValidPadding(self, use_gpu):
     # [1, 2, 2, 1]
@@ -216,7 +216,7 @@ class DilationTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="dilation2d")
-      out_shape = out_tensor.eval().shape
+      out_shape = self.evaluate(out_tensor).shape
 
       # Small delta is necessary for argmax to remain the same.
       err = gradient_checker.compute_gradient_error(
@@ -327,7 +327,7 @@ class ErosionTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="erosion2d")
-      self.assertAllClose(out, out_tensor.eval())
+      self.assertAllClose(out, self.evaluate(out_tensor))
 
   def _testErosionValidPadding(self, use_gpu):
     # [1, 2, 2, 1]
@@ -491,7 +491,7 @@ class ErosionTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="erosion2d")
-      out_shape = out_tensor.eval().shape
+      out_shape = self.evaluate(out_tensor).shape
 
       # Small delta is necessary for argmax to remain the same.
       err = gradient_checker.compute_gradient_error(
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
index 15e38265421a30277fa46d362afe57249a11a4e7..87f1991aa78e2906d9787067ec57f6f7682c2159 100644
--- a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -142,8 +142,8 @@ class DepthwiseConv2DTest(test.TestCase):
       conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
 
-      native_result = sess.run(conv_native)
-      interface_result = sess.run(conv_interface)
+      native_result = self.evaluate(conv_native)
+      interface_result = self.evaluate(conv_interface)
 
     print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
           ", stride:", stride, ", padding: ", padding, ", max diff: ",
@@ -211,7 +211,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=filter_in_sizes)
         conv = nn_ops.depthwise_conv2d_native(
             t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        value = sess.run(conv)
+        value = self.evaluate(conv)
     print("value = ", value)
     self.assertAllClose(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index e202b6e8a43b27cc6896a3a5d7e6d2f47b3bed5b..5ff0c58bf1bee6909d68420f89bcecf5afa490e6 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -70,7 +70,7 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
         tf_matrix = constant_op.constant(matrix)
         tf_norm = linalg_ops.norm(
             tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
-        tf_norm_val = sess.run(tf_norm)
+        tf_norm_val = self.evaluate(tf_norm)
       else:
         tf_matrix = array_ops.placeholder(dtype_)
         tf_norm = linalg_ops.norm(
diff --git a/tensorflow/python/kernel_tests/nth_element_op_test.py b/tensorflow/python/kernel_tests/nth_element_op_test.py
index 338b6cec0102c7149c0af4f8295c8c7263a5f2f6..6cd497467106997a1e68f2f46cf02884699981b4 100644
--- a/tensorflow/python/kernel_tests/nth_element_op_test.py
+++ b/tensorflow/python/kernel_tests/nth_element_op_test.py
@@ -35,7 +35,7 @@ class NthElementTest(test.TestCase):
     with self.cached_session(use_gpu=False) as sess:
       inputs_op = ops.convert_to_tensor(inputs, dtype=dtype)
       values_op = nn_ops.nth_element(inputs_op, n, reverse=reverse)
-      values = sess.run(values_op)
+      values = self.evaluate(values_op)
 
       self.assertShapeEqual(np_expected_values, values_op)
       self.assertAllClose(np_expected_values, values)
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 5db591ed304a60011a24347e12da45b523c6c305..e3210dcddc40048cd27413f7f12b2a8c568b628b 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -35,11 +36,11 @@ class VerifyTensorAllFiniteTest(test.TestCase):
   def testVerifyTensorAllFiniteSucceeds(self):
     x_shape = [5, 4]
     x = np.random.random_sample(x_shape).astype(np.float32)
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
       t_verified = numerics.verify_tensor_all_finite(t,
                                                      "Input is not a number.")
-      self.assertAllClose(x, t_verified.eval())
+      self.assertAllClose(x, self.evaluate(t_verified))
 
   def testVerifyTensorAllFiniteFails(self):
     x_shape = [5, 4]
@@ -48,19 +49,19 @@ class VerifyTensorAllFiniteTest(test.TestCase):
 
     # Test NaN.
     x[0] = np.nan
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       with self.assertRaisesOpError(my_msg):
         t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
         t_verified = numerics.verify_tensor_all_finite(t, my_msg)
-        t_verified.eval()
+        self.evaluate(t_verified)
 
     # Test Inf.
     x[0] = np.inf
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       with self.assertRaisesOpError(my_msg):
         t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
         t_verified = numerics.verify_tensor_all_finite(t, my_msg)
-        t_verified.eval()
+        self.evaluate(t_verified)
 
 
 class NumericsTest(test.TestCase):
@@ -73,7 +74,7 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("Inf"):
-        a.eval()
+        self.evaluate(a)
 
   def testNaN(self):
     with self.session(graph=ops.Graph()):
@@ -83,7 +84,7 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("NaN"):
-        a.eval()
+        self.evaluate(a)
 
   def testBoth(self):
     with self.session(graph=ops.Graph()):
@@ -93,13 +94,13 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("Inf and NaN"):
-        a.eval()
+        self.evaluate(a)
 
   def testPassThrough(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       checked = array_ops.check_numerics(t1, message="pass through test")
-      value = checked.eval()
+      value = self.evaluate(checked)
       self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value)
       self.assertEqual([2, 3], checked.get_shape())
 
diff --git a/tensorflow/python/kernel_tests/one_hot_op_test.py b/tensorflow/python/kernel_tests/one_hot_op_test.py
index 377d545c9cdd30fefd1d66d2138716bbb0b153f4..856ba7bb7f3c5fb340a80c88b7c4ff2c33277568 100644
--- a/tensorflow/python/kernel_tests/one_hot_op_test.py
+++ b/tensorflow/python/kernel_tests/one_hot_op_test.py
@@ -41,12 +41,12 @@ class OneHotTest(test.TestCase):
       else:
         ans = array_ops.one_hot(**inputs)
         if expected_err_re is None:
-          tf_ans = ans.eval()
+          tf_ans = self.evaluate(ans)
           self.assertAllEqual(tf_ans, truth)
           self.assertEqual(tf_ans.shape, ans.get_shape())
         else:
           with self.assertRaisesOpError(expected_err_re):
-            ans.eval()
+            self.evaluate(ans)
 
   def _testBothOneHot(self, truth, expected_err_re=None, raises=None, **inputs):
     self._testOneHot(truth, True, expected_err_re, raises, **inputs)
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index fc302c4141af776c25d1d4883765d4bc4989e482..6fe98d2559a544f6dd73589dfe049e4756991e1b 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -88,7 +88,7 @@ class PadOpTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       tf_val = array_ops.pad(np_inputs, paddings, mode=mode,
                              constant_values=constant_values)
-      out = tf_val.eval()
+      out = self.evaluate(tf_val)
     self.assertAllEqual(np_val, out)
     self.assertShapeEqual(np_val, tf_val)
 
@@ -208,7 +208,7 @@ class PadOpTest(test.TestCase):
                                  constant_op.constant(paddings, padding_dtype),
                                  mode=mode,
                                  constant_values=0)
-          out = tf_val.eval()
+          out = self.evaluate(tf_val)
         self.assertAllEqual(np_val, out)
         self.assertShapeEqual(np_val, tf_val)
 
@@ -250,16 +250,16 @@ class PadOpTest(test.TestCase):
     symmetric = array_ops.pad(x, [[1, 0], [0, 1]], mode="SYMMETRIC",
                               constant_values="PAD")
     with self.session(use_gpu=True):
-      self.assertAllEqual([[b"PAD", b"PAD", b"PAD"],
-                           [b"Hello", b"World", b"PAD"],
-                           [b"Goodnight", b"Moon", b"PAD"]], constant.eval())
+      self.assertAllEqual(
+          [[b"PAD", b"PAD", b"PAD"], [b"Hello", b"World", b"PAD"],
+           [b"Goodnight", b"Moon", b"PAD"]], self.evaluate(constant))
       self.assertAllEqual([[b"Goodnight", b"Moon", b"Goodnight"],
                            [b"Hello", b"World", b"Hello"],
                            [b"Goodnight", b"Moon", b"Goodnight"]],
-                          reflect.eval())
-      self.assertAllEqual([[b"Hello", b"World", b"World"],
-                           [b"Hello", b"World", b"World"],
-                           [b"Goodnight", b"Moon", b"Moon"]], symmetric.eval())
+                          self.evaluate(reflect))
+      self.assertAllEqual(
+          [[b"Hello", b"World", b"World"], [b"Hello", b"World", b"World"],
+           [b"Goodnight", b"Moon", b"Moon"]], self.evaluate(symmetric))
 
   def testShapeFunctionEdgeCases(self):
     # Unknown paddings shape.
@@ -327,7 +327,7 @@ class PadOpTest(test.TestCase):
     inp = np.asarray(7)
     with self.session(use_gpu=True):
       tf_val = array_ops.pad(inp, paddings)
-      out = tf_val.eval()
+      out = self.evaluate(tf_val)
     self.assertAllEqual(inp, out)
     self.assertShapeEqual(inp, tf_val)
 
@@ -337,7 +337,7 @@ class PadOpTest(test.TestCase):
       inp = np.asarray(7)
       with self.cached_session(use_gpu=True):
         tf_val = array_ops.pad(inp, constant_op.constant(paddings, dtype=dtype))
-        out = tf_val.eval()
+        out = self.evaluate(tf_val)
       self.assertAllEqual(inp, out)
       self.assertShapeEqual(inp, tf_val)
 
@@ -361,11 +361,12 @@ class PadOpTest(test.TestCase):
             [paddings_value[i][0] + inp.shape.dims[i].value for i in range(4)],
             [-1, -1, -1, -1])
         with self.cached_session(use_gpu=True):
-          self.assertAllEqual(inp.eval(), middle.eval())
+          self.assertAllEqual(inp.eval(), self.evaluate(middle))
           self.assertAllEqual(
-              np.zeros([row[0] for row in paddings_value]), left.eval())
+              np.zeros([row[0] for row in paddings_value]), self.evaluate(left))
           self.assertAllEqual(
-              np.zeros([row[1] for row in paddings_value]), right.eval())
+              np.zeros([row[1] for row in paddings_value]),
+              self.evaluate(right))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index 95f3dcceeaa14909b706b1f1c0676c5df28b8427..3696298132ac0beb63bba7ebf9635bfee624d3e9 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -126,7 +126,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -178,7 +178,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -193,7 +193,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
@@ -224,7 +224,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        x_val, y_val = sess.run(dequeued_t)
+        x_val, y_val = self.evaluate(dequeued_t)
         x, y = elems[i]
         self.assertEqual([x], x_val)
         self.assertEqual([y], y_val)
@@ -243,9 +243,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -257,7 +257,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i % 4]], vals)
 
   def testEmptyEnqueueMany(self):
@@ -269,9 +269,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -279,9 +279,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithDynamicShape(self):
     with self.cached_session():
@@ -290,9 +290,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue(([10.0],))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpToWithDynamicShape(self):
     with self.cached_session():
@@ -301,9 +301,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue(([10.0],))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testConstructPaddingFIFOQueueWithNoShape(self):
     with self.cached_session():
@@ -327,7 +327,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -344,7 +344,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -357,8 +357,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testDequeueUpToNoBlocking(self):
     with self.cached_session():
@@ -369,8 +369,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testMultiDequeueMany(self):
     with self.cached_session() as sess:
@@ -387,17 +387,17 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
@@ -418,7 +418,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertTrue(
@@ -428,11 +428,11 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertTrue(
@@ -459,7 +459,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       for enqueue_op in enqueue_ops:
         enqueue_op.run()
-      string_val, int_val = sess.run(dequeued_t)
+      string_val, int_val = self.evaluate(dequeued_t)
 
       self.assertAllEqual([[b"a", b"", b""], [b"ab", b"", b""],
                            [b"abc", b"", b""], [b"abc", b"d", b""],
@@ -473,7 +473,7 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      string_val, int_val = sess.run(dequeued_single_t)
+      string_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual([b"abc", b"d", b"e", b"f"], string_val)
       self.assertAllEqual([[1, 2, 3, 4]], int_val)
       self.assertTrue(
@@ -500,7 +500,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       for enqueue_op in enqueue_ops:
         enqueue_op.run()
-      string_val, int_val = sess.run(dequeued_t)
+      string_val, int_val = self.evaluate(dequeued_t)
 
       self.assertAllEqual([[b"a", b"", b""], [b"ab", b"", b""],
                            [b"abc", b"", b""], [b"abc", b"d", b""],
@@ -514,7 +514,7 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      string_val, int_val = sess.run(dequeued_single_t)
+      string_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual([b"abc", b"d", b"e", b"f"], string_val)
       self.assertAllEqual([[1, 2, 3, 4]], int_val)
       self.assertTrue(
@@ -622,7 +622,7 @@ class PaddingFIFOQueueTest(test.TestCase):
                                    r"Expected \[2,\?,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testParallelEnqueueMany(self):
     with self.cached_session() as sess:
@@ -633,7 +633,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -700,11 +700,11 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def enqueue():
         for _ in xrange(100):
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       def dequeue():
         for _ in xrange(100):
-          self.assertTrue(sess.run(dequeued_t) in (10.0, 20.0))
+          self.assertTrue(self.evaluate(dequeued_t) in (10.0, 20.0))
 
       enqueue_threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       dequeue_threads = [self.checkedThread(target=dequeue) for _ in range(10)]
@@ -736,7 +736,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         for i in xrange(250):
-          self.assertEqual(i, sess.run(dequeued_t))
+          self.assertEqual(i, self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -767,7 +767,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeuemany_t = q.dequeue_many(count_placeholder)
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -776,7 +776,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       while elements_dequeued < 250:
         # With equal probability, run Dequeue or dequeue_many.
         if random.random() > 0.5:
-          self.assertEqual(elements_dequeued, dequeued_t.eval())
+          self.assertEqual(elements_dequeued, self.evaluate(dequeued_t))
           elements_dequeued += 1
         else:
           count = random.randint(0, min(20, 250 - elements_dequeued))
@@ -805,7 +805,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
         dequeued_elems.extend(sess.run(dequeued_t).tolist())
@@ -832,7 +832,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
         dequeued_elems.extend(sess.run(dequeued_t).tolist())
@@ -882,12 +882,12 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
       close_op.run()
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -901,7 +901,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         for elem in elems:
-          self.assertEqual([elem], sess.run(dequeued_t))
+          self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
@@ -926,8 +926,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
-        self.assertAllEqual(elems[3:], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
+        self.assertAllEqual(elems[3:], self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -968,7 +968,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems, sess.run(dequeued_t))
+        self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
@@ -993,7 +993,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
@@ -1017,16 +1017,16 @@ class PaddingFIFOQueueTest(test.TestCase):
       cleanup_dequeue_t = q.dequeue()
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        self.assertAllEqual(elems[0:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[0:3], self.evaluate(dequeued_t))
         with self.assertRaises(errors_impl.OutOfRangeError):
           sess.run(dequeued_t)
-        self.assertEqual(elems[3], sess.run(cleanup_dequeue_t))
+        self.assertEqual(elems[3], self.evaluate(cleanup_dequeue_t))
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -1155,7 +1155,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1163,8 +1163,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
-      self.assertEqual([50.0], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
       thread.join()
 
   def testBlockingEnqueueManyToFullQueue(self):
@@ -1178,7 +1178,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1186,10 +1186,10 @@ class PaddingFIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
         time.sleep(0.01)
-      self.assertEqual([50.0], dequeued_t.eval())
-      self.assertEqual([60.0], dequeued_t.eval())
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
+      self.assertEqual([60.0], self.evaluate(dequeued_t))
 
       # Make sure the thread finishes before exiting.
       thread.join()
@@ -1207,7 +1207,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # Expect the operation to succeed once the dequeue op runs.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1217,18 +1217,18 @@ class PaddingFIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
 
       for elem in [20.0, 30.0, 40.0, 50.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
       self.assertEqual(0, q.size().eval())
 
   def testBlockingEnqueueManyBeforeClose(self):
@@ -1242,7 +1242,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1252,17 +1252,17 @@ class PaddingFIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
       for elem in [20.0, 30.0, 50.0, 60.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
 
   def testDoesNotLoseValue(self):
     with self.cached_session():
@@ -1379,19 +1379,19 @@ class PaddingFIFOQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1444,14 +1444,14 @@ class PaddingFIFOQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1517,7 +1517,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       q.enqueue_many(input_tuple).run()
 
       output_tuple_t = q.dequeue_many(32)
-      output_tuple = sess.run(output_tuple_t)
+      output_tuple = self.evaluate(output_tuple_t)
 
       for (input_elem, output_elem) in zip(input_tuple, output_tuple):
         self.assertAllEqual(input_elem, output_elem)
diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
index a84895a287eeb0d67cce563254e2383e390c9e2c..3f500872827dbb187af674409a2dfc408356e6fe 100644
--- a/tensorflow/python/kernel_tests/parse_single_example_op_test.py
+++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
@@ -107,7 +107,7 @@ class ParseExampleTest(test.TestCase):
         for result_dict in [out, out_with_example_name]:
           result = flatten_values_tensors_or_sparse(result_dict.values())
           # Check values.
-          tf_result = sess.run(result)
+          tf_result = self.evaluate(result)
           _compare_output_to_expected(self, result_dict, expected_values,
                                       tf_result)
 
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index 71d8b60d3ccf9fafaa16fa705c3261e008d8409c..d87adbfc2e514663d528a2c4ef09e7f5ff389d13 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -101,15 +101,15 @@ class ParseExampleTest(test.TestCase):
         out = parsing_ops.parse_example(**kwargs)
         result = flatten_values_tensors_or_sparse(out.values())
         # Check values.
-        tf_result = sess.run(result)
+        tf_result = self.evaluate(result)
         _compare_output_to_expected(self, out, expected_values, tf_result)
 
       # Check shapes; if serialized is a Tensor we need its size to
       # properly check.
       serialized = kwargs["serialized"]
       batch_size = (
-          serialized.eval().size if isinstance(serialized, ops.Tensor) else
-          np.asarray(serialized).size)
+          self.evaluate(serialized).size if isinstance(serialized, ops.Tensor)
+          else np.asarray(serialized).size)
       for k, f in kwargs["features"].items():
         if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
           self.assertEqual(
@@ -1614,7 +1614,7 @@ class DecodeJSONExampleTest(test.TestCase):
           shape=examples.shape,
           dtype=dtypes.string)
       binary_tensor = parsing_ops.decode_json_example(json_tensor)
-      binary_val = sess.run(binary_tensor)
+      binary_val = self.evaluate(binary_tensor)
 
       if examples.shape:
         self.assertShapeEqual(binary_val, json_tensor)
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index d1f0c6c2a056dc85e8ac038ddb0cf14ef00ccf0d..0c0465619694996d9f05c96266f45d8ce8f55a9f 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -328,7 +328,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       vs = partitioned_variables.create_partitioned_variables([4], [4], rnd_par)
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd_par.eval()
+      rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 4, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["4 0,1", "4 1,1", "4 2,1", "4 3,1"])
@@ -340,7 +340,7 @@ class PartitionedVariablesTestCase(test.TestCase):
                                                               rnd_par)
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 1).eval()
-      rnd = rnd_par.eval()
+      rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 2, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["2 4 0,2:0,2", "2 4 0,2:2,2"])
@@ -414,7 +414,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [1, 10], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 1).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.float32] * 10, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, [
@@ -434,7 +434,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           for i in xrange(1, 10)
       ]
       variables.global_variables_initializer().run()
-      rnd_val = rnd.eval()
+      rnd_val = self.evaluate(rnd)
       # Only check the slice save specs for the first 5 tf.
       save_specs = [
           # One slice
@@ -469,7 +469,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [1, 1], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, ["10 43 0,10:0,43"])
 
@@ -480,7 +480,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [10, 1], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, [
           "10 43 0,1:0,43", "10 43 1,1:0,43", "10 43 2,1:0,43",
@@ -510,7 +510,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer())
       variables.global_variables_initializer().run()
-      val0, val1 = var0.eval().flatten(), var1.eval().flatten()
+      val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6)
     # Negative test that proves that slices have the same values if
     # the random initializer uses a seed.
@@ -518,7 +518,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer(seed=201))
       variables.global_variables_initializer().run()
-      val0, val1 = var0.eval().flatten(), var1.eval().flatten()
+      val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertAllClose(val0, val1)
 
   def testSomeErrors(self):
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 372861297fb9243254fcba6f7064ce5cc63a6086..92016a49a278a8e79c10711fae1ae3a6e8a1eef0 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -151,7 +151,7 @@ class PoolingTest(test.TestCase):
         np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1
     y1 = pool_direct(input=x, **kwargs)
     y2 = nn_ops.pool(input=x, **kwargs)
-    self.assertAllClose(y1, y2.eval(), rtol=1e-2, atol=1e-2)
+    self.assertAllClose(y1, self.evaluate(y2), rtol=1e-2, atol=1e-2)
 
   def testPoolSimple(self):
     with self.session(use_gpu=test.is_gpu_available()):
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index e393c7a0229a907fbccc09b77a788e96069cf825..a8e962bc3a6119adc29e24ef6b87353d535f1b30 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -81,7 +81,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format)
       if data_format == "NCDHW":
         t = test_util.NCHWToNHWC(t)
-      vals = sess.run(t)
+      vals = self.evaluate(t)
     # Verifies values.
     actual = vals.flatten()
     self.assertAllClose(expected, actual)
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 53003a7f284d684bf51f3757043ff330c3066eb8..61628c4756fedf0364470cc54aa299419adbe292 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -166,7 +166,7 @@ class PoolingTest(test.TestCase):
             strides_placeholder: strides
         })
       else:
-        actual = t.eval()
+        actual = self.evaluate(t)
         self.assertShapeEqual(actual, t)
       self.assertAllCloseAccordingToType(expected, actual.flatten())
 
@@ -750,11 +750,11 @@ class PoolingTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op, _ = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
       with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
       self.assertAllCloseAccordingToType(cpu_val, gpu_val)
 
   def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
@@ -767,20 +767,20 @@ class PoolingTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        argmax = argmax_op.eval()
+        argmax = self.evaluate(argmax_op)
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
         out_op = gen_nn_ops.max_pool_grad_with_argmax(t, grad_in, argmax, ksize,
                                                       strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
         self.assertShapeEqual(gpu_val, out_op)
       with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        orig_out = out_op.eval()
+        orig_out = self.evaluate(out_op)
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
         out_op = gen_nn_ops.max_pool_grad(t, orig_out, grad_in, ksize, strides,
                                           padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
@@ -796,20 +796,20 @@ class PoolingTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        argmax = argmax_op.eval()
+        argmax = self.evaluate(argmax_op)
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
         out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
             t, grad_in, argmax, ksize, strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
         self.assertShapeEqual(gpu_val, out_op)
       with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        orig_out = out_op.eval()
+        orig_out = self.evaluate(out_op)
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
         out_op = gen_nn_ops.max_pool_grad_grad(t, orig_out, grad_in, ksize,
                                                strides, padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
@@ -848,7 +848,7 @@ class PoolingTest(test.TestCase):
           ksize=[1, 2, 2, 1],
           strides=[1, 1, 1, 1],
           padding="VALID")
-      out = out_op.eval().flatten()
+      out = self.evaluate(out_op).flatten()
       self.assertAllClose(out,
                           [11.0, 12.0, 0.0, 13.0, 0.0, 14.0, 0.0, 0.0, 0.0])
 
@@ -871,7 +871,7 @@ class PoolingTest(test.TestCase):
           ksize=[1, 2, 2, 1],
           strides=[1, 1, 1, 1],
           padding="VALID")
-      out = out_op.eval().flatten()
+      out = self.evaluate(out_op).flatten()
       self.assertAllClose(out, [11.0, 12.0, 14.0, 16.0])
 
   def _ConstructAndTestGradient(self,
@@ -1221,12 +1221,12 @@ class PoolingTest(test.TestCase):
           input_tensor, output_tensor, output_backprop_tensor, window_rows,
           window_cols, row_stride, col_stride, padding, v2)
 
-      actual_input_backprop = input_backprop_tensor.eval()
+      actual_input_backprop = self.evaluate(input_backprop_tensor)
       self.assertShapeEqual(actual_input_backprop, input_backprop_tensor)
       actual_input_backprop = actual_input_backprop.flatten()
       actual_input_backprop = self._GetNdArray(actual_input_backprop)
 
-      actual_output = output_tensor.eval().flatten()
+      actual_output = self.evaluate(output_tensor).flatten()
       actual_output = self._GetNdArray(actual_output)
 
       self.assertAllClose(
diff --git a/tensorflow/python/kernel_tests/priority_queue_test.py b/tensorflow/python/kernel_tests/priority_queue_test.py
index 73a9c81638259486f28a37755db86e4fe055f738..a510fccaaa5b2301106d5682a347e6ce23e11c3f 100644
--- a/tensorflow/python/kernel_tests/priority_queue_test.py
+++ b/tensorflow/python/kernel_tests/priority_queue_test.py
@@ -50,7 +50,7 @@ class PriorityQueueTest(test.TestCase):
         enq.run()
 
       deq = q.dequeue_many(100)
-      deq_elem, deq_value_0, deq_value_1 = sess.run(deq)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(deq)
 
       allowed = {}
       missed = set()
@@ -81,7 +81,7 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeue_op = q.dequeue_many(100)
 
@@ -93,7 +93,7 @@ class PriorityQueueTest(test.TestCase):
       for t in enqueue_threads:
         t.start()
 
-      deq_elem, deq_value_0, deq_value_1 = sess.run(dequeue_op)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(dequeue_op)
 
       for t in enqueue_threads:
         t.join()
@@ -132,12 +132,12 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeued = []
 
       def dequeue(dequeue_op):
-        (dequeue_indices, dequeue_values) = sess.run(dequeue_op)
+        (dequeue_indices, dequeue_values) = self.evaluate(dequeue_op)
         self.assertAllEqual(dequeue_indices, dequeue_values)
         dequeued.extend(dequeue_indices)
 
@@ -184,10 +184,10 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue(dequeue_op, dequeued):
-        (dequeue_indices, dequeue_values) = sess.run(dequeue_op)
+        (dequeue_indices, dequeue_values) = self.evaluate(dequeue_op)
         self.assertAllEqual(dequeue_indices, dequeue_values)
         dequeue_wait.acquire()
         dequeued.extend(dequeue_indices)
@@ -236,7 +236,7 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeue_op = q.dequeue_many(100)
 
@@ -248,7 +248,7 @@ class PriorityQueueTest(test.TestCase):
       for t in enqueue_threads:
         t.start()
 
-      deq_elem, deq_value_0, deq_value_1 = sess.run(dequeue_op)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(dequeue_op)
 
       for t in enqueue_threads:
         t.join()
@@ -276,7 +276,7 @@ class PriorityQueueTest(test.TestCase):
       side_value_1 = np.random.rand(1000).astype(bytes)
       q.enqueue_many((elem, side_value_0, side_value_1)).run()
       deq = q.dequeue_many(1000)
-      deq_elem, deq_value_0, deq_value_1 = sess.run(deq)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(deq)
 
       allowed = {}
       for e, v0, v1 in zip(elem, side_value_0, side_value_1):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 837f1ec054ff4980cf4868c26dbdbe43cc1d1726..c9cbe44a7f3b1b5f103a11999acb2ba76f5ba99a 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -272,7 +272,7 @@ class PyFuncTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported numpy type"):
-        y.eval()
+        self.evaluate(y)
 
   def testBadReturnType(self):
     with self.cached_session():
@@ -285,7 +285,7 @@ class PyFuncTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported object type"):
-        z.eval()
+        self.evaluate(z)
 
   def testReturnInput(self):
     with self.cached_session():
@@ -307,9 +307,9 @@ class PyFuncTest(test.TestCase):
     with session_lib.Session() as sess:
       producer = iter(range(3))
       x, = script_ops.py_func(lambda: next(producer), [], [dtypes.int64])
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 1)
-      self.assertEqual(sess.run(x), 2)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 1)
+      self.assertEqual(self.evaluate(x), 2)
 
   def testStateless(self):
     # Not using self.cached_session(), which disables optimization.
@@ -317,9 +317,9 @@ class PyFuncTest(test.TestCase):
       producer = iter(range(3))
       x, = script_ops.py_func(
           lambda: next(producer), [], [dtypes.int64], stateful=False)
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
 
   def testGradientFunction(self):
     # Input to tf.py_func is necessary, otherwise get_gradient_function()
@@ -335,7 +335,7 @@ class PyFuncTest(test.TestCase):
       val = [[1, 2], [3, 4]]
       x, = script_ops.py_func(lambda: np.array(val, order="F"), [],
                               [dtypes.int64])
-      self.assertAllEqual(val, x.eval())
+      self.assertAllEqual(val, self.evaluate(x))
 
   def testParallel(self):
     # Tests that tf.py_func's can run in parallel if they release the GIL.
@@ -390,7 +390,7 @@ class PyFuncTest(test.TestCase):
     f = script_ops.py_func(
         do_nothing, [constant_op.constant(3, dtypes.int64)], [], stateful=False)
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(f), [])
+      self.assertEqual(self.evaluate(f), [])
 
   def _testExceptionHandling(self, py_exp, tf_exp, eager=False):
 
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index a60237fb25a0ca5c2a26797452f0ce08e530f830..114481ed6a03611a0cb6b7aaaa1cc7d3bff40c20 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -60,7 +60,7 @@ class QrOpTest(test.TestCase):
             q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
             q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
             all_ops += [q1, r1, q2, r2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       for i in range(8):
         q = 4 * i
         self.assertAllEqual(val[q], val[q + 2])  # q1 == q2
@@ -110,7 +110,7 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-5
     else:
       tol = 1e-14
-    self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
+    self.assertAllClose(identity.eval(), self.evaluate(xx), atol=tol)
 
   def Test(self):
     np.random.seed(1)
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
index 0023506b77aeb561da2f65183ce7efb60402ba4c..cab841741e751e42864f3d284ef0d03529f35b69 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
@@ -39,7 +39,7 @@ class MultinomialTest(test.TestCase):
           num_samples=1000000,
           seed=15)
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
@@ -57,7 +57,7 @@ class MultinomialTest(test.TestCase):
           num_samples=1000000,
           seed=15)
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
@@ -79,7 +79,7 @@ class MultinomialTest(test.TestCase):
       # we'll run out of memory if we try to draw 1e9 samples directly
       # really should fit in 12GB of memory...
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index bd64d61af8e793e71a319b6ac1af95bd7dd16a3d..8d2718c6d54675a00113f44fb8238093310e84e7 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -70,8 +70,8 @@ class MultinomialTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       sample_op1, _ = self._make_ops(10)
       # Consecutive runs shouldn't yield identical output.
-      sample1a = sess.run(sample_op1)
-      sample1b = sess.run(sample_op1)
+      sample1a = self.evaluate(sample_op1)
+      sample1b = self.evaluate(sample_op1)
       self.assertFalse(np.equal(sample1a, sample1b).all())
 
   def testEagerOneOpMultipleStepsIndependent(self):
@@ -160,7 +160,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       random_seed.set_random_seed(1618)
       op = sampler(constant_op.constant(logits), num_samples)
-      d = sess.run(op)
+      d = self.evaluate(op)
 
     batch_size, num_classes = logits.shape
     freqs_mat = []
@@ -197,7 +197,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session(use_gpu=True):
       x = random_ops.multinomial(array_ops.zeros([5, 0]), 7)
       with self.assertRaisesOpError("num_classes should be positive"):
-        x.eval()
+        self.evaluate(x)
 
   def testNegativeMinLogits(self):
     random_seed.set_random_seed(78844)
@@ -225,8 +225,10 @@ def native_op_vs_composed_ops(batch_size, num_classes, num_samples, num_iters):
     native_op = control_flow_ops.group(native_sampler(logits, num_samples))
     composed_op = control_flow_ops.group(composed_sampler(logits, num_samples))
 
-    native_dt = timeit.timeit(lambda: sess.run(native_op), number=num_iters)
-    composed_dt = timeit.timeit(lambda: sess.run(composed_op), number=num_iters)
+    native_dt = timeit.timeit(
+        lambda: sess.run(native_op), number=num_iters)
+    composed_dt = timeit.timeit(
+        lambda: sess.run(composed_op), number=num_iters)
     return native_dt, composed_dt
 
 
diff --git a/tensorflow/python/kernel_tests/random/random_crop_test.py b/tensorflow/python/kernel_tests/random/random_crop_test.py
index 8ded522320b730955e08b43cbf6da537f437b095..491d19d6a00629c93fde7699258ec8e424d0dbb2 100644
--- a/tensorflow/python/kernel_tests/random/random_crop_test.py
+++ b/tensorflow/python/kernel_tests/random/random_crop_test.py
@@ -44,7 +44,7 @@ class RandomCropTest(test.TestCase):
           for i in range(2) for j in range(3) for k in range(4))
       crop = random_ops.random_crop(value, size=target)
       for _ in range(20):
-        y = crop.eval()
+        y = self.evaluate(crop)
         self.assertAllEqual(y.shape, target)
         self.assertTrue(tuple(y.ravel()) in value_set)
 
@@ -61,7 +61,7 @@ class RandomCropTest(test.TestCase):
       crop = random_ops.random_crop(value, single, seed=7)
       counts = np.zeros(size, dtype=np.int32)
       for _ in range(num_samples):
-        y = crop.eval()
+        y = self.evaluate(crop)
         self.assertAllEqual(y.shape, single)
         counts[y] += 1
 
diff --git a/tensorflow/python/kernel_tests/random/random_gamma_test.py b/tensorflow/python/kernel_tests/random/random_gamma_test.py
index 606e8862c47af0683d7c2695a1e8d4088c6e7afe..d18e3feb04563c788bd110e111b0481266362b5c 100644
--- a/tensorflow/python/kernel_tests/random/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random/random_gamma_test.py
@@ -48,7 +48,7 @@ class RandomGammaTest(test.TestCase):
             [num], alpha, beta=beta, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 6de894846bcac7ccee43fd5e6b843d45a773a9ef..76618316b24bba608680d3d5fd2d83ba9c0c7a41 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -49,9 +49,9 @@ class RandomOpTestCommon(test.TestCase):
         random_seed.set_random_seed(graph_seed)
       x = rng_func([num], min_or_mean, max_or_stddev, dtype=dtype, seed=op_seed)
 
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # the same output, all three outputs will be bitwise identical.
@@ -69,7 +69,7 @@ class RandomNormalTest(RandomOpTestCommon):
             [num], mean=mu, stddev=sigma, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -160,7 +160,7 @@ class TruncatedNormalTest(test.TestCase):
             [num], mean=mu, stddev=sigma, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -256,7 +256,7 @@ class RandomUniformTest(RandomOpTestCommon):
             [num], minval=minv, maxval=maxv, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index 417588f8a391ee73d9f944fe785db3c591a5b450..47c0858db74e1400432ffd0164c29988cbecd296 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -43,7 +43,7 @@ class RandomPoissonTest(test.TestCase):
         rng = random_ops.random_poisson(lam, [num], dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -140,7 +140,7 @@ class RandomPoissonTest(test.TestCase):
     with self.cached_session():
       rnd = random_ops.random_poisson([], [], seed=12345)
       self.assertEqual([0], rnd.get_shape().as_list())
-      self.assertAllClose(np.array([], dtype=np.float32), rnd.eval())
+      self.assertAllClose(np.array([], dtype=np.float32), self.evaluate(rnd))
 
   def testShape(self):
     # Fully known shape
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index 0d85a072d4a2ff168f5e1c3233c7f7faf5c69a32..5601b9864bddfae24c314536e1d752d3c5ec0c49 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -84,7 +84,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeue_t = q.dequeue()
       results = []
       for _ in range(2):
-        a, b = sess.run(dequeue_t)
+        a, b = self.evaluate(dequeue_t)
         results.append((a, b))
       a, b = sess.run(q.dequeue_many(3))
       for i in range(3):
@@ -101,7 +101,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -167,7 +167,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
@@ -197,7 +197,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       results = []
       for _ in xrange(len(elems)):
-        x, y = sess.run(dequeued_t)
+        x, y = self.evaluate(dequeued_t)
         results.append((x, y))
       self.assertItemsEqual(elems, results)
 
@@ -215,9 +215,9 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual([1], size.eval())
+      self.assertEqual([1], self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual([0], size.eval())
+      self.assertEqual([0], self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -241,9 +241,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual(0, size_t.eval())
+      self.assertEqual(0, self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual(0, size_t.eval())
+      self.assertEqual(0, self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -251,9 +251,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpTo(self):
     with self.cached_session():
@@ -261,9 +261,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithNoShape(self):
     with self.cached_session():
@@ -275,7 +275,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the shape not being constrained.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
       enqueue_op.run()
 
@@ -284,7 +284,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # elements enqueued.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testEmptyDequeueUpToWithNoShape(self):
     with self.cached_session():
@@ -296,7 +296,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the shape not being constrained.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
       enqueue_op.run()
 
@@ -305,7 +305,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # elements enqueued.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testMultiEnqueueMany(self):
     with self.cached_session() as sess:
@@ -321,7 +321,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       results = []
       for _ in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         results.append((float_val, [int_val[0], int_val[1]]))
       expected = list(zip(float_elems, int_elems)) * 2
       self.assertItemsEqual(expected, results)
@@ -335,7 +335,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      results = dequeued_t.eval().tolist()
+      results = self.evaluate(dequeued_t).tolist()
       results.extend(dequeued_t.eval())
       self.assertItemsEqual(elems, results)
 
@@ -348,7 +348,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      results = dequeued_t.eval().tolist()
+      results = self.evaluate(dequeued_t).tolist()
       results.extend(dequeued_t.eval())
       self.assertItemsEqual(elems, results)
 
@@ -368,20 +368,20 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       results = []
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_single_t[1].get_shape())
       results.append((float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       results.append((float_val, int_val.tolist()))
 
       self.assertItemsEqual(zip(float_elems, int_elems), results)
@@ -402,21 +402,21 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       results = []
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       # dequeue_up_to has undefined shape.
       self.assertEqual([None], dequeued_t[0].get_shape().as_list())
       self.assertEqual([None, 2], dequeued_t[1].get_shape().as_list())
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_single_t[1].get_shape())
       results.append((float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       results.append((float_val, int_val.tolist()))
 
       self.assertItemsEqual(zip(float_elems, int_elems), results)
@@ -442,7 +442,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -515,7 +515,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue(dequeue_op):
-        dequeued_elems.extend(sess.run(dequeue_op))
+        dequeued_elems.extend(self.evaluate(dequeue_op))
 
       threads = []
       for dequeue_op in dequeue_ops:
@@ -539,7 +539,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
         dequeued_elems.extend(sess.run(dequeued_t).tolist())
@@ -566,7 +566,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
         dequeued_elems.extend(sess.run(dequeued_t).tolist())
@@ -649,7 +649,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -727,7 +727,7 @@ class RandomShuffleQueueTest(test.TestCase):
       progress = []  # Must be mutable
 
       def dequeue():
-        self.assertItemsEqual(elems, sess.run(dequeued_t))
+        self.assertItemsEqual(elems, self.evaluate(dequeued_t))
         progress.append(1)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
@@ -922,7 +922,7 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -950,7 +950,7 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -987,11 +987,11 @@ class RandomShuffleQueueTest(test.TestCase):
       def blocking_enqueue():
         # Expect the operation to succeed since it will complete
         # before the queue is closed.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
-          sess.run(blocking_enqueue_op)
+          self.evaluate(blocking_enqueue_op)
 
       thread1 = self.checkedThread(target=blocking_enqueue)
       thread1.start()
@@ -1001,7 +1001,7 @@ class RandomShuffleQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def blocking_close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       thread2 = self.checkedThread(target=blocking_close)
       thread2.start()
@@ -1032,7 +1032,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # This will block until the dequeue after the close.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread1 = self.checkedThread(target=blocking_enqueue)
       thread1.start()
@@ -1040,7 +1040,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # First blocking_enqueue_op of blocking_enqueue has enqueued 1 of 2
       # elements, and is blocked waiting for one more element to be dequeue.
       for i in range(50):
-        queue_size = size_t.eval()
+        queue_size = self.evaluate(size_t)
         if queue_size == 4:
           break
         elif i == 49:
@@ -1050,7 +1050,7 @@ class RandomShuffleQueueTest(test.TestCase):
         time.sleep(0.1)
 
       def blocking_close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       thread2 = self.checkedThread(target=blocking_close)
       thread2.start()
@@ -1064,7 +1064,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # At this point the close operation will complete, so the next enqueue
       # will fail.
       with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
   def testSharedQueueSameSession(self):
     with self.cached_session():
@@ -1216,23 +1216,23 @@ class RandomShuffleQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingDequeueUpTo(self, sess, dequeue_up_to_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_up_to_op)
+      self.evaluate(dequeue_up_to_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1393,14 +1393,14 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index d57db3c5126059c27cf23b493c4cb09d4987459d..13f97a9367bc2809f87ee218817884b77ffd2e0c 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -62,7 +62,7 @@ class StatelessOpsTest(test.TestCase):
         for stateless_op, stateful_op in cases:
           stateful = stateful_op(seed=seed[1])
           pure = stateless_op(seed=preseed)
-          self.assertAllEqual(stateful.eval(), pure.eval())
+          self.assertAllEqual(stateful.eval(), self.evaluate(pure))
 
   def _test_determinism(self, cases):
     # Stateless values should be equal iff the seeds are equal (roughly)
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index ac9be56d63fce302928b8de84ac9c1bf7ea6e55e..4d9b26f4ebd5f6ce22ce9af23a163b7e6c3a293a 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -154,30 +154,30 @@ class IdentityReaderTest(test.TestCase):
       queued_length = queue.size()
       key, value = reader.read(queue)
 
-      self.assertAllEqual(0, work_completed.eval())
-      self.assertAllEqual(0, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+      self.assertAllEqual(0, self.evaluate(work_completed))
+      self.assertAllEqual(0, self.evaluate(produced))
+      self.assertAllEqual(0, self.evaluate(queued_length))
 
       queue.enqueue_many([["A", "B", "C"]]).run()
       queue.close().run()
-      self.assertAllEqual(3, queued_length.eval())
+      self.assertAllEqual(3, self.evaluate(queued_length))
 
       self._ExpectRead(sess, key, value, b"A")
-      self.assertAllEqual(1, produced.eval())
+      self.assertAllEqual(1, self.evaluate(produced))
 
       self._ExpectRead(sess, key, value, b"B")
 
       self._ExpectRead(sess, key, value, b"C")
-      self.assertAllEqual(3, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+      self.assertAllEqual(3, self.evaluate(produced))
+      self.assertAllEqual(0, self.evaluate(queued_length))
 
       with self.assertRaisesOpError("is closed and has insufficient elements "
                                     "\\(requested 1, current size 0\\)"):
         sess.run([key, value])
 
-      self.assertAllEqual(3, work_completed.eval())
-      self.assertAllEqual(3, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+      self.assertAllEqual(3, self.evaluate(work_completed))
+      self.assertAllEqual(3, self.evaluate(produced))
+      self.assertAllEqual(0, self.evaluate(queued_length))
 
   def testMultipleEpochs(self):
     with self.cached_session() as sess:
@@ -209,23 +209,23 @@ class IdentityReaderTest(test.TestCase):
       key, value = reader.read(queue)
 
       self._ExpectRead(sess, key, value, b"X")
-      self.assertAllEqual(1, produced.eval())
+      self.assertAllEqual(1, self.evaluate(produced))
       state = reader.serialize_state().eval()
 
       self._ExpectRead(sess, key, value, b"Y")
       self._ExpectRead(sess, key, value, b"Z")
-      self.assertAllEqual(3, produced.eval())
+      self.assertAllEqual(3, self.evaluate(produced))
 
       queue.enqueue_many([["Y", "Z"]]).run()
       queue.close().run()
       reader.restore_state(state).run()
-      self.assertAllEqual(1, produced.eval())
+      self.assertAllEqual(1, self.evaluate(produced))
       self._ExpectRead(sess, key, value, b"Y")
       self._ExpectRead(sess, key, value, b"Z")
       with self.assertRaisesOpError("is closed and has insufficient elements "
                                     "\\(requested 1, current size 0\\)"):
         sess.run([key, value])
-      self.assertAllEqual(3, produced.eval())
+      self.assertAllEqual(3, self.evaluate(produced))
 
       self.assertEqual(bytes, type(state))
 
@@ -266,17 +266,17 @@ class IdentityReaderTest(test.TestCase):
 
       queue.enqueue_many([["X", "Y", "Z"]]).run()
       self._ExpectRead(sess, key, value, b"X")
-      self.assertLess(0, queued_length.eval())
-      self.assertAllEqual(1, produced.eval())
+      self.assertLess(0, self.evaluate(queued_length))
+      self.assertAllEqual(1, self.evaluate(produced))
 
       self._ExpectRead(sess, key, value, b"Y")
-      self.assertLess(0, work_completed.eval())
-      self.assertAllEqual(2, produced.eval())
+      self.assertLess(0, self.evaluate(work_completed))
+      self.assertAllEqual(2, self.evaluate(produced))
 
       reader.reset().run()
-      self.assertAllEqual(0, work_completed.eval())
-      self.assertAllEqual(0, produced.eval())
-      self.assertAllEqual(1, queued_length.eval())
+      self.assertAllEqual(0, self.evaluate(work_completed))
+      self.assertAllEqual(0, self.evaluate(produced))
+      self.assertAllEqual(1, self.evaluate(queued_length))
       self._ExpectRead(sess, key, value, b"Z")
 
       queue.enqueue_many([["K", "L"]]).run()
@@ -724,7 +724,7 @@ class AsyncReaderTest(test.TestCase):
         thread_data.append(thread_data_t(t, queue, output))
 
       # Start all readers. They are all blocked waiting for queue entries.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for d in thread_data:
         d.thread.start()
 
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index ebb9872f226f35c4642f99c8aa161845657e4a73..74020667d9340c4775685a97d6853486147ca8e0 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -54,7 +54,7 @@ class RecordInputOpTest(test.TestCase):
           batch_size=1,
           name="record_input").get_yield_op()
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
   def testRecordInputSimpleGzip(self):
     with self.cached_session() as sess:
@@ -73,7 +73,7 @@ class RecordInputOpTest(test.TestCase):
           compression_type=tf_record.TFRecordCompressionType.GZIP).get_yield_op(
           )
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
   def testRecordInputSimpleZlib(self):
     with self.cached_session() as sess:
@@ -92,7 +92,7 @@ class RecordInputOpTest(test.TestCase):
           compression_type=tf_record.TFRecordCompressionType.ZLIB).get_yield_op(
           )
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
   def testRecordInputEpochs(self):
     files = 100
@@ -117,7 +117,7 @@ class RecordInputOpTest(test.TestCase):
       for _ in range(3):
         epoch_set = set()
         for _ in range(int(files * records_per_file / batches)):
-          op_list = sess.run(yield_op)
+          op_list = self.evaluate(yield_op)
           self.assertTrue(len(op_list) is batches)
           for r in op_list:
             self.assertTrue(r[0] not in epoch_set)
@@ -138,15 +138,15 @@ class RecordInputOpTest(test.TestCase):
 
         yield_op = records.get_yield_op()
         for _ in range(50):
-          sess.run(yield_op)
+          self.evaluate(yield_op)
 
   def testEmptyGlob(self):
     with self.cached_session() as sess:
       record_input = data_flow_ops.RecordInput(file_pattern="foo")
       yield_op = record_input.get_yield_op()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       with self.assertRaises(NotFoundError):
-        sess.run(yield_op)
+        self.evaluate(yield_op)
 
   def testBufferTooSmall(self):
     files = 10
@@ -171,7 +171,7 @@ class RecordInputOpTest(test.TestCase):
       for _ in range(3):
         epoch_set = set()
         for _ in range(int(files * records_per_file / batches)):
-          op_list = sess.run(yield_op)
+          op_list = self.evaluate(yield_op)
           self.assertTrue(len(op_list) is batches)
           for r in op_list:
             self.assertTrue(r[0] not in epoch_set)
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index 3bb4986313db74ba439991566ab2947722ab890d..c26e62738c10ff00c5b92efcd518ef0857367ea3 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -119,7 +119,7 @@ class ReduceJoinTest(UnicodeTestCase):
           axis=axis,
           keep_dims=keep_dims,
           separator=separator)
-      output_array = output.eval()
+      output_array = self.evaluate(output)
 
     self.assertAllEqualUnicode(truth, output_array)
     self.assertAllEqual(truth_shape, output.get_shape())
@@ -149,10 +149,10 @@ class ReduceJoinTest(UnicodeTestCase):
       if not axis:
         truth = constant_op.constant(truth)
       truth_squeezed = array_ops.squeeze(truth, axis=axis)
-      output_array = output.eval()
-      output_keep_dims_array = output_keep_dims.eval()
-      truth_array = truth.eval()
-      truth_squeezed_array = truth_squeezed.eval()
+      output_array = self.evaluate(output)
+      output_keep_dims_array = self.evaluate(output_keep_dims)
+      truth_array = self.evaluate(truth)
+      truth_squeezed_array = self.evaluate(truth_squeezed)
     self.assertAllEqualUnicode(truth_array, output_keep_dims_array)
     self.assertAllEqualUnicode(truth_squeezed_array, output_array)
     self.assertAllEqual(truth.get_shape(), output_keep_dims.get_shape())
@@ -318,11 +318,11 @@ class ReduceJoinTest(UnicodeTestCase):
 
       # Reduction that drops the dim of size 0.
       output = string_ops.reduce_join(inputs=inputs, axis=0)
-      self.assertAllEqualUnicode([""], output.eval())
+      self.assertAllEqualUnicode([""], self.evaluate(output))
 
       # Reduction that keeps the dim of size 0.
       output = string_ops.reduce_join(inputs=inputs, axis=1)
-      output_shape = output.eval().shape
+      output_shape = self.evaluate(output).shape
       self.assertAllEqual([0], output_shape)
 
   def testInvalidArgsUnknownShape(self):
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 2ac3996e25b1754e829c2744357396c8ddccc07a..612b2c56a55f48eb31c17ca9c0c1cba7c4d42ece 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -185,7 +185,7 @@ class SumReductionTest(BaseReductionTest):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_sum([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
   def testInfinity(self):
@@ -216,7 +216,7 @@ class SumReductionTest(BaseReductionTest):
       tf_arr = variables.Variable(arr)
       variables.global_variables_initializer().run()
       tf_mean = math_ops.reduce_mean(tf_arr, 0, False)
-      tf_out_mean = sess.run(tf_mean)
+      tf_out_mean = self.evaluate(tf_mean)
     self.assertAllClose(tf_out_mean, 1.)
 
   def testFloat32(self):
@@ -400,7 +400,7 @@ class MeanReductionTest(BaseReductionTest):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_mean([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
   def testInfinity(self):
@@ -473,7 +473,7 @@ class ProdReductionTest(BaseReductionTest):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_prod([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
   def testInfinity(self):
@@ -562,7 +562,7 @@ class MinReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_min(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -576,7 +576,7 @@ class MinReductionTest(test.TestCase):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_min([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
   def testInfinity(self):
@@ -675,7 +675,7 @@ class MaxReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_max(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -689,7 +689,7 @@ class MaxReductionTest(test.TestCase):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_max([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
   def testInfinity(self):
@@ -802,7 +802,7 @@ class AllReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_all(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -817,7 +817,7 @@ class AllReductionTest(test.TestCase):
       with self.session(use_gpu=True) as sess:
         v = math_ops.reduce_all([True, True],
                                 constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, True)
 
   def testAll3D(self):
@@ -851,7 +851,7 @@ class AnyReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_any(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -866,7 +866,7 @@ class AnyReductionTest(test.TestCase):
       with self.session(use_gpu=True) as sess:
         v = math_ops.reduce_any([True, True],
                                 constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, True)
 
   def testAll3D(self):
@@ -962,7 +962,7 @@ class CountNonzeroReductionTest(test.TestCase):
     # Test case for GitHub issue 18712
     with self.cached_session() as sess:
       v = math_ops.count_nonzero(constant_op.constant(["test"]))
-      self.assertAllClose(sess.run(v), 1)
+      self.assertAllClose(self.evaluate(v), 1)
 
   def testStringReduce1D(self):
     # Create a 1D array of strings
diff --git a/tensorflow/python/kernel_tests/regex_full_match_op_test.py b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
index 98746e7d9b19e5ba52a73b7ca3d9967cc813c133..4edd3e98d9b3d4be3fb1a7179de981aa939b4593 100644
--- a/tensorflow/python/kernel_tests/regex_full_match_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
@@ -61,7 +61,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       invalid_pattern = "A["
       matched = op(input_tensor, invalid_pattern)
       with self.assertRaisesOpError("Invalid pattern"):
-        matched.eval()
+        self.evaluate(matched)
 
 
 class RegexFullMatchOpTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/regex_replace_op_test.py b/tensorflow/python/kernel_tests/regex_replace_op_test.py
index d9b7ed28d21652e964977c1938cd5d2cefb17825..ce9a1b5279f66f8fa4fea53bc7eccab0c736668b 100644
--- a/tensorflow/python/kernel_tests/regex_replace_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_replace_op_test.py
@@ -74,7 +74,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       invalid_pattern = "A["
       replace = op(input_vector, invalid_pattern, "x")
       with self.assertRaisesOpError("Invalid pattern"):
-        replace.eval()
+        self.evaluate(replace)
 
   def testGlobal(self, op):
     values = ["ababababab", "abcabcabc", ""]
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index b0f2796ede176b9d3ea7e69fa5da6394d74c258e..68243f27c0590bc073e8065bc53128a760753451 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -55,52 +56,52 @@ class ReluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testRelu(self, np_features, use_gpu=False):
+  def _testRelu(self, np_features):
     np_relu = self._npRelu(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      relu = nn_ops.relu(np_features)
-      tf_relu = relu.eval()
+    tf_relu = nn_ops.relu(np_features)
     self.assertAllClose(np_relu, tf_relu)
-    self.assertShapeEqual(np_relu, relu)
+    self.assertShapeEqual(np_relu, tf_relu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testRelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testRelu(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
-  def _testReluInt8x4(self, np_inputs):
-    if not test.is_gpu_available(cuda_only=True):
-      return
-    np_relu = self._npRelu(np_inputs)
-    with self.cached_session(use_gpu=True):
-      relu = nn_ops.relu(constant_op.constant(np_inputs, dtypes.qint8))
-      if np_inputs.size % 4 == 0:
-        tf_relu = relu.eval()
-        self.assertAllClose(np_relu, tf_relu)
-        self.assertShapeEqual(np_relu, relu)
-      else:
-        with self.assertRaisesRegexp(
-            errors.InvalidArgumentError,
-            "Tensor size must be a multiple of 4 for Relu<qint8>. Got %d" %
-            np_inputs.size):
-          tf_relu = relu.eval()
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testReluInt8x4GoodShape(self):
-    self._testReluInt8x4(np.array([[-50, 7, 23, 0], [-1, -5, 6, 11]]))
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest("No GPU available")
+    inputs = np.array([[-50, 7, 23, 0], [-1, -5, 6, 11]])
+    np_relu = self._npRelu(inputs)
+    tf_relu = nn_ops.relu(constant_op.constant(inputs, dtypes.qint8))
+    self.assertAllClose(np_relu, tf_relu)
+    self.assertShapeEqual(np_relu, tf_relu)
 
   def testReluInt8x4BadShape(self):
-    np_inputs = np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]])
-    self.assertEqual(np_inputs.size, 9)
-    self._testReluInt8x4(np_inputs)
-    np_inputs = np.array(
-        [1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1])
-    self.assertEqual(np_inputs.size, 17)
-    self._testReluInt8x4(np_inputs)
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest("No GPU available")
+    inputs = constant_op.constant(
+        np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]]), dtypes.qint8)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tensor size must be a multiple of 4 for Relu<qint8>. Got 9"):
+      self.evaluate(nn_ops.relu(inputs))
+
+    inputs = constant_op.constant(
+        np.array([1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1]),
+        dtypes.qint8)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tensor size must be a multiple of 4 for Relu<qint8>. Got 17"):
+      self.evaluate(nn_ops.relu(inputs))
 
   # The gradient test for ReLU is a bit tricky as the derivative is not well
   # defined at around zero and we want to avoid that in terms of input values.
@@ -202,15 +203,15 @@ class ReluTest(test.TestCase):
     self.assertLess(err, 1e-10)
 
   def testGradientScalar(self):
-    with self.cached_session() as sess:
-      x = variables.Variable(100.)
-      y = nn_ops.relu(x)
-      loss = y**2
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.25)
-      train_op = optimizer.minimize(loss)
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(x.eval(), 50.0)
+    x = variables.Variable(100.)
+
+    def loss():
+      return nn_ops.relu(x)**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.25)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(optimizer.minimize(loss))
+    self.assertAllClose(x.read_value(), 50.0)
 
 
 class Relu6Test(test.TestCase):
@@ -228,23 +229,25 @@ class Relu6Test(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7,
                                                     0.9]])))
 
-  def _testRelu6(self, np_features, use_gpu=False):
+  def _testRelu6(self, np_features):
     np_relu6 = self._npRelu6(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      relu6 = nn_ops.relu6(np_features)
-      tf_relu6 = relu6.eval()
+    tf_relu6 = nn_ops.relu6(np_features)
     self.assertAllClose(np_relu6, tf_relu6)
-    self.assertShapeEqual(np_relu6, relu6)
+    self.assertShapeEqual(np_relu6, tf_relu6)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testRelu6(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float, np.double]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testRelu6(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float, np.double]:
+      self._testRelu6(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   # The gradient test for ReLU6 is a bit tricky as the derivative is
   # not well defined at around zero and six and we want to avoid that
@@ -297,25 +300,27 @@ class LeakyReluTest(test.TestCase):
                                                      0.9]]),
             alpha=0.1))
 
-  def _testLeakyRelu(self, np_features, alpha, use_gpu=False):
+  def _testLeakyRelu(self, np_features, alpha):
     np_leaky_relu = self._npLeakyRelu(np_features, alpha)
-    with self.test_session(use_gpu=use_gpu):
-      leaky_relu = nn_ops.leaky_relu(np_features, alpha)
-      tf_leaky_relu = leaky_relu.eval()
+    tf_leaky_relu = nn_ops.leaky_relu(np_features, alpha)
     self.assertAllClose(np_leaky_relu, tf_leaky_relu)
-    self.assertShapeEqual(np_leaky_relu, leaky_relu)
+    self.assertShapeEqual(np_leaky_relu, tf_leaky_relu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testLeakyRelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          alpha=0.2,
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testLeakyRelu(
             np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            alpha=0.1,
-            use_gpu=True)
+            alpha=0.2)
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testLeakyRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+          alpha=0.1)
 
   # The gradient test for Leaky ReLU is a bit tricky as the derivative is not
   # well defined at around zero and we want to avoid that in terms of input
@@ -391,15 +396,15 @@ class LeakyReluTest(test.TestCase):
       self.assertLess(err, 1e-10)
 
   def testGradientScalar(self):
-    with self.test_session() as sess:
-      x = variables.Variable(-100.)
-      y = nn_ops.leaky_relu(x, 0.05)
-      loss = y**2
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.2)
-      train_op = optimizer.minimize(loss)
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(x.eval(), -99.9)
+    x = variables.Variable(-100.)
+
+    def loss():
+      return nn_ops.leaky_relu(x, 0.05)**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.2)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(optimizer.minimize(loss))
+    self.assertAllClose(x.read_value(), -99.9)
 
 
 class EluTest(test.TestCase):
@@ -415,22 +420,24 @@ class EluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testElu(self, np_features, use_gpu=False):
+  def _testElu(self, np_features):
     np_elu = self._npElu(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      elu = nn_ops.elu(np_features)
-      tf_elu = elu.eval()
+    tf_elu = nn_ops.elu(np_features)
     self.assertAllClose(np_elu, tf_elu)
-    self.assertShapeEqual(np_elu, elu)
+    self.assertShapeEqual(np_elu, tf_elu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.float16, np.float32, np.float64]:
-      self._testElu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      self._testElu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=True)
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
+        self._testElu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testElu(np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testGradientFloat32(self):
     with self.cached_session():
@@ -517,22 +524,20 @@ class SeluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testSelu(self, np_features, use_gpu=False):
+  def _testSelu(self, np_features):
     np_selu = self._npSelu(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      selu = nn_ops.selu(np_features)
-      tf_selu = selu.eval()
+    tf_selu = nn_ops.selu(np_features)
     self.assertAllClose(np_selu, tf_selu)
-    self.assertShapeEqual(np_selu, selu)
+    self.assertShapeEqual(np_selu, tf_selu)
 
   def testNumbers(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testSelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      self._testSelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=True)
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+      # Force executed on CPU in case GPU kernels are avaiable.
+      with ops.device("/device:CPU:0"):
+        self._testSelu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testGradientFloat32(self):
     with self.cached_session():
@@ -599,46 +604,44 @@ class CreluTest(test.TestCase):
     t = nn_ops.crelu(f)
     self.assertEqual([50, 5, 7, 20], t.get_shape())
 
-  def _testCrelu(self, np_features, use_gpu=False):
+  def _testCrelu(self, np_features):
     np_relu = np.maximum(np_features, np.zeros_like(np_features))
     np_neg_relu = np.maximum(-np_features, np.zeros_like(np_features))
     np_crelu = np.concatenate((np_relu, np_neg_relu),
                               len(np_features.shape) - 1)
 
-    with self.cached_session(use_gpu=use_gpu):
-      crelu = nn_ops.crelu(np_features)
-      tf_relu = crelu.eval()
+    tf_crelu = nn_ops.crelu(np_features)
 
-    self.assertAllClose(np_crelu, tf_relu)
-    self.assertShapeEqual(np_crelu, crelu)
+    self.assertAllClose(np_crelu, tf_crelu)
+    self.assertShapeEqual(np_crelu, tf_crelu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testCrelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testCrelu(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testCrelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testNumbersWithAxis0(self):
-    with self.cached_session():
-      crelu = nn_ops.crelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=0)
-      tf_relu = crelu.eval()
-      np_crelu = np.array([[0, 7, 0, 3, 0], [1, 0, 5, 0, 9], [9, 0, 5, 0, 1],
-                           [0, 3, 0, 7, 0]])
-      self.assertAllEqual(np_crelu, tf_relu)
+    tf_crelu = nn_ops.crelu(
+        np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=0)
+    np_crelu = np.array([[0, 7, 0, 3, 0], [1, 0, 5, 0, 9], [9, 0, 5, 0, 1],
+                         [0, 3, 0, 7, 0]])
+    self.assertAllEqual(np_crelu, tf_crelu)
 
   def testNumbersWithAxis1(self):
-    with self.cached_session():
-      crelu = nn_ops.crelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=1)
-      tf_relu = crelu.eval()
-      np_crelu = np.array([[0, 7, 0, 3, 0, 9, 0, 5, 0, 1],
-                           [1, 0, 5, 0, 9, 0, 3, 0, 7, 0]])
-      self.assertAllEqual(np_crelu, tf_relu)
+    tf_crelu = nn_ops.crelu(
+        np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=1)
+    np_crelu = np.array([[0, 7, 0, 3, 0, 9, 0, 5, 0, 1],
+                         [1, 0, 5, 0, 9, 0, 3, 0, 7, 0]])
+    self.assertAllEqual(np_crelu, tf_crelu)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/reshape_op_test.py b/tensorflow/python/kernel_tests/reshape_op_test.py
index 14cdae18370fb047d68eb31f62e92d20ad263146..84539c2b02a9000b540ae504e3036fc23a9c30f1 100644
--- a/tensorflow/python/kernel_tests/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/reshape_op_test.py
@@ -33,14 +33,14 @@ class ReshapeTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       np_ans = x.reshape(y)
       tf_ans = array_ops.reshape(x, y)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertEqual(tf_ans.get_shape(), out.shape)
       self.assertShapeEqual(np_ans, tf_ans)
 
       # Repeat with an int64 shape tensor.
       y64 = constant_op.constant(y, dtype=dtypes.int64)
       tf_ans = array_ops.reshape(x, y64)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertEqual(tf_ans.get_shape(), out.shape)
       self.assertShapeEqual(np_ans, tf_ans)
 
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index c8227dc117f316c1fa1cb7780c764fc7766f1a2d..c351a18c8f861396ccd318c7b499fed558af9da2 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -153,7 +153,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
   def testCachedValueReadBeforeWrite(self):
     with self.cached_session() as sess:
       v = resource_variable_ops.ResourceVariable(0.0, caching_device="cpu:0")
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
       value, _ = sess.run([v, v.assign_add(1.0)])
       self.assertAllEqual(value, 0.0)
 
@@ -568,6 +568,20 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.load(2.0)
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+  def testToFromProtoCachedValue(self):
+    with ops.Graph().as_default():
+      v_def = resource_variable_ops.ResourceVariable(
+          initial_value=constant_op.constant(3.0)).to_proto()
+      v_prime = resource_variable_ops.ResourceVariable(variable_def=v_def)
+      self.assertTrue(getattr(v_prime, "_cached_value", None) is None)
+
+      other_v_def = resource_variable_ops.ResourceVariable(
+          caching_device="cpu:0",
+          initial_value=constant_op.constant(3.0)).to_proto()
+      other_v_prime = resource_variable_ops.ResourceVariable(
+          variable_def=other_v_def)
+      self.assertTrue(other_v_prime._cached_value is not None)
+
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = resource_variable_ops.ResourceVariable(
@@ -736,7 +750,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           # Needed in Eager since we get a unique container name by default.
           container=ops.get_default_graph()._container)
       w_read = resource_variable_ops.read_variable_op(w, v.dtype.base_dtype)
-      self.assertEqual(300.0, w_read.eval())
+      self.assertEqual(300.0, self.evaluate(w_read))
 
       x = resource_variable_ops.var_handle_op(
           dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="var5",
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 56609bd0a5ea8a2ce161f317b4a6977987b5821d..91d054ad9ad94e3e2c7889804c0b11fe51a5c909 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -42,12 +42,12 @@ class ReverseSequenceTest(test.TestCase):
       ans = array_ops.reverse_sequence(
           x, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=seq_lengths)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertAllClose(tf_ans, truth, atol=1e-10)
         self.assertShapeEqual(truth, ans)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBothReverseSequence(self,
                                x,
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 2fb49638ef5da2eaff218517459a9508cfb7bcef..0090b7332f9184e1d513108bb68a41ccb016a5f4 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -54,6 +54,7 @@ import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
 from tensorflow.python.training import training
+from tensorflow.python.util import nest
 
 
 class Plus1RNNCell(rnn_cell_impl.RNNCell):
@@ -471,6 +472,8 @@ class RNNTest(test.TestCase):
       outputs, state = rnn.dynamic_rnn(
           cell, inputs, dtype=dtypes.float32)
       self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(len(state), 2)
+      state = nest.flatten(state)
       self.assertEqual(len(state), 4)
       self.assertEqual(state[0].shape.as_list(), [None, 2 * output_shape])
       self.assertEqual(state[1].shape.as_list(), [None, 2 * output_shape])
@@ -664,24 +667,25 @@ class RNNTest(test.TestCase):
       kn1 = KerasNetworkTFRNNs(name="kn1")
       kn2 = KerasNetworkKerasRNNs(name="kn2")
 
-      z = array_ops.zeros((2, 3))
+    z = array_ops.zeros((2, 3))
 
-      kn1(z)
-      kn2(z)
+    kn1(z)
+    kn2(z)
 
-      # pylint: disable=protected-access
-      self.assertTrue(all("kn1" in v.name for v in kn1._cell.variables))
-      self.assertTrue(all("kn2" in v.name for v in kn2._cell.variables))
+    # pylint: disable=protected-access
+    self.assertTrue(all("kn1" in v.name for v in kn1._cell.variables))
+    self.assertTrue(all("kn2" in v.name for v in kn2._cell.variables))
 
+    with base_layers.keras_style_scope():
       kn1_new = KerasNetworkTFRNNs(name="kn1_new")
       kn2_new = KerasNetworkKerasRNNs(name="kn2_new")
 
-      kn2_new(z)
-      # Most importantly, this doesn't fail due to variable scope reuse issues.
-      kn1_new(z)
+    kn2_new(z)
+    # Most importantly, this doesn't fail due to variable scope reuse issues.
+    kn1_new(z)
 
-      self.assertTrue(all("kn1_new" in v.name for v in kn1_new._cell.variables))
-      self.assertTrue(all("kn2_new" in v.name for v in kn2_new._cell.variables))
+    self.assertTrue(all("kn1_new" in v.name for v in kn1_new._cell.variables))
+    self.assertTrue(all("kn2_new" in v.name for v in kn2_new._cell.variables))
 
 
 ######### Benchmarking RNN code
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index b36922256525f55e4958fa9b0dabfe9215580212..c48e0e2e674ed6f7632d23692468bf33cae53c45 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -126,6 +126,11 @@ class CumsumTest(test.TestCase):
       for axis in range(-6, 6, 3):
         self._compareAll(x, axis)
 
+  def testLarge(self):
+    for dtype in self.valid_dtypes:
+      x = np.ones([1000000], dtype=dtype) / 1024
+      self._compareAll(x, 0)
+
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     input_tensor = ops.convert_to_tensor(x)
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 0ed508b9fe2e4c575a2053af3da099ad64fcaa3e..1f1249727c4335ecdbafd3344281cc71dd723488 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -144,7 +144,7 @@ class StatefulScatterNdTest(test.TestCase):
         tf_scatter(ref_var, indices, updates).eval()
 
         # Compare
-        self.assertAllClose(new, ref_var.eval())
+        self.assertAllClose(new, self.evaluate(ref_var))
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
     for vtype in (np.int32, np.float16, np.float32, np.float64, np.complex64,
@@ -162,7 +162,7 @@ class StatefulScatterNdTest(test.TestCase):
 
     with self.session(use_gpu=True) as sess:
       sess.run(init)
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
   def testSimpleResource(self):
@@ -190,7 +190,7 @@ class StatefulScatterNdTest(test.TestCase):
 
     with self.session(use_gpu=True) as sess:
       sess.run(init)
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
   def testSimple3(self):
@@ -204,7 +204,7 @@ class StatefulScatterNdTest(test.TestCase):
 
     with self.session(use_gpu=True) as sess:
       sess.run(init)
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
   def testVariableRankUpdate(self):
@@ -249,7 +249,7 @@ class StatefulScatterNdTest(test.TestCase):
   #             [[0]], dtype=tf.int64), [False])
   #     var.initializer.run()
   #     session.run([update0, update1])
-  #     self.assertAllEqual([False, True], var.eval())
+  #     self.assertAllEqual([False, True], self.evaluate(var))
 
   def testScatterOutOfRangeCpu(self):
     # TODO(simister): Re-enable once binary size increase due to
@@ -307,7 +307,7 @@ class StatefulScatterNdTest(test.TestCase):
     expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.cached_session():
       ref.initializer.run()
-      self.assertAllEqual(expected_result, scatter_update.eval())
+      self.assertAllEqual(expected_result, self.evaluate(scatter_update))
 
   def testRank3InvalidShape1(self):
     indices = array_ops.zeros([3, 2, 2], dtypes.int32)
@@ -342,7 +342,7 @@ class StatefulScatterNdTest(test.TestCase):
 
     with session.Session() as sess:
       sess.run(init)
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       assert np.allclose(result, expected_result)
 
   # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
@@ -421,7 +421,7 @@ class ScatterNdTest(test.TestCase):
                          b"", b"", b"seven"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by same value.
@@ -432,7 +432,7 @@ class ScatterNdTest(test.TestCase):
     expected = np.array([b"", b"", b"", b"bb", b"a", b"", b"", b"c"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by different value.
@@ -444,7 +444,7 @@ class ScatterNdTest(test.TestCase):
                 np.array([b"", b"", b"", b"cb", b"a", b"", b"", b"d"])]
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertTrue(np.array_equal(result, expected[0]) or
                       np.array_equal(result, expected[1]))
 
@@ -463,7 +463,7 @@ class ScatterNdTest(test.TestCase):
     self.assertAllEqual(scatter.get_shape().as_list(), shape)
     expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.cached_session():
-      self.assertAllEqual(expected_result, scatter.eval())
+      self.assertAllEqual(expected_result, self.evaluate(scatter))
 
   def testUndefinedIndicesShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
@@ -545,9 +545,9 @@ class ScatterNdTest(test.TestCase):
       expected_input_grad = np.array([[1, 2], [3, 4]],
                                      dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
   def testGradientsRank2SliceUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
@@ -565,9 +565,9 @@ class ScatterNdTest(test.TestCase):
       expected_input_grad = np.array([[3, 4], [1, 2]],
                                      dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
   def testGradientsRank3SliceUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
@@ -588,9 +588,9 @@ class ScatterNdTest(test.TestCase):
       expected_input_grad = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
                                      dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
   def testGradientsRank7SliceUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
@@ -615,9 +615,9 @@ class ScatterNdTest(test.TestCase):
           [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]],
           dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
   def testScatterNdRepatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 87c345245c1d982b21afd52cf3e3da89fdff20ad..a4daad7adcceed1f789293c521f71cc45a6d7e32 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -286,7 +287,7 @@ class ScatterTest(test.TestCase):
 
         session.run([update0, update1])
 
-        self.assertAllEqual([False, True], var.eval())
+        self.assertAllEqual([False, True], self.evaluate(var))
 
   def testScatterOutOfRangeCpu(self):
     for op, _ in _TF_OPS_TO_NUMPY.items():
@@ -320,19 +321,19 @@ class ScatterTest(test.TestCase):
       updates = np.array([-3, -4, -5]).astype(np.float32)
       # With GPU, the code ignores indices that are out of range.
       # We don't test the implementation; just test there's no failures.
-      with self.cached_session(force_gpu=True):
+      with test_util.force_gpu():
         ref = variables.Variable(params)
         ref.initializer.run()
 
         # Indices all in range, no problem.
         indices = np.array([2, 0, 5])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
         # Indicies out of range should not fail.
         indices = np.array([-1, 0, 5])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
         indices = np.array([2, 0, 6])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 3f7e43b5335f37651914d95091094ddd4000e1b5..5ab889895ec0c9394c2de770bfcb7c6c5a6f4c77 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -118,7 +118,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
           for np_op1, np_op2, tf_op in curr_ops_list:
             np_ans = self._segmentReduce(indices, np_x, np_op1, np_op2)
             s = tf_op(data=tf_x, segment_ids=indices)
-            tf_ans = s.eval()
+            tf_ans = self.evaluate(s)
             self.assertAllClose(np_ans, tf_ans)
             # NOTE(mrry): The static shape inference that computes
             # `tf_ans.shape` can only infer that sizes from dimension 1
@@ -141,7 +141,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 1]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment_ids should be the same size"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentIdsValid(self):
     # This is a baseline for the following SegmentIdsInvalid* tests.
@@ -161,7 +161,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [1, 1, 2, 2]
         np_ans = self._segmentReduce(indices, np_x, np.add)
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testSegmentIdsHole(self):
@@ -172,7 +172,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 0, 3, 3]
         np_ans = self._segmentReduce(indices, np_x, np.add)
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testSegmentIdsInvalid1(self):
@@ -184,7 +184,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.assertRaisesOpError(
           r"Segment id -1 out of range \[0, 1\), possibly because "
           "'segment_ids' input is not sorted."):
-        s.eval()
+        self.evaluate(s)
 
   def testSegmentIdsInvalid2(self):
     shape = [4, 4]
@@ -193,7 +193,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       indices = [0, 1, 0, 1]
       s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
       with self.assertRaisesOpError("segment ids are not increasing"):
-        s.eval()
+        self.evaluate(s)
 
   def testSegmentIdsInvalid3(self):
     shape = [4, 4]
@@ -204,7 +204,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.assertRaisesOpError(
           r"Segment id 1 out of range \[0, 1\), possibly "
           "because 'segment_ids' input is not sorted."):
-        s.eval()
+        self.evaluate(s)
 
   def testSegmentIdsInvalid4(self):
     shape = [4, 4]
@@ -214,7 +214,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 0, 0, -1]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentIdsInvalid5(self):
     shape = [4, 4]
@@ -224,7 +224,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 0, 0, -2]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
   def testGradient(self):
     shape = [4, 4]
@@ -297,7 +297,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
                   indices, np_x, np_op1, np_op2, num_segments=num_segments,
                   initial_value=init_op(dtype))
               s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
-              tf_ans = s.eval()
+              tf_ans = self.evaluate(s)
               if dtype is dtypes_lib.bfloat16:
                 tf_ans = tf_ans.astype(np.float32)
               self.assertAllCloseAccordingToType(np_ans, tf_ans)
@@ -320,7 +320,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
               data=tf_x,
               segment_ids=indices,
               num_segments=num_segments_constant)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
@@ -412,7 +412,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
         unsorted = math_ops.unsorted_segment_sum([[17]], bad, num_segments=2)
         with self.assertRaisesOpError(
             r"segment_ids\[0,0\] = %d is out of range \[0, 2\)" % bad[0][0]):
-          unsorted.eval()
+          self.evaluate(unsorted)
 
   def testEmptySecondDimension(self):
     dtypes = [np.float16, np.float32, np.float64, np.int64, np.int32,
@@ -443,7 +443,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
           np.place(indices, indices == 8, [-1])
           s = math_ops.unsorted_segment_sum(
               data=tf_x, segment_ids=indices, num_segments=num_segments)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
@@ -499,7 +499,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
           np_ans = self._sparseSegmentReduce(np_x, np_indices, segment_indices,
                                              np_op1, np_op2)
           s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
           self.assertAllClose(np_ans, tf_ans)
           # NOTE(mrry): The static shape inference that computes
           # `tf_ans.shape` can only infer that sizes from dimension 1
@@ -518,7 +518,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
                                            np_op1, np_op2)
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testWithNumSegments(self):
@@ -543,7 +543,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testWithEmptySegments(self):
@@ -562,7 +562,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np.zeros([5, 4]), tf_ans)
 
   def testSegmentIdsGreaterThanZero(self):
@@ -576,7 +576,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
                                            np_op1, np_op2)
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testValid(self):
@@ -588,7 +588,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        s.eval()
+        self.evaluate(s)
 
   def testIndicesInvalid1(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -600,7 +600,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"indices\[1\] == -1 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
   def testIndicesInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -612,7 +612,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"indices\[3\] == 10 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentsInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -623,7 +623,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids are not increasing"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentsInvalid3(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -636,7 +636,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError(
             r"Segment id 1 out of range \[0, 1\), possibly because "
             "'segment_ids' input is not sorted"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentsInvalid4(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -649,7 +649,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError(
             r"Segment id -1 out of range \[0, 2\), possibly because "
             "'segment_ids' input is not sorted"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentsInvalid6(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -660,7 +660,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentsInvalid7(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -671,7 +671,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentWithNumSegmentsValid(self):
     # Baseline for the test*WithNumSegmentsInvalid* methods below.
@@ -690,7 +690,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        s.eval()
+        self.evaluate(s)
 
   def testSegmentWithNumSegmentsInvalid1(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -709,7 +709,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             segment_ids=segment_indices,
             num_segments=num_segments)
         with self.assertRaisesOpError("segment ids must be < num_segments"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentWithNumSegmentsInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -785,7 +785,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
-        s.eval()
+        self.evaluate(s)
 
   def testGradientIndicesInvalid1(self):
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
@@ -798,7 +798,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Index 10 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
   def testGradientIndicesInvalid2(self):
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
@@ -811,7 +811,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Index -1 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
   def testGradientSegmentsInvalid1(self):
     tf_x, _ = self._input(
@@ -825,7 +825,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError("Invalid number of segments"):
-          s.eval()
+          self.evaluate(s)
 
   def testGradientSegmentsInvalid2(self):
     tf_x, _ = self._input([1, 4], dtype=dtypes_lib.float32)
@@ -838,7 +838,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id 1 out of range \[0, 1\)"):
-          s.eval()
+          self.evaluate(s)
 
   def testGradientSegmentsInvalid3(self):
     tf_x, _ = self._input([2, 4], dtype=dtypes_lib.float32)
@@ -851,7 +851,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id -1 out of range \[0, 2\)"):
-          s.eval()
+          self.evaluate(s)
 
   def testGradientSegmentsInvalid4(self):
     tf_x, _ = self._input([0, 4], dtype=dtypes_lib.float32)
@@ -864,7 +864,8 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id 0 out of range \[0, 0\)"):
-          s.eval()
+          self.evaluate(s)
+
 
 class SegmentReductionOpBenchmark(test.Benchmark):
   outer_dim_options = [2**x for x in range(9, 14, 2)]
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 1b4aff8c9cae8c387a517c448bb1c7aee1ed6094..8ca8e9dddf5163a77c2a61cfa4fe2c5a45bc438c 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -63,7 +63,7 @@ class SelfAdjointEigTest(test.TestCase):
           e1 = linalg_ops.self_adjoint_eigvals(matrix1)
           e2 = linalg_ops.self_adjoint_eigvals(matrix2)
           all_ops += [e1, e2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       self.assertAllEqual(val[0], val[2])
       # The algorithm is slightly different for compute_v being True and False,
       # so require approximate equality only here.
@@ -164,8 +164,8 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
         self.assertAllClose(a_ev.eval(), a, atol=atol)
 
         # Compare to numpy.linalg.eigh.
-        CompareEigenDecompositions(self, np_e, np_v,
-                                   tf_e.eval(), tf_v.eval(), atol)
+        CompareEigenDecompositions(self, np_e, np_v, self.evaluate(tf_e),
+                                   self.evaluate(tf_v), atol)
       else:
         tf_e = linalg_ops.self_adjoint_eigvals(constant_op.constant(a))
         self.assertAllClose(
diff --git a/tensorflow/python/kernel_tests/session_ops_test.py b/tensorflow/python/kernel_tests/session_ops_test.py
index 03e1ae852fc5b4ce4297b70b37964310f02306e5..73d85ddc078b4c9dd241ab3d3bb242503a519f0d 100644
--- a/tensorflow/python/kernel_tests/session_ops_test.py
+++ b/tensorflow/python/kernel_tests/session_ops_test.py
@@ -37,7 +37,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Feed a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -51,7 +51,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Get the tensor from its handle.
       self.assertEqual(50, h.eval())
@@ -94,7 +94,7 @@ class SessionOpsTest(test.TestCase):
       # Initialize a handle.
       a = constant_op.constant(0)
       h = session_ops.get_session_handle(a)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Do some computation.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -111,7 +111,7 @@ class SessionOpsTest(test.TestCase):
       # Initialize a handle.
       a = constant_op.constant(0)
       h = session_ops.get_session_handle(a)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Do some computation.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -133,7 +133,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Feed a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -144,7 +144,7 @@ class SessionOpsTest(test.TestCase):
       with ops.device(test.gpu_device_name()):
         a = constant_op.constant(10)
         h = session_ops.get_session_handle(a)
-        h = sess.run(h)
+        h = self.evaluate(h)
         self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
 
   def testHandleDelete(self):
@@ -163,7 +163,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Delete using a raw tensor handle.
       raw_h = h.get_raw_handle()
@@ -219,8 +219,8 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(2.0)
       b_handle_op = session_ops.get_session_handle(b)
 
-      a_handle = sess.run(a_handle_op)
-      b_handle = sess.run(b_handle_op)
+      a_handle = self.evaluate(a_handle_op)
+      b_handle = self.evaluate(b_handle_op)
 
       a_p, a_t = session_ops.get_session_tensor(a_handle.handle, dtypes.float32)
       b_p, b_t = session_ops.get_session_tensor(b_handle.handle, dtypes.float32)
@@ -288,10 +288,10 @@ class SessionOpsTest(test.TestCase):
       a = variables.Variable(12.0)
       inc_a = state_ops.assign_add(a, 2.0)
       b = math_ops.add(a, 5.0)
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
 
       h_a_read = sess.run(session_ops.get_session_handle(a.read_value()))
-      self.assertAllClose(12.0, sess.run(a))
+      self.assertAllClose(12.0, self.evaluate(a))
 
       self.assertAllClose(17.0, sess.run(b, feed_dict={a: h_a_read}))
       sess.run(inc_a)
diff --git a/tensorflow/python/kernel_tests/sets_test.py b/tensorflow/python/kernel_tests/sets_test.py
index 8335e9c139a581a22e06bd2fbfc5c027956d1714..e037f51e0fc9850a5b712afd8f3ab9196ec6e806 100644
--- a/tensorflow/python/kernel_tests/sets_test.py
+++ b/tensorflow/python/kernel_tests/sets_test.py
@@ -159,7 +159,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual(None, op.get_shape().dims)
       self.assertEqual(dtypes.int32, op.dtype)
     with self.cached_session() as sess:
-      results = sess.run(ops)
+      results = self.evaluate(ops)
     self.assertAllEqual(results[0], results[1])
     return results[0]
 
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index ee813e5ffd91d6a83e665dfb013c8a082ed2ad32..a0506fbfc5728e8d3192cb95a5e73915c1cd1921 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -53,8 +53,8 @@ class ShapeOpsTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.shape(x)
       tf_ans_64 = array_ops.shape(x, out_type=dtypes.int64)
-      result = tf_ans.eval()
-      result_64 = tf_ans_64.eval()
+      result = self.evaluate(tf_ans)
+      result_64 = self.evaluate(tf_ans_64)
     self.assertAllEqual(np_ans, result)
     self.assertAllEqual(np_ans, result_64)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -64,7 +64,7 @@ class ShapeOpsTest(test.TestCase):
     x_tf, unused_nnz = _sparsify(x_np)
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.shape(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -73,8 +73,8 @@ class ShapeOpsTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       tf_ans = array_ops.shape_n([x, x, x])
       tf_ans_64 = array_ops.shape_n([x, x, x], out_type=dtypes.int64)
-      result = sess.run(tf_ans)
-      result_64 = sess.run(tf_ans_64)
+      result = self.evaluate(tf_ans)
+      result_64 = self.evaluate(tf_ans_64)
     for i in range(3):
       self.assertAllEqual(np_ans, result[i])
       self.assertAllEqual(np_ans, result_64[i])
@@ -84,7 +84,7 @@ class ShapeOpsTest(test.TestCase):
     np_ans = np.asarray(np.ndim(x))
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.rank(x)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -93,7 +93,7 @@ class ShapeOpsTest(test.TestCase):
     x_tf, unused_nnz = _sparsify(x_np)
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.rank(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -101,9 +101,9 @@ class ShapeOpsTest(test.TestCase):
     np_ans = np.asarray(np.size(x))
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.size(x)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
       tf_ans_64 = array_ops.size(x, out_type=dtypes.int64)
-      result_64 = tf_ans_64.eval()
+      result_64 = self.evaluate(tf_ans_64)
     self.assertAllEqual(np_ans, result)
     self.assertAllEqual(np_ans, result_64)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -113,7 +113,7 @@ class ShapeOpsTest(test.TestCase):
     x_tf, unused_nnz = _sparsify(x_np)
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.size(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -162,7 +162,7 @@ class ShapeOpsTest(test.TestCase):
       inp = array_ops.zeros([2**31])
       num_elements = array_ops.size_internal(
           inp, optimize=False, out_type=dtypes.int64)
-      self.assertEqual(2**31, num_elements.eval())
+      self.assertEqual(2**31, self.evaluate(num_elements))
 
     # Too large for tf.int32 output.
     with self.assertRaises(errors_impl.InvalidArgumentError):
@@ -170,13 +170,13 @@ class ShapeOpsTest(test.TestCase):
         inp = array_ops.zeros([2**31])
         num_elements = array_ops.size_internal(
             inp, optimize=False, out_type=dtypes.int32)
-        self.assertEqual(2**31, num_elements.eval())
+        self.assertEqual(2**31, self.evaluate(num_elements))
 
   def _compareExpandDims(self, x, dim, use_gpu):
     np_ans = np.expand_dims(x, axis=dim)
     with self.cached_session(use_gpu=use_gpu):
       tensor = array_ops.expand_dims(x, dim)
-      tf_ans = tensor.eval()
+      tf_ans = self.evaluate(tensor)
     self.assertShapeEqual(np_ans, tensor)
     self.assertAllEqual(np_ans, tf_ans)
 
@@ -264,7 +264,7 @@ class ShapeOpsTest(test.TestCase):
       np_ans = np.expand_dims(x, axis=0)
       with self.cached_session(use_gpu=True):
         tensor = array_ops.expand_dims(x, constant_op.constant(0, dtype))
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
       self.assertShapeEqual(np_ans, tensor)
       self.assertAllEqual(np_ans, tf_ans)
 
@@ -273,11 +273,11 @@ class ShapeOpsTest(test.TestCase):
       if squeeze_dims:
         np_ans = np.squeeze(x, axis=tuple(squeeze_dims))
         tensor = array_ops.squeeze(x, squeeze_dims)
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
       else:
         np_ans = np.squeeze(x)
         tensor = array_ops.squeeze(x)
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
     self.assertShapeEqual(np_ans, tensor)
     self.assertAllEqual(np_ans, tf_ans)
 
@@ -340,7 +340,7 @@ class ShapeOpsTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         tensor = array_ops.squeeze(np.zeros([1, 1, 1]), [])
         self.assertEqual(np.shape(1), tensor.get_shape())
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
         self.assertEqual(np.shape(1), tf_ans.shape)
 
   def testSqueezeAllOnesBool(self):
@@ -350,7 +350,7 @@ class ShapeOpsTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         tensor = array_ops.squeeze([[[False]]], [])
         self.assertEqual(np.shape(1), tensor.get_shape())
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
         self.assertEqual(np.shape(1), tf_ans.shape)
 
   def testSqueezeOnlyOnes(self):
@@ -415,7 +415,7 @@ class TileTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         a = constant_op.constant(7, shape=[], dtype=dtypes.float32)
         tiled = array_ops.tile(a, [])
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, ())
       self.assertEqual([], tiled.get_shape())
       self.assertEqual(7, result)
@@ -427,7 +427,7 @@ class TileTest(test.TestCase):
         inp = np.random.rand(4, 1).astype(np.float32)
         a = constant_op.constant(inp)
         tiled = array_ops.tile(a, constant_op.constant([1, 4], dtype=dtype))
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, (4, 4))
       self.assertEqual([4, 4], tiled.get_shape())
       self.assertTrue((result == np.tile(inp, (1, 4))).all())
@@ -437,7 +437,7 @@ class TileTest(test.TestCase):
       inp = np.random.rand(4, 1).astype(np.float32)
       a = constant_op.constant(inp)
       tiled = array_ops.tile(a, [1, 1])
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertEqual(result.shape, (4, 1))
     self.assertEqual([4, 1], tiled.get_shape())
     self.assertTrue((result == np.tile(inp, (1, 1))).all())
@@ -447,7 +447,7 @@ class TileTest(test.TestCase):
       inp = np.random.rand(2, 3).astype(np.float32)
       a = constant_op.constant(inp)
       tiled = array_ops.tile(a, [5, 0])
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertEqual(result.shape, (10, 0))
     self.assertEqual([10, 0], tiled.get_shape())
 
@@ -497,7 +497,7 @@ class TileTest(test.TestCase):
             shape=[4, 1],
             dtype=dtype_tf)
         tiled = array_ops.tile(a, [1, 4])
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, (4, 4))
       self.assertEqual([4, 4], tiled.get_shape())
       self.assertAllEqual(result, np.tile(inp, (1, 4)))
@@ -527,7 +527,7 @@ class TileTest(test.TestCase):
           dtype=dtypes.float32)
       multiples = np.random.randint(1, 4, size=rank).astype(np.int32)
       tiled = array_ops.tile(a, multiples)
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertTrue((np.array(multiples) * np.array(inp.shape) == np.array(
         result.shape)).all())
     self.assertAllEqual(result, np.tile(inp, tuple(multiples)))
@@ -557,7 +557,7 @@ class TileTest(test.TestCase):
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
       self.assertShapeEqual(inp, grad)
-      result = grad.eval()
+      result = self.evaluate(grad)
     self.assertAllClose(np.sum(grad_inp, axis=1).reshape(4, 1), result, 1e-3)
 
   def testGradientStridedReduction(self):
@@ -572,7 +572,7 @@ class TileTest(test.TestCase):
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
       self.assertShapeEqual(inp, grad)
-      result = grad.eval()
+      result = self.evaluate(grad)
     expected_shape = [4, 2]
     expected = np.zeros(expected_shape)
     expected[:, 0] = grad_inp[:, 0] + grad_inp[:, 2]
@@ -590,7 +590,7 @@ class TileTest(test.TestCase):
       grad_tensor = constant_op.constant(
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     self.assertAllClose(np.sum(grad_inp, axis=1).reshape(4, 1), result, 1e-3)
 
   def testGradientStridedReductionOnGPU(self):
@@ -604,7 +604,7 @@ class TileTest(test.TestCase):
       grad_tensor = constant_op.constant(
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     expected_shape = [4, 2]
     expected = np.zeros(expected_shape)
     expected[:, 0] = grad_inp[:, 0] + grad_inp[:, 2]
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8f4e31abe3c90af01029be719ee83c7c7dc42f0c
--- /dev/null
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -0,0 +1,143 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
+
+py_library(
+    name = "test_util",
+    srcs = ["test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:tf_optimizer",
+        "//tensorflow/python:training",
+    ],
+)
+
+cuda_py_tests(
+    name = "dct_ops_test",
+    srcs = ["dct_ops_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+)
+
+cuda_py_tests(
+    name = "fft_ops_test",
+    size = "medium",
+    srcs = ["fft_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+    shard_count = 4,
+    tags = ["optonly"],
+)
+
+cuda_py_tests(
+    name = "mel_ops_test",
+    srcs = ["mel_ops_test.py"],
+    additional_deps = [
+        ":test_util",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/ops/signal",
+    ],
+)
+
+cuda_py_tests(
+    name = "mfcc_ops_test",
+    srcs = ["mfcc_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:spectral_ops_test_util",
+    ],
+)
+
+cuda_py_tests(
+    name = "reconstruction_ops_test",
+    srcs = ["reconstruction_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "shape_ops_test",
+    srcs = ["shape_ops_test.py"],
+    additional_deps = [
+        ":test_util",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "spectral_ops_test",
+    size = "large",
+    srcs = ["spectral_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+    tags = ["nomac"],
+)
+
+cuda_py_tests(
+    name = "window_ops_test",
+    srcs = ["window_ops_test.py"],
+    additional_deps = [
+        ":test_util",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/dct_ops_test.py b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
similarity index 68%
rename from tensorflow/python/kernel_tests/dct_ops_test.py
rename to tensorflow/python/kernel_tests/signal/dct_ops_test.py
index c9d0167608ed0447d1a0fcdcc8e054fd8c1fe863..af4939332fdcfac30c05f84ae42838ec24d970b3 100644
--- a/tensorflow/python/kernel_tests/dct_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import importlib
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.ops import spectral_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import dct_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -40,6 +41,20 @@ def try_import(name):  # pylint: disable=invalid-name
 fftpack = try_import("scipy.fftpack")
 
 
+def _np_dct1(signals, norm=None):
+  """Computes the DCT-I manually with NumPy."""
+  # X_k = (x_0 + (-1)**k * x_{N-1} +
+  #       2 * sum_{n=0}^{N-2} x_n * cos(\frac{pi}{N-1} * n * k)  k=0,...,N-1
+  del norm
+  dct_size = signals.shape[-1]
+  dct = np.zeros_like(signals)
+  for k in range(dct_size):
+    phi = np.cos(np.pi * np.arange(1, dct_size - 1) * k / (dct_size - 1))
+    dct[..., k] = 2 * np.sum(signals[..., 1:-1] * phi, axis=-1) + (
+        signals[..., 0] + (-1) ** k * signals[..., -1])
+  return dct
+
+
 def _np_dct2(signals, norm=None):
   """Computes the DCT-II manually with NumPy."""
   # X_k = sum_{n=0}^{N-1} x_n * cos(\frac{pi}{N} * (n + 0.5) * k)  k=0,...,N-1
@@ -81,19 +96,19 @@ def _np_dct3(signals, norm=None):
   return dct
 
 
-NP_DCT = {2: _np_dct2, 3: _np_dct3}
-NP_IDCT = {2: _np_dct3, 3: _np_dct2}
+NP_DCT = {1: _np_dct1, 2: _np_dct2, 3: _np_dct3}
+NP_IDCT = {1: _np_dct1, 2: _np_dct3, 3: _np_dct2}
 
 
-class DCTOpsTest(test.TestCase):
+class DCTOpsTest(parameterized.TestCase, test.TestCase):
 
   def _compare(self, signals, norm, dct_type, atol=5e-4, rtol=5e-4):
     """Compares (I)DCT to SciPy (if available) and a NumPy implementation."""
     np_dct = NP_DCT[dct_type](signals, norm)
-    tf_dct = spectral_ops.dct(signals, type=dct_type, norm=norm).eval()
+    tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm).eval()
     self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol)
     np_idct = NP_IDCT[dct_type](signals, norm)
-    tf_idct = spectral_ops.idct(signals, type=dct_type, norm=norm).eval()
+    tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm).eval()
     self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol)
     if fftpack:
       scipy_dct = fftpack.dct(signals, type=dct_type, norm=norm)
@@ -101,38 +116,51 @@ class DCTOpsTest(test.TestCase):
       scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm)
       self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol)
     # Verify inverse(forward(s)) == s, up to a normalization factor.
-    tf_idct_dct = spectral_ops.idct(
+    tf_idct_dct = dct_ops.idct(
         tf_dct, type=dct_type, norm=norm).eval()
-    tf_dct_idct = spectral_ops.dct(
+    tf_dct_idct = dct_ops.dct(
         tf_idct, type=dct_type, norm=norm).eval()
     if norm is None:
-      tf_idct_dct *= 0.5 / signals.shape[-1]
-      tf_dct_idct *= 0.5 / signals.shape[-1]
+      if dct_type == 1:
+        tf_idct_dct *= 0.5 / (signals.shape[-1] - 1)
+        tf_dct_idct *= 0.5 / (signals.shape[-1] - 1)
+      else:
+        tf_idct_dct *= 0.5 / signals.shape[-1]
+        tf_dct_idct *= 0.5 / signals.shape[-1]
     self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol)
     self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
 
-  def test_random(self):
+  @parameterized.parameters([
+      [[2]], [[3]], [[10]], [[2, 20]], [[2, 3, 25]]])
+  def test_random(self, shape):
     """Test randomly generated batches of data."""
     with spectral_ops_test_util.fft_kernel_label_map():
       with self.session(use_gpu=True):
-        for shape in ([1], [2], [3], [10], [2, 20], [2, 3, 25]):
-          signals = np.random.rand(*shape).astype(np.float32)
-          for norm in (None, "ortho"):
-            self._compare(signals, norm, 2)
-            self._compare(signals, norm, 3)
+        signals = np.random.rand(*shape).astype(np.float32)
+        # Normalization not implemented for orthonormal.
+        self._compare(signals, norm=None, dct_type=1)
+        for norm in (None, "ortho"):
+          self._compare(signals, norm, 2)
+          self._compare(signals, norm, 3)
 
   def test_error(self):
     signals = np.random.rand(10)
     # Unsupported type.
     with self.assertRaises(ValueError):
-      spectral_ops.dct(signals, type=1)
+      dct_ops.dct(signals, type=5)
+    # DCT-I normalization not implemented.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(signals, type=1, norm="ortho")
+    # DCT-I requires at least two inputs.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(np.random.rand(1), type=1)
     # Unknown normalization.
     with self.assertRaises(ValueError):
-      spectral_ops.dct(signals, norm="bad")
+      dct_ops.dct(signals, norm="bad")
     with self.assertRaises(NotImplementedError):
-      spectral_ops.dct(signals, n=10)
+      dct_ops.dct(signals, n=10)
     with self.assertRaises(NotImplementedError):
-      spectral_ops.dct(signals, axis=0)
+      dct_ops.dct(signals, axis=0)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
similarity index 98%
rename from tensorflow/python/kernel_tests/fft_ops_test.py
rename to tensorflow/python/kernel_tests/signal/fft_ops_test.py
index 8592550f99a8da997de5d8abd4dee0ca541259db..3eeecc12a887d778206e8efeac58a0f7c60d4203 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -29,8 +29,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.platform import test
 
 VALID_FFT_RANKS = (1, 2, 3)
@@ -139,21 +139,21 @@ class FFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.fft
+      return fft_ops.fft
     elif rank == 2:
-      return spectral_ops.fft2d
+      return fft_ops.fft2d
     elif rank == 3:
-      return spectral_ops.fft3d
+      return fft_ops.fft3d
     else:
       raise ValueError("invalid rank")
 
   def _tfIFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.ifft
+      return fft_ops.ifft
     elif rank == 2:
-      return spectral_ops.ifft2d
+      return fft_ops.ifft2d
     elif rank == 3:
-      return spectral_ops.ifft3d
+      return fft_ops.ifft3d
     else:
       raise ValueError("invalid rank")
 
@@ -312,21 +312,21 @@ class RFFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.rfft
+      return fft_ops.rfft
     elif rank == 2:
-      return spectral_ops.rfft2d
+      return fft_ops.rfft2d
     elif rank == 3:
-      return spectral_ops.rfft3d
+      return fft_ops.rfft3d
     else:
       raise ValueError("invalid rank")
 
   def _tfIFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.irfft
+      return fft_ops.irfft
     elif rank == 2:
-      return spectral_ops.irfft2d
+      return fft_ops.irfft2d
     elif rank == 3:
-      return spectral_ops.irfft3d
+      return fft_ops.irfft3d
     else:
       raise ValueError("invalid rank")
 
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
similarity index 97%
rename from tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
rename to tensorflow/python/kernel_tests/signal/mel_ops_test.py
index 13ee8764b7c48bccb45064f43a8a143c08b8d10e..2b3dde30f3975401be6b1217e645bf015ed7b9b0 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.kernel_tests import test_util
-from tensorflow.contrib.signal.python.ops import mel_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.signal import mel_ops
 from tensorflow.python.platform import test
 
 # mel spectrum constants and functions.
@@ -141,7 +141,7 @@ class LinearToMelTest(test.TestCase):
       for config in configs:
         mel_matrix_np = spectrogram_to_mel_matrix(*config)
         mel_matrix = mel_ops.linear_to_mel_weight_matrix(*config)
-        self.assertAllClose(mel_matrix_np, mel_matrix.eval(), atol=3e-6)
+        self.assertAllClose(mel_matrix_np, self.evaluate(mel_matrix), atol=3e-6)
 
   def test_dtypes(self):
     # LinSpace is not supported for tf.float16.
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
similarity index 97%
rename from tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
rename to tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
index 9de1e2c2f440912a4fc856f2cc236712b603f0d9..79d23d77d1e3112822bc5968d957b095ae378188 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops import mfcc_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import mfcc_ops
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
similarity index 96%
rename from tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py
rename to tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
index c476cd4e00d621bbf694b27236d82f18e8699c8c..de3351e543c19d879598aebd264fb4a9f63c1c37 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.ops import reconstruction_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import reconstruction_ops
 from tensorflow.python.platform import test
 
 
@@ -56,7 +56,7 @@ class ReconstructionOpsTest(test.TestCase):
     reconstruction = reconstruction_ops.overlap_and_add(signal, 2)
 
     with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+      output = self.evaluate(reconstruction)
 
       expected_output = np.array([1, 1, 2, 2, 3, 2, 2, 1, 1])
 
@@ -99,7 +99,7 @@ class ReconstructionOpsTest(test.TestCase):
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
     with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+      output = self.evaluate(reconstruction)
       string_output = [np.base_repr(x, self.bases[0]) for x in output]
 
       self.assertEqual(string_output, self.expected_string)
@@ -109,7 +109,7 @@ class ReconstructionOpsTest(test.TestCase):
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
     with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+      output = self.evaluate(reconstruction)
 
       accumulator = True
       for i in range(self.batch_size):
@@ -125,7 +125,7 @@ class ReconstructionOpsTest(test.TestCase):
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
     with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+      output = self.evaluate(reconstruction)
 
       string_output = [np.base_repr(int(x), self.bases[0]) for x in
                        np.squeeze(output)]
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
similarity index 97%
rename from tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
rename to tensorflow/python/kernel_tests/signal/shape_ops_test.py
index 838025a0406b609e017e49a25be8bb10a6ab99f0..21a6b23b304efd2a0d61cb8b7e9a009cdea182fb 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.kernel_tests import test_util
-from tensorflow.contrib.signal.python.ops import shape_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.platform import test
 
 
@@ -150,7 +150,7 @@ class FrameTest(test.TestCase):
           op = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=pad_end, pad_value=99)
           with self.cached_session(use_gpu=True):
-            result = op.eval()
+            result = self.evaluate(op)
           self.assertEqual(op.shape.as_list(), list(result.shape))
 
   def test_basic_mono(self):
@@ -248,7 +248,7 @@ class FrameTest(test.TestCase):
       result = shape_ops.frame(signal, frame_length=2, frame_step=2,
                                pad_end=True, axis=1)
       expected = np.reshape(np.arange(16), (2, 2, 2, 2))
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
       result = shape_ops.frame(signal, frame_length=2, frame_step=1,
                                pad_end=True, axis=1)
@@ -260,7 +260,7 @@ class FrameTest(test.TestCase):
                    [[10, 11], [12, 13]],
                    [[12, 13], [14, 15]],
                    [[14, 15], [0, 0]]]]
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
       result = shape_ops.frame(signal, frame_length=3, frame_step=1,
                                pad_end=True, axis=1)
@@ -272,7 +272,7 @@ class FrameTest(test.TestCase):
                    [[10, 11], [12, 13], [14, 15]],
                    [[12, 13], [14, 15], [0, 0]],
                    [[14, 15], [0, 0], [0, 0]]]]
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
   def test_window_larger_than_signal(self):
     signal = constant_op.constant([[1, 2], [11, 12]], dtype=dtypes.float32)
diff --git a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
similarity index 96%
rename from tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
rename to tensorflow/python/kernel_tests/signal/spectral_ops_test.py
index 5106a22f8881d6a654540a204cf359161a1d64c2..7583c4d8fc554812eca9de8b29971d1f00e79776 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
@@ -20,14 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.ops import spectral_ops
-from tensorflow.contrib.signal.python.ops import window_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import spectral_ops
+from tensorflow.python.ops.signal import window_ops
 from tensorflow.python.platform import test
 
 
@@ -125,22 +125,22 @@ class SpectralOpsTest(test.TestCase):
       stft = spectral_ops.stft(signal, frame_length=7, frame_step=8,
                                pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
                                pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
                                fft_length=16, pad_end=True)
       self.assertAllEqual([64, 9], stft.shape.as_list())
-      self.assertAllEqual([64, 9], stft.eval().shape)
+      self.assertAllEqual([64, 9], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=16, frame_step=8,
                                fft_length=8, pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = np.zeros((32, 9)).astype(np.complex64)
 
@@ -148,7 +148,7 @@ class SpectralOpsTest(test.TestCase):
                                                fft_length=16, frame_step=8)
       expected_length = (stft.shape[0] - 1) * 8 + 8
       self.assertAllEqual([256], inverse_stft.shape.as_list())
-      self.assertAllEqual([expected_length], inverse_stft.eval().shape)
+      self.assertAllEqual([expected_length], self.evaluate(inverse_stft).shape)
 
   def test_stft_and_inverse_stft(self):
     """Test that spectral_ops.stft/inverse_stft match a NumPy implementation."""
diff --git a/tensorflow/contrib/signal/python/kernel_tests/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
similarity index 77%
rename from tensorflow/contrib/signal/python/kernel_tests/test_util.py
rename to tensorflow/python/kernel_tests/signal/test_util.py
index b4422a49887378187a2be46275d4dabf1fbd40a1..0a8a621c3eeee1b943a55aced138a6abad233059 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Test utilities for tf.contrib.signal."""
+"""Test utilities for tf.signal."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.training import saver
 
 
-def grappler_optimize(graph, fetches=None, rewriter_config=None):
+def grappler_optimize(graph, fetches=None, config_proto=None):
   """Tries to optimize the provided graph using grappler.
 
   Args:
@@ -31,17 +31,17 @@ def grappler_optimize(graph, fetches=None, rewriter_config=None):
     fetches: An optional list of `Tensor`s to fetch (i.e. not optimize away).
       Grappler uses the 'train_op' collection to look for fetches, so if not
       provided this collection should be non-empty.
-    rewriter_config: An optional `tf.RewriterConfig` to use when rewriting the
+    config_proto: An optional `tf.ConfigProto` to use when rewriting the
       graph.
 
   Returns:
     A `tf.GraphDef` containing the rewritten graph.
   """
-  if rewriter_config is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
-    rewriter_config.min_graph_nodes = -1
+  if config_proto is None:
+    config_proto = config_pb2.ConfigProto()
+    config_proto.graph_options.rewrite_options.min_graph_nodes = -1
   if fetches is not None:
     for fetch in fetches:
       graph.add_to_collection('train_op', fetch)
   metagraph = saver.export_meta_graph(graph_def=graph.as_graph_def())
-  return tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+  return tf_optimizer.OptimizeGraph(config_proto, metagraph)
diff --git a/tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
similarity index 96%
rename from tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py
rename to tensorflow/python/kernel_tests/signal/window_ops_test.py
index 6a46a226938a5dce39b3b93771f10be1e9e709ae..2f19134f5a8c1b43852c14a857a709c61fef30a9 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -22,10 +22,10 @@ import functools
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.kernel_tests import test_util
-from tensorflow.contrib.signal.python.ops import window_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.kernel_tests.signal import test_util
+from tensorflow.python.ops.signal import window_ops
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 41f040ab739451cf5a42287702ee4cffe1d8d3fa..5bb34a632d28fcd00ce9d3a1fef5f359e08fbef8 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -38,7 +38,7 @@ class SliceTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.float32)
         slice_t = a[2, k:k]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[2, k:k])
 
   def testInt32(self):
@@ -47,7 +47,7 @@ class SliceTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.int32)
         slice_t = a[2, k:k]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[2, k:k])
 
   def testSlicingWithInt64Index(self):
@@ -57,33 +57,33 @@ class SliceTest(test.TestCase):
       # Slice using int64 Tensor.
       i = constant_op.constant(1, dtype=dtypes.int64)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i+1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       # Slice using int64 integer.
       i = np.asarray(1).astype(np.int64)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i+1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       a_int32 = constant_op.constant([0, 1, 2], dtype=dtypes.int32)
       slice_t = array_ops.slice(a_int32,
                                 np.asarray([1]).astype(np.int64),
                                 np.asarray([2]).astype(np.int64))
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1, 2], slice_val)
 
       a_float32 = constant_op.constant([0, 1, 2], dtype=dtypes.float32)
       slice_t = array_ops.slice(a_float32,
                                 np.asarray([1]).astype(np.int64),
                                 np.asarray([2]).astype(np.int64))
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1, 2], slice_val)
 
   def testSlicingInt64Tensor(self):
@@ -93,23 +93,23 @@ class SliceTest(test.TestCase):
       # Slice using int32 Tensor.
       i = constant_op.constant(1, dtype=dtypes.int32)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i + 1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       # Slice using int32 integer.
       i = np.asarray(1).astype(np.int32)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i + 1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       slice_t = array_ops.slice(a, [1], [2])
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1, 2], slice_val)
 
   def testSelectAll(self):
@@ -121,8 +121,8 @@ class SliceTest(test.TestCase):
         slice_explicit_t = array_ops.slice(a, [0, 0, 0, 0], [-1, -1, -1, -1])
         slice_implicit_t = a[:, :, :, :]
 
-        self.assertAllEqual(inp, slice_explicit_t.eval())
-        self.assertAllEqual(inp, slice_implicit_t.eval())
+        self.assertAllEqual(inp, self.evaluate(slice_explicit_t))
+        self.assertAllEqual(inp, self.evaluate(slice_implicit_t))
         self.assertEqual(inp.shape, slice_explicit_t.get_shape())
         self.assertEqual(inp.shape, slice_implicit_t.get_shape())
 
@@ -134,7 +134,7 @@ class SliceTest(test.TestCase):
 
         hi = np.random.randint(0, 9)
         scalar_t = a[hi]
-        scalar_val = scalar_t.eval()
+        scalar_val = self.evaluate(scalar_t)
         self.assertAllEqual(scalar_val, inp[hi])
 
         if hi > 0:
@@ -142,7 +142,7 @@ class SliceTest(test.TestCase):
         else:
           lo = 0
         slice_t = a[lo:hi]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
         self.assertAllEqual(slice_val, inp[lo:hi])
 
   def testScalarInput(self):
@@ -195,7 +195,7 @@ class SliceTest(test.TestCase):
 
         x, y = np.random.randint(0, 3, size=2).tolist()
         slice_t = a[x, 0:y]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[x, 0:y])
 
   def testSimple(self):
@@ -282,7 +282,7 @@ class SliceTest(test.TestCase):
       grads = np.random.rand(num_grads).astype("f").reshape(slice_size)
       grad_tensor = constant_op.constant(grads)
       grad = gradients_impl.gradients(slice_t, [a], grad_tensor)[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
 
     # Create a zero tensor of the input shape ane place
     # the grads into the right location to compare against TensorFlow.
@@ -368,7 +368,7 @@ class SliceTest(test.TestCase):
       c = b[:-1, :]
       d = c[1, :]
       res = 2 * d - c[1, :] + a[2, :] - 2 * b[-2, :]
-      self.assertAllEqual([0, 0, 0], res.eval())
+      self.assertAllEqual([0, 0, 0], self.evaluate(res))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 3218d00c668c1223e604e0c5effc8e7e52477474..8b1a2e4c4e3895dd1cdcfc0a24825e1073daa19d 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -64,7 +64,7 @@ class SoftmaxTest(test.TestCase):
         tf_softmax = nn_ops.log_softmax(np_features, axis=dim, name=name)
       else:
         tf_softmax = nn_ops.softmax(np_features, axis=dim, name=name)
-      out = tf_softmax.eval()
+      out = self.evaluate(tf_softmax)
     self.assertAllCloseAccordingToType(np_softmax, out)
     self.assertShapeEqual(np_softmax, tf_softmax)
     if not log:
@@ -113,7 +113,7 @@ class SoftmaxTest(test.TestCase):
     features = np.array([[1., 1., 1., 1.], [max, 1., 2., 3.]]).astype(type)
     with self.cached_session(use_gpu=use_gpu):
       tf_log_softmax = nn_ops.log_softmax(features)
-      out = tf_log_softmax.eval()
+      out = self.evaluate(tf_log_softmax)
     self.assertAllClose(
         np.array([[-1.386294, -1.386294, -1.386294, -1.386294],
                   [0, -max, -max, -max]]),
@@ -222,6 +222,13 @@ class SoftmaxTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         nn_ops.softmax([1., 2., 3., 4.], axis=dim).eval()
 
+  def testInvalidAxis(self):
+    # Test case for GitHub issue 22793.
+    with self.cached_session():
+      ones = array_ops.ones(shape=[2, 3])
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        nn_ops.softmax(ones, axis=2).eval()
+
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
     # https://github.com/tensorflow/tensorflow/issues/4425 for details
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index 50a8291ea88f0046d6c94eebf89e7bd79bd97659..48445a73808a2ba17182d963f7de9823f000bb26 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -39,7 +39,7 @@ class SoftplusTest(test.TestCase):
     np_softplus = self._npSoftplus(np_features)
     with self.cached_session(use_gpu=use_gpu):
       softplus = nn_ops.softplus(np_features)
-      tf_softplus = softplus.eval()
+      tf_softplus = self.evaluate(softplus)
     self.assertAllCloseAccordingToType(np_softplus, tf_softplus)
     self.assertTrue(np.all(tf_softplus > 0))
     self.assertShapeEqual(np_softplus, softplus)
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
index ee2e2e03032aca6afc9aaa73e56598920932bb55..71aac7e48e1bab9d1f6242d1699ed3c65c9ca952 100644
--- a/tensorflow/python/kernel_tests/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -36,7 +36,7 @@ class SoftsignTest(test.TestCase):
     np_softsign = self._npSoftsign(np_features)
     with self.cached_session(use_gpu=use_gpu):
       softsign = nn_ops.softsign(np_features)
-      tf_softsign = softsign.eval()
+      tf_softsign = self.evaluate(softsign)
     self.assertAllClose(np_softsign, tf_softsign)
     self.assertShapeEqual(np_softsign, softsign)
 
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index b05f14f7381bca60fdd0fae51b20f5968a44973c..8ac98a198c661d55e31b64bbe8acdd6465411721 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -36,21 +36,22 @@ class SpaceToDepthTest(test.TestCase):
 
   def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
     input_nhwc = math_ops.cast(inputs, dtype)
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       # test NHWC (default) on CPU
       x_tf = array_ops.space_to_depth(input_nhwc, block_size)
-      self.assertAllEqual(x_tf.eval(), outputs)
-    if test.is_gpu_available():
-      with self.session(force_gpu=True):
+      self.assertAllEqual(self.evaluate(x_tf), outputs)
+
+    if test_util.is_gpu_available():
+      with test_util.force_gpu():
         # test NHWC (default) on GPU
         x_tf = array_ops.space_to_depth(input_nhwc, block_size)
-        self.assertAllEqual(x_tf.eval(), outputs)
+        self.assertAllEqual(self.evaluate(x_tf), outputs)
         # test NCHW on GPU
         input_nchw = test_util.NHWCToNCHW(input_nhwc)
         output_nchw = array_ops.space_to_depth(
             input_nchw, block_size, data_format="NCHW")
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
-        self.assertAllEqual(output_nhwc.eval(), outputs)
+        self.assertAllEqual(self.evaluate(output_nhwc), outputs)
 
   def testBasic(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -134,17 +135,18 @@ class SpaceToDepthTest(test.TestCase):
     input_nhwc = array_ops.ones([batch_size, 4, 6, 3])
     x_out = array_ops.ones([batch_size, 2, 3, 12])
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       # test NHWC (default) on CPU
       x_tf = array_ops.space_to_depth(input_nhwc, block_size)
       self.assertAllEqual(x_tf.shape, x_out.shape)
-      x_tf.eval()
+      self.evaluate(x_tf)
+
     if test.is_gpu_available():
-      with self.session(use_gpu=True):
+      with test_util.use_gpu():
         # test NHWC (default) on GPU
         x_tf = array_ops.space_to_depth(input_nhwc, block_size)
         self.assertAllEqual(x_tf.shape, x_out.shape)
-        x_tf.eval()
+        self.evaluate(x_tf)
 
   # Tests for different width and height.
   def testNonSquare(self):
@@ -163,7 +165,7 @@ class SpaceToDepthTest(test.TestCase):
     block_size = 2
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
@@ -178,7 +180,7 @@ class SpaceToDepthTest(test.TestCase):
     block_size = 0
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
@@ -186,7 +188,7 @@ class SpaceToDepthTest(test.TestCase):
     block_size = 1
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
@@ -194,7 +196,7 @@ class SpaceToDepthTest(test.TestCase):
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   def testBlockSizeNotDivisibleWidth(self):
     # The block size divides width but not height.
diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index a746830afb377c5708640abd6a7590381d213b3f..845950bca7630453dc35be43a86a926d0873b30c 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -91,7 +91,7 @@ class SparseAddTest(test.TestCase):
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
           self.assertAllEqual((3, 3), sp_sum.get_shape())
 
-          sum_out = sess.run(sp_sum)
+          sum_out = self.evaluate(sp_sum)
 
           self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
           self.assertAllEqual(sum_out.indices, [[0, 1], [1, 0], [2, 0], [2, 1]])
@@ -104,7 +104,7 @@ class SparseAddTest(test.TestCase):
       sp_b = self._SparseTensor_3x3(negate=True)
 
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, 0.1)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, np.empty([0, 2]))
@@ -123,7 +123,7 @@ class SparseAddTest(test.TestCase):
 
       # two values should vanish: |.1| < .21, and |-.2| < .21
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, thresh=0.21)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0]])
@@ -132,7 +132,7 @@ class SparseAddTest(test.TestCase):
 
       # only .1 vanishes
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, thresh=0.11)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0], [2, 1]])
diff --git a/tensorflow/python/kernel_tests/sparse_concat_op_test.py b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
index 402c5eb4ea3c52752ed8ff8494014aab6cf15d33..a3d136c8d51204a6bc688a035dddf58fe956ae69 100644
--- a/tensorflow/python/kernel_tests/sparse_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
@@ -147,7 +147,7 @@ class SparseConcatTest(test.TestCase):
           self.assertEqual(sp_concat.values.get_shape(), [4])
           self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-          concat_out = sess.run(sp_concat)
+          concat_out = self.evaluate(sp_concat)
 
           self.assertAllEqual(concat_out.indices,
                               [[0, 2], [1, 0], [2, 0], [2, 2]])
@@ -169,7 +169,7 @@ class SparseConcatTest(test.TestCase):
             self.assertEqual(sp_concat.values.get_shape(), [8])
             self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-            concat_out = sess.run(sp_concat)
+            concat_out = self.evaluate(sp_concat)
 
             self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4],
                                                      [2, 0], [2, 2], [2, 3],
@@ -195,7 +195,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [7])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(
             concat_out.indices,
@@ -220,7 +220,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [10])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4], [1, 8],
                                                  [2, 0], [2, 2], [2, 3], [2, 6],
@@ -244,7 +244,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [8])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(
             concat_out.indices,
@@ -302,8 +302,8 @@ class SparseConcatTest(test.TestCase):
           sp_concat_dim1 = sparse_ops.sparse_concat(
               concat_dim1, [sp_a, sp_b, sp_c, sp_d], expand_nonconcat_dim=True)
 
-          sp_concat_dim0_out = sess.run(sp_concat_dim0)
-          sp_concat_dim1_out = sess.run(sp_concat_dim1)
+          sp_concat_dim0_out = self.evaluate(sp_concat_dim0)
+          sp_concat_dim1_out = self.evaluate(sp_concat_dim1)
 
           self.assertAllEqual(sp_concat_dim0_out.indices,
                               [[0, 2], [1, 0], [2, 0], [2, 2], [4, 1], [5, 0],
diff --git a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
index a824d5c826305a04bdc8c96d67837a39ae2dd5de..267275e771edc8d7c320e11e5d7825216ff56999 100644
--- a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
@@ -189,7 +189,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual([0, 1, 2], val.indices)
       self.assertAllEqual([[0.5, 0.5], [0, 2], [3, 0]], val.values)
       self.assertAllEqual([-1, 2], val.dense_shape)
@@ -209,7 +209,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual([0, 1, 2], val.indices)
       self.assertAllEqual([[1, 1], [0, 2], [3, 0]], val.values)
       self.assertAllEqual([-1, 2], val.dense_shape)
@@ -235,7 +235,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual(val.indices, [0, 1, 2])
       self.assertAllEqual(val.values, [[0.5, 0.5], [0, 2], [3, 0]])
       self.assertAllEqual(val.dense_shape, [-1, 2])
@@ -252,7 +252,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual(val.indices, [0, 1, 2])
       self.assertAllEqual(val.values, [[5, 5], [0, 20], [30, 0]])
       self.assertAllEqual(val.dense_shape, [-1, 2])
@@ -269,7 +269,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_indexed_slices_grad(1)
 
       def apply_indexed_slices_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(
@@ -281,7 +281,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
 
       expected_val = sum(elems) / len(elems)
       self._assertEqual_nparray(
@@ -303,7 +303,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_indexed_slices_grad(1)
 
       def apply_indexed_slices_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(target=apply_indexed_slices_grad, args=(o,))
@@ -315,7 +315,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
 
       expected_val = 550.0
       self._assertEqual_nparray(
@@ -338,13 +338,13 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       def apply_indexed_slices_grad():
         for accum_op in accum_ops:
           time.sleep(1.0)
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       apply_indexed_slices_grad_thread = self.checkedThread(
           target=apply_indexed_slices_grad)
 
       def take_grad():
-        t = sess.run(takeg_t)
+        t = self.evaluate(takeg_t)
         results.append(t)
 
       threads = [self.checkedThread(target=take_grad) for _ in range(10)]
@@ -378,7 +378,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       def apply_indexed_slices_grad():
         for accum_op in accum_ops:
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       def take_grad():
         results.append(sess.run(takeg_t))
@@ -394,7 +394,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
   def _blocking_takeg(self, sess, takeg_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(takeg_op)
+      self.evaluate(takeg_op)
 
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 6e0714da702a09735ca10f7bb8658ecb25cbe8fb..8451b96c564e39056e98bc4805c9dd3b38cebdea 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -43,7 +43,7 @@ class SparseCrossOpTest(test.TestCase):
         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_dense(self):
     """Tests only dense inputs."""
@@ -63,7 +63,7 @@ class SparseCrossOpTest(test.TestCase):
         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_integer_mixed_string_sparse(self):
     """Tests mixed type."""
@@ -77,7 +77,7 @@ class SparseCrossOpTest(test.TestCase):
         '55555_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_integer_mixed_string_dense(self):
     """Tests mixed dense inputs."""
@@ -95,7 +95,7 @@ class SparseCrossOpTest(test.TestCase):
         '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_sparse_cross_dense(self):
     """Tests sparse and dense inputs."""
@@ -112,7 +112,7 @@ class SparseCrossOpTest(test.TestCase):
             'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
         ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_integer_sparse_input(self):
     """Tests mixed type sparse and dense inputs."""
@@ -128,7 +128,7 @@ class SparseCrossOpTest(test.TestCase):
             '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2'
         ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_permutation_3x3x3(self):
     """Tests 3x3x3 permutation."""
@@ -170,7 +170,7 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_permutation_3x1x2(self):
     """Tests 3x1x2 permutation."""
@@ -189,7 +189,7 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_large_batch(self):
     """Tests with large batch size to force multithreading."""
@@ -222,7 +222,7 @@ class SparseCrossOpTest(test.TestCase):
 
     expected_out = self._sparse_tensor(col_out)
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_one_column_empty(self):
     """Tests when one column is empty.
@@ -235,7 +235,7 @@ class SparseCrossOpTest(test.TestCase):
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
     ])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_empty(sess.run(op))
+      self._assert_sparse_tensor_empty(self.evaluate(op))
 
   def test_some_columns_empty(self):
     """Tests when more than one columns are empty.
@@ -254,7 +254,7 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
     ]], 2)
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_all_columns_empty(self):
     """Tests when all columns are empty.
@@ -267,7 +267,7 @@ class SparseCrossOpTest(test.TestCase):
         self._sparse_tensor([])
     ])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_empty(sess.run(op))
+      self._assert_sparse_tensor_empty(self.evaluate(op))
 
   def test_hashed_zero_bucket_no_hash_key(self):
     op = sparse_ops.sparse_cross_hashed([
@@ -278,7 +278,7 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[1971693436396284976]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_hashed_zero_bucket(self):
     op = sparse_ops.sparse_cross_hashed(
@@ -291,7 +291,7 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[4847552627144134031]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   # TODO(sibyl-Aix6ihai): Add benchmark to compare Hashed vs Non-hashed.
   def test_hashed_no_hash_key(self):
@@ -305,7 +305,7 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[83]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_hashed_output(self):
     op = sparse_ops.sparse_cross_hashed(
@@ -319,7 +319,7 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[31]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   def test_hashed__has_no_collision(self):
     """Tests that fingerprint concatenation has no collisions."""
@@ -331,7 +331,7 @@ class SparseCrossOpTest(test.TestCase):
         [t2, t1], num_buckets=1024, hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
     cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
     with session.Session():
-      values = cross_dense.eval()
+      values = self.evaluate(cross_dense)
       self.assertTrue(numpy.not_equal(values[0], values[1]).all())
 
   def test_hashed_3x1x2(self):
@@ -345,7 +345,7 @@ class SparseCrossOpTest(test.TestCase):
         ],
         num_buckets=1000)
     with self.cached_session() as sess:
-      out = sess.run(op)
+      out = self.evaluate(op)
       self.assertEqual(6, len(out.values))
       self.assertAllEqual([[0, i] for i in range(6)], out.indices)
       self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
diff --git a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
index 541463e76bbc7b5569bb4deabd86872dd75c9533..4de69a26e3272c76220f53151419abc442098e5d 100644
--- a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
@@ -58,7 +58,7 @@ class SparseMatMulTest(test.TestCase):
           transpose_b=tr_b,
           a_is_sparse=sp_a,
           b_is_sparse=sp_b)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       np_x = math_ops.cast(tf_x, dtypes.float32).eval()
       np_y = math_ops.cast(tf_y, dtypes.float32).eval()
 
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index a45ce2e13b471994b7fceece3536ed43ce9add86..ad253595d257632b32d0f4aff055041d0dfa3bf9 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -154,7 +154,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
                        sparse_tensor.SparseTensor.from_value(values_v)):
           sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-          output = sess.run(sp_output)
+          output = self.evaluate(sp_output)
           self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat32(self):
@@ -163,7 +163,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
@@ -172,7 +172,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt32AndFloat32NonCanonicalOrder(self):
@@ -182,7 +182,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testInt64AndFloat32NonCanonicalOrder(self):
@@ -192,7 +192,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testInt64AndFloat64NonCanonicalOrder(self):
@@ -203,7 +203,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size_tensor, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testShouldSetLastDimensionInDynamicShape(self):
@@ -261,7 +261,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
@@ -270,7 +270,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64Shape(self):
@@ -279,7 +279,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
 
@@ -302,7 +302,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
         to_retain = np.array([1, 0, 0, 1, 1, 0], dtype=np.bool)
         sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
 
-        output = sess.run(sp_output)
+        output = self.evaluate(sp_output)
 
         self.assertAllEqual(output.indices, [[0, 0], [1, 4], [3, 2]])
         self.assertAllEqual(output.values, [0, 14, 32])
@@ -314,7 +314,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
       to_retain = np.zeros((6,), dtype=np.bool)
       sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, np.array([]).reshape((0, 2)))
       self.assertAllEqual(output.values, [])
@@ -365,7 +365,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
@@ -378,7 +378,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
@@ -404,7 +404,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       sp_input = self._SparseTensor_2x5x6()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
@@ -416,7 +416,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       sp_input = self._SparseTensor_2x5x6_Empty()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices.shape, [0, 3])
       self.assertAllEqual(output.values.shape, [0])
@@ -591,8 +591,8 @@ class SparseAddTest(test_util.TensorFlowTestCase):
     sp_output = sparse_ops.sparse_add(sp_input, sp_input)
 
     with self.session(use_gpu=False) as sess:
-      sess.run(variables.global_variables_initializer())
-      output = sess.run(sp_output)
+      self.evaluate(variables.global_variables_initializer())
+      output = self.evaluate(sp_output)
       self.assertAllEqual(output.values, [2])
 
 
@@ -635,7 +635,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
       else:
         tf_dense_ans = sparse_ops.sparse_reduce_max(sp_t, reduction_axes,
                                                     keep_dims)
-      out_dense = tf_dense_ans.eval()
+      out_dense = self.evaluate(tf_dense_ans)
 
       if do_sum:
         tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t,
@@ -710,16 +710,16 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
           axes = np.random.choice(len(dims), size=d, replace=False).tolist()
           reduced = sparse_ops.sparse_reduce_sum(sp_t, axes)
 
-          err = gradient_checker.compute_gradient_error(sp_t.values, (nnz,),
-                                                        reduced,
-                                                        reduced.eval().shape)
+          err = gradient_checker.compute_gradient_error(
+              sp_t.values, (nnz,), reduced,
+              self.evaluate(reduced).shape)
           self.assertLess(err, 1e-3)
 
         # Tests for negative axes.
         reduced = sparse_ops.sparse_reduce_sum(sp_t, -1)
-        err = gradient_checker.compute_gradient_error(sp_t.values, (nnz,),
-                                                      reduced,
-                                                      reduced.eval().shape)
+        err = gradient_checker.compute_gradient_error(
+            sp_t.values, (nnz,), reduced,
+            self.evaluate(reduced).shape)
         self.assertLess(err, 1e-3)
 
 
diff --git a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
index 7b83ae51779143377ef3ca6b9c909731f7829ca9..bbf2f392026a4a91a8972f8d4bdb8b68adaefa75 100644
--- a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
@@ -60,7 +60,7 @@ class SparseReorderTest(test.TestCase):
       input_val = self._SparseTensorValue_5x6(np.arange(6))
       sp_output = sparse_ops.sparse_reorder(input_val)
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices, input_val.indices)
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
@@ -83,7 +83,7 @@ class SparseReorderTest(test.TestCase):
         input_val = self._SparseTensorValue_5x6(np.random.permutation(6))
         sp_output = sparse_ops.sparse_reorder(input_val)
 
-        output_val = sess.run(sp_output)
+        output_val = self.evaluate(sp_output)
         self.assertAllEqual(output_val.indices, expected_output_val.indices)
         self.assertAllEqual(output_val.values, expected_output_val.values)
         self.assertAllEqual(output_val.dense_shape,
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index f7be397c333e3012e994a942b3428b92b0f7c54d..918af27091bee68ad54f95fc8d4318f2ef5aacb3 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -81,7 +81,7 @@ class SparseReshapeTest(test.TestCase):
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(input_val, [5, 6])
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices, input_val.indices)
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
@@ -151,7 +151,7 @@ class SparseReshapeTest(test.TestCase):
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(input_val, [2, 3, 5])
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices,
                           np.array([[0, 0, 0], [0, 1, 1], [0, 1, 4], [0, 2, 0],
                                     [1, 1, 0], [1, 1, 1]]))
diff --git a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
index b24a08696991dda7051f1348c4afc1675362b6d8..39a9ab9b491c2048b53b28c5ce53ee19415807b1 100644
--- a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
@@ -73,7 +73,7 @@ class SerializeSparseTest(test.TestCase):
       serialized = serialize_fn(sp_input, out_type=out_type)
       sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
-      indices, values, shape = sess.run(sp_deserialized)
+      indices, values, shape = self.evaluate(sp_deserialized)
 
       self.assertAllEqual(indices, sp_input[0])
       self.assertAllEqual(values, sp_input[1])
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index fe334045afe5ff4d034528c815e3485e1e98f8f9..e605cb1c3588f628966b627814df1adc019f653f 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -80,7 +80,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
       self.assertEqual(tf_value_ans.get_shape()[1], np_ans.shape[1])
       self.assertEqual(tf_tensor_ans.get_shape()[1], np_ans.shape[1])
 
-      for out in (tf_value_ans.eval(), tf_tensor_ans.eval()):
+      for out in (tf_value_ans.eval(), self.evaluate(tf_tensor_ans)):
         if x.dtype == np.float32:
           self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-4)
         elif x.dtype == np.float64:
diff --git a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
index e08464a701c98f24d4ca977b824cdf1e7c329763..e63ba8f697053cc05968f32c867d143f6acc88c2 100644
--- a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
@@ -88,7 +88,7 @@ class SparseTensorsMapTest(test.TestCase):
       sp_out = take_many_sparse_from_tensors_map(
           sparse_map_op=handle0.op, sparse_handles=handles_concat)
 
-      combined_indices, combined_values, combined_shape = sess.run(sp_out)
+      combined_indices, combined_values, combined_shape = self.evaluate(sp_out)
 
       self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
       self.assertAllEqual(combined_indices[:6, 1:], sp_input0[0])
@@ -114,7 +114,8 @@ class SparseTensorsMapTest(test.TestCase):
       sp_roundtrip = take_many_sparse_from_tensors_map(
           sparse_map_op=handle.op, sparse_handles=sparse_handles)
 
-      combined_indices, combined_values, combined_shape = sess.run(sp_roundtrip)
+      combined_indices, combined_values, combined_shape = self.evaluate(
+          sp_roundtrip)
 
       self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
       self.assertAllEqual(combined_indices[:6, 1:], input0_val[0])
@@ -171,7 +172,7 @@ class SparseTensorsMapTest(test.TestCase):
     with self.session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6(np.arange(6))
       handle = add_sparse_to_tensors_map(input_val)
-      handle_value = sess.run(handle)
+      handle_value = self.evaluate(handle)
       bad_handle = handle_value + 10
       sp_roundtrip = take_many_sparse_from_tensors_map(
           sparse_map_op=handle.op, sparse_handles=[handle_value, bad_handle])
@@ -212,8 +213,8 @@ class BenchmarkSparseTensorsMapVsSerialization(test.Benchmark):
 
         variables.global_variables_initializer().run()
 
-        st_roundtrip_values = sess.run(st_roundtrip)
-        st_deserialized_values = sess.run(st_deserialized)
+        st_roundtrip_values = self.evaluate(st_roundtrip)
+        st_deserialized_values = self.evaluate(st_deserialized)
         np.testing.assert_equal(st_roundtrip_values.values,
                                 st_deserialized_values.values)
         np.testing.assert_equal(st_roundtrip_values.indices,
diff --git a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
index 7f63532e10de88b65f6c7fecb2eaf4f42d6519e4..fa6cb134327f3a259492609b8b651e52bdac20fe 100644
--- a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
+++ b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
@@ -104,20 +104,20 @@ class SparseToDenseTest(test.TestCase):
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[2,1\], "
           r"should be \[\] or \[2\]"):
-        dense.eval()
+        self.evaluate(dense)
 
   def testBadNumValues(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [1, 2, 3], -1)
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[3\], should be \[\] or \[2\]"):
-        dense.eval()
+        self.evaluate(dense)
 
   def testBadDefault(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [1, 2], [0])
       with self.assertRaisesOpError("default_value should be a scalar"):
-        dense.eval()
+        self.evaluate(dense)
 
   def testOutOfBoundsIndicesWithWithoutValidation(self):
     with self.cached_session():
@@ -128,7 +128,7 @@ class SparseToDenseTest(test.TestCase):
           default_value=0.0)
       with self.assertRaisesOpError(
           r"indices\[1\] = \[10\] is out of bounds: need 0 <= index < \[5\]"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks, the allocation should still fail.
       with self.assertRaisesOpError("out of bounds"):
         dense_without_validation = _SparseToDense(
@@ -137,7 +137,7 @@ class SparseToDenseTest(test.TestCase):
             sparse_values=[-1.0, 1.0],
             default_value=0.0,
             validate_indices=False)
-        dense_without_validation.eval()
+        self.evaluate(dense_without_validation)
 
   def testRepeatingIndicesWithWithoutValidation(self):
     with self.cached_session():
@@ -147,7 +147,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0)
       with self.assertRaisesOpError(r"indices\[1\] = \[1\] is repeated"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks
       dense_without_validation = _SparseToDense(
           sparse_indices=[[1], [1]],
@@ -155,7 +155,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0,
           validate_indices=False)
-      dense_without_validation.eval()
+      self.evaluate(dense_without_validation)
 
   def testUnsortedIndicesWithWithoutValidation(self):
     with self.cached_session():
@@ -165,7 +165,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0)
       with self.assertRaisesOpError(r"indices\[1\] = \[1\] is out of order"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks
       dense_without_validation = _SparseToDense(
           sparse_indices=[[2], [1]],
@@ -173,7 +173,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0,
           validate_indices=False)
-      dense_without_validation.eval()
+      self.evaluate(dense_without_validation)
 
   def testShapeInferenceKnownShape(self):
     with self.session(use_gpu=False):
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index 0510bc5321445e3db0dcfef169100fbc4dd013da..3f91131dab7a463cf2631cba825c618276c74b3b 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -164,7 +164,7 @@ class SparseXentTest(test.TestCase):
     with self.session(use_gpu=True):
       loss = nn_ops.sparse_softmax_cross_entropy_with_logits(
           labels=constant_op.constant(0), logits=constant_op.constant([1.0]))
-      self.assertAllClose(0.0, loss.eval())
+      self.assertAllClose(0.0, self.evaluate(loss))
 
   def testFloat(self):
     for label_dtype in np.int32, np.int64:
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 944b0e59b12a1382d64941ebda8018c9f30acdfe..af90e03966ee5e71f8a655a6befaa5f60583ae77 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -318,7 +318,7 @@ class SplitOpTest(test.TestCase):
       inp_grads = [self._makeData((4, 1), dtype)for _ in range(4)]
       grad_tensors = [constant_op.constant(x) for x in inp_grads]
       grad = gradients_impl.gradients(s, [inp_tensor], grad_tensors)[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     for i in range(4):
       self.assertAllEqual(result[:, i:i + 1], inp_grads[i])
 
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index 4b355620bf93bc100b6ce399a183b485d3ccd32f..0f1fa97c3844d4b9b17bf98f6e505503a823159e 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -204,11 +204,11 @@ class StackOpTest(test.TestCase):
         with self.cached_session(use_gpu=True):
           actual_pack = array_ops.stack(test_arrays, axis=j)
           self.assertEqual(expected.shape, actual_pack.get_shape())
-          actual_pack = actual_pack.eval()
+          actual_pack = self.evaluate(actual_pack)
 
           actual_stack = array_ops.stack(test_arrays, axis=j)
           self.assertEqual(expected.shape, actual_stack.get_shape())
-          actual_stack = actual_stack.eval()
+          actual_stack = self.evaluate(actual_stack)
 
         self.assertNDArrayNear(expected, actual_stack, 1e-6)
 
@@ -253,17 +253,19 @@ class AutomaticStackingTest(test.TestCase):
                                           [[2., 2.], [3., 3.]],
                                           dtype=np.float32)])
       self.assertAllEqual([[[0., 0.], [1., 1.]], [[2., 2.], [3., 3.]]],
-                          result.eval())
+                          self.evaluate(result))
 
   def testVariable(self):
     with self.session(use_gpu=True):
       v = variables.Variable(17)
       result = ops.convert_to_tensor([[0, 0, 0], [0, v, 0], [0, 0, 0]])
       v.initializer.run()
-      self.assertAllEqual([[0, 0, 0], [0, 17, 0], [0, 0, 0]], result.eval())
+      self.assertAllEqual([[0, 0, 0], [0, 17, 0], [0, 0, 0]],
+                          self.evaluate(result))
 
       v.assign(38).op.run()
-      self.assertAllEqual([[0, 0, 0], [0, 38, 0], [0, 0, 0]], result.eval())
+      self.assertAllEqual([[0, 0, 0], [0, 38, 0], [0, 0, 0]],
+                          self.evaluate(result))
 
   def testDtype(self):
     t_0 = ops.convert_to_tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
diff --git a/tensorflow/python/kernel_tests/stack_ops_test.py b/tensorflow/python/kernel_tests/stack_ops_test.py
index 1aa12009ea5e1aa1bad3d1b4a3696178831d6a03..6c6fe8aba47fbc6e5baa71d00121a2ae110e8062 100644
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@@ -39,7 +39,7 @@ class StackOpTest(test.TestCase):
       c = gen_data_flow_ops.stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
-      self.assertAllClose([[4.0, 5.0]], c1.eval())
+      self.assertAllClose([[4.0, 5.0]], self.evaluate(c1))
 
   def testStackPushPop(self):
     self._testStackPushPop(use_gpu=False)
@@ -54,7 +54,7 @@ class StackOpTest(test.TestCase):
       c = gen_data_flow_ops.stack_push_v2(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
-      self.assertAllClose(a, c1.eval())
+      self.assertAllClose(a, self.evaluate(c1))
 
   def testStackPushPopSwap(self):
     self._testStackPushPopSwap(use_gpu=False)
@@ -91,7 +91,7 @@ class StackOpTest(test.TestCase):
 
       _, ry = control_flow_ops.while_loop(
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
-      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
+      self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
@@ -110,7 +110,7 @@ class StackOpTest(test.TestCase):
       with ops.control_dependencies([c2]):
         c2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
       r = c1 + c2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
   def testMultiStack(self):
     self._testMultiStack(use_gpu=False)
@@ -173,7 +173,7 @@ class StackOpRefTest(test.TestCase):
       c = gen_data_flow_ops.stack_push(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
-      self.assertAllClose([[4.0, 5.0]], c1.eval())
+      self.assertAllClose([[4.0, 5.0]], self.evaluate(c1))
 
   def testStackPushPop(self):
     self._testStackPushPop(use_gpu=False)
@@ -187,7 +187,7 @@ class StackOpRefTest(test.TestCase):
       c = gen_data_flow_ops.stack_push(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
-      self.assertAllClose(a, c1.eval())
+      self.assertAllClose(a, self.evaluate(c1))
 
   def testStackPushPopSwap(self):
     self._testStackPushPopSwap(use_gpu=False)
@@ -204,7 +204,7 @@ class StackOpRefTest(test.TestCase):
       with ops.control_dependencies([c2]):
         c2 = gen_data_flow_ops.stack_pop(h2, dtypes.float32)
       r = c1 + c2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
   def _testStackWhileSwap(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -236,7 +236,7 @@ class StackOpRefTest(test.TestCase):
 
       _, ry = control_flow_ops.while_loop(
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
-      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
+      self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
@@ -253,7 +253,7 @@ class StackOpRefTest(test.TestCase):
       h2 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c2 = gen_data_flow_ops.stack_push(h2, 5.0)
       _ = c1 + c2
-      self.assertNotEqual(h1.eval()[1], h2.eval()[1])
+      self.assertNotEqual(h1.eval()[1], self.evaluate(h2)[1])
 
   def testSameNameStacks(self):
     self._testSameNameStacks(use_gpu=False)
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index b814843b86c7321ffdb98072d96909d569b32a62..b1e7ce5d621e685217143e70baab637ef02e5c6d 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -152,11 +152,11 @@ class StageTest(test.TestCase):
 
     with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1})
-      self.assertEqual(sess.run(size), 1)
+      self.assertEqual(self.evaluate(size), 1)
       sess.run(stage, feed_dict={x: -1})
-      self.assertEqual(sess.run(size), 2)
+      self.assertEqual(self.evaluate(size), 2)
       sess.run(clear)
-      self.assertEqual(sess.run(size), 0)
+      self.assertEqual(self.evaluate(size), 0)
 
   def testCapacity(self):
     capacity = 3
@@ -210,14 +210,14 @@ class StageTest(test.TestCase):
                                              capacity))
 
       # Should have capacity elements in the staging area
-      self.assertTrue(sess.run(size) == capacity)
+      self.assertTrue(self.evaluate(size) == capacity)
 
       # Clear the staging area completely
       for i in range(n):
-        self.assertTrue(sess.run(ret) == [i])
+        self.assertTrue(self.evaluate(ret) == [i])
 
       # It should now be empty
-      self.assertTrue(sess.run(size) == 0)
+      self.assertTrue(self.evaluate(size) == 0)
 
   def testMemoryLimit(self):
     memory_limit = 512 * 1024  # 512K
@@ -274,13 +274,13 @@ class StageTest(test.TestCase):
                                              capacity))
 
       # Should have capacity elements in the staging area
-      self.assertTrue(sess.run(size) == capacity)
+      self.assertTrue(self.evaluate(size) == capacity)
 
       # Clear the staging area completely
       for i in range(n):
-        self.assertTrue(np.all(sess.run(ret)[0] == i))
+        self.assertTrue(np.all(self.evaluate(ret)[0] == i))
 
-      self.assertTrue(sess.run(size) == 0)
+      self.assertTrue(self.evaluate(size) == 0)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/string_length_op_test.py b/tensorflow/python/kernel_tests/string_length_op_test.py
index 57db7302b155991c966e66aa77152d297ce0cf2b..0c68f0cadd0ef7817a1dcb0a5441c8a49f69ead1 100644
--- a/tensorflow/python/kernel_tests/string_length_op_test.py
+++ b/tensorflow/python/kernel_tests/string_length_op_test.py
@@ -29,7 +29,7 @@ class StringLengthOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       lengths = string_ops.string_length(strings)
-      values = sess.run(lengths)
+      values = self.evaluate(lengths)
       self.assertAllEqual(values, [[[1, 2], [3, 4], [5, 6]]])
 
   def testUnit(self):
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index b968e885edafcbdebd3b32e11c6bdf35e65e7616..92e13db0f7327d815833443b3a0637ac8b0b0945 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -34,7 +34,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
       self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
       self.assertAllEqual(shape, [2, 4])
@@ -44,7 +44,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, delimiter="")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                                     [1, 0], [1, 1], [1, 2], [1, 3], [2, 0],
                                     [2, 1], [2, 2], [2, 3]])
@@ -62,7 +62,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices,
           [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
@@ -74,7 +74,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, delimiter=" .")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices,
           [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
@@ -92,13 +92,13 @@ class StringSplitOpTest(test.TestCase):
           ValueError, string_ops.string_split, strings, delimiter=["a"])
 
       tokens = string_ops.string_split(strings, delimiter="|")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
       self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
       self.assertAllEqual(shape, [2, 2])
 
       tokens = string_ops.string_split(strings, delimiter="| ")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"hello", b"world", b"hello", b"world"])
       self.assertAllEqual(shape, [2, 2])
@@ -145,7 +145,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, "#", skip_empty=False)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1],
                                     [2, 0], [2, 1], [2, 2]])
@@ -154,7 +154,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, "#")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(values, [b"a", b"b", b"c"])
       self.assertAllEqual(indices, [[0, 0], [1, 0], [2, 0]])
       self.assertAllEqual(shape, [3, 1])
@@ -167,7 +167,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
       self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
       self.assertAllEqual(shape, [2, 4])
@@ -182,7 +182,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep="<>")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices, [[0, 0], [0, 1], [0, 2],
                     [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
@@ -200,7 +200,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep=',')
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                     [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
       self.assertAllEqual(values, [b"1", b"2", b"3",
@@ -217,7 +217,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                     [1, 0], [1, 1], [1, 2]])
       self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
@@ -233,7 +233,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
@@ -249,7 +249,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, maxsplit=1)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
diff --git a/tensorflow/python/kernel_tests/string_strip_op_test.py b/tensorflow/python/kernel_tests/string_strip_op_test.py
index a96b71490e460ba7e9f28f03b1fca7a0c9984571..edff3862ff6984393c497f76943dc460d6f2541c 100644
--- a/tensorflow/python/kernel_tests/string_strip_op_test.py
+++ b/tensorflow/python/kernel_tests/string_strip_op_test.py
@@ -23,14 +23,14 @@ from tensorflow.python.platform import test
 
 
 class StringStripOpTest(test.TestCase):
-  """ Test cases for tf.string_strip."""
+  """ Test cases for tf.strings.strip."""
 
   def test_string_strip(self):
     strings = ["pigs on the wing", "animals"]
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [b"pigs on the wing", b"animals"])
 
   def test_string_strip_2d(self):
@@ -39,7 +39,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [[b"pigs on the wing", b"animals"],
                                    [b"hello", b"world"]])
 
@@ -48,7 +48,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [b"hello", b"", b"world", b""])
 
 
diff --git a/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py b/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
index 9cb0c9d18f32803aff5b5c7d1d5643d0742fee05..2cc87008da0bf97e128c8c3ac7006469e747d266 100644
--- a/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
+++ b/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
@@ -70,7 +70,7 @@ class StringToHashBucketOpTest(test.TestCase):
       input_string = constant_op.constant(['a', 'b', 'c'])
       output = string_ops.string_to_hash_bucket_strong(
           input_string, 1, key=[123, 345])
-      self.assertAllEqual([0, 0, 0], output.eval())
+      self.assertAllEqual([0, 0, 0], self.evaluate(output))
 
   def testStringToHashBucketsStrong(self):
     with self.cached_session():
@@ -81,7 +81,7 @@ class StringToHashBucketOpTest(test.TestCase):
       # StrongKeyedHash(key, 'a') -> 7157389809176466784 -> mod 10 -> 4
       # StrongKeyedHash(key, 'b') -> 15805638358933211562 -> mod 10 -> 2
       # StrongKeyedHash(key, 'c') -> 18100027895074076528 -> mod 10 -> 8
-      self.assertAllEqual([4, 2, 8], output.eval())
+      self.assertAllEqual([4, 2, 8], self.evaluate(output))
 
   def testStringToHashBucketsStrongInvalidKey(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/substr_op_test.py b/tensorflow/python/kernel_tests/substr_op_test.py
index 37aa624b07e86c68a48d3859bf88d8ef0ce93253..bb2d4a79131dc12a67d214752c44989d29e2cf56 100644
--- a/tensorflow/python/kernel_tests/substr_op_test.py
+++ b/tensorflow/python/kernel_tests/substr_op_test.py
@@ -51,7 +51,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -71,7 +71,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Full string
@@ -83,7 +83,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, test_string)
 
     # Full string (Negative)
@@ -95,7 +95,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, test_string)
 
     # Length is larger in magnitude than a negative position
@@ -111,7 +111,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_string)
 
   @parameterized.parameters(
@@ -138,7 +138,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -173,7 +173,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     position = np.array(-3, dtype)
@@ -188,7 +188,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -229,7 +229,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -271,7 +271,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Broadcast input string onto pos/len
@@ -294,7 +294,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Test 1D broadcast
@@ -310,7 +310,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -349,7 +349,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, 4, "BYTE"),
@@ -373,7 +373,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -398,7 +398,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
     # Matrix/Matrix (with negative)
     position = np.array([[1, 2, -3], [1, 2, -4], [1, 2, -3]], dtype)
@@ -406,7 +406,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -428,7 +428,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
     # Broadcast (with negative)
     position = np.array([-1, -2, -4], dtype)
@@ -436,7 +436,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
diff --git a/tensorflow/python/kernel_tests/summary_audio_op_test.py b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
similarity index 95%
rename from tensorflow/python/kernel_tests/summary_audio_op_test.py
rename to tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
index e59a2ceef7e4c8e8099da0b7aa4d8f3bd8b0b124..1547c55f8b0b112325c6049f2052091228c171bf 100644
--- a/tensorflow/python/kernel_tests/summary_audio_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for summary sound op."""
+"""Tests for summary V1 audio op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -27,7 +27,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 
 
-class SummaryAudioOpTest(test.TestCase):
+class SummaryV1AudioOpTest(test.TestCase):
 
   def _AsSummary(self, s):
     summ = summary_pb2.Summary()
@@ -60,7 +60,7 @@ class SummaryAudioOpTest(test.TestCase):
         sample_rate = 8000
         summ = summary.audio(
             "snd", const, max_outputs=3, sample_rate=sample_rate)
-        value = sess.run(summ)
+        value = self.evaluate(summ)
         self.assertEqual([], summ.get_shape())
         audio_summ = self._AsSummary(value)
 
diff --git a/tensorflow/python/kernel_tests/summary_image_op_test.py b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
similarity index 96%
rename from tensorflow/python/kernel_tests/summary_image_op_test.py
rename to tensorflow/python/kernel_tests/summary_v1_image_op_test.py
index b650e1040424818e06181c0019139127414b41d7..e1b24756f3f6303ce9fe250333d10a22e46499a3 100644
--- a/tensorflow/python/kernel_tests/summary_image_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for summary image op."""
+"""Tests for summary V1 image op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -30,7 +30,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 
 
-class SummaryImageOpTest(test.TestCase):
+class SummaryV1ImageOpTest(test.TestCase):
 
   def _AsSummary(self, s):
     summ = summary_pb2.Summary()
@@ -70,7 +70,7 @@ class SummaryImageOpTest(test.TestCase):
 
           # Summarize
           summ = summary.image("img", const)
-          value = sess.run(summ)
+          value = self.evaluate(summ)
           self.assertEqual([], summ.get_shape())
           image_summ = self._AsSummary(value)
 
@@ -97,7 +97,7 @@ class SummaryImageOpTest(test.TestCase):
 
         # Summarize
         summ = summary.image("img", tf_images)
-        value = sess.run(summ)
+        value = self.evaluate(summ)
         self.assertEqual([], summ.get_shape())
         image_summ = self._AsSummary(value)
 
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
similarity index 87%
rename from tensorflow/python/kernel_tests/summary_ops_test.py
rename to tensorflow/python/kernel_tests/summary_v1_ops_test.py
index 0c500120b0b81907e1c6d2a4a70405b4c7b42687..1206cb7013fa785573a4c7706aef2f52b78a00e3 100644
--- a/tensorflow/python/kernel_tests/summary_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
@@ -12,21 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for summary ops."""
+"""Tests for the actual serialized proto output of the V1 tf.summary ops.
+
+The tensor, audio, and image ops have dedicated tests in adjacent files. The
+overall tf.summary API surface also has its own tests in summary_test.py that
+check calling the API methods but not the exact serialized proto output.
+"""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 
 
-class SummaryOpsTest(test.TestCase):
+class SummaryV1OpsTest(test.TestCase):
 
   def _AsSummary(self, s):
     summ = summary_pb2.Summary()
@@ -37,7 +42,7 @@ class SummaryOpsTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant([10.0, 20.0])
       summ = logging_ops.scalar_summary(["c1", "c2"], const, name="mysumm")
-      value = sess.run(summ)
+      value = self.evaluate(summ)
     self.assertEqual([], summ.get_shape())
     self.assertProtoEquals("""
       value { tag: "c1" simple_value: 10.0 }
@@ -48,7 +53,7 @@ class SummaryOpsTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant([10.0, 20.0])
       summ = logging_ops.scalar_summary(["c1", "c2"], const)
-      value = sess.run(summ)
+      value = self.evaluate(summ)
     self.assertEqual([], summ.get_shape())
     self.assertProtoEquals("""
       value { tag: "c1" simple_value: 10.0 }
@@ -61,7 +66,7 @@ class SummaryOpsTest(test.TestCase):
       summ1 = summary.histogram("h", const)
       summ2 = logging_ops.scalar_summary("c", const)
       merge = summary.merge([summ1, summ2])
-      value = sess.run(merge)
+      value = self.evaluate(merge)
     self.assertEqual([], merge.get_shape())
     self.assertProtoEquals("""
       value {
@@ -100,13 +105,6 @@ class SummaryOpsTest(test.TestCase):
       self.assertEqual(summ2, merge.op.inputs[0])
       self.assertTrue(summary.merge_all("bar_key") is None)
 
-  def testHistogramSummaryTypes(self):
-    with ops.Graph().as_default():
-      for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32,
-                    dtypes.float32, dtypes.float64):
-        const = constant_op.constant(10, dtype=dtype)
-        summary.histogram("h", const)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/summary_tensor_op_test.py b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
similarity index 82%
rename from tensorflow/python/kernel_tests/summary_tensor_op_test.py
rename to tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
index 0f4643393a12a2b1d72faaf22683698be3ee6f3b..71251f5602a80e1a6b3b260376c959a164a3043b 100644
--- a/tensorflow/python/kernel_tests/summary_tensor_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for summary ops."""
+"""Tests for summary V1 tensor op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,11 +26,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import summary_ops
 from tensorflow.python.platform import test
+from tensorflow.python.summary import summary as summary_lib
 
 
-class SummaryOpsTest(test.TestCase):
+class SummaryV1TensorOpTest(test.TestCase):
 
   def _SummarySingleValue(self, s):
     summ = summary_pb2.Summary()
@@ -44,12 +44,12 @@ class SummaryOpsTest(test.TestCase):
   def testTags(self):
     with self.cached_session() as sess:
       c = constant_op.constant(1)
-      s1 = summary_ops.tensor_summary("s1", c)
+      s1 = summary_lib.tensor_summary("s1", c)
       with ops.name_scope("foo"):
-        s2 = summary_ops.tensor_summary("s2", c)
+        s2 = summary_lib.tensor_summary("s2", c)
         with ops.name_scope("zod"):
-          s3 = summary_ops.tensor_summary("s3", c)
-          s4 = summary_ops.tensor_summary("TensorSummary", c)
+          s3 = summary_lib.tensor_summary("s3", c)
+          s4 = summary_lib.tensor_summary("TensorSummary", c)
       summ1, summ2, summ3, summ4 = sess.run([s1, s2, s3, s4])
 
     v1 = self._SummarySingleValue(summ1)
@@ -67,8 +67,8 @@ class SummaryOpsTest(test.TestCase):
   def testScalarSummary(self):
     with self.cached_session() as sess:
       const = constant_op.constant(10.0)
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
 
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -78,8 +78,8 @@ class SummaryOpsTest(test.TestCase):
     s = six.b("foobar")
     with self.cached_session() as sess:
       const = constant_op.constant(s)
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
 
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -88,8 +88,8 @@ class SummaryOpsTest(test.TestCase):
   def testManyScalarSummary(self):
     with self.cached_session() as sess:
       const = array_ops.ones([5, 5, 5])
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
     self._AssertNumpyEq(n, np.ones([5, 5, 5]))
@@ -98,8 +98,8 @@ class SummaryOpsTest(test.TestCase):
     strings = [[six.b("foo bar"), six.b("baz")], [six.b("zoink"), six.b("zod")]]
     with self.cached_session() as sess:
       const = constant_op.constant(strings)
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
     self._AssertNumpyEq(n, strings)
@@ -108,8 +108,8 @@ class SummaryOpsTest(test.TestCase):
     bools = [True, True, True, False, False, False]
     with self.cached_session() as sess:
       const = constant_op.constant(bools)
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
 
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -119,21 +119,21 @@ class SummaryOpsTest(test.TestCase):
     with self.cached_session() as sess:
 
       def get_description(summary_op):
-        summ_str = sess.run(summary_op)
+        summ_str = self.evaluate(summary_op)
         summ = summary_pb2.Summary()
         summ.ParseFromString(summ_str)
         return summ.value[0].metadata
 
       const = constant_op.constant(1)
       # Default case; no description or display name
-      simple_summary = summary_ops.tensor_summary("simple", const)
+      simple_summary = summary_lib.tensor_summary("simple", const)
 
       descr = get_description(simple_summary)
       self.assertEqual(descr.display_name, "")
       self.assertEqual(descr.summary_description, "")
 
       # Values are provided via function args
-      with_values = summary_ops.tensor_summary(
+      with_values = summary_lib.tensor_summary(
           "simple",
           const,
           display_name="my name",
@@ -148,14 +148,14 @@ class SummaryOpsTest(test.TestCase):
       metadata.display_name = "my name"
       metadata.summary_description = "my description"
 
-      with_metadata = summary_ops.tensor_summary(
+      with_metadata = summary_lib.tensor_summary(
           "simple", const, summary_metadata=metadata)
       descr = get_description(with_metadata)
       self.assertEqual(descr.display_name, "my name")
       self.assertEqual(descr.summary_description, "my description")
 
       # If both SummaryMetadata and explicit args are provided, the args win
-      overwrite = summary_ops.tensor_summary(
+      overwrite = summary_lib.tensor_summary(
           "simple",
           const,
           summary_metadata=metadata,
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 57298c0fecca859ffdc581560cda4e3a0423d762..589172e4b72a4cb1f0c5f10ebcc88352bc17e896 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -68,7 +68,7 @@ class SvdOpTest(test.TestCase):
             s2 = linalg_ops.svd(
                 matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
             all_ops += [s1, s2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       for i in range(2):
         s = 6 * i
         self.assertAllEqual(val[s], val[s + 3])  # s1 == s2
@@ -123,7 +123,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
-    self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
+    self.assertAllClose(identity.eval(), self.evaluate(xx), atol=tol)
 
   def Test(self):
     is_complex = dtype_ in (np.complex64, np.complex128)
@@ -158,7 +158,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         s_tf = linalg_ops.svd(
             x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
         if use_static_shape_:
-          s_tf_val = sess.run(s_tf)
+          s_tf_val = self.evaluate(s_tf)
         else:
           s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np})
 
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 9dcdaa61ed2c0c12940817ccb311e27d1a19fa0c..a187fa115ce135832611e3819bd3bd546f083f63 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -104,10 +104,10 @@ class TemplateTest(test.TestCase):
     train_op = optimizer.minimize(train_loss)
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      initial_test_loss = sess.run(test_loss)
-      sess.run(train_op)
-      final_test_loss = sess.run(test_loss)
+      self.evaluate(variables.global_variables_initializer())
+      initial_test_loss = self.evaluate(test_loss)
+      self.evaluate(train_op)
+      final_test_loss = self.evaluate(test_loss)
 
     # Parameters are tied, so the loss should have gone down when we trained it.
     self.assertLess(final_test_loss, initial_test_loss)
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 91bd93712a9bc318e24801519526178e9abced44..4ee1c27a87fc8da1394a68102152f5f7d3bb976c 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -63,6 +63,8 @@ def _make_ta(size, name, dtype=dtypes.float32, infer_shape=False):
       dtype=dtype, tensor_array_name=name, size=size, infer_shape=infer_shape)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
 class TensorArrayTest(test.TestCase):
 
   @classmethod
@@ -121,13 +123,14 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayWritePack(dtypes.int64)
     self._testTensorArrayWritePack(dtypes.complex64)
     self._testTensorArrayWritePack(dtypes.complex128)
-    self._testTensorArrayWritePack(dtypes.string)
+    if not (test.is_gpu_available() and
+            tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
+      # TODO(b/119684648): Enable this.
+      self._testTensorArrayWritePack(dtypes.string)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayWritePack(self):
     self._testTensorArrayWritePackMaybeLegacy()
 
-  @test_util.run_in_graph_and_eager_modes
   def testEmptyTensorArrayPack(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -161,7 +164,7 @@ class TensorArrayTest(test.TestCase):
           convert([[4.0, 5.0], [104.0, 105.0], [204.0, 205.0], [6.0, 7.0],
                    [106.0, 107.0], [8.0, 9.0]]), c0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118343594 (TensorArray.concat)")
   def testTensorArrayWriteConcat(self):
     self._testTensorArrayWriteConcat(dtypes.float32)
     self._testTensorArrayWriteConcat(dtypes.float64)
@@ -184,7 +187,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                           self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
 
@@ -200,7 +203,7 @@ class TensorArrayTest(test.TestCase):
     self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                         self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
@@ -249,9 +252,11 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayUnpackRead(dtypes.int64)
     self._testTensorArrayUnpackRead(dtypes.complex64)
     self._testTensorArrayUnpackRead(dtypes.complex128)
-    self._testTensorArrayUnpackRead(dtypes.string)
+    if not (test.is_gpu_available() and
+            tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
+      # TODO(b/119684648): Enable this.
+      self._testTensorArrayUnpackRead(dtypes.string)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayUnpackRead(self):
     self._testTensorArrayUnpackReadMaybeLegacy()
 
@@ -297,7 +302,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(convert([]).reshape(0, 2), d1)
       self.assertAllEqual(convert([[3.0, 301.0]]), d2)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
   def testTensorArraySplitRead(self):
     self._testTensorArraySplitRead(dtypes.float32)
     self._testTensorArraySplitRead(dtypes.float64)
@@ -307,7 +312,8 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArraySplitRead(dtypes.complex128)
     self._testTensorArraySplitRead(dtypes.string)
 
-  def testTensorGradArrayWriteRead(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorGradArrayWriteRead(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -340,7 +346,27 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[2.0]], g_d1)
       self.assertAllEqual(-2.0, g_d2)
 
-  def testTensorGradArrayDynamicWriteRead(self):
+  def testSkipEagerTensorArrayGradGrad(self):
+    if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+      self.skipTest("Legacy TensorArray does not support double derivatives.")
+    with self.test_session(use_gpu=True) as session:
+      x = constant_op.constant(4.0)
+
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=1,
+          infer_shape=False)
+      w0 = ta.write(0, x)
+      r0 = w0.read(0)
+      y = r0 * r0
+
+      g1 = gradients_impl.gradients(ys=[y], xs=[x])
+      g2 = gradients_impl.gradients(ys=[g1], xs=[x])
+      self.assertAllEqual([2.0], session.run(g2))
+
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorGradArrayDynamicWriteRead(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -381,7 +407,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(3, vs)
       self.assertAllEqual(3, g_vs)
 
-  def testTensorGradAccessTwiceReceiveSameObject(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorGradAccessTwiceReceiveSameObject(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
@@ -397,26 +424,39 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(t_g_ta_0, t_g_ta_1)
       self.assertAllEqual([[4.0, 5.0]], d_r1_0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
       # Test writing the wrong datatype
-      with self.assertRaisesOpError(
-          "TensorArray dtype is (float|float32) but Op is trying to write "
-          "dtype string"):
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = ("Invalid data types; op elements string but list elements "
+                     "float")
+      else:
+        error_msg = (
+            "TensorArray dtype is (float|float32) but Op is trying to write "
+            "dtype string")
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(0, "wrong_type_scalar").flow)
 
-      with self.assertRaisesOpError("index -1"):
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to modify element -1 in a list with 3 elements."
+      else:
+        error_msg = "index -1"
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(-1, 3.0).flow)
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to modify element 3 in a list with 3 elements"
+      else:
+        error_msg = ("Tried to write to index 3 but array is not "
+                     "resizeable and size is: 3")
       # Test reading from too large an index
-      with self.assertRaisesOpError(
-          "Tried to write to index 3 but array is not "
-          "resizeable and size is: 3"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(3, 3.0).flow)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayReadWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
@@ -424,23 +464,34 @@ class TensorArrayTest(test.TestCase):
       w0 = ta.write(0, [[4.0, 5.0]])
 
       # Test reading wrong datatype (only possible when constructing graphs).
-      if not context.executing_eagerly():
+      if (not context.executing_eagerly() and
+          not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtypes.float64, flow_in=w0.flow)
         with self.assertRaisesOpError(
             "TensorArray dtype is float but Op requested dtype double."):
-          r0_bad.eval()
+          self.evaluate(r0_bad)
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to access element -1 in a list with 3 elements."
+      else:
+        error_msg = "index -1"
       # Test reading from a negative index, which is not allowed
-      with self.assertRaisesOpError("index -1"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(-1))
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to access element 3 in a list with 3 elements."
+      else:
+        error_msg = "Tried to read from index 3 but array size is: 3"
       # Test reading from too large an index
-      with self.assertRaisesOpError(
-          "Tried to read from index 3 but array size is: 3"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(3))
 
-  def testTensorArrayWriteMultipleFails(self):
+  @test_util.disable_control_flow_v2("v2 allows multiple writes.")
+  def testSkipEagerTensorArrayWriteMultipleFails(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
@@ -450,7 +501,7 @@ class TensorArrayTest(test.TestCase):
           "it has already been written to."):
         self.evaluate(ta.write(2, 3.0).write(2, 3.0).flow)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118343594 (TensorArray.concat)")
   def testTensorArrayConcatIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -482,7 +533,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError("shape"):
         self.evaluate(w3.concat())
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
   def testTensorArraySplitIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       in_eager_mode = context.executing_eagerly()
@@ -546,12 +597,14 @@ class TensorArrayTest(test.TestCase):
           r"existing shape is \[\] but the new input shape is \[1\]"):
         wb1_grad.flow.eval()
 
-  def testTensorArrayWriteGradientAddMultipleAdds(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorArrayWriteGradientAddMultipleAdds(self):
     for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
                   dtypes.complex64, dtypes.complex128):
       self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
 
-  def testTensorArrayGradWithShapeKnownElementShape(self):
+  @test_util.disable_control_flow_v2("Low level legacy TA op test.")
+  def testSkipEagerTensorArrayGradWithShapeKnownElementShape(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           size=3,
@@ -580,7 +633,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(fed_value,
                           sess.run(read_value, feed_dict={value: fed_value}))
 
-  def testTensorArrayGradWithShapeUnknownElementShape(self):
+  @test_util.disable_control_flow_v2("Low level legacy TA op test.")
+  def testSkipEagerTensorArrayGradWithShapeUnknownElementShape(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           size=3, dtype=dtypes.float32,
@@ -603,7 +657,6 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(fed_value,
                           sess.run(read_value, feed_dict={value: fed_value}))
 
-  @test_util.run_in_graph_and_eager_modes
   def testMultiTensorArray(self):
     with self.session(use_gpu=True):
       h1 = tensor_array_ops.TensorArray(
@@ -667,7 +720,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(c([[3.0, 2.0]]), grad_vals[0])
       self.assertAllEqual(c(-2.0), grad_vals[1])
 
-  def testTensorArrayGradientWriteRead(self):
+  def testSkipEagerTensorArrayGradientWriteRead(self):
     for dtype in (np.float32, np.float64, np.complex64, np.complex128):
       self._testTensorArrayGradientWriteReadType(dtype)
 
@@ -698,15 +751,16 @@ class TensorArrayTest(test.TestCase):
                 [-0.5, 1.5],  # read(0) gradient
                 [20.0, 30.0, 40.0, 50.0]
             ])  # concat gradient
-      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+      grad_vals = self.evaluate(grad_r)  # 2 + 2 entries
 
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
 
-  def testTensorArrayGradientWritePackConcatAndRead(self):
+  @test_util.disable_control_flow_v2("b/118343594 (TensorArray.concat)")
+  def testSkipEagerTensorArrayGradientWritePackConcatAndRead(self):
     self._testTensorArrayGradientWritePackConcatAndRead()
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("v2 does not support clear_after_read.")
   def testTensorArrayReadTwice(self):
     with self.session(use_gpu=True):
       value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
@@ -760,10 +814,11 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0 - 1.5, 3.0 + 1.5], [4.0, 5.0]], grad_vals[0])
 
-  def testTensorArrayGradientUnpackRead(self):
+  def testSkipEagerTensorArrayGradientUnpackRead(self):
     self._testTensorArrayGradientUnpackRead()
 
-  def testTensorArrayGradientSplitConcat(self):
+  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
+  def testSkipEagerTensorArrayGradientSplitConcat(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=2,
@@ -808,17 +863,15 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  def testTensorArrayGradientDynamicUnpackRead(self):
+  def testSkipEagerTensorArrayGradientDynamicUnpackRead(self):
     self._testTensorArrayGradientDynamicUnpackRead()
 
-  @test_util.run_in_graph_and_eager_modes
   def testCloseTensorArray(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       self.evaluate(ta.close())
 
-  @test_util.run_in_graph_and_eager_modes
   def testSizeTensorArray(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -826,7 +879,6 @@ class TensorArrayTest(test.TestCase):
       s = ta.size()
       self.assertAllEqual(3, self.evaluate(s))
 
-  @test_util.run_in_graph_and_eager_modes
   def testWriteCloseTensorArray(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -924,7 +976,6 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(grad_val.sum(axis=0), var_grad_t)
       self.assertAllClose(grad_val.sum(axis=0), state0_grad_t)
 
-  @test_util.run_in_graph_and_eager_modes
   def testWhileLoopWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=False, dtype=dtypes.float32)
@@ -932,11 +983,27 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  def testWhileLoopDynamicWritePackGradients(self):
+  @test_util.disable_control_flow_v2("Testing v1 while_loop with v2 TA")
+  @test_util.enable_tensor_array_v2
+  def testWhileLoopV1WithTensorArrayV2(self):
+    size = 3
+    ta = tensor_array_ops.TensorArray(
+        dtype=dtypes.int32, size=size, element_shape=tensor_shape.scalar())
+
+    def Body(counter, ta):
+      return counter + 1, ta.write(counter, counter)
+
+    _, ta = control_flow_ops.while_loop(lambda i, _: i < size, Body, [0, ta])
+
+    for i in range(size):
+      self.assertEqual(self.evaluate(ta.read(i)), i)
+
+  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
+  def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/119323158")
   def testGradSerialTwoLoops(self):
     with self.session(use_gpu=True):
       def loop(x):
@@ -976,7 +1043,7 @@ class TensorArrayTest(test.TestCase):
         grad = gradients_impl.gradients(loop(x), [x])[0]
       self.assertAllClose(31.0, self.evaluate(grad))
 
-  def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
+  def testSkipEagerSumOfTwoReadVariablesWithoutRepeatGrad(self):
     with self.session(use_gpu=True) as session:
       a = array_ops.identity(
           np.arange(
@@ -1011,7 +1078,7 @@ class TensorArrayTest(test.TestCase):
   def _grad_source_for_name(self, name):
     return tensor_array_grad._GetGradSource(constant_op.constant(0, name=name))
 
-  def testGetGradSource_Invalid(self):
+  def testSkipEagerGetGradSource_Invalid(self):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("")
     with self.assertRaises(ValueError):
@@ -1019,7 +1086,7 @@ class TensorArrayTest(test.TestCase):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("foo/bar")
 
-  def testGetGradSource_NoEnclosingScope(self):
+  def testSkipEagerGetGradSource_NoEnclosingScope(self):
     self.assertEqual("gradients:0", self._grad_source_for_name("gradients"))
     self.assertEqual("gradients_0:0", self._grad_source_for_name("gradients_0"))
     self.assertEqual("gradients", self._grad_source_for_name("gradients/foo"))
@@ -1030,7 +1097,7 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("gradients_0",
                      self._grad_source_for_name("gradients_0/foo/bar"))
 
-  def testGetGradSource_EnclosingScope(self):
+  def testSkipEagerGetGradSource_EnclosingScope(self):
     self.assertEqual("foo/gradients:0",
                      self._grad_source_for_name("foo/gradients"))
     self.assertEqual("foo/gradients_0:0",
@@ -1044,12 +1111,12 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("foo/bar/gradients_0",
                      self._grad_source_for_name("foo/bar/gradients_0/baz"))
 
-  def testGetGradSource_NestedUsesInnermost(self):
+  def testSkipEagerGetGradSource_NestedUsesInnermost(self):
     self.assertEqual(
         "foo/gradients/bar/gradients_0",
         self._grad_source_for_name("foo/gradients/bar/gradients_0/baz"))
 
-  def testWriteShape(self):
+  def testSkipEagerWriteShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
@@ -1073,7 +1140,8 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w0.write(0, c2)
 
-  def testPartlyUnknownShape(self):
+  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
+  def testSkipEagerPartlyUnknownShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=6)
@@ -1113,7 +1181,6 @@ class TensorArrayTest(test.TestCase):
       r5 = w5.read(0)
       self.assertAllEqual([5, 4, 2, 3], r5.get_shape().as_list())
 
-  @test_util.run_in_graph_and_eager_modes
   def _testUnpackShape(self):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1144,10 +1211,11 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
+  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
   def testUnpackShape(self):
     self._testUnpackShape()
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
   def testSplitShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1178,7 +1246,7 @@ class TensorArrayTest(test.TestCase):
             tensor_shape.TensorShape(
                 ta1.handle.op.get_attr("element_shape")).ndims, None)
 
-  def testWriteUnknownShape(self):
+  def testSkipEagerWriteUnknownShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -1201,7 +1269,11 @@ class TensorArrayTest(test.TestCase):
       grad_r0_vals = session.run(grad_r0)[0]
       self.assertAllEqual(grad_r0_vals, [1.0, 0.0])
 
-  def testGradientWhenNotAllComponentsRead(self):
+  # TODO(srbs): Figure out how to enable this. This is probably failing
+  # because we are trying to stack a TensorList with invalid tensors.
+  # That is because we do not receive gradients for all list indices.
+  # Figure out how TensorArray handles this.
+  def disabletestGradientWhenNotAllComponentsRead(self):
     self._testGradientWhenNotAllComponentsRead()
 
   def _testTensorArrayUnpackDynamic(self):
@@ -1212,14 +1284,16 @@ class TensorArrayTest(test.TestCase):
       w0 = ta.unstack(x)
       w1 = w0.write(3, 4.0)
       r = w1.stack()
-      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), r.eval())
+      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), self.evaluate(r))
       grad = gradients_impl.gradients(ys=[r], xs=[x])
-      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), sess.run(grad)[0])
+      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  def testTensorArrayUnpackDynamic(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  def testTensorArraySplitDynamic(self):
+  @test_util.disable_control_flow_v2("b/118343594 (TensorArray.concat)")
+  def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=3, dynamic_size=True)
@@ -1227,21 +1301,25 @@ class TensorArrayTest(test.TestCase):
       w0 = ta.split(x, [1, 1, 1])
       w1 = w0.write(3, [4.0])
       r = w1.concat()
-      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), r.eval())
+      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), self.evaluate(r))
       grad = gradients_impl.gradients(ys=[r], xs=[x])
-      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), sess.run(grad)[0])
+      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
   def _testTensorArrayEvalEmpty(self):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=False)
-      with self.assertRaisesOpError(
-          "TensorArray has size zero, but element shape <unknown> is not fully "
-          "defined. Currently only static shapes are supported when packing "
-          "zero-size TensorArrays."):
+      v2_msg = ("Tried to stack elements of a empty list with "
+                "non-fully-defined shape")
+      v1_msg = (
+          "TensorArray has size zero, but element shape <unknown> is not "
+          "fully defined. Currently only static shapes are supported when "
+          "packing zero-size TensorArrays.")
+      with self.assertRaisesOpError(v2_msg if tensor_array_ops
+                                    .ENABLE_TENSOR_ARRAY_V2 else v1_msg):
         ta.stack().eval()
 
-  def testTensorArrayEvalEmpty(self):
+  def testSkipEagerTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
 
   # this test is ill-defined for Eager mode --- unpacking an empty tensor
@@ -1255,15 +1333,17 @@ class TensorArrayTest(test.TestCase):
       ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       concatenated = ta.concat()
-      self.assertAllEqual([0, 3, 5], packed.eval().shape)
+      self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
-      self.assertAllEqual([0, 5], concatenated.eval().shape)
+      self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  def testTensorArrayEvalEmptyWithDefault(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  def testTensorArrayScatterReadAndGradients(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -1289,7 +1369,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/117943286")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1326,7 +1406,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[1.0, -1.0], [8.0, -8.0]], g_vals[0])
       self.assertAllEqual(expected_grad, grad_vals[0])
 
-  def testTensorArrayGetsDeviceFromFirstWrite(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  def testSkipEagerTensorArrayGetsDeviceFromFirstWrite(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       # this initial device will be ignored.
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
@@ -1374,7 +1455,8 @@ class TensorArrayTest(test.TestCase):
         self.assertFalse(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
-  def testTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  def testSkipEagerTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
 
@@ -1398,12 +1480,13 @@ class TensorArrayTest(test.TestCase):
     for d in dev_stats:
       if "/task:1/" in d:
         self.assertTrue(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
       else:
         self.assertFalse(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
-  def testTensorArrayDisabledColocateWithFirstWriteCall(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  def testSkipEagerTensorArrayDisabledColocateWithFirstWriteCall(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=2, colocate_with_first_write_call=False)
@@ -1428,12 +1511,11 @@ class TensorArrayTest(test.TestCase):
     for d in dev_stats:
       if "/task:0/" in d and "CPU" in d:  # Skip any GPU node stats
         self.assertTrue(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
       else:
         self.assertFalse(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayIdentity(self):
     with self.session(use_gpu=True):
       ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
@@ -1486,7 +1568,7 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(size0_v, 2)
       self.assertEqual(size1_v, 4)
 
-  def testTensorArrayGradYsInCorrectScope(self):
+  def testSkipEagerTensorArrayGradYsInCorrectScope(self):
     n_time = 1
     n_dim = 1
     x = constant_op.constant([[1.42]])
@@ -1504,7 +1586,7 @@ class TensorArrayTest(test.TestCase):
         vdx, vdy = sess.run([dx, dy])
       self.assertAllClose(vdx, vdy)
 
-  def testTensorArrayInt64GPU(self):
+  def testSkipEagerTensorArrayInt64GPU(self):
     if not test.is_gpu_available():
       return
     with self.session(use_gpu=True, force_gpu=True) as sess:
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index 8c11c2070973cbd4780871e2d716bb9bd2cbb3f9..76e1002ee1b97cea9fa29763b39f39a486a0ec16 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -50,7 +50,7 @@ class TransposeTest(test.TestCase):
     with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
       y = array_ops.transpose(inx, p, conjugate=conjugate)
-      tf_ans = y.eval()
+      tf_ans = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       self.assertAllEqual(np_ans, tf_ans)
 
@@ -81,7 +81,7 @@ class TransposeTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       y = array_ops.transpose(inx, p, conjugate=conjugate)
-      tf_ans = y.eval()
+      tf_ans = self.evaluate(y)
 
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
@@ -168,7 +168,7 @@ class TransposeTest(test.TestCase):
         with self.cached_session(use_gpu=True):
           inx = ops.convert_to_tensor(inp)
           y = array_ops.transpose(inx, perm)
-          tf_ans = y.eval()
+          tf_ans = self.evaluate(y)
         self.assertAllEqual(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, y)
 
@@ -189,7 +189,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -224,7 +224,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -246,7 +246,7 @@ class TransposeTest(test.TestCase):
         with self.cached_session(use_gpu=True):
           inx = ops.convert_to_tensor(inp)
           y = array_ops.transpose(inx, perm)
-          tf_ans = y.eval()
+          tf_ans = self.evaluate(y)
         self.assertAllEqual(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, y)
 
@@ -267,7 +267,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -319,7 +319,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
       self._ClearCachedSession()
@@ -341,7 +341,7 @@ class TransposeTest(test.TestCase):
         inx = ops.convert_to_tensor(x)
         inp = constant_op.constant(p)
         y = array_ops.transpose(inx, inp)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
         self.assertShapeEqual(np_ans, y)
         self.assertAllEqual(np_ans, tf_ans)
 
diff --git a/tensorflow/python/kernel_tests/unicode_decode_op_test.py b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c34145bff17c8aa62a0f3df6f2cf34d7b763f7da
--- /dev/null
+++ b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for unicode_decode and unicode_decode_with_splits."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.platform import test
+
+
+# Account for python2 and python3 execution of the test.
+def codepoint(s):
+  if isinstance(s, bytes):
+    return ord(s.decode("utf-8"))
+  elif isinstance(s, str):
+    return ord(s)
+
+
+class UnicodeDecodeTest(test.TestCase):
+
+  def testBatchDecode(self):
+    text = constant_op.constant(
+        ["仅今年前", "分享介面終於迎來更新"])
+    row_splits, utf8_text, offsets = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8")
+
+    with self.test_session():
+      self.assertAllEqual([
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+          codepoint("分"),
+          codepoint("享"),
+          codepoint("介"),
+          codepoint("面"),
+          codepoint("終"),
+          codepoint("於"),
+          codepoint("迎"),
+          codepoint("來"),
+          codepoint("更"),
+          codepoint("新")
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual([0, 4, 14], self.evaluate(row_splits).tolist())
+      self.assertAllEqual([0, 3, 6, 9, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27],
+                          self.evaluate(offsets).tolist())
+
+  def testBasicDecodeWithOffset(self):
+    text = constant_op.constant(["仅今年前"])
+    row_splits, utf8_text, starts = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8")
+
+    with self.test_session():
+      self.assertAllEqual([
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual(self.evaluate(row_splits).tolist(), [0, 4])
+      self.assertAllEqual(self.evaluate(starts).tolist(), [0, 3, 6, 9])
+
+  def testStrictError(self):
+    text = constant_op.constant([b"\xFEED"])
+    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="strict")
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      with self.test_session():
+        self.evaluate(error)
+
+  def testReplaceOnError(self):
+    text = constant_op.constant([b"\xFE"])
+
+    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="replace")
+
+    with self.test_session():
+      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [65533])
+
+  def testBadReplacementChar(self):
+    text = constant_op.constant([b"\xFE"])
+    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="replace", replacement_char=11141111)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      with self.test_session():
+        self.evaluate(error)
+
+  def testIgnoreOnError(self):
+    text = constant_op.constant([b"\xFEhello"])
+
+    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="ignore")
+
+    with self.test_session():
+      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [
+          codepoint("h"),
+          codepoint("e"),
+          codepoint("l"),
+          codepoint("l"),
+          codepoint("o")
+      ])
+
+  def testBadErrorPolicy(self):
+    text = constant_op.constant(["hippopotamus"])
+
+    with self.assertRaises(ValueError):
+      _, _, _ = gen_string_ops.unicode_decode_with_offsets(
+          text, "utf-8", errors="oranguatan")
+
+  def testReplaceControlChars(self):
+    text = constant_op.constant(["\x02仅今年前"])
+    row_splits, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", replace_control_characters=True)
+
+    with self.test_session():
+      self.assertAllEqual([
+          65533,
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual([0, 5], self.evaluate(row_splits).tolist())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
index 2908e2bfc56981d15e54594ed155f30bc21b1aab..d1c7b41c7b1dd008c1eb9a9e2d4fca83f03aa53d 100644
--- a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for string_length_op."""
+"""Tests for unicode_transcode op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -42,7 +42,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
       outputs = string_ops.unicode_transcode(
@@ -52,7 +52,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
       outputs = string_ops.unicode_transcode(
@@ -62,7 +62,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
   def test_transcode_utf16_to_utf8(self):
@@ -77,7 +77,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, expected)
 
   def test_transcode_bad_utf8(self):
@@ -90,7 +90,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=True)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"  ")
 
       outputs = string_ops.unicode_transcode(
@@ -100,7 +100,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00 ")
 
   def test_transcode_bad_utf8_with_some_good(self):
@@ -113,7 +113,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"abc abcdefg")
 
   def test_transcode_bad_utf8_with_defaults(self):
@@ -121,7 +121,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00\xef\xbf\xbd")
 
   def test_transcode_bad_utf8_with_space_replacement(self):
@@ -130,7 +130,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
       outputs = string_ops.unicode_transcode(
           bad_string, input_encoding="UTF-8", output_encoding="UTF-8",
           replacement_char=ord(" "))
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00 ")
 
   def test_transcode_bad_utf8_with_strict_errors(self):
@@ -165,7 +165,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           input_encoding="UTF-8",
           output_encoding="UTF-8",
           errors="ignore")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00")
 
   def test_transcode_bad_utf8_with_elision_including_control_chars(self):
@@ -177,7 +177,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-8",
           errors="ignore",
           replace_control_characters=True)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"")
 
   def test_transcode_bad_utf8_termination_with_defaults(self):
@@ -185,7 +185,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"a\xef\xbf\xbd")   # 0xFFFD
 
   def test_transcode_utf8_with_replacement_char(self):
@@ -194,13 +194,13 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
       outputs = string_ops.unicode_transcode(
           strings, input_encoding="UTF-8", output_encoding="UTF-8",
           errors="strict")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
 
       outputs = string_ops.unicode_transcode(
           strings, input_encoding="UTF-8", output_encoding="UTF-8",
           errors="replace", replacement_char=ord("?"))
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
 
   def test_transcode_utf8_to_utf16(self):
@@ -214,7 +214,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-16-BE",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       print("values=", values)
       self.assertAllEqual(values, expected)
 
@@ -230,7 +230,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-8",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, expected)
 
   def test_transcode_utf8_to_utf32(self):
@@ -243,7 +243,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-32-BE",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, expected)
 
   # Documentation in ICU suggests that getNextUChar may produce a different
@@ -258,7 +258,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-8",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
   def test_transcode_utf8_with_bom(self):
@@ -266,12 +266,12 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-8", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\xef\xbb\xbfabcdefg")  # BOM preserved
 
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-8", output_encoding="UTF-16-BE")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       utf16expected = bom_string.decode("UTF-8").encode("UTF-16-BE")
       self.assertAllEqual(values, utf16expected)
 
@@ -280,20 +280,20 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-16-BE", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       # BOM is preserved in output
       self.assertAllEqual(values, b"\xef\xbb\xbfa")
 
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       # mangled BOM and value from (incorrect) LE encoding
       self.assertAllEqual(values, b"\xef\xbf\xbe\xe6\x84\x80")
 
       bom_string = b"\xff\xfe\x61\x00"  # Little-endian BOM with 'a' encoded
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\xef\xbb\xbfa")
 
   @parameterized.parameters(
@@ -378,6 +378,60 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
 
       self.assertAllEqual([b"AbCdE", b"HiJkL"], transcoded)
 
+  def test_cjk_encodings(self):
+    strings_ja = [
+        b"\x5c\x5c",  # Yen sign
+        b"\x8f\x70",  # kanji character "waza"
+        b"\x83\x4f"
+    ]  # katakana character "gu"
+    strings_zh_cn = [b"\xca\xf5"]  # simplified "shu4"
+    strings_zh_tw = [b"\xb3\x4e"]  # traditional "shu4"
+    strings_ko = [b"\xc7\xd1\xb9\xce"]  # hangul "hanmin"
+
+    expected_ja = [s.decode("shift_jis").encode("UTF-8") for s in strings_ja]
+    expected_zh_cn = [
+        s.decode("gb18030").encode("UTF-8") for s in strings_zh_cn
+    ]
+    expected_zh_tw = [s.decode("big5").encode("UTF-8") for s in strings_zh_tw]
+    expected_ko = [s.decode("euc_kr").encode("UTF-8") for s in strings_ko]
+
+    with self.cached_session() as sess:
+      outputs_ja = string_ops.unicode_transcode(
+          strings_ja,
+          input_encoding="shift_jis",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_zh_cn = string_ops.unicode_transcode(
+          strings_zh_cn,
+          input_encoding="gb18030",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_zh_tw = string_ops.unicode_transcode(
+          strings_zh_tw,
+          input_encoding="big5",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_ko = string_ops.unicode_transcode(
+          strings_ko,
+          input_encoding="euc_kr",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      result_ja, result_zh_cn, result_zh_tw, result_ko = sess.run(
+          [outputs_ja, outputs_zh_cn, outputs_zh_tw, outputs_ko])
+
+      self.assertAllEqual(result_ja, expected_ja)
+      self.assertAllEqual(result_zh_cn, expected_zh_cn)
+      self.assertAllEqual(result_zh_tw, expected_zh_tw)
+      self.assertAllEqual(result_ko, expected_ko)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index 6aea42990acf3541fa888f580ac8d82ea378096a..d314e1eaf938b7c409dd81f650db366cc5167474 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -41,7 +41,7 @@ class UnstackOpTest(test.TestCase):
 
   def testSimple(self):
     np.random.seed(7)
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [
             np.bool, np.float16, np.float32, np.float64, np.int32, np.int64
@@ -53,14 +53,15 @@ class UnstackOpTest(test.TestCase):
           cs = array_ops.unstack(x, num=shape[0])
           self.assertEqual(type(cs), list)
           self.assertEqual(len(cs), shape[0])
-          cs = [c.eval() for c in cs]
+          cs = [self.evaluate(c) for c in cs]
           self.assertAllEqual(cs, data)
 
   def testSimpleGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest('No GPU available')
+
     np.random.seed(7)
-    with self.session(use_gpu=True, force_gpu=True):
+    with test_util.force_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
@@ -70,7 +71,7 @@ class UnstackOpTest(test.TestCase):
           cs = array_ops.unstack(x, num=shape[0])
           self.assertEqual(type(cs), list)
           self.assertEqual(len(cs), shape[0])
-          cs = [c.eval() for c in cs]
+          cs = [self.evaluate(c) for c in cs]
           self.assertAllEqual(cs, data)
 
   def testGradientsAxis0(self):
@@ -131,15 +132,13 @@ class UnstackOpTest(test.TestCase):
       for j in range(-i, i):
         expected = np_split_squeeze(a, j)
 
-        with self.cached_session() as sess:
-          actual_unstack = sess.run(array_ops.unstack(a, axis=j))
+        actual_unstack = self.evaluate(array_ops.unstack(a, axis=j))
 
         self.assertAllEqual(expected, actual_unstack)
 
   def testAxis0Default(self):
-    with self.cached_session() as sess:
-      a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
-      unstacked = sess.run(array_ops.unstack(a))
+    a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
+    unstacked = self.evaluate(array_ops.unstack(a))
 
     self.assertEqual(len(unstacked), 2)
     self.assertAllEqual(unstacked[0], [1, 2, 3])
@@ -156,10 +155,9 @@ class UnstackOpTest(test.TestCase):
       array_ops.unstack(a, axis=-3)
 
   def testZeroLengthDim(self):
-    with self.cached_session():
-      x = array_ops.zeros(shape=(0, 1, 2))
-      y = array_ops.unstack(x, axis=1)[0].eval()
-      self.assertEqual(y.shape, (0, 2))
+    x = array_ops.zeros(shape=(0, 1, 2))
+    y = self.evaluate(array_ops.unstack(x, axis=1)[0])
+    self.assertEqual(y.shape, (0, 2))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/variable_ops_test.py b/tensorflow/python/kernel_tests/variable_ops_test.py
index 3d2f8b61555f277cd67d65b27c43b81c2a45538e..769bbba47ba7b7f2815a0933377281b9cbaa86e1 100644
--- a/tensorflow/python/kernel_tests/variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variable_ops_test.py
@@ -46,7 +46,7 @@ class VariableOpTest(test.TestCase):
       p = state_ops.variable_op(x.shape, tftype)
       op = state_ops.assign(p, x)
       op.op.run()
-      return p.eval()
+      return self.evaluate(p)
 
   def _testTypes(self, vals):
     for dtype in [np.float32, np.float64, np.int32, np.int64]:
@@ -170,14 +170,14 @@ class VariableOpTest(test.TestCase):
       var = state_ops.assign(var, [[4.0, 5.0]])
       var = state_ops.assign_add(var, [[6.0, 7.0]])
       final = gen_state_ops.destroy_temporary_variable(var, var_name="foo")
-      self.assertAllClose([[10.0, 12.0]], final.eval())
+      self.assertAllClose([[10.0, 12.0]], self.evaluate(final))
 
   def testDestroyNonexistentTemporaryVariable(self):
     with self.test_session(use_gpu=True):
       var = gen_state_ops.temporary_variable([1, 2], dtypes.float32)
       final = gen_state_ops.destroy_temporary_variable(var, var_name="bad")
       with self.assertRaises(errors.NotFoundError):
-        final.eval()
+        self.evaluate(final)
 
   def testDuplicateTemporaryVariable(self):
     with self.test_session(use_gpu=True):
@@ -189,7 +189,7 @@ class VariableOpTest(test.TestCase):
       var2 = state_ops.assign(var2, [[3.0, 4.0]])
       final = var1 + var2
       with self.assertRaises(errors.AlreadyExistsError):
-        final.eval()
+        self.evaluate(final)
 
   def testDestroyTemporaryVariableTwice(self):
     with self.test_session(use_gpu=True):
@@ -198,14 +198,14 @@ class VariableOpTest(test.TestCase):
       val2 = gen_state_ops.destroy_temporary_variable(var, var_name="dup")
       final = val1 + val2
       with self.assertRaises(errors.NotFoundError):
-        final.eval()
+        self.evaluate(final)
 
   def testTemporaryVariableNoLeak(self):
     with self.test_session(use_gpu=True):
       var = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="bar")
       final = array_ops.identity(var)
-      final.eval()
+      self.evaluate(final)
 
   def testTwoTemporaryVariablesNoLeaks(self):
     with self.test_session(use_gpu=True):
@@ -214,7 +214,7 @@ class VariableOpTest(test.TestCase):
       var2 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="var2")
       final = var1 + var2
-      final.eval()
+      self.evaluate(final)
 
   def testAssignDependencyAcrossDevices(self):
     with self.test_session(use_gpu=True):
@@ -229,7 +229,7 @@ class VariableOpTest(test.TestCase):
           # honored, i.e., the Send and Recv from GPU to CPU should take place
           # only after the increment.
           result = math_ops.multiply(var, var)
-      self.assertAllClose([4.0], result.eval())
+      self.assertAllClose([4.0], self.evaluate(result))
 
   def testIsVariableInitialized(self):
     for use_gpu in [True, False]:
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 2ba064f8a502fa203f156895985169ce6b50a135..838838e0ac6a9c50597c8b7372631d9041fd5eb4 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -438,15 +438,15 @@ class VariableScopeTest(test.TestCase):
         sess.run(v0)
       # We should be able to initialize and run v1 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual(1, sess.run(v1))
+      self.evaluate(v1.initializer)
+      self.assertEqual(1, self.evaluate(v1))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
         sess.run(v0)
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
         sess.run(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
+      self.evaluate(v0.initializer)
       sess.run(add)
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
@@ -490,10 +490,10 @@ class VariableScopeTest(test.TestCase):
       v2 = var_dict["v2"]
       # We should be able to initialize and run v1 and v2 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual([1], sess.run(v1))
-      sess.run(v2.initializer)
-      self.assertEqual([2], sess.run(v2))
+      self.evaluate(v1.initializer)
+      self.assertEqual([1], self.evaluate(v1))
+      self.evaluate(v2.initializer)
+      self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
         sess.run(v0)
@@ -501,7 +501,7 @@ class VariableScopeTest(test.TestCase):
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
         sess.run(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
+      self.evaluate(v0.initializer)
       sess.run(add)
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
@@ -649,7 +649,7 @@ class VariableScopeTest(test.TestCase):
             "testVarScopeGetOrCreateReuse_bar",
             reuse=variable_scope.AUTO_REUSE):
           _ = variable_scope.get_variable("var", [])
-        self.assertEqual(value, x.eval())
+        self.assertEqual(value, self.evaluate(x))
 
       test_value(42.)  # Variable is created.
       test_value(13.)  # Variable is reused hereafter.
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index b3eebf83168fecaf21fb3e6be7329f97dd207b52..2bb75109b1bd22c9ef1ec33e1ea37b75c8013da1 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -58,15 +58,15 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([], var1.shape)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        var0.eval()
+        self.evaluate(var0)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        var1.eval()
+        self.evaluate(var1)
 
       variables.global_variables_initializer().run()
 
-      self.assertAllClose(0.0, var0.eval())
-      self.assertAllClose(1.1, var1.eval())
+      self.assertAllClose(0.0, self.evaluate(var0))
+      self.assertAllClose(1.1, self.evaluate(var1))
 
   def testInitializationOrder(self):
     with self.cached_session():
@@ -94,8 +94,9 @@ class VariablesTestCase(test.TestCase):
 
       variables.global_variables_initializer().run()
 
-      self.assertAllClose(rnd.eval(), dep.eval())
-      self.assertAllClose(rnd.eval() + dep.eval() + 2.0, depdep.eval())
+      self.assertAllClose(rnd.eval(), self.evaluate(dep))
+      self.assertAllClose(rnd.eval() + self.evaluate(dep) + 2.0,
+                          self.evaluate(depdep))
 
   def testIterable(self):
     with self.assertRaisesRegexp(TypeError, "not iterable"):
@@ -112,16 +113,16 @@ class VariablesTestCase(test.TestCase):
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(0.0, var.eval())
+      self.assertAllClose(0.0, self.evaluate(var))
 
-      self.assertAllClose(1.0, plus_one.eval())
-      self.assertAllClose(1.0, var.eval())
+      self.assertAllClose(1.0, self.evaluate(plus_one))
+      self.assertAllClose(1.0, self.evaluate(var))
 
-      self.assertAllClose(-1.0, minus_one.eval())
-      self.assertAllClose(-1.0, var.eval())
+      self.assertAllClose(-1.0, self.evaluate(minus_one))
+      self.assertAllClose(-1.0, self.evaluate(var))
 
-      self.assertAllClose(4.0, four.eval())
-      self.assertAllClose(4.0, var.eval())
+      self.assertAllClose(4.0, self.evaluate(four))
+      self.assertAllClose(4.0, self.evaluate(var))
 
   def testResourceAssignments(self):
     with self.session(use_gpu=True):
@@ -130,16 +131,16 @@ class VariablesTestCase(test.TestCase):
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(0.0, var.eval())
+      self.assertAllClose(0.0, self.evaluate(var))
 
-      plus_one.eval()
-      self.assertAllClose(1.0, var.eval())
+      self.evaluate(plus_one)
+      self.assertAllClose(1.0, self.evaluate(var))
 
-      minus_one.eval()
-      self.assertAllClose(-1.0, var.eval())
+      self.evaluate(minus_one)
+      self.assertAllClose(-1.0, self.evaluate(var))
 
-      four.eval()
-      self.assertAllClose(4.0, var.eval())
+      self.evaluate(four)
+      self.assertAllClose(4.0, self.evaluate(var))
 
   def testZeroSizeStringAssign(self):
     with self.cached_session() as sess:
@@ -148,10 +149,10 @@ class VariablesTestCase(test.TestCase):
           name="foo",
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES])
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       old_value = array.value()
       copy_op = array.assign(old_value)
-      self.assertEqual([], list(sess.run(copy_op)))
+      self.assertEqual([], list(self.evaluate(copy_op)))
 
   def _countUpToTest(self, dtype):
     with self.cached_session():
@@ -160,24 +161,24 @@ class VariablesTestCase(test.TestCase):
       count_up_to = var.count_up_to(3)
 
       variables.global_variables_initializer().run()
-      self.assertEqual(0, var.eval())
+      self.assertEqual(0, self.evaluate(var))
 
-      self.assertEqual(0, count_up_to.eval())
-      self.assertEqual(1, var.eval())
+      self.assertEqual(0, self.evaluate(count_up_to))
+      self.assertEqual(1, self.evaluate(var))
 
-      self.assertEqual(1, count_up_to.eval())
-      self.assertEqual(2, var.eval())
+      self.assertEqual(1, self.evaluate(count_up_to))
+      self.assertEqual(2, self.evaluate(var))
 
-      self.assertEqual(2, count_up_to.eval())
-      self.assertEqual(3, var.eval())
+      self.assertEqual(2, self.evaluate(count_up_to))
+      self.assertEqual(3, self.evaluate(var))
 
       with self.assertRaisesOpError("Reached limit of 3"):
-        count_up_to.eval()
-      self.assertEqual(3, var.eval())
+        self.evaluate(count_up_to)
+      self.assertEqual(3, self.evaluate(var))
 
       with self.assertRaisesOpError("Reached limit of 3"):
-        count_up_to.eval()
-      self.assertEqual(3, var.eval())
+        self.evaluate(count_up_to)
+      self.assertEqual(3, self.evaluate(var))
 
   def testCountUpToInt32(self):
     self._countUpToTest(dtypes.int32)
@@ -220,10 +221,10 @@ class VariablesTestCase(test.TestCase):
       v2 = var_dict["v2"]
       # We should be able to initialize and run v1 and v2 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual([1], sess.run(v1))
-      sess.run(v2.initializer)
-      self.assertEqual([2], sess.run(v2))
+      self.evaluate(v1.initializer)
+      self.assertEqual([1], self.evaluate(v1))
+      self.evaluate(v2.initializer)
+      self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
         sess.run(v0)
@@ -231,7 +232,7 @@ class VariablesTestCase(test.TestCase):
       with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
         sess.run(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
+      self.evaluate(v0.initializer)
       sess.run(add)
 
   def testControlFlowInitialization(self):
@@ -252,8 +253,8 @@ class VariablesTestCase(test.TestCase):
       var_x = variables.Variable(2.0)
       var_y = variables.Variable(3.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(2.0, var_x.eval())
-      self.assertAllClose(3.0, var_y.eval())
+      self.assertAllClose(2.0, self.evaluate(var_x))
+      self.assertAllClose(3.0, self.evaluate(var_y))
       self.assertAllClose(5.0, math_ops.add(var_x, var_y).eval())
 
   def testZeroSizeVarSameAsConst(self):
@@ -264,7 +265,7 @@ class VariablesTestCase(test.TestCase):
       const_mul = math_ops.matmul(
           zero_size_const, zero_size_const, transpose_b=True)
       variables.global_variables_initializer().run()
-      variable_output = variable_mul.eval()
+      variable_output = self.evaluate(variable_mul)
       self.assertAllClose(const_mul.eval(), variable_output)
       self.assertAllClose([[0., 0.], [0., 0.]], variable_output)
 
@@ -349,53 +350,43 @@ class VariablesTestCase(test.TestCase):
       rmatmul = var_m.__rmatmul__([[10.0], [20.0]])
 
       variables.global_variables_initializer().run()
-      self.assertAllClose([2.0], add.eval())
-      self.assertAllClose([3.0], radd.eval())
-      self.assertAllClose([1.0], sub.eval())
-      self.assertAllClose([-1.0], rsub.eval())
-      self.assertAllClose([20.0], mul.eval())
-      self.assertAllClose([20.0], rmul.eval())
-      self.assertAllClose([0.2], div.eval())
-      self.assertAllClose([5.0], rdiv.eval())
-      self.assertAllClose([-2.0], neg.eval())
-      self.assertAllClose([2.0], abs_v.eval())
-      self.assertAllClose([True], lt.eval())
-      self.assertAllClose([False], rlt.eval())
-      self.assertAllClose([True], le.eval())
-      self.assertAllClose([True], rle.eval())
-      self.assertAllClose([False], gt.eval())
-      self.assertAllClose([True], rgt.eval())
-      self.assertAllClose([True], ge.eval())
-      self.assertAllClose([True], rge.eval())
-
-      self.assertAllClose([6], mod.eval())
-      self.assertAllClose([3], rmod.eval())
-
-      self.assertAllClose([True, False], and_v.eval())
-      self.assertAllClose([True, True], or_v.eval())
-      self.assertAllClose([True, False], xor_v.eval())
-      self.assertAllClose([False, True], invert_v.eval())
-
-      self.assertAllClose(rnd[2, 0:0], slice_v.eval())
-
-      self.assertAllClose([[80.0]], matmul.eval())
-      self.assertAllClose([[20.0, 30.0], [40.0, 60.0]], rmatmul.eval())
+      self.assertAllClose([2.0], self.evaluate(add))
+      self.assertAllClose([3.0], self.evaluate(radd))
+      self.assertAllClose([1.0], self.evaluate(sub))
+      self.assertAllClose([-1.0], self.evaluate(rsub))
+      self.assertAllClose([20.0], self.evaluate(mul))
+      self.assertAllClose([20.0], self.evaluate(rmul))
+      self.assertAllClose([0.2], self.evaluate(div))
+      self.assertAllClose([5.0], self.evaluate(rdiv))
+      self.assertAllClose([-2.0], self.evaluate(neg))
+      self.assertAllClose([2.0], self.evaluate(abs_v))
+      self.assertAllClose([True], self.evaluate(lt))
+      self.assertAllClose([False], self.evaluate(rlt))
+      self.assertAllClose([True], self.evaluate(le))
+      self.assertAllClose([True], self.evaluate(rle))
+      self.assertAllClose([False], self.evaluate(gt))
+      self.assertAllClose([True], self.evaluate(rgt))
+      self.assertAllClose([True], self.evaluate(ge))
+      self.assertAllClose([True], self.evaluate(rge))
+
+      self.assertAllClose([6], self.evaluate(mod))
+      self.assertAllClose([3], self.evaluate(rmod))
+
+      self.assertAllClose([True, False], self.evaluate(and_v))
+      self.assertAllClose([True, True], self.evaluate(or_v))
+      self.assertAllClose([True, False], self.evaluate(xor_v))
+      self.assertAllClose([False, True], self.evaluate(invert_v))
+
+      self.assertAllClose(rnd[2, 0:0], self.evaluate(slice_v))
+
+      self.assertAllClose([[80.0]], self.evaluate(matmul))
+      self.assertAllClose([[20.0, 30.0], [40.0, 60.0]], self.evaluate(rmatmul))
 
   def testSession(self):
     with self.cached_session() as sess:
       var = variables.Variable([1, 12])
       variables.global_variables_initializer().run()
-      self.assertAllClose([1, 12], sess.run(var))
-
-  def testDevicePlacement(self):
-    with self.cached_session() as sess:
-      with ops.device("/cpu:0"):
-        var = variables.Variable([1, 12])
-      init_value = var.initialized_value()
-      init_op = variables.global_variables_initializer()
-      self.assertEqual(var.op.device, init_value.device)
-      self.assertEqual(var.op.device, init_op.device)
-      sess.run(init_op)
+      self.assertAllClose([1, 12], self.evaluate(var))
 
   def testColocation(self):
     with ops.device("/job:ps"):
@@ -416,7 +407,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual(shape, v1.shape)
       self.assertAllClose(value, v1.initial_value.eval())
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        v1.eval()
+        self.evaluate(v1)
 
       v2 = variables.Variable(
           math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
@@ -425,9 +416,9 @@ class VariablesTestCase(test.TestCase):
       self.assertAllClose(np.negative(value), v2.initial_value.eval())
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        v2.eval()
+        self.evaluate(v2)
       variables.global_variables_initializer().run()
-      self.assertAllClose(np.negative(value), v2.eval())
+      self.assertAllClose(np.negative(value), self.evaluate(v2))
 
   def testConstraintArg(self):
     constraint = lambda x: x
@@ -519,7 +510,7 @@ class VariablesTestCase(test.TestCase):
       variables.global_variables_initializer().run()
       var.load(np.ones((5, 5), np.float32))
 
-      self.assertAllClose(np.ones((5, 5), np.float32), var.eval())
+      self.assertAllClose(np.ones((5, 5), np.float32), self.evaluate(var))
 
   def testRepr(self):
     var = variables.VariableV1(np.zeros((5, 5), np.float32), name="noop")
@@ -542,7 +533,7 @@ class IsInitializedTest(test.TestCase):
   def testNoVars(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       uninited = variables.report_uninitialized_variables()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testAssertVariablesInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -550,27 +541,27 @@ class IsInitializedTest(test.TestCase):
       w = variables.Variable([3, 4], name="w")
       _ = v, w
       uninited = variables.report_uninitialized_variables()
-      self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
+      self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
       variables.global_variables_initializer().run()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2], name="v")
       w = variables.VariableV1([3, 4], name="w")
       uninited = variables.report_uninitialized_variables()
-      self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
-      sess.run(w.initializer)
-      self.assertAllEqual(np.array([b"v"]), sess.run(uninited))
+      self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
+      self.evaluate(w.initializer)
+      self.assertAllEqual(np.array([b"v"]), self.evaluate(uninited))
       v.initializer.run()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testZeroSizeVarInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.Variable(array_ops.zeros([0, 2]), name="v")
       uninited = variables.report_uninitialized_variables()
       v.initializer.run()  # not strictly necessary
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testTrainingWithZeroSizeVar(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -582,7 +573,7 @@ class IsInitializedTest(test.TestCase):
       do_opt = gradient_descent.GradientDescentOptimizer(0.1).minimize(
           objective)
       sess.run([do_opt])
-      self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], b.eval())
+      self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], self.evaluate(b))
 
 
 class ObsoleteIsInitializedTest(test.TestCase):
@@ -609,7 +600,7 @@ class ObsoleteIsInitializedTest(test.TestCase):
       inited = variables.assert_variables_initialized([v])
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
-      sess.run(w.initializer)
+      self.evaluate(w.initializer)
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
       v.initializer.run()
@@ -744,34 +735,34 @@ class PartitionedVariableTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       self.assertEqual([1.0], plus_delta[0].eval())
-      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([1.0], self.evaluate(v0))
       self.assertEqual([3.0], plus_delta[1].eval())
-      self.assertEqual([3.0], v1.eval())
+      self.assertEqual([3.0], self.evaluate(v1))
 
       self.assertEqual([-2.0], minus_delta[0].eval())
-      self.assertEqual([-2.0], v0.eval())
+      self.assertEqual([-2.0], self.evaluate(v0))
       self.assertEqual([-1.0], minus_delta[1].eval())
-      self.assertEqual([-1.0], v1.eval())
+      self.assertEqual([-1.0], self.evaluate(v1))
 
       self.assertEqual([1.0], assign_ones[0].eval())
-      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([1.0], self.evaluate(v0))
       self.assertEqual([1.0], assign_ones[1].eval())
-      self.assertEqual([1.0], v1.eval())
+      self.assertEqual([1.0], self.evaluate(v1))
 
       self.assertEqual([2.0], assign_list[0].eval())
-      self.assertEqual([2.0], v2.eval())
+      self.assertEqual([2.0], self.evaluate(v2))
       self.assertEqual([3.0], assign_list[1].eval())
-      self.assertEqual([3.0], v3.eval())
+      self.assertEqual([3.0], self.evaluate(v3))
 
       self.assertEqual([3.0], assign_part_value[0].eval())
-      self.assertEqual([3.0], v2.eval())
+      self.assertEqual([3.0], self.evaluate(v2))
       self.assertEqual([4.0], assign_part_value[1].eval())
-      self.assertEqual([4.0], v3.eval())
+      self.assertEqual([4.0], self.evaluate(v3))
 
       self.assertEqual([2.0], assign_part_var[0].eval())
-      self.assertEqual([2.0], v2.eval())
+      self.assertEqual([2.0], self.evaluate(v2))
       self.assertEqual([3.0], assign_part_var[1].eval())
-      self.assertEqual([3.0], v3.eval())
+      self.assertEqual([3.0], self.evaluate(v3))
 
 
 class VariableContainerTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/weights_broadcast_test.py b/tensorflow/python/kernel_tests/weights_broadcast_test.py
index 85f9abc69f78b048c78d4d0ab908371e7a8650d3..c476004b8935dc85d90a07f15c2dee648cfc04ff 100644
--- a/tensorflow/python/kernel_tests/weights_broadcast_test.py
+++ b/tensorflow/python/kernel_tests/weights_broadcast_test.py
@@ -158,7 +158,7 @@ class BroadcastWeightsTest(test.TestCase):
     dynamic_op = weights_broadcast_ops.broadcast_weights(
         weights=weights_placeholder, values=values_placeholder)
     with self.cached_session():
-      self.assertAllEqual(expected, static_op.eval())
+      self.assertAllEqual(expected, self.evaluate(static_op))
       self.assertAllEqual(expected, dynamic_op.eval(feed_dict={
           weights_placeholder: weights,
           values_placeholder: values,
diff --git a/tensorflow/python/kernel_tests/where_op_test.py b/tensorflow/python/kernel_tests/where_op_test.py
index fca45c3ece41a50d48583ef16baca823d4607602..9e074b2304382cbc268c83c4d3fadc542977beaf 100644
--- a/tensorflow/python/kernel_tests/where_op_test.py
+++ b/tensorflow/python/kernel_tests/where_op_test.py
@@ -41,11 +41,11 @@ class WhereOpTest(test.TestCase):
       ans = array_ops.where(x)
       self.assertEqual([None, x.ndim], ans.get_shape().as_list())
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertAllClose(tf_ans, truth, atol=1e-10)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def testWrongNumbers(self):
     with self.session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index dc1bcb78b8066c83cb0d9693d7e23ce68b0463d6..48b32f06aa1b0d8e48b81bebd1665497307acb39 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -20,14 +20,15 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
@@ -47,8 +48,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     ret = while_loop_v2(lambda v: v < 8., lambda v: v * v, [x])
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
   def testMultipleLoopVarsBasic(self):
     x = constant_op.constant(5.)
@@ -64,8 +65,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # Note: This is simply d_ret[0]/d_x since d_ret[1]/d_x is 0.
     grad = gradients_impl.gradients(ret, [x])  # [2*x*y]
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(ret), [45., 3.])
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertSequenceEqual(self.evaluate(ret), [45., 3.])
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
   def testMultipleLoopVars(self):
     x = constant_op.constant(5.)
@@ -87,13 +88,13 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grady_1 = gradients_impl.gradients(ret[1], [y])  # [x + 1]
     grady_2 = gradients_impl.gradients(ret, [y])  # [2*x*y + x**2 + x + 1]
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(ret), [120., 23.])
-      self.assertSequenceEqual(sess.run(gradx_0), [39.])
-      self.assertSequenceEqual(sess.run(gradx_1), [4.])
-      self.assertSequenceEqual(sess.run(gradx_2), [43.])
-      self.assertSequenceEqual(sess.run(grady_0), [55.])
-      self.assertSequenceEqual(sess.run(grady_1), [6.])
-      self.assertSequenceEqual(sess.run(grady_2), [61.])
+      self.assertSequenceEqual(self.evaluate(ret), [120., 23.])
+      self.assertSequenceEqual(self.evaluate(gradx_0), [39.])
+      self.assertSequenceEqual(self.evaluate(gradx_1), [4.])
+      self.assertSequenceEqual(self.evaluate(gradx_2), [43.])
+      self.assertSequenceEqual(self.evaluate(grady_0), [55.])
+      self.assertSequenceEqual(self.evaluate(grady_1), [6.])
+      self.assertSequenceEqual(self.evaluate(grady_2), [61.])
 
   def testMultipleWhileLoops(self):
     x = constant_op.constant(2.)
@@ -102,8 +103,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grad = gradients_impl.gradients(ret2, [x])  # 4x**3
     grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(grad), [32.])
-      self.assertSequenceEqual(sess.run(grad_grad), [48.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
   def testDoubleDerivative(self):
     x = constant_op.constant(2.)
@@ -111,9 +112,9 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grad = gradients_impl.gradients(ret, [x])  # 4x**3
     grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
-      self.assertSequenceEqual(sess.run(grad_grad), [48.])
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
   def testPruning(self):
     x = constant_op.constant(1)
@@ -135,10 +136,12 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
     def GetOptimizedGraph():
       mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
-      rewriter_config = rewriter_config_pb2.RewriterConfig(
-          constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-          memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
-      return tf_optimizer.OptimizeGraph(rewriter_config, mg)
+      config = config_pb2.ConfigProto()
+      config.graph_options.rewrite_options.CopyFrom(
+          rewriter_config_pb2.RewriterConfig(
+              constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+              memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+      return tf_optimizer.OptimizeGraph(config, mg)
 
     g = GetOptimizedGraph()
     self.assertEqual(len([n for n in g.node if n.op == "Enter"]), 1)
@@ -154,8 +157,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     ret = while_loop_v2(lambda v: v + y < 9., lambda v: v * 3., [x])
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 18.)
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertEqual(self.evaluate(ret), 18.)
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
   def testCaptureExternalTensorInBody(self):
     x = constant_op.constant(2.)
@@ -163,8 +166,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     ret = while_loop_v2(lambda v: v < 8., lambda v: v * y, [x])
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 18.)
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertEqual(self.evaluate(ret), 18.)
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
   def testLoopWithTensorListPushBack(self):
     x = constant_op.constant(2.)
@@ -185,7 +188,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grad = gradients_impl.gradients(ret[0], x)
     with self.cached_session() as sess:
       self.assertEqual(sess.run(ret[0]), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
   def testDuplicateAccumulator(self):
     x = constant_op.constant(2.)
@@ -219,7 +222,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grad = gradients_impl.gradients(ret[0], x)
     with self.cached_session() as sess:
       self.assertEqual(sess.run(ret[0]), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
   @parameterized.named_parameters(
       ("UnknownShape", None),
@@ -262,7 +265,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
     # Gradient pass.
     grad = gradients_impl.gradients(ret[1], y)
-    grad_while_op = grad[0].op
+    grad_while_op = grad[0].op.inputs[0].op
     # Get the TensorList output of gradient While op containing the accumulated
     # values of grad_y.
     # grad_while_op.inputs:
@@ -305,18 +308,16 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         self.assertRegexpMatches(
             while2_op.get_attr("body").name, r"foo_while_1_body_\d*")
 
+  @test_util.enable_control_flow_v2
   def testWhileAndTensorArray(self):
-    old_enable_while_v2 = control_flow_ops.ENABLE_WHILE_V2
-    control_flow_ops.ENABLE_WHILE_V2 = True
     with self.cached_session() as sess:
       param = constant_op.constant(2.0)
       y0 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
       # map_fn uses TensorArray internally.
       r = functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y0)
-      self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], sess.run(r))
+      self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], self.evaluate(r))
       r = gradients_impl.gradients(r, param)[0]
-      self.assertAllClose(21.0, sess.run(r))
-    control_flow_ops.ENABLE_WHILE_V2 = old_enable_while_v2
+      self.assertAllClose(21.0, self.evaluate(r))
 
   def testNestedWhile(self):
     # Compute sum of geometric progression: n^0 + n^1 + ... + n^m
@@ -333,8 +334,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     result = while_loop_v2(lambda i, _: i >= 0, Body, [m, sum_of_powers])[1]
     grad = gradients_impl.gradients(result, [n])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(result), 364.)
-      self.assertSequenceEqual(sess.run(grad), [547.])
+      self.assertEqual(self.evaluate(result), 364.)
+      self.assertSequenceEqual(self.evaluate(grad), [547.])
 
   def testIdentityNodeInBody(self):
 
@@ -347,8 +348,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     ret = while_loop_v2(lambda v: v < 8., Body, [x])
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
   def testNestedWhileAndTensorArray(self):
     n = constant_op.constant(3.0)
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index c3c7f867a1e34efd98ca8a84e8f2d2a002b75ac9..bd3142132c7545e89fe023cb6141398a0c7bc326 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -65,7 +65,7 @@ class XentTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       loss = nn_ops.softmax_cross_entropy_with_logits(
           labels=np_labels, logits=np_features, dim=dim)
-      tf_loss = sess.run(loss)
+      tf_loss = self.evaluate(loss)
     print("np_loss:", np_loss)
     print("tf_loss:", tf_loss)
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
@@ -280,7 +280,7 @@ class XentTest(test.TestCase):
     with self.session(use_gpu=True) as sess:
       loss = nn_ops.softmax_cross_entropy_with_logits(
           labels=labels, logits=features)
-      tf_loss = sess.run(loss)
+      tf_loss = self.evaluate(loss)
     self.assertAllEqual(np_loss, tf_loss)
 
 
diff --git a/tensorflow/python/kernel_tests/zero_division_test.py b/tensorflow/python/kernel_tests/zero_division_test.py
index e68b96e670f914b0f243aa2617d378f2430fbdc2..7c82f9320a1630a73754387028e5fb2888391152 100644
--- a/tensorflow/python/kernel_tests/zero_division_test.py
+++ b/tensorflow/python/kernel_tests/zero_division_test.py
@@ -21,13 +21,14 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class ZeroDivisionTest(test.TestCase):
 
   def testZeros(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       for dtype in dtypes.uint8, dtypes.int16, dtypes.int32, dtypes.int64:
         zero = constant_op.constant(0, dtype=dtype)
         one = constant_op.constant(1, dtype=dtype)
@@ -36,7 +37,7 @@ class ZeroDivisionTest(test.TestCase):
           bads.append(one % zero)
         for bad in bads:
           try:
-            result = bad.eval()
+            result = self.evaluate(bad)
           except errors_impl.OpError as e:
             # Ideally, we'd get a nice exception.  In theory, this should only
             # happen on CPU, but 32 bit integer GPU division is actually on
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 24f6a098b3571dc1022958aa005d35faaef3aac5..42086e4c3e9f085c50cf503e309461da5e6ffcca 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -30,10 +30,10 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-
+# Avoid breaking users who directly import this symbol from this file.
+# TODO(fchollet): remove this.
 InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
 
-
 _KERAS_STYLE_SCOPE = False
 
 
@@ -208,6 +208,9 @@ class Layer(base_layer.Layer):
         raise ValueError(
             'reuse argument not allowed when keras style layers are enabled, '
             'but saw: {}'.format(self._reuse))
+      self._keras_style = True
+    else:
+      self._keras_style = False
 
     self._graph = None
     self._call_has_scope_arg = 'scope' in self._call_fn_args
@@ -275,7 +278,7 @@ class Layer(base_layer.Layer):
 
   def _name_scope(self):
     """Determines op naming for the Layer."""
-    if _is_in_keras_style_scope():
+    if self._keras_style:
       return super(Layer, self)._name_scope()
     return self._current_scope.original_name_scope
 
@@ -349,7 +352,7 @@ class Layer(base_layer.Layer):
       ValueError: When trainable has been set to True with synchronization
         set as `ON_READ`.
     """
-    if _is_in_keras_style_scope():
+    if self._keras_style:
       return super(Layer, self).add_weight(
           name=name,
           shape=shape,
@@ -477,7 +480,7 @@ class Layer(base_layer.Layer):
     """
     scope = kwargs.pop('scope', None)
 
-    if _is_in_keras_style_scope():
+    if self._keras_style:
       if scope is not None:
         raise ValueError(
             'scope argument not allowed when keras style layers are enabled, '
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 7a0ed63a491f4987df81ec7e1f69d692b4b08b0b..45099677e0f417f5f7e8e88b8203b0d6534a73b2 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import base_layer as keras_base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layers
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
@@ -76,11 +77,11 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(variable.name, 'my_layer/my_var:0')
 
     with base_layers.keras_style_scope():
-      with ops.name_scope('bar'):
-        layer = base_layers.Layer(name='my_layer')
-        # Test basic variable creation.
-        variable = layer.add_variable(
-            'my_var', [2, 2], initializer=init_ops.zeros_initializer())
+      layer = base_layers.Layer(name='my_layer')
+    # Test basic variable creation.
+    with ops.name_scope('bar'):
+      variable = layer.add_variable(
+          'my_var', [2, 2], initializer=init_ops.zeros_initializer())
     self.assertEqual(variable.name, 'bar/my_var:0')
 
   @test_util.run_in_graph_and_eager_modes
@@ -251,7 +252,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(ndim=2)
+        self.input_spec = input_spec.InputSpec(ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -278,7 +279,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(min_ndim=2)
+        self.input_spec = input_spec.InputSpec(min_ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -306,7 +307,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(max_ndim=2)
+        self.input_spec = input_spec.InputSpec(max_ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -334,7 +335,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(dtype='float32')
+        self.input_spec = input_spec.InputSpec(dtype='float32')
 
       def call(self, inputs):
         return inputs
@@ -354,7 +355,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(axes={-1: 2})
+        self.input_spec = input_spec.InputSpec(axes={-1: 2})
 
       def call(self, inputs):
         return inputs
@@ -376,7 +377,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(shape=(None, 3))
+        self.input_spec = input_spec.InputSpec(shape=(None, 3))
 
       def call(self, inputs):
         return inputs
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index ce8ecae66d3910868677921a575497d5f30e4d9b..5d4805e245e17376e8719466868326b34d7cf12d 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -114,6 +115,9 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
         name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv1d instead.')
 @tf_export(v1=['layers.conv1d'])
 def conv1d(inputs,
            filters,
@@ -310,6 +314,9 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
         name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv2d instead.')
 @tf_export(v1=['layers.conv2d'])
 def conv2d(inputs,
            filters,
@@ -514,6 +521,9 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
         name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv3d instead.')
 @tf_export(v1=['layers.conv3d'])
 def conv3d(inputs,
            filters,
@@ -841,6 +851,9 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.separable_conv1d instead.')
 @tf_export(v1=['layers.separable_conv1d'])
 def separable_conv1d(inputs,
                      filters,
@@ -958,6 +971,9 @@ def separable_conv1d(inputs,
   return layer.apply(inputs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.separable_conv2d instead.')
 @tf_export(v1=['layers.separable_conv2d'])
 def separable_conv2d(inputs,
                      filters,
@@ -1165,6 +1181,9 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv2d_transpose instead.')
 @tf_export(v1=['layers.conv2d_transpose'])
 def conv2d_transpose(inputs,
                      filters,
@@ -1342,6 +1361,9 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv3d_transpose instead.')
 @tf_export(v1=['layers.conv3d_transpose'])
 def conv3d_transpose(inputs,
                      filters,
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index 257fa27156749713bd35f22f82b7cc6c81c23a70..d3200fa5b57e699be5d102345e6d1df4675f8ab8 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -276,8 +276,8 @@ class ConvTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 32)))
         # Check that the bias still got initialized to zeros.
@@ -663,8 +663,8 @@ class SeparableConv2DTest(test.TestCase):
         self.assertTrue('depthwise_kernel' in weights[0].name)
         self.assertTrue('pointwise_kernel' in weights[1].name)
         self.assertTrue('bias' in weights[2].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 1)))
         self.assertAllClose(weights[1], np.ones((1, 1, 3, 32)))
@@ -902,8 +902,8 @@ class Conv2DTransposeTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 32, 3)))
         # Check that the bias still got initialized to zeros.
@@ -1084,8 +1084,8 @@ class Conv3DTransposeTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 4, 32)))
         # Check that the bias still got initialized to zeros.
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 4eb243ab5d56b26c0846b8fb3b1d1e6bcd88a9a9..b2d54a98272be53b69872e900901d9552177a172 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -25,6 +25,7 @@ from __future__ import print_function
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -109,6 +110,9 @@ class Dense(keras_layers.Dense, base.Layer):
                                 **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.dense instead.')
 @tf_export(v1=['layers.dense'])
 def dense(
     inputs, units,
@@ -223,6 +227,9 @@ class Dropout(keras_layers.Dropout, base.Layer):
     return super(Dropout, self).call(inputs, training=training)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.dropout instead.')
 @tf_export(v1=['layers.dropout'])
 def dropout(inputs,
             rate=0.5,
@@ -291,6 +298,9 @@ class Flatten(keras_layers.Flatten, base.Layer):
   pass
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.flatten instead.')
 @tf_export(v1=['layers.flatten'])
 def flatten(inputs, name=None, data_format='channels_last'):
   """Flattens an input tensor while preserving the batch axis (axis 0).
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 0343bfa8bd2d0fdfd80bd49709fa734d8df8f7ec..a61639b2db8253aa37c4ef67c8d60cd5ab8803ce 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -443,7 +443,7 @@ class DropoutTest(test.TestCase):
       dp = core_layers.Dropout(rate, name='dropout')
       inputs = array_ops.ones((5, 5))
       dropped = dp.apply(inputs, training=True)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_output = sess.run(dropped, feed_dict={rate: 0.5})
       self.assertAlmostEqual(0., np_output.min())
       np_output = sess.run(dropped, feed_dict={rate: 0.0})
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index 11a2ebc040f0177e38d5b0f38cf609071f91fd07..93eec38a08c476a746fa5ee1604076ce1e4e904f 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -24,7 +24,7 @@ from __future__ import print_function
 
 # Base objects.
 from tensorflow.python.layers.base import Layer
-from tensorflow.python.layers.base import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 
 # Core layers.
 from tensorflow.python.layers.core import Dense
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index ec6615ea86657b7568fb4263a1f0eb520d4c808f..7eefb294cd6f1f8c7194d68f5a76bfba220e0493 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -154,6 +155,9 @@ class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
     return super(BatchNormalization, self).call(inputs, training=training)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.batch_normalization instead.')
 @tf_export(v1=['layers.batch_normalization'])
 def batch_normalization(inputs,
                         axis=-1,
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index ba2bf10cf3aa558bde2253f6fe6d44f37f9efb4c..febc3587fe9b01725c69fb711aecdc4d4edb6e62 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -78,7 +78,7 @@ class BNTest(test.TestCase):
       if restore:
         saver.restore(sess, checkpoint_path)
       else:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
       np.random.seed(0)
       for _ in range(2):
         image_val = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
@@ -321,7 +321,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 4, 1))
@@ -337,7 +337,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 2))
       std = np.std(np_inputs, axis=(0, 2))
       variance = np.square(std)
@@ -363,7 +363,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 3))
       np_beta = np.reshape(np_beta, (1, 1, 3))
@@ -377,7 +377,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1))
       std = np.std(np_inputs, axis=(0, 1))
       variance = np.square(std)
@@ -404,7 +404,7 @@ class BNTest(test.TestCase):
 
       with self.session(use_gpu=True) as sess:
         # Test training with placeholder learning phase.
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
         np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
         np_beta = np.reshape(np_beta, (1, 4, 1, 1))
@@ -418,7 +418,7 @@ class BNTest(test.TestCase):
 
         # Verify that the statistics are updated during training.
         moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-        np_inputs = sess.run(inputs)
+        np_inputs = self.evaluate(inputs)
         mean = np.mean(np_inputs, axis=(0, 2, 3))
         std = np.std(np_inputs, axis=(0, 2, 3))
         variance = np.square(std)
@@ -444,7 +444,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 3, 1))
       np_beta = np.reshape(np_beta, (1, 1, 3, 1))
@@ -458,7 +458,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 3))
       std = np.std(np_inputs, axis=(0, 1, 3))
       variance = np.square(std)
@@ -484,7 +484,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
@@ -498,7 +498,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -524,7 +524,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
@@ -538,7 +538,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -565,7 +565,7 @@ class BNTest(test.TestCase):
 
       with self.cached_session() as sess:
         # Test training with placeholder learning phase.
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
         np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
         np_beta = np.reshape(np_beta, (1, 4, 1, 1))
@@ -579,7 +579,7 @@ class BNTest(test.TestCase):
 
         # Verify that the statistics are updated during training.
         moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-        np_inputs = sess.run(inputs)
+        np_inputs = self.evaluate(inputs)
         mean = np.mean(np_inputs, axis=(0, 2, 3))
         std = np.std(np_inputs, axis=(0, 2, 3))
         variance = np.square(std)
@@ -605,7 +605,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
@@ -620,7 +620,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -646,7 +646,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
@@ -659,7 +659,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -667,7 +667,7 @@ class BNTest(test.TestCase):
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
       # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs_infer)
+      np_output = self.evaluate(outputs_infer)
 
       # Verify that the axis is normalized during inference.
       normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
@@ -696,7 +696,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([gamma, beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
@@ -710,7 +710,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       np_mean = np.mean(np_inputs, axis=(0, 1, 2))
       np_std = np.std(np_inputs, axis=(0, 1, 2))
       np_variance = np.square(np_std)
@@ -758,14 +758,14 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(100):
         np_output, _, _ = sess.run([outputs2] + updates,
                                    feed_dict={training: True})
 
       # Verify that the statistics are updated during training.
       np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
-      np_inputs = sess.run(inputs2)
+      np_inputs = self.evaluate(inputs2)
       np_mean = np.mean(np_inputs, axis=(0, 1, 2))
       np_std = np.std(np_inputs, axis=(0, 1, 2))
       np_variance = np.square(np_std)
@@ -885,7 +885,7 @@ class BNTest(test.TestCase):
     renorm_mean = renorm_stddev = 0.
     renorm_weight = 0.
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -937,7 +937,7 @@ class BNTest(test.TestCase):
     moving_mean = 0.
     moving_variance = 1.
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
         yt_val_train, adj_scale_val, adj_bias_val = sess.run(
@@ -990,7 +990,7 @@ class BNTest(test.TestCase):
     renorm_mean = renorm_stddev = 0.
     renorm_weight = 0.
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
         yt_val_train, adj_scale_val, adj_bias_val = sess.run(
@@ -1040,7 +1040,7 @@ class BNTest(test.TestCase):
         out1.shape.as_list(), out2.shape.as_list())
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       x = np.random.random(shape)
       y1, y2 = sess.run([out1, out2], feed_dict={inp: x})
@@ -1062,7 +1062,7 @@ class BNTest(test.TestCase):
         inp, virtual_batch_size=2)
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       x = np.random.random(np_shape)
       y = sess.run(out, feed_dict={inp: x})
@@ -1093,7 +1093,7 @@ class BNTest(test.TestCase):
                     shape[1]])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1146,7 +1146,7 @@ class BNTest(test.TestCase):
                    shape[1:])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1200,7 +1200,7 @@ class BNTest(test.TestCase):
                    shape[1:])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1256,7 +1256,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
 
@@ -1270,7 +1270,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=0, keepdims=True)
       std = np.std(np_inputs, axis=0, keepdims=True)
       variance = np.square(std)
@@ -1296,7 +1296,7 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
 
@@ -1310,7 +1310,7 @@ class BNTest(test.TestCase):
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 4), keepdims=True)
       std = np.std(np_inputs, axis=(0, 4), keepdims=True)
       variance = np.square(std)
@@ -1350,7 +1350,7 @@ class BNTest(test.TestCase):
                    shape[1:])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 81e9ebd54772b0656acaaea7a98cd69ff2e24845..d123afc6231fb7d49ac4d610c5ca30c324a55de3 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -57,6 +58,9 @@ class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.average_pooling1d instead.')
 @tf_export(v1=['layers.average_pooling1d'])
 def average_pooling1d(inputs, pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -125,6 +129,9 @@ class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
         **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.max_pooling1d instead.')
 @tf_export(v1=['layers.max_pooling1d'])
 def max_pooling1d(inputs, pool_size, strides,
                   padding='valid', data_format='channels_last',
@@ -193,6 +200,9 @@ class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.average_pooling2d instead.')
 @tf_export(v1=['layers.average_pooling2d'])
 def average_pooling2d(inputs,
                       pool_size, strides,
@@ -264,6 +274,9 @@ class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.max_pooling2d instead.')
 @tf_export(v1=['layers.max_pooling2d'])
 def max_pooling2d(inputs,
                   pool_size, strides,
@@ -337,6 +350,9 @@ class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.average_pooling3d instead.')
 @tf_export(v1=['layers.average_pooling3d'])
 def average_pooling3d(inputs,
                       pool_size, strides,
@@ -412,6 +428,9 @@ class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.max_pooling3d instead.')
 @tf_export(v1=['layers.max_pooling3d'])
 def max_pooling3d(inputs,
                   pool_size, strides,
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 6189503d8f5416e45a022abfa4f8bcad2da64c66..9364aec373df9575282ae9254bce50a307bf61a0 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -177,8 +177,7 @@ tensorflow::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
                                                 const Device* expected_device,
                                                 const Tensor** output_tensor) {
   auto handle = EagerTensor_Handle(eager_tensor)->handle;
-  Device* actual_device = nullptr;
-  TF_RETURN_IF_ERROR(handle->Device(&actual_device));
+  Device* actual_device = handle->device();
   TF_RETURN_IF_ERROR(handle->Tensor(output_tensor));
   // actual_device may be nullptr, which implies local CPU.
   if (expected_device == actual_device) return Status::OK();
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index f22fb253e4d59813226f0e9741cabcfbf0cdcd1a..c8aa5311d934e349c70c185ec09abf911886d245 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -241,7 +241,7 @@ class FileIO(object):
     self._writable_file = None
 
 
-@tf_export("gfile.Exists")
+@tf_export(v1=["gfile.Exists"])
 def file_exists(filename):
   """Determines whether a path exists or not.
 
@@ -252,12 +252,29 @@ def file_exists(filename):
     True if the path exists, whether its a file or a directory.
     False if the path does not exist and there are no filesystem errors.
 
+  Raises:
+    errors.OpError: Propagates any errors reported by the FileSystem API.
+  """
+  return file_exists_v2(filename)
+
+
+@tf_export("io.gfile.exists", v1=[])
+def file_exists_v2(path):
+  """Determines whether a path exists or not.
+
+  Args:
+    path: string, a path
+
+  Returns:
+    True if the path exists, whether its a file or a directory.
+    False if the path does not exist and there are no filesystem errors.
+
   Raises:
     errors.OpError: Propagates any errors reported by the FileSystem API.
   """
   try:
     with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.FileExists(compat.as_bytes(filename), status)
+      pywrap_tensorflow.FileExists(compat.as_bytes(path), status)
   except errors.NotFoundError:
     return False
   return True
diff --git a/tensorflow/python/lib/io/python_io.py b/tensorflow/python/lib/io/python_io.py
index 404423ce07b3bbee89266a7154405c72da067a02..8223d3092fc0853d02ebea5f3a117d34472077c1 100644
--- a/tensorflow/python/lib/io/python_io.py
+++ b/tensorflow/python/lib/io/python_io.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Python functions for directly manipulating TFRecord-formatted files.
-
-See the [Python IO](https://tensorflow.org/api_guides/python/python_io) guide.
-"""
+"""Python functions for directly manipulating TFRecord-formatted files."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index b7fae8529559efd1369db1364e730fbbc5d1df5a..43086ab18d7774f54be2b393deccec6be180801f 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -150,10 +150,11 @@ class TFRecordOptions(object):
     return options
 
 
-@tf_export(
-    "io.tf_record_iterator",
-    v1=["io.tf_record_iterator", "python_io.tf_record_iterator"])
-@deprecation.deprecated_endpoints("python_io.tf_record_iterator")
+@tf_export(v1=["io.tf_record_iterator", "python_io.tf_record_iterator"])
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use eager execution and: \n"
+                  "`tf.data.TFRecordDataset(path)`"))
 def tf_record_iterator(path, options=None):
   """An iterator that read the records from a TFRecords file.
 
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 68c392bf28d19fda8e39905560f04e4810c203f7..6edc1933619d321dd1c1ff33c8e24cefab64c244 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -489,10 +489,12 @@ def _GatherNdGrad(op, grad):
 
 
 @ops.RegisterGradient("CheckNumerics")
-def _CheckNumericsGrad(_, grad):
+def _CheckNumericsGrad(op, grad):
   """Gradient for check_numerics op."""
   return array_ops.check_numerics(
-      grad, "Not a number (NaN) or infinity (Inf) values detected in gradient.")
+      grad,
+      "Not a number (NaN) or infinity (Inf) values detected in gradient. %s" %
+      op.get_attr("message"))
 
 
 @ops.RegisterGradient("PlaceholderWithDefault")
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index ee328df208e667fa9fbb79b96cf584ed9c966e94..ed050d740e15fd9abf5f2c9b1115676e01e2f984 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # Tests for this file live in python/kernel_tests/array_ops_test.py
-"""Support for manipulating tensors.
-
-See the [Array Ops](https://tensorflow.org/api_guides/python/array_ops) guide.
-"""
+"""Support for manipulating tensors."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,6 +22,7 @@ from __future__ import print_function
 import sys
 
 import numpy as np
+import six
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import common_shapes
@@ -82,7 +80,7 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin,protected-access
-@tf_export("expand_dims")
+@tf_export(v1=["expand_dims"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
@@ -123,7 +121,7 @@ def expand_dims(input, axis=None, name=None, dim=None):
     axis: 0-D (scalar). Specifies the dimension index at which to
       expand the shape of `input`. Must be in the range
       `[-rank(input) - 1, rank(input)]`.
-    name: The name of the output `Tensor`.
+    name: The name of the output `Tensor` (optional).
     dim: 0-D (scalar). Equivalent to `axis`, to be deprecated.
 
   Returns:
@@ -131,9 +129,60 @@ def expand_dims(input, axis=None, name=None, dim=None):
     dimension of size 1 added.
 
   Raises:
-    ValueError: if both `dim` and `axis` are specified.
+    ValueError: if either both or neither of `dim` and `axis` are specified.
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
+    raise ValueError("Must specify an axis argument to tf.expand_dims()")
+  return expand_dims_v2(input, axis, name)
+
+
+@tf_export("expand_dims", v1=[])
+def expand_dims_v2(input, axis, name=None):
+  """Inserts a dimension of 1 into a tensor's shape.
+
+  Given a tensor `input`, this operation inserts a dimension of 1 at the
+  dimension index `axis` of `input`'s shape. The dimension index `axis` starts
+  at zero; if you specify a negative number for `axis` it is counted backward
+  from the end.
+
+  This operation is useful if you want to add a batch dimension to a single
+  element. For example, if you have a single image of shape `[height, width,
+  channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+  which will make the shape `[1, height, width, channels]`.
+
+  Other examples:
+
+  ```python
+  # 't' is a tensor of shape [2]
+  tf.shape(tf.expand_dims(t, 0))  # [1, 2]
+  tf.shape(tf.expand_dims(t, 1))  # [2, 1]
+  tf.shape(tf.expand_dims(t, -1))  # [2, 1]
+
+  # 't2' is a tensor of shape [2, 3, 5]
+  tf.shape(tf.expand_dims(t2, 0))  # [1, 2, 3, 5]
+  tf.shape(tf.expand_dims(t2, 2))  # [2, 3, 1, 5]
+  tf.shape(tf.expand_dims(t2, 3))  # [2, 3, 5, 1]
+  ```
+
+  This operation requires that:
+
+  `-1-input.dims() <= dim <= input.dims()`
+
+  This operation is related to `squeeze()`, which removes dimensions of
+  size 1.
+
+  Args:
+    input: A `Tensor`.
+    axis: 0-D (scalar). Specifies the dimension index at which to
+      expand the shape of `input`. Must be in the range
+      `[-rank(input) - 1, rank(input)]`.
+    name: The name of the output `Tensor` (optional).
+
+  Returns:
+    A `Tensor` with the same data as `input`, but its shape has an additional
+    dimension of size 1 added.
+  """
   return gen_array_ops.expand_dims(input, axis, name)
 
 
@@ -156,7 +205,11 @@ listdiff.__doc__ = gen_array_ops.list_diff.__doc__ + "\n" + listdiff.__doc__
 
 
 # pylint: disable=undefined-variable
-@tf_export("setdiff1d")
+@deprecation.deprecated(
+    "2018-11-30",
+    "This op will be removed after the deprecation date. "
+    "Please switch to tf.sets.difference().")
+@tf_export(v1=["setdiff1d"])
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
   return gen_array_ops.list_diff(x, y, index_dtype, name)
 
@@ -166,7 +219,18 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 
 @tf_export("broadcast_dynamic_shape")
 def broadcast_dynamic_shape(shape_x, shape_y):
-  """Returns the broadcasted dynamic shape between `shape_x` and `shape_y`.
+  """Computes the shape of a broadcast given symbolic shapes.
+
+  When shape_x and shape_y are Tensors representing shapes (i.e. the result of
+  calling tf.shape on another Tensor) this computes a Tensor which is the shape
+  of the result of a broadcasting op applied in tensors of shapes shape_x and
+  shape_y.
+
+  For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
+  Tensor whose value is [5, 2, 3].
+
+  This is useful when validating the result of a broadcasting operation when the
+  tensors do not have statically known shapes.
 
   Args:
     shape_x: A rank 1 integer `Tensor`, representing the shape of x.
@@ -180,7 +244,17 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
 @tf_export("broadcast_static_shape")
 def broadcast_static_shape(shape_x, shape_y):
-  """Returns the broadcasted static shape between `shape_x` and `shape_y`.
+  """Computes the shape of a broadcast given known shapes.
+
+  When shape_x and shape_y are fully known TensorShapes this computes a
+  TensorShape which is the shape of the result of a broadcasting op applied in
+  tensors of shapes shape_x and shape_y.
+
+  For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
+  TensorShape whose value is [5, 2, 3].
+
+  This is useful when validating the result of a broadcasting operation when the
+  tensors have statically known shapes.
 
   Args:
     shape_x: A `TensorShape`
@@ -195,7 +269,13 @@ def broadcast_static_shape(shape_x, shape_y):
   return common_shapes.broadcast_shape(shape_x, shape_y)
 
 
-@tf_export("shape")
+@tf_export("shape", v1=[])
+def shape_v2(input, out_type=dtypes.int32, name=None):
+  # pylint: disable=redefined-builtin
+  return shape(input, name, out_type)
+
+
+@tf_export(v1=["shape"])
 def shape(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -268,7 +348,13 @@ def shape_n(input, out_type=dtypes.int32, name=None):
   return gen_array_ops.shape_n(input, out_type=out_type, name=name)
 
 
-@tf_export("size")
+@tf_export("size", v1=[])
+def size_v2(input, out_type=dtypes.int32, name=None):
+  # pylint: disable=redefined-builtin
+  return size(input, name, out_type)
+
+
+@tf_export(v1=["size"])
 def size(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
@@ -392,6 +478,36 @@ def rank_internal(input, name=None, optimize=True):
       return gen_array_ops.rank(input, name=name)
 
 
+_SLICE_TYPE_ERROR = (
+    "Only integers, slices (`:`), ellipsis (`...`), "
+    "tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid "
+    "indices")
+
+_SUPPORTED_SLICE_DTYPES = (
+    dtypes.int32,
+    dtypes.int32_ref,
+    dtypes.int64,
+    dtypes.int64_ref
+)
+
+
+def _check_index(idx):
+  """Check if a given value is a valid index into a tensor."""
+  if isinstance(idx, (six.integer_types, tensor_shape.Dimension)):
+    return
+
+  # Optimistic check. Assumptions:
+  # * any object with a dtype is supported
+  # * any object with a dtype has a sizeable shape attribute.
+  dtype = getattr(idx, "dtype", None)
+  if (dtype is None or
+      dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
+      idx.shape and len(idx.shape) == 1):
+    # TODO(slebedev): IndexError seems more appropriate here, but it
+    # will break `_slice_helper` contract.
+    raise TypeError(_SLICE_TYPE_ERROR + ", got {!r}".format(idx))
+
+
 def _slice_helper(tensor, slice_spec, var=None):
   """Overload for Tensor.__getitem__.
 
@@ -445,7 +561,8 @@ def _slice_helper(tensor, slice_spec, var=None):
 
   Raises:
     ValueError: If a slice range is negative size.
-    TypeError: If the slice indices aren't int, slice, or Ellipsis.
+    TypeError: If the slice indices aren't int, slice, ellipsis,
+      tf.newaxis or scalar int32/int64 tensors.
   """
 
   if not isinstance(slice_spec, (list, tuple)):
@@ -463,16 +580,19 @@ def _slice_helper(tensor, slice_spec, var=None):
       # for example a[:] gives slice(None,sys.maxsize,None)
       # whereas a[::1] gives slice(None,None,None)
       if s.start is not None and s.start is not sys.maxsize:
+        _check_index(s.start)
         begin.append(s.start)
       else:
         begin.append(0)
         begin_mask |= (1 << index)
       if s.stop is not None and s.stop != sys.maxsize:
+        _check_index(s.stop)
         end.append(s.stop)
       else:
         end.append(0)
         end_mask |= (1 << index)
       if s.step is not None:
+        _check_index(s.step)
         strides.append(s.step)
       else:
         strides.append(1)
@@ -487,6 +607,7 @@ def _slice_helper(tensor, slice_spec, var=None):
       strides.append(1)
       new_axis_mask |= (1 << index)
     else:
+      _check_index(s)
       begin.append(s)
       end.append(s + 1)
       strides.append(1)
@@ -756,7 +877,8 @@ def _SliceHelperVar(var, slice_spec):
 
   Raises:
     ValueError: If a slice range is negative size.
-    TypeError: If the slice indices aren't int, slice, or Ellipsis.
+    TypeError: TypeError: If the slice indices aren't int, slice,
+      ellipsis, tf.newaxis or int32/int64 tensors.
 
   """
 
@@ -1124,7 +1246,7 @@ def concat(values, axis, name="concat"):
   return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
 
 
-@tf_export("boolean_mask")
+@tf_export(v1=["boolean_mask"])
 def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.  Numpy equivalent is `tensor[mask]`.
 
@@ -1204,6 +1326,54 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
     return _apply_mask_1d(tensor, mask, axis)
 
 
+@tf_export("boolean_mask", v1=[])
+def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
+  """Apply boolean mask to tensor.
+
+  Numpy equivalent is `tensor[mask]`.
+
+  ```python
+  # 1-D example
+  tensor = [0, 1, 2, 3]
+  mask = np.array([True, False, True, False])
+  boolean_mask(tensor, mask)  # [0, 2]
+  ```
+
+  In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
+  the first K dimensions of `tensor`'s shape.  We then have:
+    `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
+  where `(i1,...,iK)` is the ith `True` entry of `mask` (row-major order).
+  The `axis` could be used with `mask` to indicate the axis to mask from.
+  In that case, `axis + dim(mask) <= dim(tensor)` and `mask`'s shape must match
+  the first `axis + dim(mask)` dimensions of `tensor`'s shape.
+
+  Args:
+    tensor:  N-D tensor.
+    mask:  K-D boolean tensor, K <= N and K must be known statically.
+    axis:  A 0-D int Tensor representing the axis in `tensor` to mask from. By
+      default, axis is 0 which will mask from the first dimension. Otherwise K +
+      axis <= N.
+    name:  A name for this operation (optional).
+
+  Returns:
+    (N-K+1)-dimensional tensor populated by entries in `tensor` corresponding
+    to `True` values in `mask`.
+
+  Raises:
+    ValueError:  If shapes do not conform.
+
+  Examples:
+
+  ```python
+  # 2-D example
+  tensor = [[1, 2], [3, 4], [5, 6]]
+  mask = np.array([True, False, True])
+  boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
+  ```
+  """
+  return boolean_mask(tensor, mask, name, axis)
+
+
 @tf_export("sparse.mask", v1=["sparse.mask", "sparse_mask"])
 @deprecation.deprecated_endpoints("sparse_mask")
 def sparse_mask(a, mask_indices, name=None):
@@ -1337,7 +1507,75 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
       value=value, size_splits=size_splits, axis=axis, num_split=num, name=name)
 
 
-@tf_export("transpose")
+@tf_export("transpose", v1=[])
+def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
+  """Transposes `a`. Permutes the dimensions according to `perm`.
+
+  The returned tensor's dimension i will correspond to the input dimension
+  `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
+  the rank of the input tensor. Hence by default, this operation performs a
+  regular matrix transpose on 2-D input Tensors. If conjugate is True and
+  `a.dtype` is either `complex64` or `complex128` then the values of `a`
+  are conjugated and transposed.
+
+  @compatibility(numpy)
+  In `numpy` transposes are memory-efficient constant time operations as they
+  simply return a new view of the same data with adjusted `strides`.
+
+  TensorFlow does not support strides, so `transpose` returns a new tensor with
+  the items permuted.
+  @end_compatibility
+
+  For example:
+
+  ```python
+  x = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.transpose(x)  # [[1, 4]
+                   #  [2, 5]
+                   #  [3, 6]]
+
+  # Equivalently
+  tf.transpose(x, perm=[1, 0])  # [[1, 4]
+                                #  [2, 5]
+                                #  [3, 6]]
+
+  # If x is complex, setting conjugate=True gives the conjugate transpose
+  x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j],
+                   [4 + 4j, 5 + 5j, 6 + 6j]])
+  tf.transpose(x, conjugate=True)  # [[1 - 1j, 4 - 4j],
+                                   #  [2 - 2j, 5 - 5j],
+                                   #  [3 - 3j, 6 - 6j]]
+
+  # 'perm' is more useful for n-dimensional tensors, for n > 2
+  x = tf.constant([[[ 1,  2,  3],
+                    [ 4,  5,  6]],
+                   [[ 7,  8,  9],
+                    [10, 11, 12]]])
+
+  # Take the transpose of the matrices in dimension-0
+  # (this common operation has a shorthand `linalg.transpose`)
+  tf.transpose(x, perm=[0, 2, 1])  # [[[1,  4],
+                                   #   [2,  5],
+                                   #   [3,  6]],
+                                   #  [[7, 10],
+                                   #   [8, 11],
+                                   #   [9, 12]]]
+  ```
+
+  Args:
+    a: A `Tensor`.
+    perm: A permutation of the dimensions of `a`.
+    conjugate: Optional bool. Setting it to `True` is mathematically equivalent
+      to tf.conj(tf.transpose(input)).
+    name: A name for the operation (optional).
+
+  Returns:
+    A transposed `Tensor`.
+  """
+  return transpose(a=a, perm=perm, name=name, conjugate=conjugate)
+
+
+@tf_export(v1=["transpose"])
 def transpose(a, perm=None, name="transpose", conjugate=False):
   """Transposes `a`. Permutes the dimensions according to `perm`.
 
@@ -1570,7 +1808,7 @@ def zeros(shape, dtype=dtypes.float32, name=None):
   return output
 
 
-@tf_export("zeros_like")
+@tf_export(v1=["zeros_like"])
 def zeros_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to zero.
 
@@ -1597,6 +1835,42 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   Returns:
     A `Tensor` with all elements set to zero.
   """
+  return zeros_like_impl(tensor, dtype, name, optimize)
+
+
+@tf_export("zeros_like", v1=[])
+def zeros_like_v2(
+    input,  # pylint: disable=redefined-builtin
+    dtype=None,
+    name=None):
+  """Creates a tensor with all elements set to zero.
+
+  Given a single tensor (`tensor`), this operation returns a tensor of the
+  same type and shape as `tensor` with all elements set to zero. Optionally,
+  you can use `dtype` to specify a new type for the returned tensor.
+
+  For example:
+
+  ```python
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.zeros_like(tensor)  # [[0, 0, 0], [0, 0, 0]]
+  ```
+
+  Args:
+    input: A `Tensor`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with all elements set to zero.
+  """
+  return zeros_like_impl(input, dtype, name, optimize=True)
+
+
+def zeros_like_impl(tensor, dtype, name, optimize=True):
+  """Internal implementation for the v1/v2 zeros_like API calls."""
   with ops.name_scope(name, "zeros_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
 
@@ -1623,7 +1897,7 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
       return gen_array_ops.zeros_like(tensor, name=name)
 
 
-@tf_export("ones_like")
+@tf_export(v1=["ones_like"])
 def ones_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to 1.
 
@@ -1650,6 +1924,42 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
   Returns:
     A `Tensor` with all elements set to 1.
   """
+  return ones_like_impl(tensor, dtype, name, optimize)
+
+
+@tf_export("ones_like", v1=[])
+def ones_like_v2(
+    input,  # pylint: disable=redefined-builtin
+    dtype=None,
+    name=None):
+  """Creates a tensor with all elements set to zero.
+
+  Given a single tensor (`tensor`), this operation returns a tensor of the
+  same type and shape as `tensor` with all elements set to zero. Optionally,
+  you can use `dtype` to specify a new type for the returned tensor.
+
+  For example:
+
+  ```python
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.ones_like(tensor)  # [[1, 1, 1], [1, 1, 1]]
+  ```
+
+  Args:
+    input: A `Tensor`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with all elements set to zero.
+  """
+  return ones_like_impl(input, dtype, name, optimize=True)
+
+
+def ones_like_impl(tensor, dtype, name, optimize=True):
+  """Internal implementation for the v1/v2 ones_like API calls."""
   with ops.name_scope(name, "ones_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
     ones_shape = shape_internal(tensor, optimize=optimize)
@@ -1707,7 +2017,7 @@ def ones(shape, dtype=dtypes.float32, name=None):
   return output
 
 
-@tf_export("placeholder")
+@tf_export(v1=["placeholder"])
 def placeholder(dtype, shape=None, name=None):
   """Inserts a placeholder for a tensor that will be always fed.
 
@@ -1752,6 +2062,22 @@ def placeholder(dtype, shape=None, name=None):
   return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
 
 
+@tf_export(v1=["placeholder_with_default"])
+def placeholder_with_default(input, shape, name=None):  # pylint: disable=redefined-builtin
+  """A placeholder op that passes through `input` when its output is not fed.
+
+  Args:
+    input: A `Tensor`. The default value to produce when output is not fed.
+    shape: A `tf.TensorShape` or list of `int`s. The (possibly partial) shape
+      of the tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  return gen_array_ops.placeholder_with_default(input, shape, name)
+
+
 # pylint: disable=redefined-outer-name
 def _normalize_sparse_shape(shape, name):
   """Returns a tuple of (Tensor or None, rank or None)."""
@@ -1763,8 +2089,7 @@ def _normalize_sparse_shape(shape, name):
   return (ops.convert_to_tensor(shape, dtype=dtypes.int64, name=name), rank)
 
 
-@tf_export(
-    "sparse.placeholder", v1=["sparse.placeholder", "sparse_placeholder"])
+@tf_export(v1=["sparse.placeholder", "sparse_placeholder"])
 @deprecation.deprecated_endpoints("sparse_placeholder")
 def sparse_placeholder(dtype, shape=None, name=None):
   """Inserts a placeholder for a sparse tensor that will be always fed.
@@ -1832,7 +2157,65 @@ def sparse_placeholder(dtype, shape=None, name=None):
 # pylint: enable=redefined-outer-name
 
 
-@tf_export("pad")
+@tf_export("pad", v1=[])
+def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
+  """Pads a tensor.
+
+  This operation pads a `tensor` according to the `paddings` you specify.
+  `paddings` is an integer tensor with shape `[n, 2]`, where n is the rank of
+  `tensor`. For each dimension D of `input`, `paddings[D, 0]` indicates how
+  many values to add before the contents of `tensor` in that dimension, and
+  `paddings[D, 1]` indicates how many values to add after the contents of
+  `tensor` in that dimension. If `mode` is "REFLECT" then both `paddings[D, 0]`
+  and `paddings[D, 1]` must be no greater than `tensor.dim_size(D) - 1`. If
+  `mode` is "SYMMETRIC" then both `paddings[D, 0]` and `paddings[D, 1]` must be
+  no greater than `tensor.dim_size(D)`.
+
+  The padded size of each dimension D of the output is:
+
+  `paddings[D, 0] + tensor.dim_size(D) + paddings[D, 1]`
+
+  For example:
+
+  ```python
+  t = tf.constant([[1, 2, 3], [4, 5, 6]])
+  paddings = tf.constant([[1, 1,], [2, 2]])
+  # 'constant_values' is 0.
+  # rank of 't' is 2.
+  tf.pad(t, paddings, "CONSTANT")  # [[0, 0, 0, 0, 0, 0, 0],
+                                   #  [0, 0, 1, 2, 3, 0, 0],
+                                   #  [0, 0, 4, 5, 6, 0, 0],
+                                   #  [0, 0, 0, 0, 0, 0, 0]]
+
+  tf.pad(t, paddings, "REFLECT")  # [[6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1],
+                                  #  [6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1]]
+
+  tf.pad(t, paddings, "SYMMETRIC")  # [[2, 1, 1, 2, 3, 3, 2],
+                                    #  [2, 1, 1, 2, 3, 3, 2],
+                                    #  [5, 4, 4, 5, 6, 6, 5],
+                                    #  [5, 4, 4, 5, 6, 6, 5]]
+  ```
+
+  Args:
+    tensor: A `Tensor`.
+    paddings: A `Tensor` of type `int32`.
+    mode: One of "CONSTANT", "REFLECT", or "SYMMETRIC" (case-insensitive)
+    constant_values: In "CONSTANT" mode, the scalar pad value to use. Must be
+      same type as `tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `tensor`.
+
+  Raises:
+    ValueError: When mode is not one of "CONSTANT", "REFLECT", or "SYMMETRIC".
+  """
+  return pad(tensor, paddings, mode, name, constant_values)
+
+
+@tf_export(v1=["pad"])
 def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
@@ -2633,7 +3016,7 @@ def where(condition, x=None, y=None, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("reverse_sequence")
+@tf_export(v1=["reverse_sequence"])
 @deprecation.deprecated_args(
     None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
 @deprecation.deprecated_args(
@@ -2657,14 +3040,31 @@ def reverse_sequence(input,
       name=name)
 
 
-# pylint: enable=redefined-builtin
-
 reverse_sequence.__doc__ = deprecation.rewrite_argument_docstring(
     deprecation.rewrite_argument_docstring(
         gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
     "seq_dim", "seq_axis")
 
 
+@tf_export("reverse_sequence", v1=[])
+def reverse_sequence_v2(
+    input, seq_lengths, seq_axis=None, batch_axis=None, name=None):
+  return gen_array_ops.reverse_sequence(
+      input=input,
+      seq_lengths=seq_lengths,
+      seq_dim=seq_axis,
+      batch_dim=batch_axis,
+      name=name)
+
+
+reverse_sequence_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
+    "seq_dim", "seq_axis")
+
+# pylint: enable=redefined-builtin
+
+
 @tf_export("gather")
 def gather(params, indices, validate_indices=None, name=None, axis=0):
   del validate_indices
@@ -2762,7 +3162,7 @@ def batch_gather(params, indices, name=None):
 
 # Define quantize_v2 here in order to make name the second-to-last attribute,
 # because round_mode was added later.
-@tf_export("quantize_v2")
+@tf_export(v1=["quantize_v2"])
 @deprecation.deprecated(
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
@@ -2783,7 +3183,7 @@ def quantize_v2(input,  # pylint: disable=redefined-builtin
                                    round_mode=round_mode)
 
 
-quantize_v2.__doc__ = """Please use `tf.quantize` instead."""
+quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 
 
 # We want to expose tf.quantize instead of tf.quantize_v2; we can deprecate
@@ -2869,3 +3269,48 @@ def searchsorted(sorted_sequence,
 
 
 quantize.__doc__ = gen_array_ops.quantize_v2.__doc__
+
+
+@tf_export("image.extract_image_patches", v1=[])
+def extract_image_patches_v2(
+    images,
+    sizes,
+    strides,
+    rates,
+    padding,
+    name=None):
+  # pylint: disable=line-too-long
+  r"""Extract `patches` from `images` and put them in the \"depth\" output dimension.
+
+  Args:
+    images: A 4-D Tensor with shape `[batch, in_rows, in_cols, depth]
+    sizes: The size of the sliding window for each dimension of `images`.
+    strides: A 1-D Tensor of length 4. How far the centers of two consecutive
+      patches are in the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+    rates: A 1-D Tensor of length 4. Must be: `[1, rate_rows, rate_cols, 1]`.
+      This is the input stride, specifying how far two consecutive patch samples
+      are in the input. Equivalent to extracting patches with `patch_sizes_eff =
+      patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by subsampling
+      them spatially by a factor of `rates`. This is equivalent to `rate` in
+      dilated (a.k.a. Atrous) convolutions.
+    padding: The type of padding algorithm to use.
+      We specify the size-related attributes as: ```python ksizes = [1,
+        ksize_rows, ksize_cols, 1] strides = [1, strides_rows, strides_cols, 1]
+        rates = [1, rates_rows, rates_cols, 1]
+    name: A name for the operation (optional).
+
+  Returns:
+    A 4-D Tensor. Has the same type as `images`, and with shape `[batch,
+    out_rows, out_cols, ksize_rows * ksize_cols * depth]` containing image
+    patches with size `ksize_rows x ksize_cols x depth` vectorized in the
+    \"depth\" dimension. Note `out_rows` and `out_cols` are the dimensions of
+    the output patches.
+  """
+  # pylint: enable=line-too-long
+  return gen_array_ops.extract_image_patches(
+      images, sizes, strides, rates, padding, name)
+
+extract_image_patches_deprecation = deprecation.deprecated_args(
+    None, "ksizes is deprecated, use sizes instead", "ksizes")
+tf_export(v1=["image.extract_image_patches", "extract_image_patches"])(
+    extract_image_patches_deprecation(gen_array_ops.extract_image_patches))
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 720f9f4d41e4cc627752be751a0c5b377b404523..37d649acf00c6905ae7330169321e5a5f8f487be 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -33,13 +33,17 @@ from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_make_quant
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_make_stats_summary as make_stats_summary
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_predict as predict
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_add_summaries as quantile_add_summaries
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_deserialize as quantile_resource_deserialize
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_flush as quantile_flush
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_get_bucket_boundaries as get_bucket_boundaries
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_handle_op as quantile_resource_handle_op
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_training_predict as training_predict
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_update_ensemble as update_ensemble
+from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as is_quantile_resource_initialized
 # pylint: enable=unused-import
 
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 
 class PruningMode(object):
@@ -57,6 +61,69 @@ class PruningMode(object):
           sorted(cls._map))))
 
 
+class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for QuantileAccumulator.
+
+     The bucket boundaries are serialized and deserialized from checkpointing.
+  """
+
+  def __init__(self,
+               epsilon,
+               num_streams,
+               num_quantiles,
+               name=None,
+               max_elements=None):
+    with ops.name_scope(name, 'QuantileAccumulator') as name:
+      self._eps = epsilon
+      self._num_streams = num_streams
+      self._num_quantiles = num_quantiles
+      self._resource_handle = quantile_resource_handle_op(
+          container='', shared_name=name, name=name)
+      self._create_op = create_quantile_stream_resource(self._resource_handle,
+                                                        epsilon, num_streams)
+      is_initialized_op = is_quantile_resource_initialized(
+          self._resource_handle)
+      resources.register_resource(self._resource_handle, self._create_op,
+                                  is_initialized_op)
+      self._make_saveable(name)
+
+  def _make_saveable(self, name):
+    bucket_boundaries = get_bucket_boundaries(self._resource_handle,
+                                              self._num_streams)
+    slice_spec = ''
+    specs = []
+    for i in range(self._num_streams):
+      specs.append(
+          saver.BaseSaverBuilder.SaveSpec(
+              bucket_boundaries[i], slice_spec,
+              name + '_bucket_boundaries_' + str(i)))
+    super(QuantileAccumulator, self).__init__(self._resource_handle, specs,
+                                              name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+
+  def restore(self, restored_tensors, unused_tensor_shapes):
+    bucket_boundaries = restored_tensors
+    with ops.control_dependencies([self._create_op]):
+      return quantile_resource_deserialize(
+          self._resource_handle, bucket_boundaries=bucket_boundaries)
+
+  def add_summaries(self, float_columns, example_weights):
+    summaries = make_quantile_summaries(float_columns, example_weights,
+                                        self._eps)
+    summary_op = quantile_add_summaries(self._resource_handle, summaries)
+    return summary_op
+
+  def flush(self):
+    return quantile_flush(self._resource_handle, self._num_quantiles)
+
+  def get_bucket_boundaries(self):
+    return get_bucket_boundaries(self._resource_handle, self._num_streams)
+
+  @property
+  def resource(self):
+    return self._resource_handle
+
+
 class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
   """SaveableObject implementation for TreeEnsemble."""
 
@@ -102,35 +169,52 @@ class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
           tree_ensemble_serialized=restored_tensors[1])
 
 
-class TreeEnsemble(object):
+class TreeEnsemble(tracking.TrackableResource):
   """Creates TreeEnsemble resource."""
 
   def __init__(self, name, stamp_token=0, is_local=False, serialized_proto=''):
+    self._stamp_token = stamp_token
+    self._serialized_proto = serialized_proto
+    self._is_local = is_local
     with ops.name_scope(name, 'TreeEnsemble') as name:
-      self._resource_handle = (
-          gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
-              container='', shared_name=name, name=name))
-      create_op = gen_boosted_trees_ops.boosted_trees_create_ensemble(
-          self.resource_handle,
-          stamp_token,
-          tree_ensemble_serialized=serialized_proto)
-      is_initialized_op = (
-          gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
-              self._resource_handle))
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
       # Adds the variable to the savable list.
       if not is_local:
-        saveable = _TreeEnsembleSavable(self.resource_handle, create_op,
-                                        self.resource_handle.name)
-        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+        self._saveable = _TreeEnsembleSavable(
+            self.resource_handle, self.initializer, self.resource_handle.name)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
       resources.register_resource(
           self.resource_handle,
-          create_op,
+          self.initializer,
           is_initialized_op,
           is_shared=not is_local)
 
+  def create_resource(self):
+    return gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
+        container='', shared_name=self._name, name=self._name)
+
+  def initialize(self):
+    return gen_boosted_trees_ops.boosted_trees_create_ensemble(
+        self.resource_handle,
+        self._stamp_token,
+        tree_ensemble_serialized=self._serialized_proto)
+
   @property
-  def resource_handle(self):
-    return self._resource_handle
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
+        self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    if not self._is_local:
+      return {'tree_ensemble': self._saveable}
 
   def get_stamp_token(self):
     """Returns the current stamp token of the resource."""
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index f0bfdb2b7a3c57b80e3ef01fa91da12b99cdb3d9..c64000b65d4f8cf58ec5d7be66936d9b87e9a1c2 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -208,7 +208,9 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
       seed2=seed2, name=name)
 
 
-@tf_export('nn.fixed_unigram_candidate_sampler')
+@tf_export('random.fixed_unigram_candidate_sampler',
+           'nn.fixed_unigram_candidate_sampler',
+           v1=['nn.fixed_unigram_candidate_sampler'])
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -300,7 +302,8 @@ def fixed_unigram_candidate_sampler(true_classes,
       unigrams=unigrams, seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export('nn.all_candidate_sampler')
+@tf_export('random.all_candidate_sampler', 'nn.all_candidate_sampler',
+           v1=['nn.all_candidate_sampler'])
 def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
                           seed=None, name=None):
   """Generate the set of all classes.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 40b111ea0c2bca773e3fe6744fcb1e7d95791371..f1f36269cf2bd9bcd3d25638a82d776850bc6bb8 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -13,11 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=g-short-docstring-punctuation
-"""Asserts and Boolean Checks.
-
-See the [Asserts and
-checks](https://tensorflow.org/api_guides/python/check_ops) guide.
-"""
+"""Asserts and Boolean Checks."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -123,9 +119,31 @@ def assert_proper_iterable(values):
         'Expected argument "values" to be iterable.  Found: %s' % type(values))
 
 
-@tf_export(
-    'debugging.assert_negative',
-    v1=['debugging.assert_negative', 'assert_negative'])
+@tf_export('debugging.assert_negative', v1=[])
+def assert_negative_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x < 0` holds element-wise.
+
+  This Op checks that `x[i] < 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not negative everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_negative".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] < 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_negative(x=x, message=message, summarize=summarize, name=name)
+
+
+@tf_export(v1=['debugging.assert_negative', 'assert_negative'])
 @deprecation.deprecated_endpoints('assert_negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
@@ -167,9 +185,31 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less(x, zero, data=data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_positive',
-    v1=['debugging.assert_positive', 'assert_positive'])
+@tf_export('debugging.assert_positive', v1=[])
+def assert_positive_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x > 0` holds element-wise.
+
+  This Op checks that `x[i] > 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not positive everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional). Defaults to "assert_positive".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] > 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_positive(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_positive', 'assert_positive'])
 @deprecation.deprecated_endpoints('assert_positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
@@ -210,9 +250,32 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less(zero, x, data=data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_non_negative',
-    v1=['debugging.assert_non_negative', 'assert_non_negative'])
+@tf_export('debugging.assert_non_negative', v1=[])
+def assert_non_negative_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x >= 0` holds element-wise.
+
+  This Op checks that `x[i] >= 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not >= 0 everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+      "assert_non_negative".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] >= 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_non_negative(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_non_negative', 'assert_non_negative'])
 @deprecation.deprecated_endpoints('assert_non_negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
@@ -255,9 +318,32 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(zero, x, data=data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_non_positive',
-    v1=['debugging.assert_non_positive', 'assert_non_positive'])
+@tf_export('debugging.assert_non_positive', v1=[])
+def assert_non_positive_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x <= 0` holds element-wise.
+
+  This Op checks that `x[i] <= 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not <= 0 everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+      "assert_non_positive".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] <= 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_non_positive(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_non_positive', 'assert_non_positive'])
 @deprecation.deprecated_endpoints('assert_non_positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
@@ -300,7 +386,33 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
 
-@tf_export('debugging.assert_equal', 'assert_equal')
+@tf_export('debugging.assert_equal', 'assert_equal', v1=[])
+def assert_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x == y` holds element-wise.
+
+  This Op checks that `x[i] == y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` and `y` are not equal, `message`, as well as the first `summarize`
+  entries of `x` and `y` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x == y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_equal', 'assert_equal'])
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -400,9 +512,36 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_none_equal',
-    v1=['debugging.assert_none_equal', 'assert_none_equal'])
+@tf_export('debugging.assert_none_equal', v1=[])
+def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
+  """Assert the condition `x != y` holds for all elements.
+
+  This Op checks that `x[i] != y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If any elements of `x` and `y` are equal, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError`
+  is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to
+    "assert_none_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x != y` is False for any pair of elements in `x` and `y`. The check can
+      be performed immediately during eager execution or if `x` and `y` are
+      statically known.
+  """
+  assert_none_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_none_equal', 'assert_none_equal'])
 @deprecation.deprecated_endpoints('assert_none_equal')
 def assert_none_equal(
     x, y, data=None, summarize=None, message=None, name=None):
@@ -454,7 +593,52 @@ def assert_none_equal(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_near', v1=['debugging.assert_near', 'assert_near'])
+@tf_export('debugging.assert_near', v1=[])
+def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
+                   name=None):
+  """Assert the condition `x` and `y` are close element-wise.
+
+  This Op checks that `x[i] - y[i] < atol + rtol * tf.abs(y[i])` holds for every
+  pair of (possibly broadcast) elements of `x` and `y`. If both `x` and `y` are
+  empty, this is trivially satisfied.
+
+  If any elements of `x` and `y` are not close, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError`
+  is raised.
+
+  The default `atol` and `rtol` is `10 * eps`, where `eps` is the smallest
+  representable positive number such that `1 + eps != 1`.  This is about
+  `1.2e-6` in `32bit`, `2.22e-15` in `64bit`, and `0.00977` in `16bit`.
+  See `numpy.finfo`.
+
+  Args:
+    x: Float or complex `Tensor`.
+    y: Float or complex `Tensor`, same dtype as and broadcastable to `x`.
+    rtol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The relative tolerance.  Default is `10 * eps`.
+    atol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The absolute tolerance.  Default is `10 * eps`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_near".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x != y` is False for any pair of elements in `x` and `y`. The check can
+      be performed immediately during eager execution or if `x` and `y` are
+      statically known.
+
+  @compatibility(numpy)
+  Similar to `numpy.assert_allclose`, except tolerance depends on data type.
+  This is due to the fact that `TensorFlow` is often used with `32bit`, `64bit`,
+  and even `16bit` data.
+  @end_compatibility
+  """
+  assert_near(x=x, y=y, rtol=rtol, atol=atol, summarize=summarize,
+              message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_near', 'assert_near'])
 @deprecation.deprecated_endpoints('assert_near')
 def assert_near(
     x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
@@ -533,7 +717,34 @@ def assert_near(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_less', 'assert_less')
+@tf_export('debugging.assert_less', 'assert_less', v1=[])
+def assert_less_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x < y` holds element-wise.
+
+  This Op checks that `x[i] < y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not less than `y` element-wise, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
+  raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_less".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x < y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_less(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_less', 'assert_less'])
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -581,9 +792,34 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_less_equal',
-    v1=['debugging.assert_less_equal', 'assert_less_equal'])
+@tf_export('debugging.assert_less_equal', v1=[])
+def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x <= y` holds element-wise.
+
+  This Op checks that `x[i] <= y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not less or equal than `y` element-wise, `message`, as well as the
+  first `summarize` entries of `x` and `y` are printed, and
+  `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional). Defaults to "assert_less_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x <= y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_less_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_less_equal', 'assert_less_equal'])
 @deprecation.deprecated_endpoints('assert_less_equal')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
@@ -632,7 +868,34 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_greater', 'assert_greater')
+@tf_export('debugging.assert_greater', 'assert_greater', v1=[])
+def assert_greater_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x > y` holds element-wise.
+
+  This Op checks that `x[i] > y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not greater than `y` element-wise, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
+  raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_greater".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x > y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_greater(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_greater', 'assert_greater'])
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -680,9 +943,36 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_greater_equal',
-    v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
+@tf_export('debugging.assert_greater_equal', v1=[])
+def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x >= y` holds element-wise.
+
+  This Op checks that `x[i] >= y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not greater or equal to `y` element-wise, `message`, as well as the
+  first `summarize` entries of `x` and `y` are printed, and
+  `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+    "assert_greater_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x >= y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_greater_equal(x=x, y=y, summarize=summarize, message=message,
+                       name=name)
+
+
+@tf_export(v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
 @deprecation.deprecated_endpoints('assert_greater_equal')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
                          name=None):
@@ -781,7 +1071,31 @@ def _assert_rank_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_rank', 'assert_rank')
+@tf_export('debugging.assert_rank', 'assert_rank', v1=[])
+def assert_rank_v2(x, rank, message=None, name=None):
+  """Assert that `x` has rank equal to `rank`.
+
+  This Op checks that the rank of `x` is equal to `rank`.
+
+  If `x` has a different rank, `message`, as well as the shape of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    rank: Scalar integer `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to
+      "assert_rank".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x` does not have rank `rank`. The check can be performed immediately
+      during eager execution or if the shape of `x` is statically known.
+  """
+  assert_rank(x=x, rank=rank, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank', 'assert_rank'])
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -796,7 +1110,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
     x:  Numeric `Tensor`.
     rank:  Scalar integer `Tensor`.
     data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`.
+      error message and the shape of `x`.
     summarize: Print this many entries of each tensor.
     message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_rank".
@@ -843,9 +1157,31 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   return assert_op
 
 
-@tf_export(
-    'debugging.assert_rank_at_least',
-    v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
+@tf_export('debugging.assert_rank_at_least', v1=[])
+def assert_rank_at_least_v2(x, rank, message=None, name=None):
+  """Assert that `x` has rank of at least `rank`.
+
+  This Op checks that the rank of `x` is greater or equal to `rank`.
+
+  If `x` has a rank lower than `rank`, `message`, as well as the shape of `x`
+  are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    rank: Scalar integer `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to
+      "assert_rank_at_least".
+
+  Raises:
+    InvalidArgumentError: `x` does not have rank at least `rank`, but the rank
+      cannot be statically determined.
+    ValueError: If static checks determine `x` has mismatched rank.
+  """
+  assert_rank_at_least(x=x, rank=rank, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
 @deprecation.deprecated_endpoints('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
@@ -977,9 +1313,30 @@ def _assert_ranks_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_rank_in',
-    v1=['debugging.assert_rank_in', 'assert_rank_in'])
+@tf_export('debugging.assert_rank_in', v1=[])
+def assert_rank_in_v2(x, ranks, message=None, name=None):
+  """Assert that `x` has a rank in `ranks`.
+
+  This Op checks that the rank of `x` is in `ranks`.
+
+  If `x` has a different rank, `message`, as well as the shape of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    ranks: `Iterable` of scalar `Tensor` objects.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to "assert_rank_in".
+
+  Raises:
+    InvalidArgumentError: `x` does not have rank in `ranks`, but the rank cannot
+      be statically determined.
+    ValueError: If static checks determine `x` has mismatched rank.
+  """
+  assert_rank_in(x=x, ranks=ranks, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank_in', 'assert_rank_in'])
 @deprecation.deprecated_endpoints('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
@@ -1042,9 +1399,25 @@ def assert_rank_in(
   return assert_op
 
 
-@tf_export(
-    'debugging.assert_integer',
-    v1=['debugging.assert_integer', 'assert_integer'])
+@tf_export('debugging.assert_integer', v1=[])
+def assert_integer_v2(x, message=None, name=None):
+  """Assert that `x` is of integer dtype.
+
+  If `x` has a non-integer type, `message`, as well as the dtype of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: A `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to "assert_integer".
+
+  Raises:
+    TypeError:  If `x.dtype` is not a non-quantized integer type.
+  """
+  assert_integer(x=x, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_integer', 'assert_integer'])
 @deprecation.deprecated_endpoints('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
@@ -1083,13 +1456,30 @@ def assert_integer(x, message=None, name=None):
     return control_flow_ops.no_op('statically_determined_was_integer')
 
 
-@tf_export('debugging.assert_type', v1=['debugging.assert_type', 'assert_type'])
+@tf_export('debugging.assert_type', v1=[])
+def assert_type_v2(tensor, tf_type, message=None, name=None):
+  """Asserts that the given `Tensor` is of the specified type.
+
+  Args:
+    tensor: A `Tensor`.
+    tf_type: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
+      etc).
+    message: A string to prefix to the default message.
+    name:  A name for this operation. Defaults to "assert_type"
+
+  Raises:
+    TypeError: If the tensor's data type doesn't match `tf_type`.
+  """
+  assert_type(tensor=tensor, tf_type=tf_type, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_type', 'assert_type'])
 @deprecation.deprecated_endpoints('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
 
   Args:
-    tensor: A tensorflow `Tensor`.
+    tensor: A `Tensor`.
     tf_type: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
       etc).
     message: A string to prefix to the default message.
@@ -1140,9 +1530,13 @@ def is_numeric_tensor(tensor):
 
 
 @tf_export(
-    'debugging.is_non_decreasing',
-    v1=['debugging.is_non_decreasing', 'is_non_decreasing'])
-@deprecation.deprecated_endpoints('is_non_decreasing')
+    'math.is_non_decreasing',
+    v1=[
+        'math.is_non_decreasing', 'debugging.is_non_decreasing',
+        'is_non_decreasing'
+    ])
+@deprecation.deprecated_endpoints('debugging.is_non_decreasing',
+                                  'is_non_decreasing')
 def is_non_decreasing(x, name=None):
   """Returns `True` if `x` is non-decreasing.
 
@@ -1170,9 +1564,13 @@ def is_non_decreasing(x, name=None):
 
 
 @tf_export(
-    'debugging.is_strictly_increasing',
-    v1=['debugging.is_strictly_increasing', 'is_strictly_increasing'])
-@deprecation.deprecated_endpoints('is_strictly_increasing')
+    'math.is_strictly_increasing',
+    v1=[
+        'math.is_strictly_increasing', 'debugging.is_strictly_increasing',
+        'is_strictly_increasing'
+    ])
+@deprecation.deprecated_endpoints('debugging.is_strictly_increasing',
+                                  'is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
   """Returns `True` if `x` is strictly increasing.
 
@@ -1264,8 +1662,10 @@ def assert_same_float_dtype(tensors=None, dtype=None):
     tensors: Tensors of input values. Can include `None` elements, which will be
         ignored.
     dtype: Expected type.
+
   Returns:
     Validated type.
+
   Raises:
     ValueError: if neither `tensors` nor `dtype` is supplied, or result is not
         float, or the common type of the inputs is not a floating point type.
@@ -1279,20 +1679,57 @@ def assert_same_float_dtype(tensors=None, dtype=None):
   return dtype
 
 
-@tf_export(
-    'debugging.assert_scalar', v1=['debugging.assert_scalar', 'assert_scalar'])
+@tf_export('debugging.assert_scalar', v1=[])
+def assert_scalar_v2(tensor, message=None, name=None):
+  """Asserts that the given `tensor` is a scalar.
+
+  This function raises `ValueError` unless it can be certain that the given
+  `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
+  unknown.
+
+  Args:
+    tensor: A `Tensor`.
+    message: A string to prefix to the default message.
+    name:  A name for this operation. Defaults to "assert_scalar"
+
+  Raises:
+    ValueError: If the tensor is not scalar (rank 0), or if its shape is
+      unknown.
+  """
+  assert_scalar(tensor=tensor, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
 @deprecation.deprecated_endpoints('assert_scalar')
-def assert_scalar(tensor, name=None):
+def assert_scalar(tensor, name=None, message=None):
+  """Asserts that the given `tensor` is a scalar.
+
+  This function raises `ValueError` unless it can be certain that the given
+  `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
+  unknown.
+
+  Args:
+    tensor: A `Tensor`.
+    name:  A name for this operation. Defaults to "assert_scalar"
+    message: A string to prefix to the default message.
+
+  Returns:
+    The input tensor (potentially converted to a `Tensor`).
+
+  Raises:
+    ValueError: If the tensor is not scalar (rank 0), or if its shape is
+      unknown.
+  """
   with ops.name_scope(name, 'assert_scalar', [tensor]) as name_scope:
     tensor = ops.convert_to_tensor(tensor, name=name_scope)
     shape = tensor.get_shape()
     if shape.ndims != 0:
       if context.executing_eagerly():
-        raise ValueError('Expected scalar shape, saw shape: %s.'
-                         % (shape,))
+        raise ValueError('%sExpected scalar shape, saw shape: %s.'
+                         % (message or '', shape,))
       else:
-        raise ValueError('Expected scalar shape for %s, saw shape: %s.'
-                         % (tensor.name, shape))
+        raise ValueError('%sExpected scalar shape for %s, saw shape: %s.'
+                         % (message or '', tensor.name, shape))
     return tensor
 
 
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 998c3e08f6f002b167310c607ac1960e993b6bd2..0f08c611bc0fc2a0a4312541d2a1d3f273f4bcb9 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -25,15 +25,15 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2 as util
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.util import nest
+
 
 # NOTE(skyewm): TensorFlow uses protected class methods and fields to signify
 # that they aren't part of the official public API. These protected members
@@ -108,23 +108,8 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
                                          false_graph.outputs),
         name=scope)
 
-    # Set the flag to enable lowering on the `if` op if necessary
-    # Lowering allows cond_v2 to avoid some of the limitations of Functions,
-    # allowing users to specify devices & colocation inside of cond_v2 branches,
-    # and enabling non-strict evaluation & partial pruning of cond_v2 branches.
-    # This brings cond_v2 closer to feature parity with tf.cond.
-    #
-    # However, we do not lower `If` in the XLA context because it is easier for
-    # XLA to apply its own optimizations when dealing with un-lowered `If`
-    # operators than with lowered switch/merge control flow.
-    #
     # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
-    if_op = tensors[0].op
-    if not control_flow_util.IsInXLAContext(if_op):
-      # pylint: disable=protected-access
-      if_op._set_attr("_lower_using_switch_merge",
-                      attr_value_pb2.AttrValue(b=True))
-      # pylint: enable=protected-access
+    util.maybe_set_lowering_attr(tensors[0].op)
 
     # Return identities for each output of the If op, rather than the output of
     # the If op directly. This makes pruning work if the output of cond() is
@@ -136,11 +121,8 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
     # correct output structure
     tensors = tuple(array_ops.identity(t) for t in tensors)
 
-    result = tuple(tensors[:num_cond_outputs])
-    if len(result) == 1:
-      return result[0]
-    else:
-      return result
+    return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
+                                              tensors[:num_cond_outputs])
 
 
 @ops.RegisterGradient("If")
@@ -199,6 +181,11 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
       output_shapes=_get_output_shapes(true_grad_graph.outputs,
                                        false_grad_graph.outputs))
 
+  util.maybe_set_lowering_attr(tensors[0].op)
+
+  # See comment in cond_v2.
+  tensors = [array_ops.identity(t) for t in tensors]
+
   # The predicate has no gradient.
   return [None] + tensors[:num_grad_outputs]
 
@@ -462,12 +449,20 @@ def _check_same_outputs(true_graph, false_graph):
   false_output_types = [t.dtype for t in false_graph.outputs]
   if (len(true_graph.outputs) != len(false_graph.outputs) or
       true_output_types != false_output_types):
-    raise ValueError(
+    raise TypeError(
         "true_fn() and false_fn() must return the same number and type of "
         "arguments, got:\n"
         "  true_fn: %s\n"
         "  false_fn: %s" % (true_output_types, false_output_types))
 
+  # Make sure `structured_outputs` for both graphs have the same structure.
+  try:
+    nest.assert_same_structure(true_graph.structured_outputs,
+                               false_graph.structured_outputs)
+  except (ValueError, TypeError) as e:
+    raise ValueError("Outputs of true_fn and false_fn must have the same "
+                     "structure: %s" % str(e))
+
 
 def _get_output_shapes(true_graph_outputs, false_graph_outputs):
   output_shapes = [
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 2694af3e78603f1adb21b24d1dd8c62a41f80a67..4417632e69215ffa9e55012e391b103fda7be7cd 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -14,8 +14,7 @@
 # ==============================================================================
 """Control Flow Operations.
 
-See the [Control
-Flow](https://tensorflow.org/api_guides/python/control_flow_ops) guide.
+See the [autograph](https://www.tensorflow.org/guide/autographs) guide.
 """
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -1977,7 +1976,7 @@ def _UnpackIfSingleton(res):
 
 # pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
-@tf_export("cond")
+@tf_export(v1=["cond"])
 @deprecation.deprecated_args(
     None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
@@ -2174,6 +2173,77 @@ def cond(pred,
 # pylint: enable=redefined-outer-name
 
 
+@tf_export("cond", v1=[])
+def cond_for_tf_v2(pred,
+                   true_fn=None,
+                   false_fn=None,
+                   name=None):
+  """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
+
+  `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
+  `false_fn` must have the same non-zero number and type of outputs.
+
+  **WARNING**: Any Tensors or Operations created outside of `true_fn` and
+  `false_fn` will be executed regardless of which branch is selected at runtime.
+
+  Although this behavior is consistent with the dataflow model of TensorFlow,
+  it has frequently surprised users who expected a lazier semantics.
+  Consider the following simple program:
+
+  ```python
+  z = tf.multiply(a, b)
+  result = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
+  ```
+
+  If `x < y`, the `tf.add` operation will be executed and `tf.square`
+  operation will not be executed. Since `z` is needed for at least one
+  branch of the `cond`, the `tf.multiply` operation is always executed,
+  unconditionally.
+
+  Note that `cond` calls `true_fn` and `false_fn` *exactly once* (inside the
+  call to `cond`, and not at all during `Session.run()`). `cond`
+  stitches together the graph fragments created during the `true_fn` and
+  `false_fn` calls with some additional graph nodes to ensure that the right
+  branch gets executed depending on the value of `pred`.
+
+  `tf.cond` supports nested structures as implemented in
+  `tensorflow.python.util.nest`. Both `true_fn` and `false_fn` must return the
+  same (possibly nested) value structure of lists, tuples, and/or named tuples.
+  Singleton lists and tuples form the only exceptions to this: when returned by
+  `true_fn` and/or `false_fn`, they are implicitly unpacked to single values.
+
+  Args:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix for the returned tensors.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`. If the
+    callables return a singleton list, the element is extracted from the list.
+
+  Raises:
+    TypeError: if `true_fn` or `false_fn` is not callable.
+    ValueError: if `true_fn` and `false_fn` do not return the same number of
+      tensors, or return tensors of different types.
+
+  Example:
+
+  ```python
+  x = tf.constant(2)
+  y = tf.constant(5)
+  def f1(): return tf.multiply(x, 17)
+  def f2(): return tf.add(y, 23)
+  r = tf.cond(tf.less(x, y), f1, f2)
+  # r is set to f1().
+  # Operations in f2 (e.g., tf.add) are not executed.
+  ```
+
+  """
+  return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
+
+
 def _resource_safe_shape(t):
   """Returns the shape of t or the variable it points to."""
   if t.dtype == dtypes.resource:
@@ -3066,7 +3136,193 @@ class WhileContext(ControlFlowContext):
 
 
 # pylint: disable=redefined-outer-name
-@tf_export("while_loop")
+@tf_export("while_loop", v1=[])
+def while_loop_v2(cond,
+                  body,
+                  loop_vars,
+                  shape_invariants=None,
+                  parallel_iterations=10,
+                  back_prop=True,
+                  swap_memory=False,
+                  maximum_iterations=None,
+                  return_same_structure=False,
+                  name=None):
+  """Repeat `body` while the condition `cond` is true.
+
+  `cond` is a callable returning a boolean scalar tensor. `body` is a callable
+  returning a (possibly nested) tuple, namedtuple or list of tensors of the same
+  arity (length and structure) and types as `loop_vars`. `loop_vars` is a
+  (possibly nested) tuple, namedtuple or list of tensors that is passed to both
+  `cond` and `body`. `cond` and `body` both take as many arguments as there are
+  `loop_vars`.
+
+  In addition to regular Tensors or IndexedSlices, the body may accept and
+  return TensorArray objects.  The flows of the TensorArray objects will
+  be appropriately forwarded between loops and during gradient calculations.
+
+  Note that `while_loop` calls `cond` and `body` *exactly once* (inside the
+  call to `while_loop`, and not at all during `Session.run()`). `while_loop`
+  stitches together the graph fragments created during the `cond` and `body`
+  calls with some additional graph nodes to create the graph flow that
+  repeats `body` until `cond` returns false.
+
+  For correctness, `tf.while_loop()` strictly enforces shape invariants for
+  the loop variables. A shape invariant is a (possibly partial) shape that
+  is unchanged across the iterations of the loop. An error will be raised
+  if the shape of a loop variable after an iteration is determined to be more
+  general than or incompatible with its shape invariant. For example, a shape
+  of [11, None] is more general than a shape of [11, 17], and [11, 21] is not
+  compatible with [11, 17]. By default (if the argument `shape_invariants` is
+  not specified), it is assumed that the initial shape of each tensor in
+  `loop_vars` is the same in every iteration. The `shape_invariants` argument
+  allows the caller to specify a less specific shape invariant for each loop
+  variable, which is needed if the shape varies between iterations. The
+  `tf.Tensor.set_shape`
+  function may also be used in the `body` function to indicate that
+  the output loop variable has a particular shape. The shape invariant for
+  SparseTensor and IndexedSlices are treated specially as follows:
+
+  a) If a loop variable is a SparseTensor, the shape invariant must be
+  TensorShape([r]) where r is the rank of the dense tensor represented
+  by the sparse tensor. It means the shapes of the three tensors of the
+  SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
+  is the shape of the SparseTensor.dense_shape property. It must be the shape of
+  a vector.
+
+  b) If a loop variable is an IndexedSlices, the shape invariant must be
+  a shape invariant of the values tensor of the IndexedSlices. It means
+  the shapes of the three tensors of the IndexedSlices are (shape, [shape[0]],
+  [shape.ndims]).
+
+  `while_loop` implements non-strict semantics, enabling multiple iterations
+  to run in parallel. The maximum number of parallel iterations can be
+  controlled by `parallel_iterations`, which gives users some control over
+  memory consumption and execution order. For correct programs, `while_loop`
+  should return the same result for any parallel_iterations > 0.
+
+  For training, TensorFlow stores the tensors that are produced in the
+  forward inference and are needed in back propagation. These tensors are a
+  main source of memory consumption and often cause OOM errors when training
+  on GPUs. When the flag swap_memory is true, we swap out these tensors from
+  GPU to CPU. This for example allows us to train RNN models with very long
+  sequences and large batches.
+
+  Args:
+    cond: A callable that represents the termination condition of the loop.
+    body: A callable that represents the loop body.
+    loop_vars: A (possibly nested) tuple, namedtuple or list of numpy array,
+      `Tensor`, and `TensorArray` objects.
+    shape_invariants: The shape invariants for the loop variables.
+    parallel_iterations: The number of iterations allowed to run in parallel. It
+      must be a positive integer.
+    back_prop: Whether backprop is enabled for this while loop.
+    swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
+    maximum_iterations: Optional maximum number of iterations of the while loop
+      to run.  If provided, the `cond` output is AND-ed with an additional
+      condition ensuring the number of iterations executed is no greater than
+      `maximum_iterations`.
+    return_same_structure: If True, output has same structure as `loop_vars`. If
+      eager execution is enabled, this is ignored (and always treated as True).
+    name: Optional name prefix for the returned tensors.
+
+  Returns:
+    The output tensors for the loop variables after the loop.
+     If `return_same_structure` is True, the return value has the same
+     structure as `loop_vars`.
+     If `return_same_structure` is False, the return value is a Tensor,
+     TensorArray or IndexedSlice if the length of `loop_vars` is 1, or a list
+     otherwise.
+
+  Raises:
+    TypeError: if `cond` or `body` is not callable.
+    ValueError: if `loop_vars` is empty.
+
+  Example:
+
+  ```python
+  i = tf.constant(0)
+  c = lambda i: tf.less(i, 10)
+  b = lambda i: tf.add(i, 1)
+  r = tf.while_loop(c, b, [i])
+  ```
+
+  Example with nesting and a namedtuple:
+
+  ```python
+  import collections
+  Pair = collections.namedtuple('Pair', 'j, k')
+  ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
+  c = lambda i, p: i < 10
+  b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
+  ijk_final = tf.while_loop(c, b, ijk_0)
+  ```
+
+  Example using shape_invariants:
+
+  ```python
+  i0 = tf.constant(0)
+  m0 = tf.ones([2, 2])
+  c = lambda i, m: i < 10
+  b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+  tf.while_loop(
+      c, b, loop_vars=[i0, m0],
+      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+  ```
+
+  Example which demonstrates non-strict semantics: In the following
+  example, the final value of the counter `i` does not depend on `x`. So
+  the `while_loop` can increment the counter parallel to updates of `x`.
+  However, because the loop counter at one loop iteration depends
+  on the value at the previous iteration, the loop counter itself cannot
+  be incremented in parallel. Hence if we just want the final value of the
+  counter (which we print on the line `print(sess.run(i))`), then
+  `x` will never be incremented, but the counter will be updated on a
+  single thread. Conversely, if we want the value of the output (which we
+  print on the line `print(sess.run(out).shape)`), then the counter may be
+  incremented on its own thread, while `x` can be incremented in
+  parallel on a separate thread. In the extreme case, it is conceivable
+  that the thread incrementing the counter runs until completion before
+  `x` is incremented even a single time. The only thing that can never
+  happen is that the thread updating `x` can never get ahead of the
+  counter thread because the thread incrementing `x` depends on the value
+  of the counter.
+
+  ```python
+  import tensorflow as tf
+
+  n = 10000
+  x = tf.constant(list(range(n)))
+  c = lambda i, x: i < n
+  b = lambda i, x: (tf.Print(i + 1, [i]), tf.Print(x + 1, [i], "x:"))
+  i, out = tf.while_loop(c, b, (0, x))
+  with tf.Session() as sess:
+      print(sess.run(i))  # prints [0] ... [9999]
+
+      # The following line may increment the counter and x in parallel.
+      # The counter thread may get ahead of the other thread, but not the
+      # other way around. So you may see things like
+      # [9996] x:[9987]
+      # meaning that the counter thread is on iteration 9996,
+      # while the other thread is on iteration 9987
+      print(sess.run(out).shape)
+  ```
+
+  """
+  return while_loop(
+      cond=cond,
+      body=body,
+      loop_vars=loop_vars,
+      shape_invariants=shape_invariants,
+      parallel_iterations=parallel_iterations,
+      back_prop=back_prop,
+      swap_memory=swap_memory,
+      name=name,
+      maximum_iterations=maximum_iterations,
+      return_same_structure=return_same_structure)
+
+
+# pylint: disable=redefined-outer-name
+@tf_export(v1=["while_loop"])
 def while_loop(cond,
                body,
                loop_vars,
@@ -3466,7 +3722,43 @@ def group(*inputs, **kwargs):
       return no_op(name=name)
 
 
-@tf_export("tuple")
+@tf_export("tuple", v1=[])
+def tuple_v2(tensors, control_inputs=None, name=None):
+  """Group tensors together.
+
+  This creates a tuple of tensors with the same values as the `tensors`
+  argument, except that the value of each tensor is only returned after the
+  values of all tensors have been computed.
+
+  `control_inputs` contains additional ops that have to finish before this op
+  finishes, but whose outputs are not returned.
+
+  This can be used as a "join" mechanism for parallel computations: all the
+  argument tensors can be computed in parallel, but the values of any tensor
+  returned by `tuple` are only available after all the parallel computations
+  are done.
+
+  See also `tf.group` and
+  `tf.control_dependencies`.
+
+  Args:
+    tensors: A list of `Tensor`s or `IndexedSlices`, some entries can be `None`.
+    control_inputs: List of additional ops to finish before returning.
+    name: (optional) A name to use as a `name_scope` for the operation.
+
+  Returns:
+    Same as `tensors`.
+
+  Raises:
+    ValueError: If `tensors` does not contain any `Tensor` or `IndexedSlices`.
+    TypeError: If `control_inputs` is not a list of `Operation` or `Tensor`
+      objects.
+
+  """
+  return tuple(tensors=tensors, name=name, control_inputs=control_inputs)  # pylint: disable=redefined-builtin
+
+
+@tf_export(v1=["tuple"])
 def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
@@ -3617,12 +3909,22 @@ def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name,
   if isinstance(pred_fn_pairs, collections.OrderedDict):
     pred_fn_pairs = pred_fn_pairs.items()
   elif isinstance(pred_fn_pairs, dict):
-    pred_fn_pairs = sorted(pred_fn_pairs.items(), key=lambda item: item[0].name)
-    if not exclusive:
-      logging.warn(
-          "%s: An unordered dictionary of predicate/fn pairs was "
-          "provided, but exclusive=False. The order of conditional "
-          "tests is deterministic but not guaranteed.", name)
+    if context.executing_eagerly():
+      # No name to sort on in eager mode. Use dictionary traversal order,
+      # which is nondeterministic in versions of Python < 3.6
+      if not exclusive:
+        raise ValueError("Unordered dictionaries are not supported for the "
+                         "`pred_fn_pairs` argument when `exclusive=False` and "
+                         "eager mode is enabled.")
+      pred_fn_pairs = list(pred_fn_pairs.items())
+    else:
+      pred_fn_pairs = sorted(
+          pred_fn_pairs.items(), key=lambda item: item[0].name)
+      if not exclusive:
+        logging.warn(
+            "%s: An unordered dictionary of predicate/fn pairs was "
+            "provided, but exclusive=False. The order of conditional "
+            "tests is deterministic but not guaranteed.", name)
   for pred_fn_pair in pred_fn_pairs:
     if not isinstance(pred_fn_pair, _basetuple) or len(pred_fn_pair) != 2:
       raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
@@ -3718,7 +4020,7 @@ def case(pred_fn_pairs,
   operation returns the tensors generated by `default`.
 
   `tf.case` supports nested structures as implemented in
-  `tensorflow.python.util.nest`. All of the callables must return the same
+  `tf.contrib.framework.nest`. All of the callables must return the same
   (possibly nested) value structure of lists, tuples, and/or named tuples.
   Singleton lists and tuples form the only exceptions to this: when returned by
   a callable, they are implicitly unpacked to single values. This
@@ -3729,6 +4031,12 @@ def case(pred_fn_pairs,
   deterministic, so that variables created in conditional branches are created
   in fixed order across runs.
 
+  @compatibility{eager}
+  Unordered dictionaries are not supported in eager mode when `exclusive=False`.
+  Use a list of tuples instead.
+  @end_compatibility
+
+
   **Example 1:**
 
   Pseudocode:
@@ -3743,7 +4051,7 @@ def case(pred_fn_pairs,
   ```python
   f1 = lambda: tf.constant(17)
   f2 = lambda: tf.constant(23)
-  r = case([(tf.less(x, y), f1)], default=f2)
+  r = tf.case([(tf.less(x, y), f1)], default=f2)
   ```
 
   **Example 2:**
@@ -3751,7 +4059,7 @@ def case(pred_fn_pairs,
   Pseudocode:
 
   ```
-  if (x < y && x > z) raise OpError("Only one predicate may evaluate true");
+  if (x < y && x > z) raise OpError("Only one predicate may evaluate to True");
   if (x < y) return 17;
   else if (x > z) return 23;
   else return -1;
@@ -3763,7 +4071,7 @@ def case(pred_fn_pairs,
   def f1(): return tf.constant(17)
   def f2(): return tf.constant(23)
   def f3(): return tf.constant(-1)
-  r = case({tf.less(x, y): f1, tf.greater(x, z): f2},
+  r = tf.case({tf.less(x, y): f1, tf.greater(x, z): f2},
            default=f3, exclusive=True)
   ```
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index f4b28f0113bebf2677c65ec2449662ffbd84d575..260af95a3bd6f6e8aa80e9e930c4b0aca3294335 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -155,9 +155,9 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
           constant_op.constant(7))
       with self.cached_session():
         variables.global_variables_initializer().run()
-        self.assertEquals(0, counter.eval())
-        self.assertEquals(7, const_with_dep.eval())
-        self.assertEquals(1, counter.eval())
+        self.assertEquals(0, self.evaluate(counter))
+        self.assertEquals(7, self.evaluate(const_with_dep))
+        self.assertEquals(1, self.evaluate(counter))
 
   def testListDependencies(self):
     with ops.Graph().as_default():
@@ -169,9 +169,9 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
           constant_op.constant(7))
       with self.cached_session():
         variables.global_variables_initializer().run()
-        self.assertEquals(0, counter.eval())
-        self.assertEquals(7, const_with_dep.eval())
-        self.assertEquals(1, counter.eval())
+        self.assertEquals(0, self.evaluate(counter))
+        self.assertEquals(7, self.evaluate(const_with_dep))
+        self.assertEquals(1, self.evaluate(counter))
 
 
 class SwitchTestCase(test_util.TensorFlowTestCase):
@@ -209,7 +209,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       optimizer = momentum.MomentumOptimizer(0.1, 0.9)
       train_op = optimizer.minimize(cost)
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
           sess.run([train_op])
 
@@ -232,8 +232,8 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
           cond, body, [constant_op.constant(0),
                        constant_op.constant(0.0)])
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertAllEqual(10.0, cost.eval())
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllEqual(10.0, self.evaluate(cost))
 
   def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
     with ops.Graph().as_default():
@@ -269,7 +269,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
                                           static_grads.indices)
 
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
 
   def testIndexedSlicesGradientInCondInWhileLoop(self):
@@ -398,9 +398,9 @@ class CondTest(test_util.TensorFlowTestCase):
             pred=bool_var,
             true_fn=lambda: state_ops.assign(bool_var, False),
             false_fn=lambda: True)
-        sess.run(bool_var.initializer)
-        self.assertEquals(sess.run(cond_on_bool_var), False)
-        self.assertEquals(sess.run(cond_on_bool_var), True)
+        self.evaluate(bool_var.initializer)
+        self.assertEquals(self.evaluate(cond_on_bool_var), False)
+        self.assertEquals(self.evaluate(cond_on_bool_var), True)
 
   def testCondMissingArg1(self):
     with ops.Graph().as_default():
@@ -947,6 +947,16 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
+  @test_util.run_in_graph_and_eager_modes
+  def testCase_dict(self):
+    x = constant_op.constant(2)
+    conditions = {
+        math_ops.equal(x, 1): lambda: constant_op.constant(2),
+        math_ops.equal(x, 2): lambda: constant_op.constant(4)
+    }
+    output = control_flow_ops.case(conditions, exclusive=True)
+    self.assertEqual(4, self.evaluate(output))
+
 
 class WhileLoopTestCase(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index cab1d7b02e10812bba4cd6b1697a0da60031fa75..5f56850884a5e9e424c77515406ef8c9b513e972 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -19,10 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.func_graph import FuncGraph
+from tensorflow.python.ops import control_flow_util
 
 
 class CondBranchFuncGraph(FuncGraph):
@@ -90,3 +92,31 @@ def unique_fn_name(scope, name):
 
 def unique_grad_fn_name(forward_name):
   return "%s_grad_%s" % (forward_name, ops.uid())
+
+
+def maybe_set_lowering_attr(op):
+  """Sets the flag to enable lowering on `op` if necessary.
+
+  Lowering allows cond_v2 and while_v2 to avoid some of the limitations of
+  Functions, allowing users to specify devices & colocation inside of cond_v2
+  and while_v2 input functions, and enabling non-strict evaluation & partial
+  pruning. This brings v2 control flow closer to feature parity with v1 control
+  flow.
+
+  However, we do not lower in the following cases:
+    - When the `If` or `While` ops are in the XLA context. Because it is easier
+      for XLA to apply its own optimizations when dealing with un-lowered
+      control flow operators than with low-level control flow primitives.
+    - When the eager execution context specifies the executor of functions to
+      be the single threaded executor (see context.function_executor_type()).
+      Because the single threaded executor does not support v1 control flow ops.
+
+  Args:
+    op: An `If` or `While` Operation.
+  """
+  if (not control_flow_util.IsInXLAContext(op) and
+      context.context().get_function_call_options().executor_type
+      != "SINGLE_THREADED_EXECUTOR"):
+    # pylint: disable=protected-access
+    op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
+    # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index e1071afd8e00728b896a7ac03eb2e07cea2dbe74..3a7eb9355a66a213d3d60f103b818ef22fd839bd 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -19,17 +19,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_ctc_ops
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access, invalid-name
-@tf_export("nn.ctc_loss")
+@tf_export(v1=["nn.ctc_loss"])
 def ctc_loss(labels, inputs, sequence_length,
              preprocess_collapse_repeated=False,
              ctc_merge_repeated=True,
@@ -336,6 +346,785 @@ def ctc_beam_search_decoder_v2(inputs, sequence_length, beam_width=100,
 
 
 ops.NotDifferentiable("CTCGreedyDecoder")
+ops.NotDifferentiable("CTCBeamSearchDecoder")
 
 
-ops.NotDifferentiable("CTCBeamSearchDecoder")
+def _ctc_state_trans(label_seq):
+  """Compute CTC alignment model transition matrix.
+
+  Args:
+    label_seq: tensor of shape [batch_size, max_seq_length]
+
+  Returns:
+    tensor of shape [batch_size, states, states] with a state transition matrix
+    computed for each sequence of the batch.
+  """
+
+  with ops.name_scope("ctc_state_trans"):
+    label_seq = ops.convert_to_tensor(label_seq, name="label_seq")
+    batch_size = _get_dim(label_seq, 0)
+    num_labels = _get_dim(label_seq, 1)
+
+    num_label_states = num_labels + 1
+    num_states = 2 * num_label_states
+
+    label_states = math_ops.range(num_label_states)
+    blank_states = label_states + num_label_states
+
+    # Start state to first label.
+    start_to_label = [[1, 0]]
+
+    # Blank to label transitions.
+    blank_to_label = array_ops.stack([label_states[1:], blank_states[:-1]], 1)
+
+    # Label to blank transitions.
+    label_to_blank = array_ops.stack([blank_states, label_states], 1)
+
+    # Scatter transitions that don't depend on sequence.
+    indices = array_ops.concat(
+        [start_to_label, blank_to_label, label_to_blank], 0)
+    values = array_ops.ones([_get_dim(indices, 0)])
+    trans = array_ops.scatter_nd(
+        indices, values, shape=[num_states, num_states])
+    trans += linalg_ops.eye(num_states)  # Self-loops.
+
+    # Label to label transitions. Disallow transitions between repeated labels
+    # with no blank state in between.
+    batch_idx = array_ops.zeros_like(label_states[2:])
+    indices = array_ops.stack(
+        [batch_idx, label_states[2:], label_states[1:-1]], 1)
+    indices = array_ops.tile(
+        array_ops.expand_dims(indices, 0), [batch_size, 1, 1])
+    batch_idx = array_ops.expand_dims(math_ops.range(batch_size), 1) * [1, 0, 0]
+    indices += array_ops.expand_dims(batch_idx, 1)
+    repeats = math_ops.equal(label_seq[:, :-1], label_seq[:, 1:])
+    values = 1.0 - math_ops.cast(repeats, dtypes.float32)
+    batched_shape = [batch_size, num_states, num_states]
+    label_to_label = array_ops.scatter_nd(indices, values, batched_shape)
+
+    return array_ops.expand_dims(trans, 0) + label_to_label
+
+
+def ctc_state_log_probs(seq_lengths, max_seq_length):
+  """Computes CTC alignment initial and final state log probabilities.
+
+  Create the initial/final state values directly as log values to avoid
+  having to take a float64 log on tpu (which does not exist).
+
+  Args:
+    seq_lengths: int tensor of shape [batch_size], seq lengths in the batch.
+    max_seq_length: int, max sequence length possible.
+
+  Returns:
+    initial_state_log_probs, final_state_log_probs
+  """
+
+  batch_size = _get_dim(seq_lengths, 0)
+  num_label_states = max_seq_length + 1
+  num_duration_states = 2
+  num_states = num_duration_states * num_label_states
+  log_0 = math_ops.cast(
+      math_ops.log(math_ops.cast(0, dtypes.float64) + 1e-307),
+      dtypes.float32)
+
+  initial_state_log_probs = array_ops.one_hot(
+      indices=array_ops.zeros([batch_size], dtype=dtypes.int32),
+      depth=num_states,
+      on_value=0.0,
+      off_value=log_0, axis=1)
+
+  label_final_state_mask = array_ops.one_hot(
+      seq_lengths, depth=num_label_states, axis=0)
+  duration_final_state_mask = array_ops.ones(
+      [num_duration_states, 1, batch_size])
+  final_state_mask = duration_final_state_mask * label_final_state_mask
+  final_state_log_probs = (1.0 - final_state_mask) * log_0
+  final_state_log_probs = array_ops.reshape(
+      final_state_log_probs, [num_states, batch_size])
+
+  return initial_state_log_probs, array_ops.transpose(final_state_log_probs)
+
+
+def _ilabel_to_state(labels, num_labels, ilabel_log_probs):
+  """Project ilabel log probs to state log probs."""
+
+  num_label_states = _get_dim(labels, 1)
+  blank = ilabel_log_probs[:, :, :1]
+  blank = array_ops.tile(blank, [1, 1, num_label_states + 1])
+  one_hot = array_ops.one_hot(labels, depth=num_labels)
+  one_hot = array_ops.expand_dims(one_hot, axis=0)
+  ilabel_log_probs = array_ops.expand_dims(ilabel_log_probs, axis=2)
+  state_log_probs = math_ops.reduce_sum(ilabel_log_probs * one_hot, axis=3)
+  state_log_probs = array_ops.concat([state_log_probs, blank], axis=2)
+  return array_ops.pad(
+      state_log_probs, [[0, 0], [0, 0], [1, 0]],
+      constant_values=math_ops.log(0.0))
+
+
+def _state_to_olabel(labels, num_labels, states):
+  """Sum state log probs to ilabel log probs."""
+
+  num_label_states = _get_dim(labels, 1) + 1
+  label_states = states[:, :, 1:num_label_states]
+  blank_states = states[:, :, num_label_states:]
+  one_hot = array_ops.one_hot(
+      labels - 1, depth=(num_labels - 1),
+      on_value=0.0, off_value=math_ops.log(0.0))
+  one_hot = array_ops.expand_dims(one_hot, axis=0)
+  label_states = array_ops.expand_dims(label_states, axis=3)
+  label_olabels = math_ops.reduce_logsumexp(label_states + one_hot, axis=2)
+  blank_olabels = math_ops.reduce_logsumexp(
+      blank_states, axis=2, keepdims=True)
+  return array_ops.concat([blank_olabels, label_olabels], axis=-1)
+
+
+# pylint: disable=redefined-outer-name
+def _state_to_olabel_unique(labels, num_labels, states, unique):
+  """Sum state log probs to ilabel log probs using unique label indices."""
+
+  num_label_states = _get_dim(labels, 1) + 1
+  label_states = states[:, :, 1:num_label_states]
+  blank_states = states[:, :, num_label_states:]
+
+  unique_y, unique_idx = unique
+  mul_reduce = _sum_states(unique_idx, label_states)
+
+  num_frames = states.shape[0]
+  batch_size = states.shape[1]
+  num_states = num_label_states - 1
+  batch_state_major = array_ops.transpose(mul_reduce, perm=[1, 2, 0])
+  batch_state_major = array_ops.reshape(
+      batch_state_major, [batch_size * num_states, num_frames])
+  batch_offset = math_ops.range(batch_size, dtype=unique_y.dtype) * num_labels
+  indices = unique_y + array_ops.expand_dims(batch_offset, axis=-1)
+  indices = array_ops.reshape(indices, [-1, 1])
+  scatter = array_ops.scatter_nd(
+      indices=indices,
+      updates=batch_state_major,
+      shape=[batch_size * num_labels, num_frames])
+  scatter = array_ops.reshape(scatter, [batch_size, num_labels, num_frames])
+  scatter = array_ops.where(
+      math_ops.equal(scatter, 0.0),
+      array_ops.fill(array_ops.shape(scatter), math_ops.log(0.0)),
+      scatter)
+  label_olabels = array_ops.transpose(scatter, [2, 0, 1])
+  label_olabels = label_olabels[:, :, 1:]
+
+  blank_olabels = math_ops.reduce_logsumexp(
+      blank_states, axis=2, keepdims=True)
+
+  return array_ops.concat([blank_olabels, label_olabels], axis=-1)
+
+
+def ctc_loss_and_grad(logits, labels, label_length, logit_length, unique=None):
+  """Computes the CTC loss and gradients.
+
+  Most users will want fwd_bwd.ctc_loss
+
+  This function returns the computed gradient, it does not have a gradient
+  of its own defined.
+
+  Args:
+    logits: tensor of shape [frames, batch_size, num_labels]
+    labels: tensor of shape [batch_size, max_label_seq_length]
+    label_length: tensor of shape [batch_size]
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    unique: (optional) unique label indices as computed by unique(labels)
+      If supplied, enables an implementation that is faster and more memory
+      efficient on TPU.
+
+  Returns:
+    loss: tensor of shape [batch_size]
+    gradient: tensor of shape [frames, batch_size, num_labels]
+  """
+
+  num_labels = _get_dim(logits, 2)
+  max_label_seq_length = _get_dim(labels, 1)
+
+  ilabel_log_probs = nn_ops.log_softmax(logits)
+  state_log_probs = _ilabel_to_state(labels, num_labels, ilabel_log_probs)
+  state_trans_probs = _ctc_state_trans(labels)
+  initial_state_log_probs, final_state_log_probs = ctc_state_log_probs(
+      label_length, max_label_seq_length)
+  fwd_bwd_log_probs, log_likelihood = _forward_backward_log(
+      state_trans_log_probs=math_ops.log(state_trans_probs),
+      initial_state_log_probs=initial_state_log_probs,
+      final_state_log_probs=final_state_log_probs,
+      observed_log_probs=state_log_probs,
+      sequence_length=logit_length)
+
+  if unique:
+    olabel_log_probs = _state_to_olabel_unique(
+        labels, num_labels, fwd_bwd_log_probs, unique)
+  else:
+    olabel_log_probs = _state_to_olabel(labels, num_labels, fwd_bwd_log_probs)
+
+  grad = math_ops.exp(ilabel_log_probs) - math_ops.exp(olabel_log_probs)
+  loss = -log_likelihood
+  return loss, grad
+
+
+def _ctc_loss_grad(op, grad_loss, _):
+  grad = op.outputs[1]
+  grad = [array_ops.reshape(grad_loss, [1, -1, 1]) * grad]
+  grad += [None] * (len(op.inputs) - len(grad))
+  return grad
+
+
+def _ctc_loss_shape(op):
+  return [op.inputs[2].get_shape(), op.inputs[0].get_shape()]
+
+
+@tf_export("nn.ctc_loss", v1=["nn.ctc_loss_v2"])
+def ctc_loss_v2(labels, logits, label_length, logit_length,
+                logits_time_major=True, unique=None,
+                blank_index=None, name=None):
+  """Computes CTC (Connectionist Temporal Classification) loss.
+
+  This op implements the CTC loss as presented in the article:
+
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
+  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+
+  Notes:
+      - Same as the "Classic CTC" in TensorFlow 1.x's tf.nn.ctc_loss setting of
+        preprocess_collapse_repeated=False, ctc_merge_repeated=True
+      - Labels may be supplied as either a dense, zero-padded tensor with a
+        vector of label sequence lengths OR as a SparseTensor.
+      - On TPU and GPU:
+          - Only dense padded labels are supported.
+      - On CPU:
+          - Caller may use SparseTensor or dense padded labels but calling with
+            a SparseTensor will be significantly faster.
+      - Default blank label is 0 rather num_classes - 1, unless overridden by
+        blank_index.
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_seq_length] or SparseTensor
+    logits: tensor of shape [frames, batch_size, num_labels],
+      if logits_time_major == False, shape is [batch_size, frames, num_labels].
+    label_length: tensor of shape [batch_size], None if labels is SparseTensor
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    logits_time_major: (optional) If True (default), logits is shaped
+      [time, batch, logits]. If False, shape is [batch, time, logits]
+    unique: (optional) Unique label indices as computed by
+      ctc_unique_labels(labels).  If supplied, enable a faster, memory
+      efficient implementation on TPU.
+    blank_index: (optional) Set the class index to use for the blank label.
+      Negative values will start from num_classes, ie, -1 will reproduce the
+      ctc_loss behavior of using num_classes - 1 for the blank symbol.
+      There is some memory/performance overhead to switching from the default
+      of 0 as an additional shifted copy of the logits may be created.
+    name: A name for this `Op`. Defaults to "ctc_loss_dense".
+
+  Returns:
+    loss: tensor of shape [batch_size], negative log probabilities.
+  """
+  if isinstance(labels, sparse_tensor.SparseTensor):
+    if blank_index is None:
+      raise ValueError(
+          "blank_index must be given when using SparseTensor labels.")
+
+    if blank_index < 0:
+      blank_index += _get_dim(logits, 2)
+
+    if blank_index != _get_dim(logits, 2) - 1:
+      logits = array_ops.concat([
+          logits[:, :, :blank_index],
+          logits[:, :, blank_index+1:],
+          logits[:, :, blank_index:blank_index+1],
+      ], axis=2)
+      labels = sparse_tensor.SparseTensor(
+          labels.indices,
+          array_ops.where(labels.values < blank_index,
+                          labels.values,
+                          labels.values - 1),
+          labels.dense_shape)
+
+    return ctc_loss(labels=labels,
+                    inputs=logits,
+                    sequence_length=logit_length,
+                    time_major=logits_time_major)
+
+  if blank_index is None:
+    blank_index = 0
+
+  return ctc_loss_dense(labels=labels,
+                        logits=logits,
+                        label_length=label_length,
+                        logit_length=logit_length,
+                        logits_time_major=logits_time_major,
+                        unique=unique,
+                        blank_index=blank_index,
+                        name=name)
+
+
+def ctc_loss_dense(labels, logits, label_length, logit_length,
+                   logits_time_major=True, unique=None,
+                   blank_index=0, name=None):
+  """Computes CTC (Connectionist Temporal Classification) loss.
+
+  This op implements the CTC loss as presented in the article:
+
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
+  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+
+  Using the batched forward backward algorithm described in:
+
+  [Sim, K. C., Narayanan, A., Bagby, T., Sainath, T. N., & Bacchiani, M.
+  Improving the efficiency of forward-backward algorithm using batched
+    computation in TensorFlow.
+  Automatic Speech Recognition and Understanding Workshop (ASRU),
+    2017 IEEE (pp. 258-264).
+  ](https://ieeexplore.ieee.org/iel7/8260578/8268903/08268944.pdf)
+
+  Notes:
+    Significant differences from tf.nn.ctc_loss:
+      Supports GPU and TPU (tf.nn.ctc_loss supports CPU only):
+        For batched operations, GPU and TPU are significantly faster than using
+        ctc_loss on CPU.
+        This implementation runs on CPU, but significantly slower than ctc_loss.
+      Blank label is 0 rather num_classes - 1, unless overridden by blank_index.
+      Logits and labels are dense arrays with padding rather than SparseTensor.
+      The only mode supported is the same as:
+        preprocess_collapse_repeated=False, ctc_merge_repeated=True
+        To collapse labels, the caller can preprocess label sequence first.
+
+    The dense implementation supports both CPU, GPU and TPU. A fast path is
+    provided that significantly improves memory use for large vocabulary if the
+    caller preprocesses label sequences to get unique label indices on the CPU
+    (eg. in the data input pipeline) using ctc_ops.unique and simplies this in
+    the optional "unique" kwarg. This is especially useful for TPU and GPU but
+    also works with if used on CPU.
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_seq_length]
+    logits: tensor of shape [frames, batch_size, num_labels],
+      if logits_time_major == False, shape is [batch_size, frames, num_labels].
+    label_length: tensor of shape [batch_size]
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    logits_time_major: (optional) If True (default), logits is shaped
+      [time, batch, logits]. If False, shape is [batch, time, logits]
+    unique: (optional) Unique label indices as computed by unique(labels).
+      If supplied, enable a faster, memory efficient implementation on TPU.
+    blank_index: (optional) Set the class index to use for the blank label.
+      Negative values will start from num_classes, ie, -1 will reproduce the
+      ctc_loss behavior of using num_classes - 1 for the blank symbol.
+      There is some memory/performance overhead to switching from the default
+      of 0 as an additional shifted copy of the logits may be created.
+    name: A name for this `Op`. Defaults to "ctc_loss_dense".
+
+  Returns:
+    loss: tensor of shape [batch_size], negative log probabilities.
+  """
+
+  with ops.name_scope(name, "ctc_loss_dense",
+                      [logits, labels, label_length, logit_length]):
+    logits = ops.convert_to_tensor(logits, name="logits")
+    labels = ops.convert_to_tensor(labels, name="labels")
+    label_length = ops.convert_to_tensor(label_length, name="label_length")
+    logit_length = ops.convert_to_tensor(logit_length, name="logit_length")
+
+    if not logits_time_major:
+      logits = array_ops.transpose(logits, perm=[1, 0, 2])
+
+    if blank_index != 0:
+      if blank_index < 0:
+        blank_index += _get_dim(logits, 2)
+      logits = array_ops.concat([
+          logits[:, :, blank_index:blank_index+1],
+          logits[:, :, :blank_index],
+          logits[:, :, blank_index+1:],
+      ], axis=2)
+      labels = array_ops.where(labels < blank_index, labels + 1, labels)
+
+    args = [logits, labels, label_length, logit_length]
+
+    if unique:
+      unique_y, unique_idx = unique
+      args.extend([unique_y, unique_idx])
+
+    # TODO(tombagby): Update to tfe.defun
+    @function.Defun(*[x.dtype for x in args],
+                    python_grad_func=_ctc_loss_grad,
+                    shape_func=_ctc_loss_shape)
+    def compute_ctc_loss(logits_t, labels_t, label_length_t, logit_length_t,
+                         *unique_t):
+      """Compute CTC loss."""
+      logits_t.set_shape(logits.shape)
+      labels_t.set_shape(labels.shape)
+      label_length_t.set_shape(label_length.shape)
+      logit_length_t.set_shape(logit_length.shape)
+      kwargs = dict(
+          logits=logits_t,
+          labels=labels_t,
+          label_length=label_length_t,
+          logit_length=logit_length_t)
+      if unique_t:
+        kwargs["unique"] = unique_t
+      return ctc_loss_and_grad(**kwargs)
+
+    return compute_ctc_loss(*args)[0]
+
+
+@tf_export("nn.collapse_repeated")
+def collapse_repeated(labels, seq_length, name=None):
+  """Merge repeated labels into single labels.
+
+  Args:
+    labels: Tensor of shape (batch, max value in seq_length)
+    seq_length: Tensor of shape (batch), sequence length of each batch element.
+    name: A name for this `Op`. Defaults to "collapse_repeated_labels".
+
+  Returns:
+    tuple of Tensor of shape (batch, max_seq_length) with repeated labels
+    collapsed and padded to max_seq_length, eg:
+        [[A, A, B, B, A],
+         [A, B, C, D, E]] => [[A, B, A, 0, 0],
+                              [A, B, C, D, E]]
+    and int tensor of shape [batch] with new sequence lengths.
+  """
+
+  with ops.name_scope(name, "collapse_repeated_labels",
+                      [labels, seq_length]):
+    labels = ops.convert_to_tensor(labels, name="labels")
+    seq_length = ops.convert_to_tensor(seq_length, name="seq_length")
+
+    # Mask labels that don't equal previous label.
+    label_mask = array_ops.concat(
+        [array_ops.ones_like(labels[:, :1], dtypes.bool),
+         math_ops.not_equal(labels[:, 1:], labels[:, :-1])],
+        axis=1)
+
+    # Filter labels that aren't in the original sequence.
+    maxlen = _get_dim(labels, 1)
+    seq_mask = array_ops.sequence_mask(seq_length, maxlen=maxlen)
+    label_mask = math_ops.logical_and(label_mask, seq_mask)
+
+    # Count masks for new sequence lengths.
+    new_seq_len = math_ops.reduce_sum(
+        math_ops.cast(label_mask, dtypes.int32), axis=1)
+
+    # Mask indexes based on sequence length mask.
+    new_maxlen = math_ops.reduce_max(new_seq_len)
+    idx_mask = array_ops.sequence_mask(new_seq_len, maxlen=new_maxlen)
+
+    # Flatten everything and mask out labels to keep and sparse indices.
+    flat_labels = array_ops.reshape(labels, [-1])
+    flat_label_mask = array_ops.reshape(label_mask, [-1])
+    flat_idx_mask = array_ops.reshape(idx_mask, [-1])
+    idx = math_ops.range(_get_dim(flat_idx_mask, 0))
+
+    # Scatter to flat shape.
+    flat = array_ops.scatter_nd(
+        indices=array_ops.expand_dims(
+            array_ops.boolean_mask(idx, flat_idx_mask), axis=1),
+        updates=array_ops.boolean_mask(flat_labels, flat_label_mask),
+        shape=array_ops.shape(flat_idx_mask))
+
+    # Reshape back to square batch.
+    batch_size = _get_dim(labels, 0)
+    new_shape = [batch_size, new_maxlen]
+    return (array_ops.reshape(flat, new_shape),
+            math_ops.cast(new_seq_len, seq_length.dtype))
+
+
+def dense_labels_to_sparse(dense, length):
+  """Convert dense labels with sequence lengths to sparse tensor.
+
+  Args:
+    dense: tensor of shape [batch, max_length]
+    length: int tensor of shape [batch]
+      The length of each sequence in dense.
+
+  Returns:
+    tf.SparseTensor with values only for the valid elements of sequences.
+  """
+
+  flat_values = array_ops.reshape(dense, [-1])
+  flat_indices = math_ops.range(
+      array_ops.shape(flat_values, out_type=dtypes.int64)[0])
+  mask = array_ops.sequence_mask(length, maxlen=array_ops.shape(dense)[1])
+  flat_mask = array_ops.reshape(mask, [-1])
+  indices = array_ops.expand_dims(
+      array_ops.boolean_mask(flat_indices, flat_mask), 1)
+  values = array_ops.boolean_mask(flat_values, flat_mask)
+  sparse = sparse_tensor.SparseTensor(
+      indices=indices, values=math_ops.cast(values, dtypes.int32),
+      dense_shape=array_ops.shape(flat_values, out_type=dtypes.int64))
+  reshaped = sparse_ops.sparse_reshape(sparse, array_ops.shape(dense))
+  max_length = math_ops.reduce_max(length)
+  return sparse_tensor.SparseTensor(
+      indices=reshaped.indices,
+      values=reshaped.values,
+      dense_shape=[
+          math_ops.cast(reshaped.dense_shape[0], dtypes.int64),
+          math_ops.cast(max_length, dtypes.int64)])
+
+
+@tf_export("nn.ctc_unique_labels")
+def ctc_unique_labels(labels, name=None):
+  """Get unique labels and indices for batched labels for tf.nn.ctc_loss.
+
+  For use with tf.nn.ctc_loss_v2 optional argument `unique`: This op can be
+  used to preprocess labels in input pipeline to for better speed/memory use
+  computing the ctc loss on TPU.
+
+  Example:
+    ctc_unique_labels([[3, 4, 4, 3]]) ->
+      unique labels padded with 0: [[3, 4, 0, 0]]
+      indices of original labels in unique: [0, 1, 1, 0]
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_length] padded with 0.
+    name: A name for this `Op`. Defaults to "ctc_unique_labels".
+
+  Returns:
+    tuple of
+      - unique labels, tensor of shape `[batch_size, max_label_length]`
+      - indices into unique labels, shape `[batch_size, max_label_length]`
+  """
+
+  with ops.name_scope(name, "ctc_unique_labels", [labels]):
+    labels = ops.convert_to_tensor(labels, name="labels")
+    def _unique(x):
+      u = array_ops.unique(x)
+      y = array_ops.pad(
+          u.y, [[0, _get_dim(u.idx, 0) - _get_dim(u.y, 0)]])
+      y = math_ops.cast(y, dtypes.int64)
+      return [y, u.idx]
+    return functional_ops.map_fn(
+        _unique, labels, dtype=[dtypes.int64, dtypes.int32])
+
+
+def _sum_states(idx, states):
+  """Take logsumexp for each unique state out of all label states.
+
+  Args:
+    idx: tensor of shape [batch, label_length]
+      For each sequence, indices into a set of unique labels as computed by
+      calling unique.
+    states: tensor of shape [frames, batch, label_length]
+      Log probabilities for each label state.
+
+  Returns:
+    tensor of shape [frames, batch_size, label_length], log probabilites summed
+      for each unique label of the sequence.
+  """
+
+  with ops.name_scope("sum_states"):
+    idx = ops.convert_to_tensor(idx, name="idx")
+    num_states = _get_dim(states, 2)
+    states = array_ops.expand_dims(states, axis=2)
+    one_hot = array_ops.one_hot(
+        idx, depth=num_states, on_value=0.0, off_value=math_ops.log(0.0),
+        axis=1)
+    return math_ops.reduce_logsumexp(states + one_hot, axis=-1)
+
+
+def _forward_backward_log(state_trans_log_probs, initial_state_log_probs,
+                          final_state_log_probs, observed_log_probs,
+                          sequence_length):
+  """Forward-backward algorithm computed in log domain.
+
+  Args:
+    state_trans_log_probs: tensor of shape [states, states] or
+      if different transition matrix per batch [batch_size, states, states]
+    initial_state_log_probs: tensor of shape [batch_size, states]
+    final_state_log_probs: tensor of shape [batch_size, states]
+    observed_log_probs: tensor of shape [frames, batch_size, states]
+    sequence_length: tensor of shape [batch_size]
+
+  Returns:
+    forward backward log probabilites: tensor of shape [frames, batch, states]
+    log_likelihood: tensor of shape [batch_size]
+
+  Raises:
+    ValueError: If state_trans_log_probs has unknown or incorrect rank.
+  """
+
+  if state_trans_log_probs.shape.ndims == 2:
+    perm = [1, 0]
+  elif state_trans_log_probs.shape.ndims == 3:
+    perm = [0, 2, 1]
+  else:
+    raise ValueError(
+        "state_trans_log_probs rank must be known and == 2 or 3, is: %s" %
+        state_trans_log_probs.shape.ndims)
+
+  bwd_state_trans_log_probs = array_ops.transpose(state_trans_log_probs, perm)
+  batch_size = _get_dim(observed_log_probs, 1)
+
+  def _forward(state_log_prob, obs_log_prob):
+    state_log_prob = array_ops.expand_dims(state_log_prob, axis=1)  # Broadcast.
+    state_log_prob += state_trans_log_probs
+    state_log_prob = math_ops.reduce_logsumexp(state_log_prob, axis=-1)
+    state_log_prob += obs_log_prob
+    log_prob_sum = math_ops.reduce_logsumexp(
+        state_log_prob, axis=-1, keepdims=True)
+    state_log_prob -= log_prob_sum
+    return state_log_prob
+
+  fwd = _scan(_forward, observed_log_probs, initial_state_log_probs,
+              inclusive=True)
+
+  def _backward(accs, elems):
+    """Calculate log probs and cumulative sum masked for sequence length."""
+    state_log_prob, cum_log_sum = accs
+    obs_log_prob, mask = elems
+    state_log_prob += obs_log_prob
+    state_log_prob = array_ops.expand_dims(state_log_prob, axis=1)  # Broadcast.
+    state_log_prob += bwd_state_trans_log_probs
+    state_log_prob = math_ops.reduce_logsumexp(state_log_prob, axis=-1)
+
+    log_prob_sum = math_ops.reduce_logsumexp(
+        state_log_prob, axis=-1, keepdims=True)
+    state_log_prob -= log_prob_sum
+
+    cum_log_sum += array_ops.squeeze(log_prob_sum) * mask
+    batched_mask = array_ops.expand_dims(mask, axis=1)
+    out = state_log_prob * batched_mask
+    out += final_state_log_probs * (1.0 - batched_mask)
+    return out, cum_log_sum
+
+  zero_log_sum = array_ops.zeros([batch_size])
+  maxlen = _get_dim(observed_log_probs, 0)
+  mask = array_ops.sequence_mask(sequence_length, maxlen, dtypes.float32)
+  mask = array_ops.transpose(mask, perm=[1, 0])
+
+  bwd, cum_log_sum = _scan(_backward, (observed_log_probs, mask),
+                           (final_state_log_probs, zero_log_sum),
+                           reverse=True, inclusive=True)
+
+  fwd_bwd_log_probs = fwd[1:] + bwd[1:]
+  fwd_bwd_log_probs_sum = math_ops.reduce_logsumexp(
+      fwd_bwd_log_probs, axis=2, keepdims=True)
+  fwd_bwd_log_probs -= fwd_bwd_log_probs_sum
+  fwd_bwd_log_probs += math_ops.log(array_ops.expand_dims(mask, axis=2))
+
+  log_likelihood = bwd[0, :, 0] + cum_log_sum[0]
+
+  return fwd_bwd_log_probs, log_likelihood
+
+
+# TODO(tombagby): This is currently faster for the ctc implementation than using
+# functional_ops.scan, but could be replaced by that or something similar if
+# things change.
+def _scan(fn, elems, initial, reverse=False, inclusive=False, final_only=False):
+  """Repeatedly applies callable `fn` to a sequence of elements.
+
+  Implemented by functional_ops.While, tpu friendly, no gradient.
+
+  This is similar to functional_ops.scan but significantly faster on tpu/gpu
+  for the forward backward use case.
+
+  Examples:
+    scan(lambda a, e: a + e, [1.0, 2.0, 3.0], 1.0) => [2.0, 3.0, 4.0]
+
+    Multiple accumulators:
+      scan(lambda a, e: (a[0] + e, a[1] * e), [1.0, 2.0, 3.0], (0.0, 1.0))
+
+    Multiple inputs:
+      scan(lambda a, e: a + (e[0] * e[1]), (elems1, elems2), 0.0)
+
+  Args:
+    fn: callable, fn(accumulators, element) return new accumulator values.
+      The (possibly nested) sequence of accumulators is the same as `initial`
+      and the return value must have the same structure.
+    elems: A (possibly nested) tensor which will be unpacked along the first
+      dimension. The resulting slices will be the second argument to fn. The
+      first dimension of all nested input tensors must be the same.
+    initial: A tensor or (possibly nested) sequence of tensors with initial
+      values for the accumulators.
+    reverse: (optional) True enables scan and output elems in reverse order.
+    inclusive: (optional) True includes the initial accumulator values in the
+      output. Length of output will be len(elem sequence) + 1. Not meaningful
+      if final_only is True.
+    final_only: (optional) When True, return only the final accumulated values,
+      not the concatenation of accumulated values for each input.
+
+  Returns:
+    A (possibly nested) sequence of tensors with the results of applying fn
+    to tensors unpacked from elems and previous accumulator values.
+  """
+
+  flat_elems = [ops.convert_to_tensor(x) for x in nest.flatten(elems)]
+  num_elems = array_ops.shape(flat_elems[0])[0]
+  pack_elems = lambda x: nest.pack_sequence_as(structure=elems, flat_sequence=x)
+  flat_initial = [ops.convert_to_tensor(x) for x in nest.flatten(initial)]
+  pack = lambda x: nest.pack_sequence_as(structure=initial, flat_sequence=x)
+  accum_dtypes = [x.dtype for x in flat_initial]
+  num_accums = len(flat_initial)
+
+  # Types for counter, [outputs], [accumulators] loop arguments.
+  if final_only:
+    loop_dtypes = [dtypes.int32, dtypes.int32] + accum_dtypes
+  else:
+    loop_dtypes = [dtypes.int32, dtypes.int32] + accum_dtypes + accum_dtypes
+
+  # TODO(tombagby): Update to tfe.defun
+  @function.Defun(*loop_dtypes)
+  def cond(i, num_elems, *args):
+    del args
+    return i >= 0 if reverse else i < num_elems
+
+  # The loop *args are [output tensors] + [accumulator tensors] which must
+  # be paired. Each output corresponds to one accumulator.
+  @function.Defun(*loop_dtypes)
+  def body(i, num_elems, *args):
+    """Loop body."""
+    i.set_shape([])
+    if final_only:
+      accum = args
+    else:
+      out, accum = args[:num_accums], args[num_accums:]
+    slices = [array_ops.gather(e, i) for e in flat_elems]
+    accum = fn(pack(accum), pack_elems(slices))
+    flat_accum = nest.flatten(accum)
+    if final_only:
+      new_out = []
+    else:
+      update_i = i + 1 if inclusive and not reverse else i
+      new_out = [inplace_ops.alias_inplace_update(x, update_i, y)
+                 for x, y in zip(out, flat_accum)]
+    i = i - 1 if reverse else i + 1
+    return [i, num_elems] + new_out + flat_accum
+
+  init_i = (array_ops.shape(flat_elems[0])[0] - 1 if reverse
+            else constant_op.constant(0, dtype=dtypes.int32))
+  outputs = []
+  if not final_only:
+    num_outputs = array_ops.shape(flat_elems[0])[0] + (1 if inclusive else 0)
+    for initial_accum in flat_initial:
+      out_shape = array_ops.concat(
+          [[num_outputs], array_ops.shape(initial_accum)], 0)
+      out = inplace_ops.empty(out_shape, dtype=initial_accum.dtype, init=True)
+      if inclusive:
+        out = inplace_ops.alias_inplace_add(
+            out, init_i + (1 if reverse else 0), initial_accum)
+      outputs.append(out)
+  loop_in = [init_i, num_elems] + outputs + flat_initial
+  hostmem = [
+      i for i, x in enumerate(loop_in)
+      if x.dtype.base_dtype in (dtypes.int32, dtypes.int64)
+  ]
+
+  # TODO(tombagby): Update to while_v2.
+  loop_results = functional_ops.While(loop_in, cond, body, hostmem=hostmem)
+  out = loop_results[2:num_accums + 2]
+  return pack(out)
+
+
+def _get_dim(tensor, i):
+  """Get value of tensor shape[i] preferring static value if available."""
+  return tensor.shape[i].value or array_ops.shape(tensor)[i]
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 3f215dbcd0c375045cc9b739200f0e218d5c97b0..1426e8851c5f2a379c750f34d34f60fe0674cdf8 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -131,7 +131,15 @@ def custom_gradient(f):
          a list of `Tensor`s - the derivatives of `Tensor`s in `y` with respect
          to the `Tensor`s in `x`.  `grad_ys` is a `Tensor` or sequence of
          `Tensor`s the same size as `y` holding the initial value gradients for
-         each `Tensor` in `y`. If `f` uses `Variable`s (that are not part of the
+         each `Tensor` in `y`. In a pure mathematical sense, a vector-argument
+         vector-valued function `f`'s derivatives should be its Jacobian matrix
+         `J`. Here we are expressing the Jacobian `J` as a function `grad_fn`
+         which defines how `J` will transform a vector `grad_ys` when
+         left-multiplied with it (`grad_ys * J`). This functional representation
+         of a matrix is convenient to use for chain-rule calculation
+         (in e.g. the back-propagation algorithm).
+
+         If `f` uses `Variable`s (that are not part of the
          inputs), i.e. through `get_variable`, then `grad_fn` should have
          signature `g(*grad_ys, variables=None)`, where `variables` is a list of
          the `Variable`s, and return a 2-tuple `(grad_xs, grad_vars)`, where
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index cca8e12b43460917a51783e5e87322116403f5de..0fac7994cbc7f813a6856b23ae2f2c68e46f2307 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1148,7 +1148,7 @@ class Barrier(object):
         self._barrier_ref, name=name)
 
 
-@tf_export("ConditionalAccumulatorBase")
+@tf_export(v1=["ConditionalAccumulatorBase"])
 class ConditionalAccumulatorBase(object):
   """A conditional accumulator for aggregating gradients.
 
@@ -1227,7 +1227,7 @@ class ConditionalAccumulatorBase(object):
         name=name)
 
 
-@tf_export("ConditionalAccumulator")
+@tf_export(v1=["ConditionalAccumulator"])
 class ConditionalAccumulator(ConditionalAccumulatorBase):
   """A conditional accumulator for aggregating gradients.
 
diff --git a/tensorflow/python/ops/dequantize_op_test.py b/tensorflow/python/ops/dequantize_op_test.py
index 13e50273d863f3c157ee7a089532df0c925c0e5f..794985b2dbb77e4d7691753432c53ddf3ad31377 100644
--- a/tensorflow/python/ops/dequantize_op_test.py
+++ b/tensorflow/python/ops/dequantize_op_test.py
@@ -35,7 +35,7 @@ class DequantizeOpTest(test.TestCase):
     with self.cached_session():
       input_op = constant_op.constant(inputs, shape=[len(inputs)], dtype=dtype)
       dequantized = array_ops.dequantize(input_op, min_range, max_range)
-      tf_ans = dequantized.eval()
+      tf_ans = self.evaluate(dequantized)
 
     # TODO(vrv): Add support for DT_QINT32 quantization if needed.
     type_dict = {
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index baecc321d3824d550d5b7d9fc86caf4ec93c6c64..4fb598aef4d725bfd0d5a1ce99af7e7a1ea86fb0 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -32,7 +32,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("distributions.Bernoulli")
+@tf_export(v1=["distributions.Bernoulli"])
 class Bernoulli(distribution.Distribution):
   """Bernoulli distribution.
 
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 51c4f6eb3d0ba9e4ec28bde5189b4dba44471990..1d1a666317f83c91ae3d3aaac77596ca8d0f8680 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -47,7 +47,7 @@ _beta_sample_note = """Note: `x` must have dtype `self.dtype` and be in
 `[0, 1].` It must have a shape compatible with `self.batch_shape()`."""
 
 
-@tf_export("distributions.Beta")
+@tf_export(v1=["distributions.Beta"])
 class Beta(distribution.Distribution):
   """Beta distribution.
 
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 09d7e0e6804425375c93f04ec221ce3ca815d606..33a843562506e3e57dcba9bc6922f6ae6fa1900a 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -59,7 +59,7 @@ def _broadcast_cat_event_and_params(event, params, base_dtype):
   return event, params
 
 
-@tf_export("distributions.Categorical")
+@tf_export(v1=["distributions.Categorical"])
 class Categorical(distribution.Distribution):
   """Categorical distribution.
 
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 675c30b383391cf6f3c9cbdf5405945b5e36c66e..971ce46efbc7aaa268c2a61a0da62d64d67668ee 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -45,7 +45,7 @@ dtype `self.dtype` and be in the `(self.event_shape() - 1)`-simplex, i.e.,
 `self.batch_shape() + self.event_shape()`."""
 
 
-@tf_export("distributions.Dirichlet")
+@tf_export(v1=["distributions.Dirichlet"])
 class Dirichlet(distribution.Distribution):
   """Dirichlet distribution.
 
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index 2e3151a5ab4ac94dff6d25ef494f70d8e338482d..8ce01f6b95777248ab772f5903bf061efdcabdce 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -51,7 +51,7 @@ fractional components, and such that
 with `self.concentration` and `self.total_count`."""
 
 
-@tf_export("distributions.DirichletMultinomial")
+@tf_export(v1=["distributions.DirichletMultinomial"])
 class DirichletMultinomial(distribution.Distribution):
   """Dirichlet-Multinomial compound distribution.
 
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 11247a39bca2d53f3173e0ad290930b5c14abc0c..d551830fb84784fd6503e4386b587ef1fb3c8101 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -212,7 +212,7 @@ class _DistributionMeta(abc.ABCMeta):
     return abc.ABCMeta.__new__(mcs, classname, baseclasses, attrs)
 
 
-@tf_export("distributions.ReparameterizationType")
+@tf_export(v1=["distributions.ReparameterizationType"])
 class ReparameterizationType(object):
   """Instances of this class represent how sampling is reparameterized.
 
@@ -263,7 +263,7 @@ class ReparameterizationType(object):
 # reparameterized distribution support straight-through gradients with
 # respect to all parameters.
 FULLY_REPARAMETERIZED = ReparameterizationType("FULLY_REPARAMETERIZED")
-tf_export("distributions.FULLY_REPARAMETERIZED").export_constant(
+tf_export(v1=["distributions.FULLY_REPARAMETERIZED"]).export_constant(
     __name__, "FULLY_REPARAMETERIZED")
 
 
@@ -271,12 +271,12 @@ tf_export("distributions.FULLY_REPARAMETERIZED").export_constant(
 # reparameterized distribution do not support straight-through gradients for
 # at least some of the parameters.
 NOT_REPARAMETERIZED = ReparameterizationType("NOT_REPARAMETERIZED")
-tf_export("distributions.NOT_REPARAMETERIZED").export_constant(
+tf_export(v1=["distributions.NOT_REPARAMETERIZED"]).export_constant(
     __name__, "NOT_REPARAMETERIZED")
 
 
 @six.add_metaclass(_DistributionMeta)
-@tf_export("distributions.Distribution")
+@tf_export(v1=["distributions.Distribution"])
 class Distribution(_BaseDistribution):
   """A generic probability distribution base class.
 
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index 6a52af8c33e8a62ef5ab18640a77660faefa52c2..8b79a5d4abdbb20086ac9cba49370a9b084fe2b6 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -37,7 +37,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Exponential")
+@tf_export(v1=["distributions.Exponential"])
 class Exponential(gamma.Gamma):
   """Exponential distribution.
 
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index 4a2db208d40982f7f2c6a933145deb4528a6853b..57505d1b1311054f4d837e5e0b958df855df4881 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -43,7 +43,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Gamma")
+@tf_export(v1=["distributions.Gamma"])
 class Gamma(distribution.Distribution):
   """Gamma distribution.
 
diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
index 12743fa23d6a3d69ee65d4d653f0d195d77fb0fc..5c6745b0fe0c4036f91885a30e428fe76f316b8f 100644
--- a/tensorflow/python/ops/distributions/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -60,7 +60,7 @@ def _registered_kl(type_a, type_b):
     "should update all references to use `tfp.distributions` "
     "instead of `tf.distributions`.",
     warn_once=True)
-@tf_export("distributions.kl_divergence")
+@tf_export(v1=["distributions.kl_divergence"])
 def kl_divergence(distribution_a, distribution_b,
                   allow_nan_stats=True, name=None):
   """Get the KL-divergence KL(distribution_a || distribution_b).
@@ -161,7 +161,7 @@ def cross_entropy(ref, other,
         ref, other, allow_nan_stats=allow_nan_stats)
 
 
-@tf_export("distributions.RegisterKL")
+@tf_export(v1=["distributions.RegisterKL"])
 class RegisterKL(object):
   """Decorator to register a KL divergence implementation function.
 
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index 4f6a8f587d1e921413069a82df9bedf58730c310..a96b58ba1a64246e6f7f2d4a44bdbdae1f8d0cf8 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -43,7 +43,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Laplace")
+@tf_export(v1=["distributions.Laplace"])
 class Laplace(distribution.Distribution):
   """The Laplace distribution with location `loc` and `scale` parameters.
 
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 8397353cd5ed38ac2afb1a773367d97db7185d90..97d2b1b26c68dc53f0a77120c9d3820c1d0f017b 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -52,7 +52,7 @@ fractional components, and such that
 with `self.probs` and `self.total_count`."""
 
 
-@tf_export("distributions.Multinomial")
+@tf_export(v1=["distributions.Multinomial"])
 class Multinomial(distribution.Distribution):
   """Multinomial distribution.
 
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 9f511709b90b039e739d6f14b4f293ef029a4dbf..9acc0469885c2463e84f875314f07d1f3d55481a 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -42,7 +42,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Normal")
+@tf_export(v1=["distributions.Normal"])
 class Normal(distribution.Distribution):
   """The Normal distribution with location `loc` and `scale` parameters.
 
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index b69e61925c122d6e29b76fe5de89a546b1695325..351f5605e24770c152ad01f09b9ee78b59c3ddf5 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -43,7 +43,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.StudentT")
+@tf_export(v1=["distributions.StudentT"])
 class StudentT(distribution.Distribution):
   """Student's t-distribution.
 
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index b6b24187cc537809dc167205baf4c2a76e06c8d5..8fac0167778b824c9621462ce4981f6d767bedf2 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -33,7 +33,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("distributions.Uniform")
+@tf_export(v1=["distributions.Uniform"])
 class Uniform(distribution.Distribution):
   """Uniform distribution with `low` and `high` parameters.
 
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 760e7a8a84bbf3316175136fd9203035165435d0..24314e8fc92b3aef2718dd6668ca5564764aa8f4 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -343,7 +343,7 @@ def embed_check_categorical_event_shape(
     x_dtype = x.dtype.base_dtype
     max_event_size = (_largest_integer_by_dtype(x_dtype)
                       if x_dtype.is_floating else 0)
-    if max_event_size is 0:
+    if max_event_size == 0:
       raise TypeError("Unable to validate size of unrecognized dtype "
                       "({}).".format(x_dtype.name))
     try:
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index f2701bc41bdd5f62316eeb5cf67fd1bf432222d4..57542e3c7baa0f4eb3dc53431c9a3060f0998c5b 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -13,17 +13,13 @@
 # limitations under the License.
 # =============================================================================
 
-"""Functional operations.
-
-See the [Higher Order
-Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
-"""
+"""Functional operations."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -806,6 +802,29 @@ def Gradient(inputs, f, name=None):
   return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
 
 
+def _LoopBodyCaptureWrapper(func):
+  """Returns a wrapper for `func` that handles loop-carried captured inputs."""
+
+  @function.Defun(
+      *func.declared_input_types, func_name="%s_Wrapper" % func.name)
+  def Wrapper(*args):
+    """A wrapper that handles loop-carried captured inputs."""
+    result = func(*args)
+    extra_args = tuple(function.get_extra_args())
+    # Nullary functions return an Operation. Normal functions can't do this
+    # because their return values are converted to Tensors.
+    if isinstance(result, ops.Operation):
+      return extra_args
+    # Unary functions return a single Tensor value.
+    elif not isinstance(result, tuple):
+      return (result,) + extra_args
+    # N-ary functions return a tuple of Tensors.
+    else:
+      return result + extra_args
+
+  return Wrapper
+
+
 # pylint: disable=invalid-name,protected-access
 def While(input_, cond, body, name=None, hostmem=None):
   r"""output = input; While (Cond(output)) { output = Body(output) }.
@@ -827,11 +846,41 @@ def While(input_, cond, body, name=None, hostmem=None):
     hostmem: A list of integer. If i is in the list, input[i] is a
       host memory tensor.
 
+  Raises:
+    ValueError: if `cond` has implicitly captured inputs or if `cond` and `body`
+      have different signatures.
+
   Returns:
     A list of `Tensor` objects. Has the same type as `input`.
     A list of output tensors whose types are T.
   """
-  ret = gen_functional_ops._while(input_, cond, body, name=name)
+  if cond.captured_inputs:
+    raise ValueError("While op 'cond' argument must be a function "
+                     "without implicitly captured inputs.")
+
+  if cond.declared_input_types != body.declared_input_types:
+    raise ValueError(
+        "While op 'cond' and 'body' signatures do not match. %r vs %r" %
+        (cond.declared_input_types, body.declared_input_types))
+
+  if body.captured_inputs:
+    cond_dtypes = list(
+        body.declared_input_types) + [t.dtype for t in body.captured_inputs]
+
+    @function.Defun(*cond_dtypes, func_name="%s_Wrapper" % cond.name)
+    def CondWrapper(*args):
+      """A wrapper that handles loop-carried captured inputs."""
+      return cond(*args[:len(body.declared_input_types)])
+
+    ret = gen_functional_ops._while(
+        input_ + body.captured_inputs,
+        CondWrapper,
+        _LoopBodyCaptureWrapper(body),
+        name=name)
+    # Slice off the loop-carried captured inputs.
+    ret = ret[:-len(body.captured_inputs)]
+  else:
+    ret = gen_functional_ops._while(input_, cond, body, name=name)
   if hostmem:
     input_attr = attr_value_pb2.AttrValue()
     input_attr.list.i.extend(hostmem)
@@ -880,11 +929,10 @@ def _ForUsingWhile(start,
   # must have identical inputs, we have to augment the cond signature to take
   # the same types as the carried loop variables.
   body_sig = [dtypes.int32] * 4 + list(forbody.declared_input_types)[1:]
-  cond_sig = body_sig + [t.dtype for t in forbody.captured_inputs]
 
   cond_name = "%s_Cond" % forbody.name
 
-  @function.Defun(*cond_sig, func_name=cond_name)
+  @function.Defun(*body_sig, func_name=cond_name)
   def WhileCond(i, n, *args):
     del args
     return i < n
@@ -902,8 +950,7 @@ def _ForUsingWhile(start,
     # Unary functions return a single Tensor value.
     elif isinstance(for_result, ops.Tensor):
       for_result = (for_result,)
-    extra_args = tuple(function.get_extra_args())
-    return (i + 1, n, start, delta) + tuple(for_result) + extra_args
+    return (i + 1, n, start, delta) + tuple(for_result)
 
   if hostmem is not None:
     hostmem = [0, 1, 2, 3] + [(4 + _) for _ in hostmem]
@@ -911,13 +958,13 @@ def _ForUsingWhile(start,
     hostmem = [0, 1, 2, 3]
 
   results = While(
-      input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs,
+      input_=[0, n, start, delta] + inputs,
       cond=WhileCond,
       body=WhileBody,
       name=name,
       hostmem=hostmem)
   # Slice off the loop-carried captured inputs.
-  return list(results[4:len(results) - len(WhileBody.captured_inputs)])
+  return list(results[4:len(results)])
 
 
 def For(start,
@@ -951,29 +998,15 @@ def For(start,
   if rewrite_with_while:
     return _ForUsingWhile(start, limit, delta, inputs, body, name, hostmem)
   if body.captured_inputs:
-    wrapper_name = "%s_BodyWrapper" % body.name
-
-    @function.Defun(*body.declared_input_types, func_name=wrapper_name)
-    def BodyWrapper(*args):
-      """A wrapper for body that handles loop-carried captured inputs."""
-      body_result = body(*args)
-      extra_args = tuple(function.get_extra_args())
-      # Nullary functions return an Operation. Normal functions can't do this
-      # because their return values are converted to Tensors.
-      if isinstance(body_result, ops.Operation):
-        return extra_args
-      # Unary functions return a single Tensor value.
-      elif not isinstance(body_result, tuple):
-        return (body_result,) + extra_args
-      # N-ary functions return a tuple of Tensors.
-      else:
-        return body_result + extra_args
-
-    inputs += BodyWrapper.captured_inputs
     ret = gen_functional_ops._for(
-        start, limit, delta, inputs, BodyWrapper, name=name)
+        start,
+        limit,
+        delta,
+        inputs + body.captured_inputs,
+        _LoopBodyCaptureWrapper(body),
+        name=name)
     # Slice off the loop-carried captured inputs.
-    ret = ret[:-len(BodyWrapper.captured_inputs)]
+    ret = ret[:-len(body.captured_inputs)]
   else:
     ret = gen_functional_ops._for(start, limit, delta, inputs, body, name=name)
   if hostmem:
@@ -994,9 +1027,10 @@ _rewriter_config_optimizer_disabled = None
 def _get_disabled_rewriter_config():
   global _rewriter_config_optimizer_disabled
   if _rewriter_config_optimizer_disabled is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.disable_meta_optimizer = True
-    _rewriter_config_optimizer_disabled = rewriter_config.SerializeToString()
+    _rewriter_config_optimizer_disabled = config.SerializeToString()
   return _rewriter_config_optimizer_disabled
 
 
@@ -1015,7 +1049,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
       the signature of `f`.
     executing_eagerly: (Optional) A boolean indicating whether the context is
       executing eagerly. If `None`, fetched from the global context.
-    config: (Optional) A tensorflow::RewriterConfig proto, serialized. If
+    config: (Optional) A `tensorflow::ConfigProto` proto, serialized. If
       `None`, all optimizations are disabled. Currently only handled for eager
       defined functions.
     executor_type: (Optional) A string for the name of the executor to be used
@@ -1043,10 +1077,12 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
   if executing_eagerly or len(tout):
     if f.stateful_ops:
       outputs = gen_functional_ops.stateful_partitioned_call(
-          args=args, Tout=tout, f=f, config=config, executor_type=executor_type)
+          args=args, Tout=tout, f=f, config_proto=config,
+          executor_type=executor_type)
     else:
       outputs = gen_functional_ops.partitioned_call(
-          args=args, Tout=tout, f=f, config=config, executor_type=executor_type)
+          args=args, Tout=tout, f=f, config_proto=config,
+          executor_type=executor_type)
     return outputs if outputs else None
 
   # The generated binding returns an empty list for functions that don't
@@ -1065,7 +1101,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
   # When running in graph mode, the graph and function graphs are optimized
   # (i.e. run through grappler) per the session options, so we can disable any
   # eager-specific rewriting.
-  rewriter_config = attr_value_pb2.AttrValue(s=_get_disabled_rewriter_config())
+  config_proto = attr_value_pb2.AttrValue(s=_get_disabled_rewriter_config())
 
   graph = ops.get_default_graph()
   f.add_to_graph(graph)
@@ -1080,7 +1116,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
           "Tin": tin_attr,
           "Tout": tout_attr,
           "f": func_attr,
-          "config": rewriter_config,
+          "config_proto": config_proto,
           "executor_type": executor_type_attr,
       })
   outputs = op.outputs
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 1665219c80c4cc92e25d132f5d84e384b5b6a704..3926ff69003124d29ae1579151c5a05fa49e3dc3 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -266,7 +265,7 @@ def _compute_gradient_list(x,
   return ret
 
 
-@tf_export("test.compute_gradient")
+@tf_export(v1=["test.compute_gradient"])
 def compute_gradient(x,
                      x_shape,
                      y,
@@ -329,9 +328,6 @@ def compute_gradient(x,
 
 
 @tf_export(v1=["test.compute_gradient_error"])
-@deprecation.deprecated_args(
-    None, "init_targets will be deprecated in TensorFlow 2.0",
-    ("init_targets", None))  # Do not trigger warning in V2
 def compute_gradient_error(x,
                            x_shape,
                            y,
@@ -380,52 +376,3 @@ def compute_gradient_error(x,
     if j_t.size or j_n.size:  # Handle zero size tensors correctly
       error = np.maximum(error, np.fabs(j_t - j_n).max())
   return error
-
-
-@tf_export("test.compute_gradient_error", v1=[])
-def compute_gradient_error_v2(x,
-                              x_shape,
-                              y,
-                              y_shape,
-                              x_init_value=None,
-                              delta=1e-3,
-                              extra_feed_dict=None):
-  """Computes the gradient error.
-
-  Computes the maximum error for dy/dx between the computed Jacobian and the
-  numerically estimated Jacobian.
-
-  This function will modify the tensors passed in as it adds more operations
-  and hence changing the consumers of the operations of the input tensors.
-
-  This function adds operations to the current session. To compute the error
-  using a particular device, such as a GPU, use the standard methods for
-  setting a device (e.g. using with sess.graph.device() or setting a device
-  function in the session constructor).
-
-  Args:
-    x: a tensor or list of tensors
-    x_shape: the dimensions of x as a tuple or an array of ints. If x is a list,
-      then this is the list of shapes.
-    y: a tensor
-    y_shape: the dimensions of y as a tuple or an array of ints.
-    x_init_value: (optional) a numpy array of the same shape as "x" representing
-      the initial value of x. If x is a list, this should be a list of numpy
-      arrays.  If this is none, the function will pick a random tensor as the
-      initial value.
-    delta: (optional) the amount of perturbation.
-    extra_feed_dict: dict that allows fixing specified tensor values during the
-      Jacobian calculation.
-
-  Returns:
-    The maximum error in between the two Jacobians.
-  """
-  return compute_gradient_error(
-      x,
-      x_shape,
-      y,
-      y_shape,
-      x_init_value=x_init_value,
-      delta=delta,
-      init_targets=None,
-      extra_feed_dict=extra_feed_dict)
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 4f0fb54dcab8553043d464392ccd011a90a6bb62..53c0709e326a5d3b4e0e8f1284e01d7a35b75f8f 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -51,7 +51,6 @@ from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import spectral_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 103e3902b60f153531fa899d8f3d92df25a9e11c..a9058c4a341dda3a19a7f5390da1455981ee5d4c 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -144,7 +144,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
                                  gate_gradients=True)[0]
       with session.Session():
         # Make sure the placer doesn't complain.
-        gz_x.eval()
+        self.evaluate(gz_x)
 
   def testBoundaryStop(self):
     # Test that we don't differentiate 'x'. The gradient function for 'x' is
@@ -365,7 +365,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
       with self.cached_session() as sess:
-        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], sess.run(grads)[0])
+        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], self.evaluate(grads)[0])
 
   def testUnconnectedGradientsZeroConnectedGradients(self):
     with ops.Graph().as_default():
@@ -374,7 +374,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grad = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
       with self.cached_session() as sess:
-        self.assertEquals(3.0, sess.run(grad)[0])
+        self.assertEquals(3.0, self.evaluate(grad)[0])
 
   def testUnknownUnconnectedGradientsValueGiven(self):
     with ops.Graph().as_default():
@@ -438,8 +438,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(y, [x, b1])
 
       with self.cached_session() as sess:
-        self.assertAllEqual([40.0], sess.run(grads)[0])
-        self.assertAllEqual([10.0], sess.run(grads)[1])
+        self.assertAllEqual([40.0], self.evaluate(grads)[0])
+        self.assertAllEqual([10.0], self.evaluate(grads)[1])
 
   def testFunctionGradientsWithGradFunc(self):
     g = ops.Graph()
@@ -487,7 +487,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(f), 2.0)
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testGradientOfCaptured(self):
     with ops.Graph().as_default():
@@ -501,7 +501,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(f), 2.0)
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedResourceVariable(self):
     with ops.Graph().as_default():
@@ -515,8 +515,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertEqual(sess.run(f), 2.0)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedNested(self):
     with ops.Graph().as_default():
@@ -541,9 +541,9 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       x1_grad, x2_grad = Outer()
       with self.cached_session() as sess:
         # 1.0 + None + 2.0 + 1.0 = 4.0
-        self.assertEqual(sess.run(x1_grad), 4.0)
+        self.assertEqual(self.evaluate(x1_grad), 4.0)
         # None + 1.0 + 1.0 + None = 2.0
-        self.assertEqual(sess.run(x2_grad), 2.0)
+        self.assertEqual(self.evaluate(x2_grad), 2.0)
 
   def testCapturedFromFunction(self):
     with ops.Graph().as_default():
@@ -563,7 +563,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       z_grad = Outer()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(z_grad), 3.0)
+        self.assertEqual(self.evaluate(z_grad), 3.0)
 
   def testCapturedEagerTensors(self):
     # Test that we can handle captured eager tensors unrelated to the gradient
@@ -628,7 +628,7 @@ class HessianVectorProductTest(test_util.TensorFlowTestCase):
         mat_x = math_ops.matmul(mat, x, name="Ax")
         x_mat_x = math_ops.matmul(array_ops.transpose(x), mat_x, name="xAx")
         hess_v = gradients_impl._hessian_vector_product(x_mat_x, [x], [v])[0]
-        hess_v_actual = hess_v.eval()
+        hess_v_actual = self.evaluate(hess_v)
       self.assertAllClose(hess_v_value, hess_v_actual)
 
 
@@ -648,7 +648,7 @@ class HessianTest(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_value)
       x_mat_x = math_ops.reduce_sum(x[:, None] * mat * x[None, :])
       hess = gradients.hessians(x_mat_x, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     self.assertAllClose(hess_value, hess_actual)
 
   def testHessian1D_multi(self):
@@ -692,7 +692,7 @@ class HessianTest(test_util.TensorFlowTestCase):
           math_ops.matmul(array_ops.transpose(x), x) * 0.5
       )
       hess = gradients.hessians(x_square, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     hess_value = np.bmat([
         [elem*np.ones((m, m)) for elem in vec]
         for vec in np.eye(m)
@@ -711,7 +711,7 @@ class HessianTest(test_util.TensorFlowTestCase):
           math_ops.matmul(array_ops.transpose(x), x) * 0.5
       )
       hess = gradients.hessians(x_square, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     hess_value = np.bmat([
         [elem*np.ones((n, n)) for elem in vec]
         for vec in np.eye(m)
@@ -729,7 +729,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       c_sparse = math_ops._as_indexed_slices(c)
       self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
       c_dense = math_ops.multiply(c_sparse, 1.0)
-      self.assertAllClose(np_val, c_dense.eval())
+      self.assertAllClose(np_val, self.evaluate(c_dense))
 
   def testIndexedSlicesToTensorList(self):
     with self.cached_session():
@@ -745,7 +745,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
         sparse_list.append(c_sparse)
       packed_dense = array_ops.stack(dense_list)
       packed_sparse = array_ops.stack(sparse_list)
-      self.assertAllClose(packed_dense.eval(), packed_sparse.eval())
+      self.assertAllClose(packed_dense.eval(), self.evaluate(packed_sparse))
 
   def testInt64Indices(self):
     with self.cached_session():
@@ -757,7 +757,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
           math_ops.cast(c_sparse.indices, dtypes.int64), c_sparse.dense_shape)
       self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
       c_dense = math_ops.multiply(c_sparse, 1.0)
-      self.assertAllClose(np_val, c_dense.eval())
+      self.assertAllClose(np_val, self.evaluate(c_dense))
 
   def testWarnings(self):
     # TODO(gunan) Reenable after this issue is fixed:
@@ -853,7 +853,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       y = MyIdentity(MyIdentity(x))
       dy = gradients.gradients(y, x)[0]
       with session.Session():
-        self.assertEqual(9., dy.eval())
+        self.assertEqual(9., self.evaluate(dy))
 
   def testCustomGradient(self):
 
@@ -873,7 +873,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       y = MyMultiply(x1, x2)
       dy = gradients.gradients(y, [x1, x2])
       with session.Session() as sess:
-        self.assertAllEqual([3., 5.], sess.run(dy))
+        self.assertAllEqual([3., 5.], self.evaluate(dy))
 
   def testCustomGradientErrors(self):
 
@@ -914,7 +914,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       for g in grads:
         self.assertTrue(g is not None)
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         dw = sess.run(math_ops.reduce_sum(grads[1]))
         self.assertEqual(12., dw)
 
@@ -1074,7 +1074,7 @@ class TensorListGradientsTest(test_util.TensorFlowTestCase):
 
       grad = gradients.gradients(tl, a, grad_ys=grad_tl)[0]
       with self.cached_session() as sess:
-        self.assertEquals(sess.run(grad), 5.)
+        self.assertEquals(self.evaluate(grad), 5.)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index e7fe0efba4e5a1e7216b471c248af650b3736328..cd1abbfc132005e2b2f855bbc1792dfde31f7170 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -39,7 +39,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_1d_values_int32_output(self):
     # Bins will be:
@@ -51,7 +51,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5, dtype=dtypes.int64)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_1d_float64_values_int32_output(self):
     # Bins will be:
@@ -63,7 +63,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_2d_values(self):
     # Bins will be:
@@ -76,7 +76,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
 
 class HistogramFixedWidthTest(test.TestCase):
@@ -110,7 +110,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_1d_values_int64_output(self):
     # Bins will be:
@@ -122,7 +122,7 @@ class HistogramFixedWidthTest(test.TestCase):
       hist = histogram_ops.histogram_fixed_width(
           values, value_range, nbins=5, dtype=dtypes.int64)
       self.assertEqual(dtypes.int64, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_1d_float64_values(self):
     # Bins will be:
@@ -133,7 +133,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_2d_values(self):
     # Bins will be:
@@ -144,7 +144,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_shape_inference(self):
     value_range = [0.0, 5.0]
@@ -155,7 +155,7 @@ class HistogramFixedWidthTest(test.TestCase):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertAllEqual(hist.shape.as_list(), (5,))
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
       hist = histogram_ops.histogram_fixed_width(
           values, value_range, nbins=placeholder)
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index 32c2f37c0b769de6564f968c44df2bb552cd7edc..0ea15b0d23f5ed35538d74c7cceca1ff7292f6a9 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -44,7 +44,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
                                                        out_shape[1:3])
         self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-        resize_out = sess.run(resize_out)
+        resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
   def testGradFromResizeToLargerInBothDims(self):
@@ -113,7 +113,7 @@ class ResizeBilinearOpTest(test.TestCase):
       resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
       self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-      resize_out = sess.run(resize_out)
+      resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
   def testGradFromResizeToLargerInBothDims(self):
@@ -196,7 +196,7 @@ class ResizeBicubicOpTest(test.TestCase):
                                               align_corners=align_corners)
         self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-        resize_out = sess.run(resize_out)
+        resize_out = self.evaluate(resize_out)
         self.assertEqual(out_shape, list(resize_out.shape))
 
   def testGradFromResizeToLargerInBothDims(self):
@@ -273,7 +273,7 @@ class CropAndResizeOpTest(test.TestCase):
           constant_op.constant(
               crop_size, shape=[2]))
       self.assertEqual(crops_shape, list(crops.get_shape()))
-      crops = sess.run(crops)
+      crops = self.evaluate(crops)
       self.assertEqual(crops_shape, list(crops.shape))
 
   def _randomUniformAvoidAnchors(self, low, high, anchors, radius, num_samples):
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 3ab3695a03c080c9f1491a9c871a62808ee3f2cb..1618b7950472f04f016f1622f5adc51b3c46fca3 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -24,6 +24,7 @@ from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -37,6 +38,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable('RandomCrop')
@@ -511,15 +513,20 @@ def _rot90_4D(images, k, name_scope):
   result.set_shape([shape[0], None, None, shape[3]])
   return result
 
-@tf_export('image.transpose_image')
+
+@tf_export(v1=['image.transpose', 'image.transpose_image'])
 def transpose_image(image):
-  """Transpose image(s) by swapping the height and width dimension.
+  return transpose(image=image, name=None)
+
 
-  See also `transpose()`.
+@tf_export('image.transpose', v1=[])
+def transpose(image, name=None):
+  """Transpose image(s) by swapping the height and width dimension.
 
   Args:
     image: 4-D Tensor of shape `[batch, height, width, channels]` or
            3-D Tensor of shape `[height, width, channels]`.
+    name: A name for this operation (optional).
 
   Returns:
     If `image` was 4-D, a 4-D float Tensor of shape
@@ -530,14 +537,14 @@ def transpose_image(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'transpose_image', [image]):
+  with ops.name_scope(name, 'transpose', [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
     if shape.ndims == 3 or shape.ndims is None:
-      return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
+      return array_ops.transpose(image, [1, 0, 2], name=name)
     elif shape.ndims == 4:
-      return array_ops.transpose(image, [0, 2, 1, 3], name='transpose_image')
+      return array_ops.transpose(image, [0, 2, 1, 3], name=name)
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
@@ -938,12 +945,28 @@ class ResizeMethod(object):
   AREA = 3
 
 
-@tf_export('image.resize_images')
+@tf_export(v1=['image.resize_images', 'image.resize'])
 def resize_images(images,
                   size,
                   method=ResizeMethod.BILINEAR,
                   align_corners=False,
                   preserve_aspect_ratio=False):
+  return resize_images_v2(
+      images=images,
+      size=size,
+      method=method,
+      align_corners=align_corners,
+      preserve_aspect_ratio=preserve_aspect_ratio,
+      name=None)
+
+
+@tf_export('image.resize', v1=[])
+def resize_images_v2(images,
+                     size,
+                     method=ResizeMethod.BILINEAR,
+                     align_corners=False,
+                     preserve_aspect_ratio=False,
+                     name=None):
   """Resize `images` to `size` using the specified `method`.
 
   Resized images will be distorted if their original aspect ratio is not
@@ -979,6 +1002,7 @@ def resize_images(images,
       then `images` will be resized to a size that fits in `size` while
       preserving the aspect ratio of the original image. Scales up the image if
       `size` is bigger than the current size of the `image`. Defaults to False.
+    name: A name for this operation (optional).
 
   Raises:
     ValueError: if the shape of `images` is incompatible with the
@@ -992,7 +1016,7 @@ def resize_images(images,
     If `images` was 3-D, a 3-D float Tensor of shape
     `[new_height, new_width, channels]`.
   """
-  with ops.name_scope(None, 'resize_images', [images, size]):
+  with ops.name_scope(name, 'resize', [images, size]):
     images = ops.convert_to_tensor(images, name='images')
     if images.get_shape().ndims is None:
       raise ValueError('\'images\' contains no shape.')
@@ -1942,7 +1966,114 @@ def total_variation(images, name=None):
   return tot_var
 
 
-@tf_export('image.sample_distorted_bounding_box')
+@tf_export('image.sample_distorted_bounding_box', v1=[])
+def sample_distorted_bounding_box_v2(image_size,
+                                     bounding_boxes,
+                                     seed=0,
+                                     min_object_covered=0.1,
+                                     aspect_ratio_range=None,
+                                     area_range=None,
+                                     max_attempts=None,
+                                     use_image_if_no_bounding_boxes=None,
+                                     name=None):
+  """Generate a single randomly distorted bounding box for an image.
+
+  Bounding box annotations are often supplied in addition to ground-truth labels
+  in image recognition or object localization tasks. A common technique for
+  training such a system is to randomly distort an image while preserving
+  its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+  localization of an object, i.e. bounding box, given an `image_size`,
+  `bounding_boxes` and a series of constraints.
+
+  The output of this Op is a single bounding box that may be used to crop the
+  original image. The output is returned as 3 tensors: `begin`, `size` and
+  `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to
+  visualize what the bounding box looks like.
+
+  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`.
+  The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width
+  and height of the underlying image.
+
+  For example,
+
+  ```python
+      # Generate a single distorted bounding box.
+      begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+          tf.shape(image),
+          bounding_boxes=bounding_boxes,
+          min_object_covered=0.1)
+
+      # Draw the bounding box in an image summary.
+      image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+                                                    bbox_for_draw)
+      tf.summary.image('images_with_box', image_with_box)
+
+      # Employ the bounding box to distort the image.
+      distorted_image = tf.slice(image, begin, size)
+  ```
+
+  Note that if no bounding box information is available, setting
+  `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+  bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+  false and no bounding boxes are supplied, an error is raised.
+
+  Args:
+    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
+      `int16`, `int32`, `int64`.
+      1-D, containing `[height, width, channels]`.
+    bounding_boxes: A `Tensor` of type `float32`.
+      3-D with shape `[batch, N, 4]` describing the N bounding boxes
+      associated with the image.
+    seed: An optional `int`. Defaults to `0`.
+      If either `seed` or `seed2` are set to non-zero, the random number
+      generator is seeded by the given `seed`.  Otherwise, it is seeded by a
+      random seed.
+    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`.
+      The cropped area of the image must contain at least this
+      fraction of any bounding box supplied. The value of this parameter should
+      be non-negative. In the case of 0, the cropped area does not need to
+      overlap any of the bounding boxes supplied.
+    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
+      1.33]`.
+      The cropped area of the image must have an aspect `ratio =
+      width / height` within this range.
+    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
+      The cropped area of the image must contain a fraction of the
+      supplied image within this range.
+    max_attempts: An optional `int`. Defaults to `100`.
+      Number of attempts at generating a cropped region of the image
+      of the specified constraints. After `max_attempts` failures, return the
+      entire image.
+    use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
+      Controls behavior if no bounding boxes supplied.
+      If true, assume an implicit bounding box covering the whole input. If
+      false, raise an error.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (begin, size, bboxes).
+
+    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[offset_height, offset_width, 0]`. Provide as input to
+      `tf.slice`.
+    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[target_height, target_width, -1]`. Provide as input to
+      `tf.slice`.
+    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing
+    the distorted bounding box.
+    Provide as input to `tf.image.draw_bounding_boxes`.
+  """
+  seed1, seed2 = random_seed.get_seed(seed) if seed else (0, 0)
+  return sample_distorted_bounding_box(
+      image_size, bounding_boxes, seed1, seed2, min_object_covered,
+      aspect_ratio_range, area_range, max_attempts,
+      use_image_if_no_bounding_boxes, name)
+
+
+@tf_export(v1=['image.sample_distorted_bounding_box'])
+@deprecation.deprecated(date=None, instructions='`seed2` arg is deprecated.'
+                        'Use sample_distorted_bounding_box_v2 instead.')
 def sample_distorted_bounding_box(image_size,
                                   bounding_boxes,
                                   seed=None,
@@ -2808,3 +2939,102 @@ def sobel_edges(image):
   output = array_ops.reshape(output, shape=shape)
   output.set_shape(static_image_shape.concatenate([num_kernels]))
   return output
+
+
+resize_area_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.AREA...)` instead.'))
+tf_export(v1=['image.resize_area'])(
+    resize_area_deprecation(gen_image_ops.resize_area))
+
+resize_bicubic_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.BICUBIC...)` instead.'))
+tf_export(v1=['image.resize_bicubic'])(
+    resize_bicubic_deprecation(gen_image_ops.resize_bicubic))
+
+resize_bilinear_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.'))
+tf_export(v1=['image.resize_bilinear'])(
+    resize_bilinear_deprecation(gen_image_ops.resize_bilinear))
+
+resize_nearest_neighbor_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.NEAREST_NEIGHBOR...)` '
+        'instead.'))
+tf_export(v1=['image.resize_nearest_neighbor'])(
+    resize_nearest_neighbor_deprecation(gen_image_ops.resize_nearest_neighbor))
+
+
+@tf_export('image.crop_and_resize', v1=[])
+def crop_and_resize_v2(
+    image,
+    boxes,
+    box_indices,
+    crop_size,
+    method='bilinear',
+    extrapolation_value=0,
+    name=None):
+  """Extracts crops from the input image tensor and resizes them.
+
+  Extracts crops from the input image tensor and resizes them using bilinear
+  sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+  common output size specified by `crop_size`. This is more general than the
+  `crop_to_bounding_box` op which extracts a fixed size slice from the input
+  image and does not allow resizing or aspect ratio change.
+
+  Returns a tensor with `crops` from the input `image` at positions defined at
+  the bounding box locations in `boxes`. The cropped boxes are all resized (with
+  bilinear or nearest neighbor interpolation) to a fixed
+  `size = [crop_height, crop_width]`. The result is a 4-D tensor
+  `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+  In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+  results to using `tf.image.resize_bilinear()` or
+  `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+  `align_corners=True`.
+
+  Args:
+    image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+      Both `image_height` and `image_width` need to be positive.
+    boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+      specifies the coordinates of a box in the `box_ind[i]` image and is
+      specified in normalized coordinates `[y1, x1, y2, x2]`. A normalized
+      coordinate value of `y` is mapped to the image coordinate at `y *
+      (image_height - 1)`, so as the `[0, 1]` interval of normalized image
+      height is mapped to `[0, image_height - 1]` in image height coordinates.
+      We do allow `y1` > `y2`, in which case the sampled crop is an up-down
+      flipped version of the original image. The width dimension is treated
+      similarly. Normalized coordinates outside the `[0, 1]` range are allowed,
+      in which case we use `extrapolation_value` to extrapolate the input image
+      values.
+    box_indices: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0,
+      batch)`. The value of `box_ind[i]` specifies the image that the `i`-th box
+      refers to.
+    crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`.
+      All cropped image patches are resized to this size. The aspect ratio of
+      the image content is not preserved. Both `crop_height` and `crop_width`
+      need to be positive.
+    method: An optional string specifying the sampling method for resizing. It
+      can be either `"bilinear"` or `"nearest"` and default to `"bilinear"`.
+      Currently two sampling methods are supported: Bilinear and Nearest
+      Neighbor.
+    extrapolation_value: An optional `float`. Defaults to `0`. Value used for
+      extrapolation, when applicable.
+    name: A name for the operation (optional).
+
+  Returns:
+    A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+  """
+  return gen_image_ops.crop_and_resize(
+      image, boxes, box_indices, crop_size, method, extrapolation_value, name)
+
+
+crop_and_resize_deprecation = deprecation.deprecated_args(
+    None, 'box_ind is deprecated, use box_indices instead', 'box_ind')
+tf_export(v1=['image.crop_and_resize'])(
+    crop_and_resize_deprecation(gen_image_ops.crop_and_resize))
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index a3aeb79586be2cad6eb5d6e84f9a19dcc582c07a..de82f4fc2705a98d400efd5cdde351f00530d0c3 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -84,7 +84,7 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       with self.test_session(use_gpu=True):
         hsv = image_ops.rgb_to_hsv(rgb_np)
         rgb = image_ops.hsv_to_rgb(hsv)
-        rgb_tf = rgb.eval()
+        rgb_tf = self.evaluate(rgb)
       self.assertAllClose(rgb_tf, rgb_np)
 
 
@@ -173,7 +173,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.rgb_to_grayscale(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBasicRGBToGrayscale(self):
@@ -195,7 +195,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
     # 3-D input with no batch dimension.
@@ -205,7 +205,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testShapeInference(self):
@@ -245,7 +245,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=1)
 
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       y_np = x_np
 
       self.assertAllClose(y_tf, y_np, 1e-6)
@@ -281,7 +281,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       err_msg = "Gamma should be a non-negative real number."
       try:
-        image.eval()
+        self.evaluate(image)
       except Exception as e:
         if err_msg not in str(e):
           raise
@@ -297,7 +297,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=0)
 
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       dtype = x.dtype.as_numpy_dtype
       y_np = np.array([dtypes.dtype_range[dtype][1]] * x_np.size)
@@ -360,7 +360,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testAdjustPositiveHue(self):
@@ -375,7 +375,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBatchAdjustHue(self):
@@ -390,7 +390,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjustHueNp(self, x_np, delta_h):
@@ -415,7 +415,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_hue(x, delta_h)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
     return y_tf
 
   def testAdjustRandomHue(self):
@@ -488,11 +488,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -518,11 +518,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -548,11 +548,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -610,11 +610,11 @@ class AdjustHueBenchmark(test.Benchmark):
       delta = constant_op.constant(0.1, dtype=dtypes.float32)
       outputs = image_ops.adjust_hue(inputs, delta)
       run_op = control_flow_ops.group(outputs)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for i in xrange(warmup_rounds + benchmark_rounds):
         if i == warmup_rounds:
           start = time.time()
-        sess.run(run_op)
+        self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -653,12 +653,12 @@ class AdjustSaturationBenchmark(test.Benchmark):
       delta = constant_op.constant(0.1, dtype=dtypes.float32)
       outputs = image_ops.adjust_saturation(inputs, delta)
       run_op = control_flow_ops.group(outputs)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in xrange(warmup_rounds):
-        sess.run(run_op)
+        self.evaluate(run_op)
       start = time.time()
       for _ in xrange(benchmark_rounds):
-        sess.run(run_op)
+        self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -698,7 +698,7 @@ class ResizeBilinearBenchmark(test.Benchmark):
       benchmark_op = control_flow_ops.group(*deps)
 
     with self.benchmark_session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -746,7 +746,7 @@ class ResizeBicubicBenchmark(test.Benchmark):
       benchmark_op = control_flow_ops.group(*deps)
 
     with self.benchmark_session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -803,7 +803,7 @@ class ResizeAreaBenchmark(test.Benchmark):
       benchmark_op = control_flow_ops.group(*deps)
 
     with self.benchmark_session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -846,7 +846,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTwiceSaturation(self):
@@ -861,7 +861,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBatchSaturation(self):
@@ -876,7 +876,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjust_saturation(self, image, saturation_factor):
@@ -899,7 +899,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTwiceSaturationFused(self):
@@ -914,7 +914,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjustSaturationNp(self, x_np, scale):
@@ -980,7 +980,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionLeftRightWithBatch(self):
@@ -990,7 +990,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testLeftRight(self):
@@ -1001,7 +1001,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
       self.assertTrue(y.op.name.startswith("flip_left_right"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testLeftRightWithBatch(self):
@@ -1015,7 +1015,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testRandomFlipLeftRight(self):
@@ -1031,7 +1031,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1070,7 +1070,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1096,7 +1096,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionUpDownWithBatch(self):
@@ -1107,7 +1107,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testUpDown(self):
@@ -1118,7 +1118,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
       self.assertTrue(y.op.name.startswith("flip_up_down"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testUpDownWithBatch(self):
@@ -1132,7 +1132,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testRandomFlipUpDown(self):
@@ -1148,7 +1148,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1187,7 +1187,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1213,7 +1213,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionTransposeWithBatch(self):
@@ -1224,7 +1224,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testTranspose(self):
@@ -1234,8 +1234,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      self.assertTrue(y.op.name.startswith("transpose_image"))
-      y_tf = y.eval()
+      self.assertTrue(y.op.name.startswith("transpose"))
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTransposeWithBatch(self):
@@ -1250,7 +1250,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testPartialShapes(self):
@@ -1301,7 +1301,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
-      self.assertAllEqual(image, rotated.eval())
+      self.assertAllEqual(image, self.evaluate(rotated))
 
   def testRot90GroupOrderWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
@@ -1309,7 +1309,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
-      self.assertAllEqual(image, rotated.eval())
+      self.assertAllEqual(image, self.evaluate(rotated))
 
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
@@ -1335,7 +1335,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_contrast(x, contrast_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testDoubleContrastUint8(self):
@@ -1390,7 +1390,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_contrast(x, contrast_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
     return y_tf
 
   def testRandomContrast(self):
@@ -1423,7 +1423,7 @@ class AdjustBrightnessTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_brightness(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testPositiveDeltaUint8(self):
@@ -1480,7 +1480,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
       self.assertTrue(y.op.name.startswith("per_image_standardization"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
   def testUniformImage(self):
@@ -1488,7 +1488,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     im = constant_op.constant(im_np)
     whiten = image_ops.per_image_standardization(im)
     with self.test_session(use_gpu=True):
-      whiten_np = whiten.eval()
+      whiten_np = self.evaluate(whiten)
       self.assertFalse(np.any(np.isnan(whiten_np)))
 
   def testBatchWhitening(self):
@@ -1497,7 +1497,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       imgs = constant_op.constant(imgs_np)
       whiten = image_ops.per_image_standardization(imgs)
-      whiten_tf = whiten.eval()
+      whiten_tf = self.evaluate(whiten)
       for w_tf, w_np in zip(whiten_tf, whiten_np):
         self.assertAllClose(w_tf, w_np, atol=1e-4)
 
@@ -1696,7 +1696,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
         with self.test_session(use_gpu=use_gpu):
           x = constant_op.constant(x_np, shape=x_shape)
           y = image_ops.central_crop(x, 1.0)
-          y_tf = y.eval()
+          y_tf = self.evaluate(y)
           self.assertAllEqual(y_tf, x_np)
           self.assertEqual(y.op.name, x.op.name)
 
@@ -1711,7 +1711,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       with self.test_session(use_gpu=use_gpu):
         x = constant_op.constant(x_np, shape=x_shape)
         y = image_ops.central_crop(x, 0.5)
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         self.assertAllEqual(y_tf, y_np)
         self.assertAllEqual(y_tf.shape, y_np.shape)
 
@@ -1727,7 +1727,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.central_crop(x, 0.5)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
@@ -1897,7 +1897,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     i = constant_op.constant([1, 0, 4, 3], dtype=dtypes.int64)
     y_tf = image_ops.pad_to_bounding_box(x, i[0], i[1], i[2], i[3])
     with self.test_session(use_gpu=True):
-      self.assertAllClose(y, y_tf.eval())
+      self.assertAllClose(y, self.evaluate(y_tf))
 
   def testNoOp(self):
     x_shape = [10, 10, 10]
@@ -2040,7 +2040,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -2171,9 +2171,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
       # Actual run to make sure shape is correct inside Compute().
-      begin = begin.eval()
-      end = end.eval()
-      bbox_for_drawing = bbox_for_drawing.eval()
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
@@ -2207,9 +2207,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
       # Actual run to make sure shape is correct inside Compute().
-      begin = begin.eval()
-      end = end.eval()
-      bbox_for_drawing = bbox_for_drawing.eval()
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
@@ -2276,7 +2276,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         y = image_ops.resize_images(image, [target_height, target_width],
                                     self.OPTIONS[0])
         yshape = array_ops.shape(y)
-        newshape = yshape.eval()
+        newshape = self.evaluate(yshape)
         self.assertAllEqual(single_shape, newshape)
 
   def testTensorArguments(self):
@@ -2411,7 +2411,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               y = image_ops.resize_images(image, [target_height, target_width],
                                           opt)
               expected = np.array(expected_data).reshape(target_shape)
-              resized = y.eval()
+              resized = self.evaluate(y)
               self.assertAllClose(resized, expected, atol=1e-5)
 
   def testResizeUpAlignCornersFalse(self):
@@ -2446,7 +2446,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
               image, [target_height, target_width], opt, align_corners=False)
-          resized = y.eval()
+          resized = self.evaluate(y)
           expected = np.array(expected_data[opt]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
@@ -2482,7 +2482,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
               image, [target_height, target_width], opt, align_corners=True)
-          resized = y.eval()
+          resized = self.evaluate(y)
           expected = np.array(expected_data[opt]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
@@ -2509,7 +2509,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       image = constant_op.constant(img_np, shape=img_shape)
       y = image_ops.resize_images(image, [target_height, target_width],
                                   image_ops.ResizeMethod.BICUBIC)
-      resized = y.eval()
+      resized = self.evaluate(y)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
       self.assertAllClose(resized, expected, atol=1)
@@ -2534,7 +2534,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                   image_ops.ResizeMethod.AREA)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
-      resized = y.eval()
+      resized = self.evaluate(y)
       self.assertAllClose(resized, expected, atol=1)
 
   def testCompareNearestNeighbor(self):
@@ -2554,7 +2554,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                 new_size,
                 image_ops.ResizeMethod.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
-            gpu_val = out_op.eval()
+            gpu_val = self.evaluate(out_op)
           with self.test_session(use_gpu=False):
             image = constant_op.constant(img_np, shape=input_shape)
             new_size = constant_op.constant([target_height, target_width])
@@ -2563,7 +2563,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                 new_size,
                 image_ops.ResizeMethod.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
-            cpu_val = out_op.eval()
+            cpu_val = self.evaluate(out_op)
           self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
 
   def testCompareBilinear(self):
@@ -2585,7 +2585,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                   new_size,
                   image_ops.ResizeMethod.BILINEAR,
                   align_corners=align_corners)
-              value[use_gpu] = out_op.eval()
+              value[use_gpu] = self.evaluate(out_op)
           self.assertAllClose(value[True], value[False], rtol=1e-5, atol=1e-5)
 
   def testShapeInference(self):
@@ -2613,7 +2613,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
       y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith("resize_images"))
+      self.assertTrue(y.op.name.startswith("resize"))
 
   def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
                        use_tensor_inputs):
@@ -3234,7 +3234,7 @@ class PngTest(test_util.TensorFlowTestCase):
           self.assertEqual(image0.shape, (26, 51, channels or channels_in))
           if channels == channels_in:
             image1 = image_ops.decode_png(image_ops.encode_png(image0))
-            self.assertAllEqual(image0, image1.eval())
+            self.assertAllEqual(image0, self.evaluate(image1))
 
   def testSynthetic(self):
     with self.test_session(use_gpu=True) as sess:
@@ -3431,7 +3431,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
       y = image_ops.total_variation(images=x_tf)
 
       # Run the TensorFlow session to calculate the result.
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       # Assert that the results are as expected within
       # some small error-bound in case they are float-values.
@@ -3709,7 +3709,7 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
         iou_threshold = constant_op.constant(iou_threshold_np)
         selected_indices, _ = gen_image_ops.non_max_suppression_v4(
             boxes, scores, max_output_size, iou_threshold, score_threshold)
-        selected_indices = selected_indices.eval()
+        selected_indices = self.evaluate(selected_indices)
         self.assertAllClose(selected_indices, [3, 0, 5])
 
 
@@ -3916,7 +3916,8 @@ class PSNRTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     psnr_float32 = image_ops.psnr(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(psnr_uint8.eval(), psnr_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          psnr_uint8.eval(), self.evaluate(psnr_float32), atol=0.001)
 
 
 class SSIMTest(test_util.TensorFlowTestCase):
@@ -3969,7 +3970,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     ssim = image_ops.ssim(constant_op.constant(img1),
                           constant_op.constant(img2), 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
   def testBroadcast(self):
     img = self._LoadTestImages()[:2]
@@ -3981,7 +3982,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ssim = image_ops.ssim(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
   def testNegative(self):
     """Tests against negative SSIM index."""
@@ -4007,7 +4008,8 @@ class SSIMTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
 
 class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
@@ -4077,7 +4079,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     msssim = image_ops.ssim_multiscale(constant_op.constant(img1),
                                        constant_op.constant(img2), 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, msssim.eval(), 1e-4)
+      self.assertAllClose(expected, self.evaluate(msssim), 1e-4)
 
   def testBroadcast(self):
     """Tests MS-SSIM broadcasting."""
@@ -4090,7 +4092,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     score_tensor = image_ops.ssim_multiscale(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, score_tensor.eval(), 1e-4)
+      self.assertAllClose(expected, self.evaluate(score_tensor), 1e-4)
 
   def testRange(self):
     """Tests against low MS-SSIM score.
@@ -4108,7 +4110,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
       images = [ops.convert_to_tensor(x, dtype=dtypes.float32) for x in images]
       msssim_ops = [image_ops.ssim_multiscale(x, y, 1.0)
                     for x, y in itertools.combinations(images, 2)]
-      msssim = sess.run(msssim_ops)
+      msssim = self.evaluate(msssim_ops)
       msssim = np.squeeze(msssim)
 
     self.assertTrue(np.all(msssim >= 0.0))
@@ -4124,7 +4126,8 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim_multiscale(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
 
 class ImageGradientsTest(test_util.TensorFlowTestCase):
@@ -4139,8 +4142,8 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
 
     dy, dx = image_ops.image_gradients(img)
     with self.cached_session():
-      actual_dy = dy.eval()
-      actual_dx = dx.eval()
+      actual_dy = self.evaluate(dy)
+      actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
       self.assertAllClose(expected_dx, actual_dx)
 
@@ -4164,8 +4167,8 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
     assert batch.get_shape().as_list() == [2, 2, 3, 2]
     dy, dx = image_ops.image_gradients(batch)
     with self.test_session(use_gpu=True):
-      actual_dy = dy.eval()
-      actual_dx = dx.eval()
+      actual_dy = self.evaluate(dy)
+      actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
       self.assertAllClose(expected_dx, actual_dx)
 
@@ -4185,7 +4188,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
                            [[0, 0], [0, 12], [0, 0]]], [1, 2, 3, 1, 2])
     sobel = image_ops.sobel_edges(img)
     with self.test_session(use_gpu=True):
-      actual_sobel = sobel.eval()
+      actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected, actual_sobel)
 
   def testSobelEdges5x3x4x2(self):
@@ -4207,7 +4210,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 
     sobel = image_ops.sobel_edges(img)
     with self.test_session(use_gpu=True):
-      actual_sobel = sobel.eval()
+      actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected_batch, actual_sobel)
 
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 4fe6d05620f6a9d1e29ddc0831642335f893ad7d..5a1ac675dbb9795d48903090cad9866d8fcbb154 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -580,9 +580,9 @@ class ConvolutionDeltaOrthogonal(Initializer):
 
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
     dtype: The data type.
@@ -613,7 +613,7 @@ class ConvolutionDeltaOrthogonal(Initializer):
     d = array_ops.diag_part(r)
     q *= math_ops.sign(d)
     q = q[:shape[-2], :]
-    q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    q *= math_ops.cast(self.gain, dtype=dtype)
     if len(shape) == 3:
       weight = array_ops.scatter_nd([[(shape[0]-1)//2]],
                                     array_ops.expand_dims(q, 0), shape)
@@ -636,9 +636,9 @@ class ConvolutionOrthogonal(Initializer):
   Base class used to construct 1D, 2D and 3D orthogonal kernels for convolution.
 
   Args:
-    gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
     dtype: The data type.
@@ -701,9 +701,9 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
   See algorithm 1 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      This has the effect of scaling the output 2-norm by a factor of
-      `sqrt(gain)`.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. This has the effect of scaling the output 2-norm by
+      a factor of `gain`.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
     dtype: The data type.
@@ -722,7 +722,7 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
       raise ValueError("Kernel sizes must be equal.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[2], shape[3])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k1, k2):
@@ -837,9 +837,9 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
   See algorithm 1 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
@@ -856,7 +856,7 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
       raise ValueError("In_filters cannot be greater than out_filters.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k):
@@ -954,9 +954,9 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
   See algorithm 1 [Xiao et al., 2018] in: https://arxiv.org/abs/1806.05393
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
     dtype: The data type.
@@ -975,7 +975,7 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
       raise ValueError("Kernel sizes must be equal.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k1, k2, k3):
@@ -1148,9 +1148,7 @@ class GlorotUniform(VarianceScaling):
     dtype: The data type. Only floating point types are supported.
   """
 
-  def __init__(self,
-               seed=None,
-               dtype=dtypes.float32):
+  def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotUniform, self).__init__(
         scale=1.0,
         mode="fan_avg",
@@ -1159,10 +1157,7 @@ class GlorotUniform(VarianceScaling):
         dtype=dtype)
 
   def get_config(self):
-    return {
-        "seed": self.seed,
-        "dtype": self.dtype.name
-    }
+    return {"seed": self.seed, "dtype": self.dtype.name}
 
 
 @tf_export(
@@ -1185,14 +1180,11 @@ class GlorotNormal(VarianceScaling):
 
   Args:
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.set_random_seed` for behavior.
     dtype: The data type. Only floating point types are supported.
   """
 
-  def __init__(self,
-               seed=None,
-               dtype=dtypes.float32):
+  def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotNormal, self).__init__(
         scale=1.0,
         mode="fan_avg",
@@ -1201,10 +1193,7 @@ class GlorotNormal(VarianceScaling):
         dtype=dtype)
 
   def get_config(self):
-    return {
-        "seed": self.seed,
-        "dtype": self.dtype.name
-    }
+    return {"seed": self.seed, "dtype": self.dtype.name}
 
 
 # Aliases.
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index 5693c3caaf5ca80fd6528c94bb952acc7bc8957c..1f22248004697438d2c8c05dc0c6762a20902d31 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -45,8 +45,8 @@ class InitializersTest(test.TestCase):
       output = variable.numpy()
     else:
       sess = ops.get_default_session()
-      sess.run(variable.initializer)
-      output = sess.run(variable)
+      self.evaluate(variable.initializer)
+      output = self.evaluate(variable)
     lim = 3e-2
     if target_std is not None:
       self.assertGreater(lim, abs(output.std() - target_std))
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index c7314d77749130e4696d58896249b73cc2de4a12..5df2d6b83816334f46ef45eec675ed9b7e35bd00 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -18,6 +18,7 @@ py_library(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/signal",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/ops/linalg/cholesky_registrations.py b/tensorflow/python/ops/linalg/cholesky_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5284cf22ac2981f79c0d3c7a6a60635c9d0bf02
--- /dev/null
+++ b/tensorflow/python/ops/linalg/cholesky_registrations.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.cholesky."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+
+
+# By default, compute the Cholesky of the dense matrix, and return a
+# LowerTriangular operator. Methods below specialize this registration.
+@linear_operator_algebra.RegisterCholesky(linear_operator.LinearOperator)
+def _cholesky_linear_operator(linop):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      linalg_ops.cholesky(linop.to_dense()),
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_diag.LinearOperatorDiag)
+def _cholesky_diag(diag_operator):
+  return linear_operator_diag.LinearOperatorDiag(
+      math_ops.sqrt(diag_operator.diag),
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_identity.LinearOperatorIdentity)
+def _cholesky_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      batch_shape=identity_operator.batch_shape,
+      dtype=identity_operator.dtype,
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _cholesky_scaled_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=math_ops.sqrt(identity_operator.multiplier),
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _cholesky_block_diag(block_diag_operator):
+    # We take the cholesky of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.cholesky() for operator in block_diag_operator.operators],
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _cholesky_kronecker(kronecker_operator):
+    # Cholesky decomposition of a Kronecker product is the Kronecker product
+    # of cholesky decompositions.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.cholesky() for operator in kronecker_operator.operators],
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index c29b5033bb137e8376e1c19985755b4fc72e8834..ac4fd4ebc6059a187828c757c852a470d8ee69a8 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.linalg import cholesky_registrations as _cholesky_registrations
+from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
+from tensorflow.python.ops.linalg import matmul_registrations as _matmul_registrations
 from tensorflow.python.ops.linalg.linalg_impl import *
 from tensorflow.python.ops.linalg.linear_operator import *
 from tensorflow.python.ops.linalg.linear_operator_block_diag import *
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 704ac11d0134eaca22088ccafe9d538477e38008..6fb7a57e4d90d2c60850676372acc4800433e669 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
@@ -284,7 +285,7 @@ class LinearOperator(object):
     `[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`
@@ -318,7 +319,7 @@ class LinearOperator(object):
     `[B1,...,Bb]`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`
@@ -340,7 +341,7 @@ class LinearOperator(object):
     `A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       Python integer, or None if the tensor rank is undefined.
@@ -356,7 +357,7 @@ class LinearOperator(object):
     `A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`, determined at runtime.
@@ -581,16 +582,29 @@ class LinearOperator(object):
     ```
 
     Args:
-      x: `Tensor` with compatible shape and same `dtype` as `self`.
-        See class docstring for definition of compatibility.
+      x: `LinearOperator` or `Tensor` with compatible shape and same `dtype` as
+        `self`. See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
       adjoint_arg:  Python `bool`.  If `True`, compute `A x^H` where `x^H` is
         the hermitian transpose (transposition and complex conjugation).
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
-      A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
+      A `LinearOperator` or `Tensor` with shape `[..., M, R]` and same `dtype`
+        as `self`.
     """
+    if isinstance(x, LinearOperator):
+      if adjoint or adjoint_arg:
+        raise ValueError(".matmul not supported with adjoints.")
+      if (x.range_dimension is not None and
+          self.domain_dimension is not None and
+          x.range_dimension != self.domain_dimension):
+        raise ValueError(
+            "Operators are incompatible. Expected `x` to have dimension"
+            " {} but got {}.".format(self.domain_dimension, x.range_dimension))
+      with self._name_scope(name):
+        return linear_operator_algebra.matmul(self, x)
+
     with self._name_scope(name, values=[x]):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
@@ -630,7 +644,7 @@ class LinearOperator(object):
         dimensions, the last dimension defines a vector.
         See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
@@ -649,13 +663,13 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     if self._can_use_cholesky():
       return math_ops.exp(self.log_abs_determinant())
-    return linalg_ops.matrix_determinant(self._matrix)
+    return linalg_ops.matrix_determinant(self.to_dense())
 
   def determinant(self, name="det"):
     """Determinant for every batch member.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
@@ -677,14 +691,14 @@ class LinearOperator(object):
     if self._can_use_cholesky():
       diag = array_ops.matrix_diag_part(linalg_ops.cholesky(self.to_dense()))
       return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
-    _, log_abs_det = linalg.slogdet(self._matrix)
+    _, log_abs_det = linalg.slogdet(self.to_dense())
     return log_abs_det
 
   def log_abs_determinant(self, name="log_abs_det"):
     """Log absolute value of determinant for every batch member.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
@@ -830,6 +844,31 @@ class LinearOperator(object):
 
       return self._solvevec(rhs, adjoint=adjoint)
 
+  def cholesky(self, name="cholesky"):
+    """Returns a Cholesky factor as a `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, if `A` is positive definite
+    self-adjoint, return `L`, where `A = L L^T`, i.e. the cholesky
+    decomposition.
+
+    Args:
+      name:  A name for this `Op`.
+
+    Returns:
+      `LinearOperator` which represents the lower triangular matrix
+      in the Cholesky decomposition.
+
+    Raises:
+      ValueError: When the `LinearOperator` is not hinted to be positive
+        definite and self adjoint.
+    """
+
+    if not self._can_use_cholesky():
+      raise ValueError("Cannot take the Cholesky decomposition: "
+                       "Not a positive definite self adjoint matrix.")
+    with self._name_scope(name):
+      return linear_operator_algebra.cholesky(self)
+
   def _to_dense(self):
     """Generic and often inefficient implementation.  Override often."""
     logging.warn("Using (possibly slow) default implementation of to_dense."
@@ -922,6 +961,4 @@ class LinearOperator(object):
       return self._add_to_tensor(x)
 
   def _can_use_cholesky(self):
-    # TODO(langmore) Add complex types when tf.cholesky can use them.
-    return (not self.dtype.is_complex and self.is_self_adjoint and
-            self.is_positive_definite)
+    return self.is_self_adjoint and self.is_positive_definite
diff --git a/tensorflow/python/ops/linalg/linear_operator_adjoint.py b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..858e224b9adda57b4d472ae2f61b2b6cda74c243
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Takes the adjoint of a `LinearOperator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = []
+
+
+@tf_export("linalg.LinearOperatorAdjoint")
+class LinearOperatorAdjoint(linear_operator.LinearOperator):
+  """`LinearOperator` representing the adjoint of another operator.
+
+  This operator represents the adjoint of another operator.
+
+  ```python
+  # Create a 2 x 2 linear operator.
+  operator = LinearOperatorFullMatrix([[1 - i., 3.], [0., 1. + i]])
+  operator_adjoint = LinearOperatorAdjoint(operator)
+
+  operator_adjoint.to_dense()
+  ==> [[1. + i, 0.]
+       [3., 1 - i]]
+
+  operator_adjoint.shape
+  ==> [2, 2]
+
+  operator_adjoint.log_abs_determinant()
+  ==> - log(2)
+
+  x = ... Shape [2, 4] Tensor
+  operator_adjoint.matmul(x)
+  ==> Shape [2, 4] Tensor, equal to operator.matmul(x, adjoint=True)
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorAdjoint` depends on the underlying
+  operators performance.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operator,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorAdjoint`.
+
+    `LinearOperatorAdjoint` is initialized with an operator `A`.  The `solve`
+    and `matmul` methods effectively flip the `adjoint` argument.  E.g.
+
+    ```
+    A = MyLinearOperator(...)
+    B = LinearOperatorAdjoint(A)
+    x = [....]  # a vector
+
+    assert A.matvec(x, adjoint=True) == B.matvec(x, adjoint=False)
+    ```
+
+    Args:
+      operator: `LinearOperator` object.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`. Default is `operator.name +
+        "_adjoint"`.
+
+    Raises:
+      ValueError:  If `operator.is_non_singular` is False.
+    """
+
+    self._operator = operator
+
+    # The congruency of is_non_singular and is_self_adjoint was checked in the
+    # base operator.
+    def _combined_hint(hint_str, provided_hint_value, message):
+      """Get combined hint in the case where operator.hint should equal hint."""
+      op_hint = getattr(operator, hint_str)
+      if op_hint is False and provided_hint_value:
+        raise ValueError(message)
+      if op_hint and provided_hint_value is False:
+        raise ValueError(message)
+      return (op_hint or provided_hint_value) or None
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its adjoint is square.")
+
+    is_non_singular = _combined_hint(
+        "is_non_singular", is_non_singular,
+        "An operator is non-singular if and only if its adjoint is "
+        "non-singular.")
+
+    is_self_adjoint = _combined_hint(
+        "is_self_adjoint", is_self_adjoint,
+        "An operator is self-adjoint if and only if its adjoint is "
+        "self-adjoint.")
+
+    is_positive_definite = _combined_hint(
+        "is_positive_definite", is_positive_definite,
+        "An operator is positive-definite if and only if its adjoint is "
+        "positive-definite.")
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its adjoint is square.")
+
+    # Initialization.
+    if name is None:
+      name = operator.name + "_adjoint"
+    with ops.name_scope(name, values=operator.graph_parents):
+      super(LinearOperatorAdjoint, self).__init__(
+          dtype=operator.dtype,
+          graph_parents=operator.graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operator(self):
+    """The operator before taking the adjoint."""
+    return self._operator
+
+  def _assert_non_singular(self):
+    return self.operator.assert_non_singular()
+
+  def _assert_positive_definite(self):
+    return self.operator.assert_positive_definite()
+
+  def _assert_self_adjoint(self):
+    return self.operator.assert_self_adjoint()
+
+  def _shape(self):
+    return self.operator.shape
+
+  def _shape_tensor(self):
+    return self.operator.shape_tensor()
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.matmul(
+        x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
+  def _determinant(self):
+    if self.is_self_adjoint:
+      return self.operator.determinant()
+    return math_ops.conj(self.operator.determinant())
+
+  def _log_abs_determinant(self):
+    return self.operator.log_abs_determinant()
+
+  def _trace(self):
+    if self.is_self_adjoint:
+      return self.operator.trace()
+    return math_ops.conj(self.operator.trace())
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.solve(
+        rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
+  def _to_dense(self):
+    if self.is_self_adjoint:
+      return self.operator.to_dense()
+    return linalg.adjoint(self.operator.to_dense())
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b99066e4c121ebd7546dfad1039c0dfa46bca11
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Registration mechanisms for various n-ary operations on LinearOperators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.framework import ops
+from tensorflow.python.util import tf_inspect
+
+
+_CHOLESKY_DECOMPS = {}
+_MATMUL = {}
+
+
+def _registered_function(type_list, registry):
+  """Given a list of classes, finds the most specific function registered."""
+  enumerated_hierarchies = [enumerate(tf_inspect.getmro(t)) for t in type_list]
+  # Get all possible combinations of hierarchies.
+  cls_combinations = list(itertools.product(*enumerated_hierarchies))
+
+  def hierarchy_distance(cls_combination):
+    candidate_distance = sum(c[0] for c in cls_combination)
+    if tuple(c[1] for c in cls_combination) in registry:
+      return candidate_distance
+    return 10000
+
+  registered_combination = min(cls_combinations, key=hierarchy_distance)
+  return registry.get(tuple(r[1] for r in registered_combination), None)
+
+
+def _registered_cholesky(type_a):
+  """Get the Cholesky function registered for class a."""
+  return _registered_function([type_a], _CHOLESKY_DECOMPS)
+
+
+def _registered_matmul(type_a, type_b):
+  """Get the Matmul function registered for classes a and b."""
+  return _registered_function([type_a, type_b], _MATMUL)
+
+
+def cholesky(lin_op_a, name=None):
+  """Get the Cholesky factor associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to decompose.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the lower Cholesky factor of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Cholesky method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  cholesky_fn = _registered_cholesky(type(lin_op_a))
+  if cholesky_fn is None:
+    raise ValueError("No cholesky decomposition registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Cholesky"):
+    return cholesky_fn(lin_op_a)
+
+
+def matmul(lin_op_a, lin_op_b, name=None):
+  """Compute lin_op_a.matmul(lin_op_b).
+
+  Args:
+    lin_op_a: The LinearOperator on the left.
+    lin_op_b: The LinearOperator on the right.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the matmul between `lin_op_a` and
+      `lin_op_b`.
+
+  Raises:
+    NotImplementedError: If no matmul method is defined between types of
+      `lin_op_a` and `lin_op_b`.
+  """
+  matmul_fn = _registered_matmul(type(lin_op_a), type(lin_op_b))
+  if matmul_fn is None:
+    raise ValueError("No matmul registered for {}.matmul({})".format(
+        type(lin_op_a), type(lin_op_b)))
+
+  with ops.name_scope(name, "Matmul"):
+    return matmul_fn(lin_op_a, lin_op_b)
+
+
+class RegisterCholesky(object):
+  """Decorator to register a Cholesky implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterCholesky(lin_op.LinearOperatorIdentity)
+  def _cholesky_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, cholesky_fn):
+    """Perform the Cholesky registration.
+
+    Args:
+      cholesky_fn: The function to use for the Cholesky.
+
+    Returns:
+      cholesky_fn
+
+    Raises:
+      TypeError: if cholesky_fn is not a callable.
+      ValueError: if a Cholesky function has already been registered for
+        the given argument classes.
+    """
+    if not callable(cholesky_fn):
+      raise TypeError(
+          "cholesky_fn must be callable, received: {}".format(cholesky_fn))
+    if self._key in _CHOLESKY_DECOMPS:
+      raise ValueError("Cholesky({}) has already been registered to: {}".format(
+          self._key[0].__name__, _CHOLESKY_DECOMPS[self._key]))
+    _CHOLESKY_DECOMPS[self._key] = cholesky_fn
+    return cholesky_fn
+
+
+class RegisterMatmul(object):
+  """Decorator to register a Matmul implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterMatmul(
+    lin_op.LinearOperatorIdentity,
+    lin_op.LinearOperatorIdentity)
+  def _matmul_identity(a, b):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a, lin_op_cls_b):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to multiply.
+      lin_op_cls_b: the class of the second LinearOperator to multiply.
+    """
+    self._key = (lin_op_cls_a, lin_op_cls_b)
+
+  def __call__(self, matmul_fn):
+    """Perform the Matmul registration.
+
+    Args:
+      matmul_fn: The function to use for the Matmul.
+
+    Returns:
+      matmul_fn
+
+    Raises:
+      TypeError: if matmul_fn is not a callable.
+      ValueError: if a Matmul function has already been registered for
+        the given argument classes.
+    """
+    if not callable(matmul_fn):
+      raise TypeError(
+          "matmul_fn must be callable, received: {}".format(matmul_fn))
+    if self._key in _MATMUL:
+      raise ValueError("Matmul({}, {}) has already been registered.".format(
+          self._key[0].__name__,
+          self._key[1].__name__))
+    _MATMUL[self._key] = matmul_fn
+    return matmul_fn
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 438c3496bdf4277e239c488d947ac743165179a5..b0b418c99706ad9468668d52e48e79f2add7552d 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -29,9 +29,7 @@ from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
-__all__ = [
-    "LinearOperatorBlockDiag",
-]
+__all__ = ["LinearOperatorBlockDiag"]
 
 
 @tf_export("linalg.LinearOperatorBlockDiag")
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index 021ef47383673dd1ccd42e58d04631ef2f3b2e7a..09f0c518e7ab75fe0c716388dfd4d9b6aaab646f 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -39,8 +40,8 @@ __all__ = [
 ]
 
 # Different FFT Ops will be used for different block depths.
-_FFT_OP = {1: math_ops.fft, 2: math_ops.fft2d, 3: math_ops.fft3d}
-_IFFT_OP = {1: math_ops.ifft, 2: math_ops.ifft2d, 3: math_ops.ifft3d}
+_FFT_OP = {1: fft_ops.fft, 2: fft_ops.fft2d, 3: fft_ops.fft3d}
+_IFFT_OP = {1: fft_ops.ifft, 2: fft_ops.ifft2d, 3: fft_ops.ifft3d}
 
 # This is the only dtype allowed with fft ops.
 # TODO(langmore) Add other types once available.
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index 0292bc51dcf9809941087dd4aa1ea4c760c064d1..f499b3066129bce83706a94d93d943422ccc1ffd 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -275,6 +275,3 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
     for operator in solve_order_list[1:]:
       solution = operator.solve(solution, adjoint=adjoint)
     return solution
-
-  def _add_to_tensor(self, x):
-    return self.to_dense() + x
diff --git a/tensorflow/python/ops/linalg/linear_operator_inversion.py b/tensorflow/python/ops/linalg/linear_operator_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aa4b40e16bd82941357e394101a0a9d55c7a7fe
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_inversion.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inverts a non-singular `LinearOperator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = []
+
+
+@tf_export("linalg.LinearOperatorInversion")
+class LinearOperatorInversion(linear_operator.LinearOperator):
+  """`LinearOperator` representing the inverse of another operator.
+
+  This operator represents the inverse of another operator.
+
+  ```python
+  # Create a 2 x 2 linear operator.
+  operator = LinearOperatorFullMatrix([[1., 0.], [0., 2.]])
+  operator_inv = LinearOperatorInversion(operator)
+
+  operator_inv.to_dense()
+  ==> [[1., 0.]
+       [0., 0.5]]
+
+  operator_inv.shape
+  ==> [2, 2]
+
+  operator_inv.log_abs_determinant()
+  ==> - log(2)
+
+  x = ... Shape [2, 4] Tensor
+  operator_inv.matmul(x)
+  ==> Shape [2, 4] Tensor, equal to operator.solve(x)
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorInversion` depends on the underlying
+  operators performance:  `solve` and `matmul` are swapped, and determinant is
+  inverted.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operator,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorInversion`.
+
+    `LinearOperatorInversion` is initialized with an operator `A`.  The `solve`
+    and `matmul` methods are effectively swapped.  E.g.
+
+    ```
+    A = MyLinearOperator(...)
+    B = LinearOperatorInversion(A)
+    x = [....]  # a vector
+
+    assert A.matvec(x) == B.solvevec(x)
+    ```
+
+    Args:
+      operator: `LinearOperator` object. If `operator.is_non_singular == False`,
+        an exception is raised.  We do allow `operator.is_non_singular == None`,
+        in which case this operator will have `is_non_singular == None`.
+        Similarly for `is_self_adjoint` and `is_positive_definite`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`. Default is `operator.name +
+        "_inv"`.
+
+    Raises:
+      ValueError:  If `operator.is_non_singular` is False.
+    """
+
+    self._operator = operator
+
+    # Auto-set and check hints.
+    if operator.is_non_singular is False or is_non_singular is False:
+      raise ValueError(
+          "operator and supplied hints must have `is_non_singular` equal to "
+          "`True` or `None`.  Found %s, %s" % (operator.is_non_singular,
+                                               is_non_singular))
+    if operator.is_square is False or is_square is False:
+      raise ValueError(
+          "operator and supplied hints must have `is_square` equal to "
+          "`True` or `None`.  Found %s, %s" % (operator.is_square, is_square))
+
+    # The congruency of is_non_singular and is_self_adjoint was checked in the
+    # base operator.  Other hints are, in this special case of inversion, ones
+    # that must be the same for base/derived operator.
+    def _combined_hint(hint_str, provided_hint_value, message):
+      """Get combined hint in the case where operator.hint should equal hint."""
+      op_hint = getattr(operator, hint_str)
+      if op_hint is False and provided_hint_value:
+        raise ValueError(message)
+      if op_hint and provided_hint_value is False:
+        raise ValueError(message)
+      return (op_hint or provided_hint_value) or None
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its inverse is square.")
+
+    is_non_singular = _combined_hint(
+        "is_non_singular", is_non_singular,
+        "An operator is non-singular if and only if its inverse is "
+        "non-singular.")
+
+    is_self_adjoint = _combined_hint(
+        "is_self_adjoint", is_self_adjoint,
+        "An operator is self-adjoint if and only if its inverse is "
+        "self-adjoint.")
+
+    is_positive_definite = _combined_hint(
+        "is_positive_definite", is_positive_definite,
+        "An operator is positive-definite if and only if its inverse is "
+        "positive-definite.")
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its inverse is square.")
+
+    # Initialization.
+    if name is None:
+      name = operator.name + "_inv"
+    with ops.name_scope(name, values=operator.graph_parents):
+      super(LinearOperatorInversion, self).__init__(
+          dtype=operator.dtype,
+          graph_parents=operator.graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operator(self):
+    """The operator before inversion."""
+    return self._operator
+
+  def _assert_non_singular(self):
+    return self.operator.assert_non_singular()
+
+  def _assert_positive_definite(self):
+    return self.operator.assert_positive_definite()
+
+  def _assert_self_adjoint(self):
+    return self.operator.assert_self_adjoint()
+
+  def _shape(self):
+    return self.operator.shape
+
+  def _shape_tensor(self):
+    return self.operator.shape_tensor()
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.solve(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def _determinant(self):
+    return 1. / self.operator.determinant()
+
+  def _log_abs_determinant(self):
+    return -1. * self.operator.log_abs_determinant()
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.matmul(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index 1fd5073c17832f0689616f2842c33c95d186e487..f7e785caa5d8cc290f037944378f709633423a74 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -30,9 +30,7 @@ from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.util.tf_export import tf_export
 
-__all__ = [
-    "LinearOperatorKronecker",
-]
+__all__ = ["LinearOperatorKronecker"]
 
 
 def _vec(x):
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 76d659f1097579a9b5c92a90938f71b90268503f..e50f572b5f431ae8b7cf3470ee799f170e83656c 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -102,7 +102,9 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("operator_build_infos has not been implemented.")
 
   @abc.abstractmethod
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     """Build a batch matrix and an Operator that should have similar behavior.
 
     Every operator acts like a (batch) matrix.  This method returns both
@@ -114,6 +116,11 @@ class LinearOperatorDerivedClassTest(test.TestCase):
       dtype:  Numpy dtype.  Data type of returned array/operator.
       use_placeholder:  Python bool.  If True, initialize the operator with a
         placeholder of undefined shape and correct dtype.
+      ensure_self_adjoint_and_pd: If `True`,
+        construct this operator to be Hermitian Positive Definite, as well
+        as ensuring the hints `is_positive_definite` and `is_self_adjoint`
+        are set.
+        This is useful for testing methods such as `cholesky`.
 
     Returns:
       operator:  `LinearOperator` subclass instance.
@@ -271,6 +278,21 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
     self._test_matmul(with_batch=False)
 
+  def test_cholesky(self):
+    self._skip_if_tests_to_skip_contains("cholesky")
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder,
+                ensure_self_adjoint_and_pd=True)
+            op_chol = operator.cholesky().to_dense()
+            mat_chol = linalg_ops.cholesky(mat)
+            op_chol_v, mat_chol_v = sess.run([op_chol, mat_chol])
+            self.assertAC(mat_chol_v, op_chol_v)
+
   def _test_solve(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
@@ -441,7 +463,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["solve", "solve_with_broadcast", "det", "log_abs_det"]
+    return ["cholesky", "solve", "solve_with_broadcast", "det", "log_abs_det"]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/ops/linalg/matmul_registrations.py b/tensorflow/python/ops/linalg/matmul_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0ac988ba274dd99b03733eff38b07055d68543b
--- /dev/null
+++ b/tensorflow/python/ops/linalg/matmul_registrations.py
@@ -0,0 +1,252 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.matmul."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_composition
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+from tensorflow.python.ops.linalg import linear_operator_zeros
+
+
+def _combined_self_adjoint_hint(operator_a, operator_b):
+  """Get combined hint for self-adjoint-ness."""
+  # Note: only use this method in the commuting case.
+  # The property is preserved under composition when the operators commute.
+  if operator_a.is_self_adjoint and operator_b.is_self_adjoint:
+    return True
+
+  # The property is not preserved when an operator with the property is composed
+  # with an operator without the property.
+  if ((operator_a.is_self_adjoint is True and
+       operator_b.is_self_adjoint is False) or
+      (operator_a.is_self_adjoint is False and
+       operator_b.is_self_adjoint is True)):
+    return False
+
+  # The property is not known when operators are not known to have the property
+  # or both operators don't have the property (the property for the complement
+  # class is not closed under composition).
+  return None
+
+
+def _is_square(operator_a, operator_b):
+  """Return a hint to whether the composition is square."""
+  if operator_a.is_square and operator_b.is_square:
+    return True
+  if operator_a.is_square is False and operator_b.is_square is False:
+    # Let A have shape [B, M, N], B have shape [B, N, L].
+    m = operator_a.range_dimension
+    l = operator_b.domain_dimension
+    if m is not None and l is not None:
+      return m == l
+
+    return None
+
+
+def _combined_positive_definite_hint(operator_a, operator_b):
+  """Get combined PD hint for compositions."""
+  # Note: Positive definiteness is only guaranteed to be preserved
+  # when the operators commute and are symmetric. Only use this method in
+  # commuting cases.
+
+  if (operator_a.is_positive_definite is True and
+      operator_a.is_self_adjoint is True and
+      operator_b.is_positive_definite is True and
+      operator_b.is_self_adjoint is True):
+    return True
+
+  return None
+
+
+def _combined_non_singular_hint(operator_a, operator_b):
+  """Get combined hint for when ."""
+  # If either operator is not-invertible the composition isn't.
+  if (operator_a.is_non_singular is False or
+      operator_b.is_non_singular is False):
+    return False
+
+  return operator_a.is_non_singular and operator_b.is_non_singular
+
+
+# By default, use a LinearOperatorComposition to delay the computation.
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator, linear_operator.LinearOperator)
+def _matmul_linear_operator(linop_a, linop_b):
+  """Generic matmul of two `LinearOperator`s."""
+  is_square = _is_square(linop_a, linop_b)
+  is_non_singular = None
+  is_self_adjoint = None
+  is_positive_definite = None
+
+  if is_square:
+    is_non_singular = _combined_non_singular_hint(linop_a, linop_b)
+    is_self_adjoint = _combined_self_adjoint_hint(linop_a, linop_b)
+  elif is_square is False:
+    is_non_singular = False
+    is_self_adjoint = False
+    is_positive_definite = False
+
+  return linear_operator_composition.LinearOperatorComposition(
+      operators=[linop_a, linop_b],
+      is_non_singular=is_non_singular,
+      is_self_adjoint=is_self_adjoint,
+      is_positive_definite=is_positive_definite,
+      is_square=is_square,
+  )
+
+# Identity
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_identity.LinearOperatorIdentity,
+    linear_operator.LinearOperator)
+def _matmul_linear_operator_identity_left(identity, linop):
+  del identity
+  return linop
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator,
+    linear_operator_identity.LinearOperatorIdentity)
+def _matmul_linear_operator_identity_right(linop, identity):
+  del identity
+  return linop
+
+
+# Zeros
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator,
+    linear_operator_zeros.LinearOperatorZeros)
+def _matmul_linear_operator_zeros_right(linop, zeros):
+  if not zeros.is_square or not linop.is_square:
+    raise ValueError("Matmul with non-square `LinearOperator`s or non-square "
+                     "`LinearOperatorZeros` not supported at this time.")
+  return zeros
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_zeros.LinearOperatorZeros,
+    linear_operator.LinearOperator)
+def _matmul_linear_operator_zeros_left(zeros, linop):
+  if not zeros.is_square or not linop.is_square:
+    raise ValueError("Matmul with non-square `LinearOperator`s or non-square "
+                     "`LinearOperatorZeros` not supported at this time.")
+  return zeros
+
+
+# Diag.
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_diag(linop_a, linop_b):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_a.diag * linop_b.diag,
+      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_a, linop_b),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_a, linop_b),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _matmul_linear_operator_diag_scaled_identity_right(
+    linop_diag, linop_scaled_identity):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_diag.diag * linop_scaled_identity.multiplier,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_diag, linop_scaled_identity),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_identity.LinearOperatorScaledIdentity,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_diag_scaled_identity_left(
+    linop_scaled_identity, linop_diag):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_diag.diag * linop_scaled_identity.multiplier,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_diag, linop_scaled_identity),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular)
+def _matmul_linear_operator_diag_tril(linop_diag, linop_triangular):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      tril=linop_diag.diag[..., None] * linop_triangular.to_dense(),
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_triangular),
+      # This is safe to do since the Triangular matrix is only self-adjoint
+      # when it is a diagonal matrix, and hence commutes.
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_triangular),
+      is_positive_definite=None,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_tril_diag(linop_triangular, linop_diag):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      tril=linop_triangular.to_dense() * linop_diag.diag,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_triangular),
+      # This is safe to do since the Triangular matrix is only self-adjoint
+      # when it is a diagonal matrix, and hence commutes.
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_triangular),
+      is_positive_definite=None,
+      is_square=True)
+
+# Circulant.
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_circulant.LinearOperatorCirculant,
+    linear_operator_circulant.LinearOperatorCirculant)
+def _matmul_linear_operator_circulant_circulant(linop_a, linop_b):
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=linop_a.spectrum * linop_b.spectrum,
+      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
+      is_self_adjoint=_combined_self_adjoint_hint(linop_a, linop_b),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_a, linop_b),
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index bbccc7e0369886a0d6bc5eac139c09b8f399d366..1a9e7112b45cacb711ac176b92cb3bef0dc72f00 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -423,7 +423,78 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export('norm', 'linalg.norm')
+@tf_export('norm', 'linalg.norm', v1=[])
+def norm_v2(tensor,
+            ord='euclidean',
+            axis=None,
+            keepdims=None,
+            name=None):
+  r"""Computes the norm of vectors, matrices, and tensors.
+
+  This function can compute several different vector norms (the 1-norm, the
+  Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
+  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).
+
+  Args:
+    tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
+    ord: Order of the norm. Supported values are 'fro', 'euclidean',
+      `1`, `2`, `np.inf` and any positive real number yielding the corresponding
+      p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if
+      `tensor` is a matrix and equivalent to 2-norm for vectors.
+      Some restrictions apply:
+        a) The Frobenius norm `fro` is not defined for vectors,
+        b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`,
+           `2`, `np.inf` are supported.
+      See the description of `axis` on how to compute norms for a batch of
+      vectors or matrices stored in a tensor.
+    axis: If `axis` is `None` (the default), the input is considered a vector
+      and a single vector norm is computed over the entire set of values in the
+      tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
+      `norm(reshape(tensor, [-1]), ord=ord)`.
+      If `axis` is a Python integer, the input is considered a batch of vectors,
+      and `axis` determines the axis in `tensor` over which to compute vector
+      norms.
+      If `axis` is a 2-tuple of Python integers it is considered a batch of
+      matrices and `axis` determines the axes in `tensor` over which to compute
+      a matrix norm.
+      Negative indices are supported. Example: If you are passing a tensor that
+      can be either a matrix or a batch of matrices at runtime, pass
+      `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
+      computed.
+    keepdims: If True, the axis indicated in `axis` are kept with size 1.
+      Otherwise, the dimensions in `axis` are removed from the output shape.
+    name: The name of the op.
+
+  Returns:
+    output: A `Tensor` of the same type as tensor, containing the vector or
+      matrix norms. If `keepdims` is True then the rank of output is equal to
+      the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar,
+      if `axis` is an integer, the rank of `output` is one less than the rank
+      of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less
+      than the rank of `tensor`.
+
+  Raises:
+    ValueError: If `ord` or `axis` is invalid.
+
+  @compatibility(numpy)
+  Mostly equivalent to numpy.linalg.norm.
+  Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
+  Other differences:
+    a) If axis is `None`, treats the flattened `tensor` as a vector
+     regardless of rank.
+    b) Explicitly supports 'euclidean' norm as the default, including for
+     higher order tensors.
+  @end_compatibility
+  """
+  return norm(tensor=tensor,
+              ord=ord,
+              axis=axis,
+              keepdims=keepdims,
+              name=name)
+
+
+# pylint: disable=redefined-builtin
+@tf_export(v1=['norm', 'linalg.norm'])
 @deprecation.deprecated_args(
     None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
 def norm(tensor,
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index 386626e6a951b189f5e77bdf7b9a308b60c1c842..515926002d80782b0279b1f57854f577df6327ad 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_list_ops
 # go/tf-wildcard-import
@@ -33,6 +34,35 @@ ops.NotDifferentiable("TensorListConcat")
 ops.NotDifferentiable("TensorListPushBackBatch")
 
 
+def empty_tensor_list(element_shape,
+                      element_dtype,
+                      max_num_elements=None,
+                      name=None):
+  if max_num_elements is None:
+    max_num_elements = -1
+
+  return gen_list_ops.empty_tensor_list(
+      element_shape=_build_element_shape(element_shape),
+      element_dtype=element_dtype,
+      max_num_elements=max_num_elements,
+      name=name)
+
+
+def tensor_list_reserve(element_shape, num_elements, element_dtype, name=None):
+  return gen_list_ops.tensor_list_reserve(
+      element_shape=_build_element_shape(element_shape),
+      num_elements=num_elements,
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_from_tensor(tensor, element_shape, name=None):
+  return gen_list_ops.tensor_list_from_tensor(
+      tensor=tensor,
+      element_shape=_build_element_shape(element_shape),
+      name=name)
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
@@ -42,7 +72,7 @@ def _PushBackGrad(op, dresult):
 @ops.RegisterGradient("TensorListPopBack")
 def _PopBackGrad(op, dlist, delement):
   if dlist is None:
-    dlist = gen_list_ops.empty_tensor_list(
+    dlist = empty_tensor_list(
         element_dtype=delement.dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
@@ -51,19 +81,18 @@ def _PopBackGrad(op, dlist, delement):
 
 @ops.RegisterGradient("TensorListStack")
 def _TensorListStackGrad(unused_op, dtensor):
-  return gen_list_ops.tensor_list_from_tensor(dtensor,
-                                              element_shape=dtensor.shape[1:])
+  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:])
 
 
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape.dims[0].value is not None:
+  if op.inputs[0].shape.dims and op.inputs[0].shape.dims[0].value is not None:
     num_elements = op.inputs[0].shape.dims[0].value
   else:
     num_elements = None
   if dlist is None:
-    dlist = gen_list_ops.empty_tensor_list(
+    dlist = empty_tensor_list(
         element_dtype=op.inputs[0].dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
@@ -112,3 +141,40 @@ def _TensorListScatterGrad(op, dlist):
   t, indices, _ = op.inputs
   return gen_list_ops.tensor_list_gather(
       dlist, indices, element_dtype=t.dtype), None
+
+
+def _build_element_shape(shape):
+  """Converts shape to a format understood by list_ops for element_shape.
+
+  If `shape` is already a `Tensor` it is returned as-is. We do not perform a
+  type check here.
+
+  If shape is None or a TensorShape with unknown rank, -1 is returned.
+
+  If shape is a scalar, an int32 tensor with empty list is returned. Note we
+  do directly return an empty list since ops.convert_to_tensor would conver it
+  to a float32 which is not a valid type for element_shape.
+
+  If shape is a sequence of dims, None's in the list are replaced with -1. We
+  do not check the dtype of the other dims.
+
+  Args:
+    shape: Could be None, Tensor, TensorShape or a list of dims (each dim could
+      be a None, scalar or Tensor).
+
+  Returns:
+    A None-free shape that can be converted to a tensor.
+  """
+  if isinstance(shape, ops.Tensor):
+    return shape
+  if isinstance(shape, tensor_shape.TensorShape):
+    # `TensorShape.as_list` requires rank to be known.
+    shape = shape.as_list() if shape else None
+  # Shape is unknown.
+  if shape is None:
+    return -1
+  # Shape is a scalar.
+  if not shape:
+    return ops.convert_to_tensor(shape, dtype=dtypes.int32)
+  # Shape is a sequence of dimensions. Convert None dims to -1.
+  return [d if d is not None else -1 for d in shape]
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index fd532a9be2d625f4d0d68540fb4a1caeebf4cd83..5a948a21946d0b9ce867901a00425857e4f06b1f 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -629,4 +629,6 @@ ops.NotDifferentiable("AudioSummary")
 ops.NotDifferentiable("AudioSummaryV2")
 ops.NotDifferentiable("MergeSummary")
 ops.NotDifferentiable("ScalarSummary")
+ops.NotDifferentiable("TensorSummary")
+ops.NotDifferentiable("TensorSummaryV2")
 ops.NotDifferentiable("Timestamp")
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 89109469b7b9e770e9f7d8b048592d3f8f88fd82..397d56ef40936c02d879c719027ceb5cfd10d93a 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -45,7 +45,7 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("initialize_all_tables")
+@tf_export(v1=["initialize_all_tables"])
 @deprecated(None, "Use `tf.tables_initializer` instead.")
 def initialize_all_tables(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
@@ -60,7 +60,7 @@ def initialize_all_tables(name="init_all_tables"):
   return tables_initializer(name)
 
 
-@tf_export("initializers.tables_initializer", "tables_initializer")
+@tf_export(v1=["initializers.tables_initializer", "tables_initializer"])
 def tables_initializer(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
 
@@ -171,6 +171,11 @@ class InitializableLookupTableBase(LookupInterface):
   def initializer(self):
     return self._init_op
 
+  @property
+  @deprecated("2018-12-15", "Use `initializer` instead.")
+  def init(self):
+    return self.initializer
+
   @property
   def default_value(self):
     """The default value of the table."""
@@ -830,6 +835,11 @@ class IdTableWithHashBuckets(LookupInterface):
     with ops.name_scope(None, "init"):
       return control_flow_ops.no_op()
 
+  @property
+  @deprecated("2018-12-15", "Use `initializer` instead.")
+  def init(self):
+    return self.initializer
+
   @property
   def resource_handle(self):
     if self._table is not None:
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 53c09ee8ddf22420b768ec58056933088d9e7881..e8cadf931bc8612993da2277afc84e78b8608cff 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -34,28 +33,45 @@ from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("losses.Reduction")
-class Reduction(object):
+@tf_export("losses.Reduction", v1=[])
+class ReductionV2(object):
   """Types of loss reduction.
 
   Contains the following values:
   `NONE`: Un-reduced weighted losses with the same shape as input.
   `SUM`: Scalar sum of weighted losses.
-  `MEAN`: Scalar `SUM` divided by sum of weights.
   `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
-  `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
-     weights.
-  `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
   """
 
   NONE = "none"
-
   SUM = "weighted_sum"
+  SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
 
-  MEAN = "weighted_mean"
+  @classmethod
+  def all(cls):
+    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
 
-  SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
+  @classmethod
+  def validate(cls, key):
+    if key not in cls.all():
+      raise ValueError("Invalid Reduction Key %s." % key)
+
+
+@tf_export(v1=["losses.Reduction"])
+class Reduction(ReductionV2):
+  """Types of loss reduction.
 
+  Contains the following values:
+  `NONE`: Un-reduced weighted losses with the same shape as input.
+  `SUM`: Scalar sum of weighted losses.
+  `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
+  `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+  `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
+     weights. DEPRECATED.
+  `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
+  """
+
+  MEAN = "weighted_mean"
   SUM_BY_NONZERO_WEIGHTS = "weighted_sum_by_nonzero_weights"
   SUM_OVER_NONZERO_WEIGHTS = SUM_BY_NONZERO_WEIGHTS
 
@@ -72,35 +88,7 @@ class Reduction(object):
   @classmethod
   def validate(cls, key):
     if key not in cls.all():
-      raise ValueError("Invalid ReductionKey %s." % key)
-
-
-def _safe_div(numerator, denominator, name="value"):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-    name: An optional name for the returned op.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator, name=name)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator),
-      name=name)
+      raise ValueError("Invalid Reduction Key %s." % key)
 
 
 def _safe_mean(losses, num_present):
@@ -115,7 +103,7 @@ def _safe_mean(losses, num_present):
       then zero is returned.
   """
   total_loss = math_ops.reduce_sum(losses)
-  return _safe_div(total_loss, num_present)
+  return math_ops.div_no_nan(total_loss, num_present, name="value")
 
 
 def _num_present(losses, weights, per_batch=False):
@@ -603,18 +591,19 @@ def mean_pairwise_squared_error(
           keepdims=True)
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-      term1 = 2.0 * _safe_div(
+      term1 = 2.0 * math_ops.div_no_nan(
           sum_squares_diff_per_batch,
-          math_ops.maximum(num_present_per_batch - 1, 0))
+          math_ops.maximum(num_present_per_batch - 1, 0),
+          name="value")
 
       sum_diff = math_ops.reduce_sum(
           diffs, reduction_indices=reduction_indices, keepdims=True)
-      term2 = 2.0 * _safe_div(
+      term2 = 2.0 * math_ops.div_no_nan(
           math_ops.square(sum_diff),
           math_ops.maximum(
               math_ops.multiply(num_present_per_batch,
-                                num_present_per_batch - 1),
-              0))
+                                num_present_per_batch - 1), 0),
+          name="value")
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index c9374006ba3db757744ac42dea3c2f0ccb64d3cd..952a2a1e798c42052130ac3f1573e1ce9354dfdc 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
-from tensorflow.python.ops import gen_spectral_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_math_ops import *
@@ -50,10 +49,10 @@ from tensorflow.python.util.tf_export import tf_export
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
 
-arg_max = deprecation.deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
-arg_min = deprecation.deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
-tf_export("arg_max")(arg_max)
-tf_export("arg_min")(arg_min)
+arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
+arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
+tf_export(v1=["arg_max"])(arg_max)
+tf_export(v1=["arg_min"])(arg_min)
 
 # This is set by resource_variable_ops.py. It is included in this way since
 # there is a circular dependency between math_ops and resource_variable_ops
@@ -70,7 +69,7 @@ def _set_doc(doc):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("math.argmax", "argmax")
+@tf_export(v1=["math.argmax", "argmax"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -85,10 +84,37 @@ def argmax(input,
       "axis", axis, "dimension", dimension)
   if axis is None:
     axis = 0
+  return argmax_v2(input, axis, output_type, name)
+
+
+@tf_export("math.argmax", "argmax", v1=[])
+def argmax_v2(input,
+              axis=None,
+              output_type=dtypes.int64,
+              name=None):
+  """Returns the index with the largest value across axes of a tensor.
+
+  Note that in case of ties the identity of the return value is not guaranteed.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+    `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`, `quint8`,
+    `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`, `uint64`.
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      int32 or int64, must be in the range `-rank(input), rank(input))`.
+      Describes which axis of the input Tensor to reduce across. For vectors,
+      use axis = 0.
+    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`.
+      Defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `output_type`.
+  """
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
 
-@tf_export("math.argmin", "argmin")
+@tf_export(v1=["math.argmin", "argmin"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -103,6 +129,33 @@ def argmin(input,
       "axis", axis, "dimension", dimension)
   if axis is None:
     axis = 0
+  return argmin_v2(input, axis, output_type, name)
+
+
+@tf_export("math.argmin", "argmin", v1=[])
+def argmin_v2(input,
+              axis=None,
+              output_type=dtypes.int64,
+              name=None):
+  """Returns the index with the smallest value across axes of a tensor.
+
+  Note that in case of ties the identity of the return value is not guaranteed.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+    `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`, `quint8`,
+    `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`, `uint64`.
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      int32 or int64, must be in the range `-rank(input), rank(input))`.
+      Describes which axis of the input Tensor to reduce across. For vectors,
+      use axis = 0.
+    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`.
+      Defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `output_type`.
+  """
   return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
 
 
@@ -713,8 +766,8 @@ def saturate_cast(value, dtype, name=None):
                                        name="max"))
     return cast(value, dtype, name=name)
 
-
-@tf_export("to_float")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_float"])
 def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
 
@@ -732,7 +785,8 @@ def to_float(x, name="ToFloat"):
   return cast(x, dtypes.float32, name=name)
 
 
-@tf_export("to_double")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_double"])
 def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
 
@@ -750,7 +804,8 @@ def to_double(x, name="ToDouble"):
   return cast(x, dtypes.float64, name=name)
 
 
-@tf_export("to_int32")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_int32"])
 def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
 
@@ -768,7 +823,8 @@ def to_int32(x, name="ToInt32"):
   return cast(x, dtypes.int32, name=name)
 
 
-@tf_export("to_int64")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_int64"])
 def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
 
@@ -786,7 +842,8 @@ def to_int64(x, name="ToInt64"):
   return cast(x, dtypes.int64, name=name)
 
 
-@tf_export("to_bfloat16")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_bfloat16"])
 def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
 
@@ -804,7 +861,8 @@ def to_bfloat16(x, name="ToBFloat16"):
   return cast(x, dtypes.bfloat16, name=name)
 
 
-@tf_export("to_complex64")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_complex64"])
 def to_complex64(x, name="ToComplex64"):
   """Casts a tensor to type `complex64`.
 
@@ -822,7 +880,8 @@ def to_complex64(x, name="ToComplex64"):
   return cast(x, dtypes.complex64, name=name)
 
 
-@tf_export("to_complex128")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_complex128"])
 def to_complex128(x, name="ToComplex128"):
   """Casts a tensor to type `complex128`.
 
@@ -1031,7 +1090,10 @@ def truediv(x, y, name=None):
   return _truediv_python3(x, y, name)
 
 
-@tf_export("div")
+@deprecation.deprecated(
+    date=None,
+    instructions="Deprecated in favor of operator or tf.math.divide.")
+@tf_export(v1=["div"])
 def div(x, y, name=None):
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
@@ -1345,7 +1407,7 @@ def reduce_sum(input_tensor,
                                    name=name))
 
 
-@tf_export("math.count_nonzero", "count_nonzero")
+@tf_export(v1=["math.count_nonzero", "count_nonzero"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def count_nonzero(input_tensor,
@@ -1406,20 +1468,79 @@ def count_nonzero(input_tensor,
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis,
+      "reduction_indices", reduction_indices
+      )
   if keepdims is None:
     keepdims = False
 
-  with ops.name_scope(name, "count_nonzero", [input_tensor]):
-    input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
+  return count_nonzero_v2(input_tensor, axis, keepdims, dtype, name)
+
+
+@tf_export("math.count_nonzero", v1=[])
+def count_nonzero_v2(input,  # pylint: disable=redefined-builtin
+                     axis=None,
+                     keepdims=None,
+                     dtype=dtypes.int64,
+                     name=None):
+  """Computes number of nonzero elements across dimensions of a tensor.
+
+  Reduces `input` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  **NOTE** Floating point comparison to zero is done by exact floating point
+  equality check.  Small values are **not** rounded to zero for purposes of
+  the nonzero check.
+
+  For example:
+
+  ```python
+  x = tf.constant([[0, 1, 0], [1, 1, 0]])
+  tf.count_nonzero(x)  # 3
+  tf.count_nonzero(x, 0)  # [1, 2, 0]
+  tf.count_nonzero(x, 1)  # [1, 2]
+  tf.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
+  tf.count_nonzero(x, [0, 1])  # 3
+  ```
+
+  **NOTE** Strings are compared against zero-length empty string `""`. Any
+  string with a size greater than zero is already considered as nonzero.
+
+  For example:
+  ```python
+  x = tf.constant(["", "a", "  ", "b", ""])
+  tf.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
+  ```
+
+  Args:
+    input: The tensor to reduce. Should be of numeric type, `bool`,
+      or `string`.
+    axis: The dimensions to reduce. If `None` (the default),
+      reduces all dimensions. Must be in the range
+      `[-rank(input), rank(input))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    dtype: The output dtype; defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor (number of nonzero values).
+  """
+  with ops.name_scope(name, "count_nonzero", [input]):
+    input = ops.convert_to_tensor(input, name="input")
     # A scalar of 'zero' is enough as `not_equal` will broadcast.
-    zero = array_ops.zeros([], dtype=input_tensor.dtype)
+    zero = array_ops.zeros([], dtype=input.dtype)
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
-            to_int64(gen_math_ops.not_equal(input_tensor, zero)),
+            to_int64(gen_math_ops.not_equal(input, zero)),
             axis=axis,
-            keepdims=keepdims,
-            reduction_indices=reduction_indices),
+            keepdims=keepdims),
         dtype=dtype)
 
 
@@ -1495,6 +1616,100 @@ def reduce_mean(input_tensor,
                                    name=name))
 
 
+@tf_export("math.reduce_variance")
+def reduce_variance(input_tensor, axis=None, keepdims=None, name=None):
+  """Computes the variance of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1., 2.], [3., 4.]])
+  tf.reduce_variance(x)  # 1.25
+  tf.reduce_variance(x, 0)  # [1., 1.]
+  tf.reduce_variance(x, 1)  # [0.25,  0.25]
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name scope for the associated operations (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.var
+
+  Please note that `np.var` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_variance` has an aggressive type inference from
+  `input_tensor`,
+  @end_compatibility
+  """
+  name = name if name else "reduce_variance"
+  with ops.name_scope(name):
+    means = reduce_mean(input_tensor, axis=axis, keepdims=True)
+    squared_deviations = square(input_tensor - means)
+    return reduce_mean(squared_deviations, axis=axis, keepdims=keepdims)
+
+
+@tf_export("math.reduce_std")
+def reduce_std(input_tensor, axis=None, keepdims=None, name=None):
+  """Computes the standard deviation of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1., 2.], [3., 4.]])
+  tf.reduce_std(x)  # 1.1180339887498949
+  tf.reduce_std(x, 0)  # [1., 1.]
+  tf.reduce_std(x, 1)  # [0.5,  0.5]
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name scope for the associated operations (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.std
+
+  Please note that `np.std` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_std` has an aggressive type inference from `input_tensor`,
+  @end_compatibility
+  """
+  name = name if name else "reduce_std"
+  with ops.name_scope(name):
+    variance = reduce_variance(input_tensor, axis=axis, keepdims=keepdims)
+    return sqrt(variance)
+
+
 @tf_export("math.reduce_prod", "reduce_prod")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -2057,10 +2272,108 @@ def matmul(a,
           a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
 
 
+@tf_export("linalg.matvec")
+def matvec(a,
+           b,
+           transpose_a=False,
+           adjoint_a=False,
+           a_is_sparse=False,
+           b_is_sparse=False,
+           name=None):
+  """Multiplies matrix `a` by vector `b`, producing `a` * `b`.
+
+  The matrix `a` must, following any transpositions, be a tensor of rank >= 2,
+  and we must have `shape(b) = shape(a)[:-2] + [shape(a)[-1]]`.
+
+  Both `a` and `b` must be of the same type. The supported types are:
+  `float16`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
+
+  Matrix `a` can be transposed or adjointed (conjugated and transposed) on
+  the fly by setting one of the corresponding flag to `True`. These are `False`
+  by default.
+
+  If one or both of the inputs contain a lot of zeros, a more efficient
+  multiplication algorithm can be used by setting the corresponding
+  `a_is_sparse` or `b_is_sparse` flag to `True`. These are `False` by default.
+  This optimization is only available for plain matrices/vectors (rank-2/1
+  tensors) with datatypes `bfloat16` or `float32`.
+
+  For example:
+
+  ```python
+  # 2-D tensor `a`
+  # [[1, 2, 3],
+  #  [4, 5, 6]]
+  a = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3])
+
+  # 1-D tensor `b`
+  # [7, 9, 11]
+  b = tf.constant([7, 9, 11], shape=[3])
+
+  # `a` * `b`
+  # [ 58,  64]
+  c = tf.matvec(a, b)
+
+
+  # 3-D tensor `a`
+  # [[[ 1,  2,  3],
+  #   [ 4,  5,  6]],
+  #  [[ 7,  8,  9],
+  #   [10, 11, 12]]]
+  a = tf.constant(np.arange(1, 13, dtype=np.int32),
+                  shape=[2, 2, 3])
+
+  # 2-D tensor `b`
+  # [[13, 14, 15],
+  #  [16, 17, 18]]
+  b = tf.constant(np.arange(13, 19, dtype=np.int32),
+                  shape=[2, 3])
+
+  # `a` * `b`
+  # [[ 86, 212],
+  #  [410, 563]]
+  c = tf.matvec(a, b)
+  ```
+
+  Args:
+    a: `Tensor` of type `float16`, `float32`, `float64`, `int32`, `complex64`,
+      `complex128` and rank > 1.
+    b: `Tensor` with same type and rank = `rank(a) - 1`.
+    transpose_a: If `True`, `a` is transposed before multiplication.
+    adjoint_a: If `True`, `a` is conjugated and transposed before
+      multiplication.
+    a_is_sparse: If `True`, `a` is treated as a sparse matrix.
+    b_is_sparse: If `True`, `b` is treated as a sparse matrix.
+    name: Name for the operation (optional).
+
+  Returns:
+    A `Tensor` of the same type as `a` and `b` where each inner-most vector is
+    the product of the corresponding matrices in `a` and vectors in `b`, e.g. if
+    all transpose or adjoint attributes are `False`:
+
+    `output`[..., i] = sum_k (`a`[..., i, k] * `b`[..., k]), for all indices i.
+
+    Note: This is matrix-vector product, not element-wise product.
+
+
+  Raises:
+    ValueError: If transpose_a and adjoint_a are both set to True.
+  """
+  with ops.name_scope(name, "MatVec", [a, b]) as name:
+    output = matmul(
+        a,
+        array_ops.expand_dims(b, axis=-1),
+        transpose_a=transpose_a,
+        adjoint_a=adjoint_a,
+        a_is_sparse=a_is_sparse,
+        b_is_sparse=b_is_sparse)
+    return array_ops.squeeze(output, axis=-1)
+
+
 _OverrideBinaryOperatorHelper(matmul, "matmul")
 
 sparse_matmul = gen_math_ops.sparse_mat_mul
-tf_export("sparse_matmul")(sparse_matmul)
+tf_export(v1=["sparse_matmul"])(sparse_matmul)
 
 
 @ops.RegisterStatistics("MatMul", "flops")
@@ -2671,8 +2984,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
     return summed / gen_math_ops.sqrt(N)
 
 
-@tf_export(
-    "sparse.segment_sum", v1=["sparse.segment_sum", "sparse_segment_sum"])
+@tf_export(v1=["sparse.segment_sum", "sparse_segment_sum"])
 @deprecation.deprecated_endpoints("sparse_segment_sum")
 def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
@@ -2746,8 +3058,17 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export(
-    "sparse.segment_mean", v1=["sparse.segment_mean", "sparse_segment_mean"])
+@tf_export("sparse.segment_sum", v1=[])
+def sparse_segment_sum_v2(data,
+                          indices,
+                          segment_ids,
+                          num_segments=None,
+                          name=None):
+  return sparse_segment_mean(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
+@tf_export(v1=["sparse.segment_mean", "sparse_segment_mean"])
 @deprecation.deprecated_endpoints("sparse_segment_mean")
 def sparse_segment_mean(data,
                         indices,
@@ -2793,9 +3114,44 @@ def sparse_segment_mean(data,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export(
-    "sparse.segment_sqrt_n",
-    v1=["sparse.segment_sqrt_n", "sparse_segment_sqrt_n"])
+@tf_export("sparse.segment_mean", v1=[])
+def sparse_segment_mean_v2(data,
+                           indices,
+                           segment_ids,
+                           num_segments=None,
+                           name=None):
+  r"""Computes the mean along sparse segments of a tensor.
+
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  for an explanation of segments.
+
+  Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  return sparse_segment_mean(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
+@tf_export(v1=["sparse.segment_sqrt_n", "sparse_segment_sqrt_n"])
 @deprecation.deprecated_endpoints("sparse_segment_sqrt_n")
 def sparse_segment_sqrt_n(data,
                           indices,
@@ -2833,6 +3189,35 @@ def sparse_segment_sqrt_n(data,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
+@tf_export("sparse.segment_sqrt_n", v1=[])
+def sparse_segment_sqrt_n_v2(data,
+                             indices,
+                             segment_ids,
+                             num_segments=None,
+                             name=None):
+  r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
+
+  `N` is the size of the segment being reduced.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  return sparse_segment_sqrt_n(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
 @tf_export("tensordot", "linalg.tensordot")
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes.
@@ -2866,12 +3251,11 @@ def tensordot(a, b, axes, name=None):
     a: `Tensor` of type `float32` or `float64`.
     b: `Tensor` with the same type as `a`.
     axes: Either a scalar `N`, or a list or an `int32` `Tensor` of shape [2, k].
-     If axes is a scalar, sum over the last N axes of a and the first N axes
-     of b in order.
-     If axes is a list or `Tensor` the first and second row contain the set of
-     unique integers specifying axes along which the contraction is computed,
-     for `a` and `b`, respectively. The number of axes for `a` and `b` must
-     be equal.
+      If axes is a scalar, sum over the last N axes of a and the first N axes of
+      b in order. If axes is a list or `Tensor` the first and second row contain
+      the set of unique integers specifying axes along which the contraction is
+      computed, for `a` and `b`, respectively. The number of axes for `a` and
+      `b` must be equal.
     name: A name for the operation (optional).
 
   Returns:
@@ -3103,13 +3487,3 @@ def bessel_i1e(x, name=None):
           indices=x.indices, values=x_i1e, dense_shape=x.dense_shape)
     else:
       return gen_math_ops.bessel_i1e(x, name=name)
-
-
-# FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
-# 1.0 API so we leave these here for backwards compatibility.
-fft = gen_spectral_ops.fft
-ifft = gen_spectral_ops.ifft
-fft2d = gen_spectral_ops.fft2d
-ifft2d = gen_spectral_ops.ifft2d
-fft3d = gen_spectral_ops.fft3d
-ifft3d = gen_spectral_ops.ifft3d
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 10b87b3fccf440da09679f6d4916da6f2754f710..e0329f66ff34bc4dfaf55307eeb50e7ad8604e54 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -69,6 +69,26 @@ class ReduceTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "must be at most rank 1"):
       math_ops.reduce_sum(x, axis)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testReduceVar(self):
+    x = np.array([[0, 0, 0], [0, 0, 0]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_variance(x)), 0)
+    self.assertAllClose(
+        self.evaluate(math_ops.reduce_variance(x, axis=0)), [0, 0, 0])
+
+    x = np.array([[0, 2, 1, 1], [1, 2, 0, 1]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_variance(x)), 0.5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testReduceStd(self):
+    x = np.array([[0, 0, 0], [0, 0, 0]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_std(x)), 0)
+    self.assertAllClose(
+        self.evaluate(math_ops.reduce_std(x, axis=0)), [0, 0, 0])
+
+    x = np.array([[1, 2, 1, 1], [1, 1, 0, 1]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_std(x)), 0.5)
+
 
 class LogSumExpTest(test_util.TensorFlowTestCase):
 
@@ -87,7 +107,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         y_tf = math_ops.reduce_logsumexp(x_np, reduction_indices=[0])
         y_np = log(np.sum(exp(x_np), axis=0))
         self.assertShapeEqual(y_np, y_tf)
-        y_tf_np = y_tf.eval()
+        y_tf_np = self.evaluate(y_tf)
         self.assertAllClose(y_tf_np, y_np)
 
   def testReductionIndices2(self):
@@ -97,7 +117,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         y_tf = math_ops.reduce_logsumexp(x_np, reduction_indices=0)
         y_np = log(np.sum(exp(x_np), axis=0))
         self.assertShapeEqual(y_np, y_tf)
-        y_tf_np = y_tf.eval()
+        y_tf_np = self.evaluate(y_tf)
         self.assertAllClose(y_tf_np, y_np)
 
   def testKeepDims(self):
@@ -175,7 +195,7 @@ class ModTest(test_util.TensorFlowTestCase):
         with self.cached_session(use_gpu=True):
           x_tf = constant_op.constant(x_np, shape=x_np.shape)
           y_tf = math_ops.mod(x_tf, denom)
-          y_tf_np = y_tf.eval()
+          y_tf_np = self.evaluate(y_tf)
           y_np = np.fmod(x_np, denom)
         self.assertAllClose(y_tf_np, y_np, atol=1e-2)
 
@@ -188,7 +208,7 @@ class ModTest(test_util.TensorFlowTestCase):
         with self.cached_session(use_gpu=True):
           x_tf = constant_op.constant(x_np, shape=x_np.shape)
           y_tf = math_ops.mod(x_tf, denom)
-          y_tf_np = y_tf.eval()
+          y_tf_np = self.evaluate(y_tf)
           y_np = np.mod(x_np, denom)
         self.assertAllClose(y_tf_np, y_np)
 
@@ -353,7 +373,7 @@ class AddNTest(test_util.TensorFlowTestCase):
             for i in range(0, num_inputs)
         ]
         addn = math_ops.add_n(input_vars)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         add_n_grad = gradients.gradients(addn, input_vars)
         self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
                             [g.eval() for g in add_n_grad])
@@ -441,14 +461,15 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       a = variables.Variable(2.)
       b = variables.Variable(4.)
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         c_grad = gradients.gradients(math_ops.divide(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
         c_grad = gradients.gradients(math_ops.div(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
         c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b])
-        self.assertAllEqual([None if x is None else x.eval()
-                             for x in c_grad], [None, None])
+        self.assertAllEqual(
+            [None if x is None else self.evaluate(x) for x in c_grad],
+            [None, None])
 
   def testConsistent(self):
     nums, divs = self.intTestData()
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index d0919bdbe46d204be9ca75250b6f2a756b767312..0b91b8dde8eb15d49e85cf021e916f19957f3e26 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -213,26 +212,6 @@ def _maybe_expand_labels(labels, predictions):
         lambda: array_ops.expand_dims(labels, -1, name=scope), lambda: labels)
 
 
-def _safe_div(numerator, denominator, name):
-  """Divides two tensors element-wise, returning 0 if the denominator is <= 0.
-
-  Args:
-    numerator: A real `Tensor`.
-    denominator: A real `Tensor`, with dtype matching `numerator`.
-    name: Name for the returned op.
-
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator)
-  t = math_ops.truediv(numerator, denominator)
-  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-  condition = math_ops.greater(denominator, zero)
-  zero = math_ops.cast(zero, t.dtype)
-  return array_ops.where(condition, t, zero, name=name)
-
-
 def _safe_scalar_div(numerator, denominator, name):
   """Divides two values, returning 0 if the denominator is 0.
 
@@ -246,7 +225,7 @@ def _safe_scalar_div(numerator, denominator, name):
   """
   numerator.get_shape().with_rank_at_most(1)
   denominator.get_shape().with_rank_at_most(1)
-  return _safe_div(numerator, denominator, name=name)
+  return math_ops.div_no_nan(numerator, denominator, name=name)
 
 
 def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
@@ -302,7 +281,7 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
   """Aggregate metric value across replicas."""
   def fn(distribution, *a):
     """Call `metric_value_fn` in the correct control flow context."""
-    if hasattr(distribution, '_outer_control_flow_context'):
+    if hasattr(distribution.extended, '_outer_control_flow_context'):
       # If there was an outer context captured before this method was called,
       # then we enter that context to create the metric value op. If the
       # caputred context is `None`, ops.control_dependencies(None) gives the
@@ -315,13 +294,13 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
       # once the update ops have been evaluted.
 
       # pylint: disable=protected-access
-      if distribution._outer_control_flow_context is None:
+      if distribution.extended._outer_control_flow_context is None:
         with ops.control_dependencies(None):
           metric_value = metric_value_fn(distribution, *a)
       else:
-        distribution._outer_control_flow_context.Enter()
+        distribution.extended._outer_control_flow_context.Enter()
         metric_value = metric_value_fn(distribution, *a)
-        distribution._outer_control_flow_context.Exit()
+        distribution.extended._outer_control_flow_context.Exit()
         # pylint: enable=protected-access
     else:
       metric_value = metric_value_fn(distribution, *a)
@@ -330,10 +309,10 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
     return metric_value
 
   return distribution_strategy_context.get_replica_context().merge_call(
-      fn, *args)
+      fn, args=args)
 
 
-@tf_export('metrics.mean')
+@tf_export(v1=['metrics.mean'])
 def mean(values,
          weights=None,
          metrics_collections=None,
@@ -401,13 +380,12 @@ def mean(values,
       update_count_op = state_ops.assign_add(count, num_values)
 
     def compute_mean(_, t, c):
-      return _safe_div(t, math_ops.maximum(c, 0), name='value')
+      return math_ops.div_no_nan(t, math_ops.maximum(c, 0), name='value')
 
     mean_t = _aggregate_across_replicas(
         metrics_collections, compute_mean, total, count)
-    update_op = _safe_div(update_total_op,
-                          math_ops.maximum(update_count_op, 0),
-                          name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_total_op, math_ops.maximum(update_count_op, 0), name='update_op')
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
@@ -415,7 +393,7 @@ def mean(values,
     return mean_t, update_op
 
 
-@tf_export('metrics.accuracy')
+@tf_export(v1=['metrics.accuracy'])
 def accuracy(labels,
              predictions,
              weights=None,
@@ -779,19 +757,19 @@ def auc(labels,
       """
       dtp = tp[:num_thresholds - 1] - tp[1:]
       p = tp + fp
-      prec_slope = _safe_div(
+      prec_slope = math_ops.div_no_nan(
           dtp,
           math_ops.maximum(p[:num_thresholds - 1] - p[1:], 0),
           name='prec_slope')
       intercept = tp[1:] - math_ops.multiply(prec_slope, p[1:])
       safe_p_ratio = array_ops.where(
           math_ops.logical_and(p[:num_thresholds - 1] > 0, p[1:] > 0),
-          _safe_div(p[:num_thresholds - 1],
-                    math_ops.maximum(p[1:], 0),
-                    name='recall_relative_ratio'),
-          array_ops.ones_like(p[1:]))
+          math_ops.div_no_nan(
+              p[:num_thresholds - 1],
+              math_ops.maximum(p[1:], 0),
+              name='recall_relative_ratio'), array_ops.ones_like(p[1:]))
       return math_ops.reduce_sum(
-          _safe_div(
+          math_ops.div_no_nan(
               prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
               math_ops.maximum(tp[1:] + fn[1:], 0),
               name='pr_auc_increment'),
@@ -1074,7 +1052,7 @@ def mean_per_class_accuracy(labels,
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
 
     def compute_mean_accuracy(_, count, total):
-      per_class_accuracy = _safe_div(
+      per_class_accuracy = math_ops.div_no_nan(
           count, math_ops.maximum(total, 0), name=None)
       mean_accuracy_v = math_ops.reduce_mean(
           per_class_accuracy, name='mean_accuracy')
@@ -1083,9 +1061,8 @@ def mean_per_class_accuracy(labels,
     mean_accuracy_v = _aggregate_across_replicas(
         metrics_collections, compute_mean_accuracy, count, total)
 
-    update_op = _safe_div(update_count_op,
-                          math_ops.maximum(update_total_op, 0),
-                          name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_count_op, math_ops.maximum(update_total_op, 0), name='update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
@@ -1394,15 +1371,14 @@ def mean_tensor(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    compute_mean = lambda _, t, c: _safe_div(
+    compute_mean = lambda _, t, c: math_ops.div_no_nan(  # pylint: disable=g-long-lambda
         t, math_ops.maximum(c, 0), name='value')
 
     mean_t = _aggregate_across_replicas(
         metrics_collections, compute_mean, total, count)
 
-    update_op = _safe_div(update_total_op,
-                          math_ops.maximum(update_count_op, 0),
-                          name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_total_op, math_ops.maximum(update_count_op, 0), name='update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index c8a5b58e4584e6deeb33380b53c02be564989206..b50bccfde229537a1a3f6d3265f8d6733db24284 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -507,8 +507,8 @@ class MomentsTest(test.TestCase):
       expected_variance = expected_x_squared - expected_mean_squared
 
       # Check that the moments are correct.
-      self.assertAllCloseAccordingToType(expected_mean, mean.eval())
-      self.assertAllCloseAccordingToType(expected_variance, var.eval())
+      self.assertAllCloseAccordingToType(expected_mean, self.evaluate(mean))
+      self.assertAllCloseAccordingToType(expected_variance, self.evaluate(var))
 
   def testBasic(self):
     for keep_dims in [False, True]:
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 5ac8eba6f7345af79fda2a68dad3e289ba5df5e9..552b274b833e4a3a5423eae99046d18c12c522ee 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -50,7 +50,7 @@ class BatchNormalizationTest(test.TestCase):
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
-    return y.eval()
+    return self.evaluate(y)
 
   def _test_inference(self,
                       x_shape,
@@ -82,7 +82,7 @@ class BatchNormalizationTest(test.TestCase):
           epsilon=epsilon,
           data_format=data_format,
           is_training=False)
-      y_val = sess.run(y)
+      y_val = self.evaluate(y)
       y_ref = self._inference_ref(x, scale, offset, mean, var, epsilon,
                                   data_format)
     # An atol value of 1e-3 is too small for float16's, because some adjacent
@@ -102,7 +102,7 @@ class BatchNormalizationTest(test.TestCase):
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
-    return y.eval(), mean.eval(), var.eval()
+    return self.evaluate(y), self.evaluate(mean), self.evaluate(var)
 
   def _test_training(self,
                      x_shape,
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 902653befc44eb9b2a8c0df9d5ce69a7a0138fed..34404edc9a1250710d4cd7a50e04ad8d187a5d7f 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -18,13 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 
@@ -948,10 +948,14 @@ def _FusedBatchNormGradGrad(op, *grad):
   grad_grad_x = grad[0]
   grad_grad_scale = grad[1]
   grad_grad_offset = grad[2]
-  grad_x, grad_scale, grad_offset = _BatchNormGrad(
-      grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
-  grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
-  grad_grad_y, grad_x, grad_scale = gradients_impl.gradients(
+  with backprop.GradientTape() as tape:
+    tape.watch(grad_y)
+    tape.watch(x)
+    tape.watch(scale)
+    grad_x, grad_scale, grad_offset = _BatchNormGrad(
+        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
+    grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
+  grad_grad_y, grad_x, grad_scale = tape.gradient(
       [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
   return grad_grad_y, grad_x, grad_scale, None, None
 
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index ef763a4b6147fdf58d4eb9f6b55d789d74aac086..9cf53f191a317f795ea752316304e7d792740022 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -424,7 +424,7 @@ def zero_fraction(value, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("nn.depthwise_conv2d")
+@tf_export(v1=["nn.depthwise_conv2d"])
 def depthwise_conv2d(input,
                      filter,
                      strides,
@@ -497,11 +497,68 @@ def depthwise_conv2d(input,
         op=op)
 
 
+@tf_export("nn.depthwise_conv2d", v1=[])
+def depthwise_conv2d_v2(input,
+                        filter,
+                        strides,
+                        padding,
+                        data_format=None,
+                        dilations=None,
+                        name=None):
+  """Depthwise 2-D convolution.
+
+  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
+  and a filter tensor of shape
+  `[filter_height, filter_width, in_channels, channel_multiplier]`
+  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
+  applies a different filter to each input channel (expanding from 1 channel
+  to `channel_multiplier` channels for each), then concatenates the results
+  together.  The output has `in_channels * channel_multiplier` channels.
+
+  In detail,
+
+      output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
+           filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
+                                           strides[2] * j + rate[1] * dj, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
+  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
+  If any value in `rate` is greater than 1, we perform atrous depthwise
+  convolution, in which case all values in the `strides` tensor must be equal
+  to 1.
+
+  Args:
+    input: 4-D with shape according to `data_format`.
+    filter: 4-D with shape
+      `[filter_height, filter_width, in_channels, channel_multiplier]`.
+    strides: 1-D of size 4.  The stride of the sliding window for each
+      dimension of `input`.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: 1-D of size 2. The dilation rate in which we sample input values
+      across the `height` and `width` dimensions in atrous convolution. If it is
+      greater than 1, then all values of strides must be 1.
+    name: A name for this operation (optional).
+
+  Returns:
+    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
+    "NHWC" format, shape is
+    `[batch, out_height, out_width, in_channels * channel_multiplier].`
+  """
+  return depthwise_conv2d(input=input,
+                          filter=filter,
+                          strides=strides,
+                          padding=padding,
+                          rate=dilations,
+                          name=name,
+                          data_format=data_format)
+
 # pylint: enable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin,line-too-long
-@tf_export("nn.separable_conv2d")
+@tf_export(v1=["nn.separable_conv2d"])
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
@@ -599,10 +656,76 @@ def separable_conv2d(input,
         name=name)
 
 
+@tf_export("nn.separable_conv2d", v1=[])
+def separable_conv2d_v2(
+    input,
+    depthwise_filter,
+    pointwise_filter,
+    strides,
+    padding,
+    data_format=None,
+    dilations=None,
+    name=None,
+):
+  """2-D convolution with separable filters.
+
+  Performs a depthwise convolution that acts separately on channels followed by
+  a pointwise convolution that mixes channels.  Note that this is separability
+  between dimensions `[1, 2]` and `3`, not spatial separability between
+  dimensions `1` and `2`.
+
+  In detail,
+
+      output[b, i, j, k] = sum_{di, dj, q, r}
+          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+          depthwise_filter[di, dj, q, r] *
+          pointwise_filter[0, 0, q * channel_multiplier + r, k]
+
+  `strides` controls the strides for the depthwise convolution only, since
+  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
+  `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
+  If any value in `rate` is greater than 1, we perform atrous depthwise
+  convolution, in which case all values in the `strides` tensor must be equal
+  to 1.
+
+  Args:
+    input: 4-D `Tensor` with shape according to `data_format`.
+    depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,
+      in_channels, channel_multiplier]`. Contains `in_channels` convolutional
+      filters of depth 1.
+    pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *
+      in_channels, out_channels]`.  Pointwise filter to mix channels after
+      `depthwise_filter` has convolved spatially.
+    strides: 1-D of size 4.  The strides for the depthwise convolution for each
+      dimension of `input`.
+    padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: 1-D of size 2. The dilation rate in which we sample input values
+      across the `height` and `width` dimensions in atrous convolution. If it is
+      greater than 1, then all values of strides must be 1.
+    name: A name for this operation (optional).
+
+  Returns:
+    A 4-D `Tensor` with shape according to 'data_format'. For
+      example, with data_format="NHWC", shape is [batch, out_height,
+      out_width, out_channels].
+  """
+  return separable_conv2d(
+      input,
+      depthwise_filter,
+      pointwise_filter,
+      strides,
+      padding,
+      rate=dilations,
+      name=name,
+      data_format=data_format)
+
 # pylint: enable=redefined-builtin,line-too-long
 
 
-@tf_export("nn.sufficient_statistics")
+@tf_export(v1=["nn.sufficient_statistics"])
 def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
@@ -652,6 +775,35 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   return counts, m_ss, v_ss, shift
 
 
+@tf_export("nn.sufficient_statistics", v1=[])
+def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
+  """Calculate the sufficient statistics for the mean and variance of `x`.
+
+  These sufficient statistics are computed using the one pass algorithm on
+  an input that's optionally shifted. See:
+  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
+
+  Args:
+    x: A `Tensor`.
+    axes: Array of ints. Axes along which to compute mean and variance.
+    shift: A `Tensor` containing the value by which to shift the data for
+      numerical stability, or `None` if no shift is to be performed. A shift
+      close to the true mean provides the most numerically stable results.
+    keepdims: produce statistics with the same dimensionality as the input.
+    name: Name used to scope the operations that compute the sufficient stats.
+
+  Returns:
+    Four `Tensor` objects of the same type as `x`:
+
+    * the count (number of elements to average over).
+    * the (possibly shifted) sum of the elements in the array.
+    * the (possibly shifted) sum of squares of the elements in the array.
+    * the shift by which the mean must be corrected or None if `shift` is None.
+  """
+  return sufficient_statistics(
+      x=x, axes=axes, shift=shift, keep_dims=keepdims, name=name)
+
+
 @tf_export("nn.normalize_moments")
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
@@ -743,7 +895,7 @@ def moments(
       return (mean, variance)
 
 
-@tf_export("nn.weighted_moments")
+@tf_export(v1=["nn.weighted_moments"])
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
   """Returns the frequency-weighted mean and variance of `x`.
 
@@ -815,6 +967,30 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     return weighted_mean, weighted_variance
 
 
+@tf_export("nn.weighted_moments", v1=[])
+def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
+  """Returns the frequency-weighted mean and variance of `x`.
+
+  Args:
+    x: A tensor.
+    axes: 1-d tensor of int32 values; these are the axes along which
+      to compute mean and variance.
+    frequency_weights: A tensor of positive weights which can be
+      broadcast with x.
+    keepdims: Produce moments with the same dimensionality as the input.
+    name: Name used to scope the operation.
+
+  Returns:
+    Two tensors: `weighted_mean` and `weighted_variance`.
+  """
+  return weighted_moments(
+      x=x,
+      axes=axes,
+      frequency_weights=frequency_weights,
+      name=name,
+      keep_dims=keepdims)
+
+
 @tf_export("nn.batch_normalization")
 def batch_normalization(x,
                         mean,
@@ -946,7 +1122,7 @@ def fused_batch_norm(
   return y, batch_mean, batch_var
 
 
-@tf_export("nn.batch_norm_with_global_normalization")
+@tf_export(v1=["nn.batch_norm_with_global_normalization"])
 def batch_norm_with_global_normalization(t,
                                          m,
                                          v,
@@ -984,6 +1160,53 @@ def batch_norm_with_global_normalization(t,
                              else None, variance_epsilon, name)
 
 
+# pylint: disable=redefined-builtin,line-too-long
+@tf_export("nn.batch_norm_with_global_normalization", v1=[])
+def batch_norm_with_global_normalization_v2(input,
+                                            mean,
+                                            variance,
+                                            beta,
+                                            gamma,
+                                            variance_epsilon,
+                                            scale_after_normalization,
+                                            name=None):
+  """Batch normalization.
+
+  This op is deprecated. See `tf.nn.batch_normalization`.
+
+  Args:
+    input: A 4D input Tensor.
+    mean: A 1D mean Tensor with size matching the last dimension of t.
+      This is the first output from tf.nn.moments,
+      or a saved moving average thereof.
+    variance: A 1D variance Tensor with size matching the last dimension of t.
+      This is the second output from tf.nn.moments,
+      or a saved moving average thereof.
+    beta: A 1D beta Tensor with size matching the last dimension of t.
+      An offset to be added to the normalized tensor.
+    gamma: A 1D gamma Tensor with size matching the last dimension of t.
+      If "scale_after_normalization" is true, this tensor will be multiplied
+      with the normalized tensor.
+    variance_epsilon: A small float number to avoid dividing by 0.
+    scale_after_normalization: A bool indicating whether the resulted tensor
+      needs to be multiplied with gamma.
+    name: A name for this operation (optional).
+
+  Returns:
+     A batch-normalized `t`.
+  """
+  return batch_norm_with_global_normalization(t=input,
+                                              m=mean,
+                                              v=variance,
+                                              beta=beta,
+                                              gamma=gamma,
+                                              variance_epsilon=variance_epsilon,
+                                              scale_after_normalization=scale_after_normalization,
+                                              name=name)
+
+# pylint: enable=redefined-builtin,line-too-long
+
+
 def _sum_rows(x):
   """Returns a vector summing up each row of the matrix x."""
   # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index e31d162285beb8293e50064d68792545cedb005e..755c8ffcd25ad069b7e8ed67a82a7f5c4406a5fc 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -25,8 +25,10 @@ import numpy as np
 from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -643,7 +645,7 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
   return strides, dilation_rate
 
 
-@tf_export("nn.convolution")
+@tf_export(v1=["nn.convolution"])
 def convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -781,6 +783,30 @@ def convolution(
     return op(input, filter)
 
 
+@tf_export("nn.convolution", v1=[])
+def convolution_v2(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  return convolution(
+      input,  # pylint: disable=redefined-builtin
+      filters,
+      padding=padding,
+      strides=strides,
+      dilation_rate=dilations,
+      name=name,
+      data_format=data_format)
+
+convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        convolution.__doc__, "dilation_rate", "dilations"),
+    "filter", "filters")
+
+
 class Convolution(object):
   """Helper class for convolution.
 
@@ -872,7 +898,7 @@ class Convolution(object):
     return self.conv_op(inp, filter)
 
 
-@tf_export("nn.pool")
+@tf_export(v1=["nn.pool"])
 def pool(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1043,6 +1069,105 @@ def pool(
         filter_shape=window_shape)
 
 
+@tf_export("nn.pool", v1=[])
+def pool_v2(
+    input,  # pylint: disable=redefined-builtin
+    window_shape,
+    pooling_type,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  # pylint: disable=line-too-long
+  """Performs an N-D pooling operation.
+
+  In the case that `data_format` does not start with "NC", computes for
+      0 <= b < batch_size,
+      0 <= x[i] < output_spatial_shape[i],
+      0 <= c < num_channels:
+
+  ```
+    output[b, x[0], ..., x[N-1], c] =
+      REDUCE_{z[0], ..., z[N-1]}
+        input[b,
+              x[0] * strides[0] - pad_before[0] + dilation_rate[0]*z[0],
+              ...
+              x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1],
+              c],
+  ```
+
+  where the reduction function REDUCE depends on the value of `pooling_type`,
+  and pad_before is defined based on the value of `padding` as described in
+  the "returns" section of `tf.nn.convolution` for details.
+  The reduction never includes out-of-bounds positions.
+
+  In the case that `data_format` starts with `"NC"`, the `input` and output are
+  simply transposed as follows:
+
+  ```
+    pool(input, data_format, **kwargs) =
+      tf.transpose(pool(tf.transpose(input, [0] + range(2,N+2) + [1]),
+                        **kwargs),
+                   [0, N+1] + range(1, N+1))
+  ```
+
+  Args:
+    input: Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if data_format does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC".  Pooling happens over the spatial dimensions only.
+    window_shape: Sequence of N ints >= 1.
+    pooling_type: Specifies pooling operation, must be "AVG" or "MAX".
+    strides: Optional. Sequence of N ints >= 1.  Defaults to [1]*N. If any value of
+      strides is > 1, then all values of dilation_rate must be 1.
+    padding: The padding algorithm, must be "SAME" or "VALID". Defaults to "SAME".
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string or None.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (default, or if `data_format`
+      does not start with "NC"), or the second dimension (if `data_format`
+      starts with "NC").  For N=1, the valid values are "NWC" (default) and
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW". For
+      N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations: Optional.  Dilation rate.  List of N ints >= 1. Defaults to
+      [1]*N.  If any value of dilation_rate is > 1, then all values of strides
+      must be 1.
+    name: Optional. Name of the op.
+
+  Returns:
+    Tensor of rank N+2, of shape
+      [batch_size] + output_spatial_shape + [num_channels]
+
+    if data_format is None or does not start with "NC", or
+
+      [batch_size, num_channels] + output_spatial_shape
+
+    if data_format starts with "NC",
+    where `output_spatial_shape` depends on the value of padding:
+
+    If padding = "SAME":
+      output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
+
+    If padding = "VALID":
+      output_spatial_shape[i] =
+        ceil((input_spatial_shape[i] - (window_shape[i] - 1) * dilation_rate[i])
+             / strides[i]).
+
+  Raises:
+    ValueError: if arguments are invalid.
+
+  """
+  return pool(
+      input=input,
+      window_shape=window_shape,
+      pooling_type=pooling_type,
+      padding=padding,
+      dilation_rate=dilations,
+      strides=strides,
+      name=name,
+      data_format=data_format)
+
+
 @tf_export("nn.atrous_conv2d")
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
@@ -1180,7 +1305,208 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
-@tf_export("nn.conv2d_transpose")
+@tf_export("nn.conv2d", v1=[])
+def conv2d_v2(input,  # pylint: disable=redefined-builtin
+              filters,
+              strides,
+              padding,
+              data_format="NHWC",
+              dilations=None,
+              name=None):
+  # pylint: disable=line-too-long
+  r"""Computes a 2-D convolution given 4-D `input` and `filters` tensors.
+
+  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+  and a filter / kernel tensor of shape
+  `[filter_height, filter_width, in_channels, out_channels]`, this op
+  performs the following:
+
+  1. Flattens the filter to a 2-D matrix with shape
+     `[filter_height * filter_width * in_channels, output_channels]`.
+  2. Extracts image patches from the input tensor to form a *virtual*
+     tensor of shape `[batch, out_height, out_width,
+     filter_height * filter_width * in_channels]`.
+  3. For each patch, right-multiplies the filter matrix and the image patch
+     vector.
+
+  In detail, with the default NHWC format,
+
+      output[b, i, j, k] =
+          sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+                          filter[di, dj, q, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      A 4-D tensor. The dimension order is interpreted according to the value
+      of `data_format`, see below for details.
+    filters: A `Tensor`. Must have the same type as `input`.
+      A 4-D tensor of shape
+      `[filter_height, filter_width, in_channels, out_channels]`
+    strides: A list of `ints`.
+      1-D tensor of length 4.  The stride of the sliding window for each
+      dimension of `input`. The dimension order is determined by the value of
+      `data_format`, see below for details.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, height, width, channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, channels, height, width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by the
+      value of `data_format`, see above for details. Dilations in the batch and
+      depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  # pylint: enable=line-too-long
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
+                           filters,
+                           strides,
+                           padding,
+                           use_cudnn_on_gpu=True,
+                           data_format=data_format,
+                           dilations=dilations,
+                           name=name)
+tf_export(v1=["nn.conv2d"])(gen_nn_ops.conv2d)
+
+
+@tf_export("nn.conv2d_backprop_filter", v1=[])
+def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
+                              filter_sizes,
+                              out_backprop,
+                              strides,
+                              padding,
+                              data_format="NHWC",
+                              dilations=None,
+                              name=None):
+  r"""Computes the gradients of convolution with respect to the filter.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape `[batch, in_height, in_width, in_channels]`.
+    filter_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the tensor shape of `filter`,
+      where `filter` is a 4-D
+      `[filter_height, filter_width, in_channels, out_channels]` tensor.
+    out_backprop: A `Tensor`. Must have the same type as `input`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
+                                           filter_sizes,
+                                           out_backprop,
+                                           strides,
+                                           padding,
+                                           use_cudnn_on_gpu=True,
+                                           data_format=data_format,
+                                           dilations=dilations,
+                                           name=name)
+tf_export(v1=["nn.conv2d_backprop_filter"])(
+    gen_nn_ops.conv2d_backprop_filter)
+
+
+@tf_export("nn.conv2d_backprop_input", v1=[])
+def conv2d_backprop_input_v2(input_sizes,
+                             filters,
+                             out_backprop,
+                             strides,
+                             padding,
+                             data_format="NHWC",
+                             dilations=None,
+                             name=None):
+  r"""Computes the gradients of convolution with respect to the input.
+
+  Args:
+    input_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the shape of `input`,
+      where `input` is a 4-D `[batch, height, width, channels]` tensor.
+    filters: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape
+      `[filter_height, filter_width, in_channels, out_channels]`.
+    out_backprop: A `Tensor`. Must have the same type as `filters`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `filters`.
+  """
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d_backprop_input(input_sizes,
+                                          filters,
+                                          out_backprop,
+                                          strides,
+                                          padding,
+                                          use_cudnn_on_gpu=True,
+                                          data_format=data_format,
+                                          dilations=dilations,
+                                          name=name)
+tf_export(v1=["nn.conv2d_backprop_input"])(
+    gen_nn_ops.conv2d_backprop_input)
+
+
+@tf_export(v1=["nn.conv2d_transpose"])
 def conv2d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -1260,6 +1586,31 @@ def conv2d_transpose(
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.conv2d_transpose", v1=[])
+def conv2d_transpose_v2(
+    input,
+    filters,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NHWC",
+    name=None):
+  return conv2d_transpose(
+      input,
+      filters,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+conv2d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        conv2d_transpose.__doc__, "filter", "filters"),
+    "value", "input")
+
+
 @tf_export("nn.atrous_conv2d_transpose")
 def atrous_conv2d_transpose(value,
                             filters,
@@ -1408,7 +1759,29 @@ def atrous_conv2d_transpose(value,
         input=value, crops=batch_to_space_crop, block_size=rate)
 
 
-@tf_export("nn.conv3d_transpose")
+@tf_export("nn.conv3d", v1=[])
+def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
+              filters,
+              strides,
+              padding,
+              data_format="NDHWC",
+              dilations=None,
+              name=None):
+  if dilations is None:
+    dilations = [1, 1, 1, 1, 1]
+  return gen_nn_ops.conv3d(input,  # pylint: disable=redefined-builtin
+                           filters,
+                           strides,
+                           padding,
+                           data_format=data_format,
+                           dilations=dilations,
+                           name=name)
+tf_export(v1=["nn.conv3d"])(gen_nn_ops.conv3d)
+conv3d_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    gen_nn_ops.conv3d.__doc__, "filter", "filters")
+
+
+@tf_export(v1=["nn.conv3d_transpose"])
 def conv3d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -1486,6 +1859,31 @@ def conv3d_transpose(
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.conv3d_transpose", v1=[])
+def conv3d_transpose_v2(
+    input,
+    filters,
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NDHWC",
+    name=None):
+  return conv3d_transpose(
+      input,
+      filters,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+conv3d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        conv3d_transpose.__doc__, "filter", "filters"),
+    "value", "input")
+
+
 @tf_export("nn.bias_add")
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
@@ -1541,7 +1939,7 @@ def bias_add_v1(value, bias, name=None):
     return gen_nn_ops.bias_add_v1(value, bias, name=name)
 
 
-@tf_export("nn.crelu")
+@tf_export(v1=["nn.crelu"])
 def crelu(features, name=None, axis=-1):
   """Computes Concatenated ReLU.
 
@@ -1567,6 +1965,12 @@ def crelu(features, name=None, axis=-1):
     return gen_nn_ops.relu(c)
 
 
+@tf_export("nn.crelu", v1=[])
+def crelu_v2(features, axis=-1, name=None):
+  return crelu(features, name=name, axis=axis)
+crelu_v2.__doc__ = crelu.__doc__
+
+
 @tf_export("nn.relu6")
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
@@ -1609,6 +2013,8 @@ def leaky_relu(features, alpha=0.2, name=None):
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     if compat.forward_compatible(2018, 11, 1):
+      if isinstance(alpha, np.ndarray):
+        alpha = np.asscalar(alpha)
       return gen_nn_ops.leaky_relu(features, alpha=alpha, name=name)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
     return math_ops.maximum(alpha * features, features, name=name)
@@ -1682,6 +2088,16 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   if is_last_dim:
     return compute_op(logits, name=name)
 
+  dim_val = dim
+  if isinstance(dim, ops.Tensor):
+    dim_val = tensor_util.constant_value(dim)
+  if dim_val is not None and (dim_val < -shape.ndims or dim_val >= shape.ndims):
+    raise errors_impl.InvalidArgumentError(
+        None, None,
+        "Dimension (%d) must be in the range [%d, %d) where %d is the number of"
+        " dimensions in the input." % (dim_val, -shape.ndims, shape.ndims,
+                                       shape.ndims))
+
   # If dim is not the last dimension, we have to do a transpose so that we can
   # still perform softmax on its last dimension.
 
@@ -1702,7 +2118,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   return output
 
 
-@tf_export("nn.softmax", "math.softmax")
+@tf_export(v1=["nn.softmax", "math.softmax"])
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -1732,7 +2148,32 @@ def softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops.softmax, axis, name)
 
 
-@tf_export("nn.log_softmax", "math.log_softmax")
+@tf_export("nn.softmax", "math.softmax", v1=[])
+def softmax_v2(logits, axis=None, name=None):
+  """Computes softmax activations.
+
+  This function performs the equivalent of
+
+      softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
+
+  Args:
+    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    axis: The dimension softmax would be performed on. The default is -1 which
+      indicates the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type and shape as `logits`.
+
+  Raises:
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
+      dimension of `logits`.
+  """
+  return _softmax(logits, gen_nn_ops.softmax, axis, name)
+
+
+@tf_export(v1=["nn.log_softmax", "math.log_softmax"])
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -1762,6 +2203,31 @@ def log_softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
 
 
+@tf_export("nn.log_softmax", "math.log_softmax", v1=[])
+def log_softmax_v2(logits, axis=None, name=None):
+  """Computes log softmax activations.
+
+  For each batch `i` and class `j` we have
+
+      logsoftmax = logits - log(reduce_sum(exp(logits), axis))
+
+  Args:
+    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    axis: The dimension softmax would be performed on. The default is -1 which
+      indicates the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
+
+  Raises:
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
+      dimension of `logits`.
+  """
+  return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
+
+
 def _ensure_xent_args(name, sentinel, labels, logits):
   # Make sure that all arguments were passed as named arguments.
   if sentinel is not None:
@@ -2186,7 +2652,7 @@ def _calc_bias_add_flops(graph, node):
   return ops.OpStats("flops", input_count)
 
 
-@tf_export("nn.xw_plus_b")
+@tf_export(v1=["nn.xw_plus_b"])
 def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
   """Computes matmul(x, weights) + biases.
 
@@ -2389,7 +2855,293 @@ def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefine
   return gen_nn_ops.nth_element(input, n, reverse=reverse, name=name)
 
 
-@tf_export("nn.conv1d")
+@tf_export(v1=["nn.fractional_max_pool"])
+@deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
+                        "args are deprecated.  Use fractional_max_pool_v2.")
+def fractional_max_pool(value,
+                        pooling_ratio,
+                        pseudo_random=False,
+                        overlapping=False,
+                        deterministic=False,
+                        seed=0,
+                        seed2=0,
+                        name=None):   # pylint: disable=redefined-builtin
+  r"""Performs fractional max pooling on the input.
+
+  This is a deprecated version of `fractional_max_pool`.
+
+  Fractional max pooling is slightly different than regular max pooling.  In
+  regular max pooling, you downsize an input set by taking the maximum value of
+  smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+  a factor of N, where N is an integer.  Fractional max pooling, as you might
+  expect from the word "fractional", means that the overall reduction ratio N
+  does not have to be an integer.
+
+  The sizes of the pooling regions are generated randomly but are fairly
+  uniform.  For example, let's look at the height dimension, and the constraints
+  on the list of rows that will be pool boundaries.
+
+  First we define the following:
+
+  1.  input_row_length : the number of rows from the input set
+  2.  output_row_length : which will be smaller than the input
+  3.  alpha = input_row_length / output_row_length : our reduction ratio
+  4.  K = floor(alpha)
+  5.  row_pooling_sequence : this is the result list of pool boundary rows
+
+  Then, row_pooling_sequence should satisfy:
+
+  1.  a[0] = 0 : the first value of the sequence is 0
+  2.  a[end] = input_row_length : the last value of the sequence is the size
+  3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+  4.  length(row_pooling_sequence) = output_row_length+1
+
+  For more details on fractional max pooling, see this paper: [Benjamin Graham,
+  Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional max pooling.
+    deterministic: An optional `bool`.  Deprecated; use `fractional_max_pool_v2`
+      instead.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    seed2: An optional `int`.  Deprecated; use `fractional_max_pool_v2` instead.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional max pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                        overlapping, deterministic, seed, seed2,
+                                        name)
+
+
+@tf_export("nn.fractional_max_pool", v1=[])
+def fractional_max_pool_v2(value,
+                           pooling_ratio,
+                           pseudo_random=False,
+                           overlapping=False,
+                           seed=0,
+                           name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional max pooling on the input.
+
+  Fractional max pooling is slightly different than regular max pooling.  In
+  regular max pooling, you downsize an input set by taking the maximum value of
+  smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+  a factor of N, where N is an integer.  Fractional max pooling, as you might
+  expect from the word "fractional", means that the overall reduction ratio N
+  does not have to be an integer.
+
+  The sizes of the pooling regions are generated randomly but are fairly
+  uniform.  For example, let's look at the height dimension, and the constraints
+  on the list of rows that will be pool boundaries.
+
+  First we define the following:
+
+  1.  input_row_length : the number of rows from the input set
+  2.  output_row_length : which will be smaller than the input
+  3.  alpha = input_row_length / output_row_length : our reduction ratio
+  4.  K = floor(alpha)
+  5.  row_pooling_sequence : this is the result list of pool boundary rows
+
+  Then, row_pooling_sequence should satisfy:
+
+  1.  a[0] = 0 : the first value of the sequence is 0
+  2.  a[end] = input_row_length : the last value of the sequence is the size
+  3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+  4.  length(row_pooling_sequence) = output_row_length+1
+
+  For more details on fractional max pooling, see this paper: [Benjamin Graham,
+  Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional max pooling.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional max pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  if seed == 0:
+    return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=False,
+                                          seed=0, seed2=0, name=name)
+  else:
+    seed1, seed2 = random_seed.get_seed(seed)
+    return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=True,
+                                          seed=seed1, seed2=seed2, name=name)
+
+
+@tf_export(v1=["nn.fractional_avg_pool"])
+@deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
+                        "args are deprecated.  Use fractional_avg_pool_v2.")
+def fractional_avg_pool(value,
+                        pooling_ratio,
+                        pseudo_random=False,
+                        overlapping=False,
+                        deterministic=False,
+                        seed=0,
+                        seed2=0,
+                        name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional average pooling on the input.
+
+  This is a deprecated version of `fractional_avg_pool`.
+
+  Fractional average pooling is similar to Fractional max pooling in the pooling
+  region generation step. The only difference is that after pooling regions are
+  generated, a mean operation is performed instead of a max operation in each
+  pooling region.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional avg pooling.
+    deterministic: An optional `bool`.  Deprecated; use `fractional_avg_pool_v2`
+      instead.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    seed2: An optional `int`.  Deprecated; use `fractional_avg_pool_v2` instead.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional avg pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                        overlapping, deterministic, seed, seed2,
+                                        name=name)
+
+
+@tf_export("nn.fractional_avg_pool", v1=[])
+def fractional_avg_pool_v2(value,
+                           pooling_ratio,
+                           pseudo_random=False,
+                           overlapping=False,
+                           seed=0,
+                           name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional average pooling on the input.
+
+  Fractional average pooling is similar to Fractional max pooling in the pooling
+  region generation step. The only difference is that after pooling regions are
+  generated, a mean operation is performed instead of a max operation in each
+  pooling region.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional avg pooling.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional avg pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  if seed == 0:
+    return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=False,
+                                          seed=0, seed2=0, name=name)
+  else:
+    seed1, seed2 = random_seed.get_seed(seed)
+    return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=True,
+                                          seed=seed1, seed2=seed2, name=name)
+
+
+@tf_export(v1=["nn.conv1d"])
 @deprecation.deprecated_arg_values(
     None,
     "`NCHW` for data_format is deprecated, use `NCW` instead",
@@ -2474,6 +3226,64 @@ def conv1d(value,
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
+@tf_export("nn.conv1d", v1=[])
+def conv1d_v2(input,  # pylint: disable=redefined-builtin
+              filters,
+              stride,
+              padding,
+              data_format=None,
+              name=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `input`.
+    stride: An `integer`.  The number of entries by which
+      the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
+      to `"NWC"`, the data is stored in the order of
+      [batch, in_width, in_channels].  The `"NCW"` format stores
+      data as [batch, in_channels, in_width].
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  return conv1d(input,  # pylint: disable=redefined-builtin
+                filters,
+                stride,
+                padding,
+                use_cudnn_on_gpu=True,
+                data_format=data_format,
+                name=name)
+
+
 def conv1d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -2680,3 +3490,10 @@ def in_top_k(predictions, targets, k, name=None):
   """
   with ops.name_scope(name, "in_top_k"):
     return gen_nn_ops.in_top_kv2(predictions, targets, k, name=name)
+
+
+tf_export(v1=["nn.quantized_avg_pool"])(gen_nn_ops.quantized_avg_pool)
+tf_export(v1=["nn.quantized_conv2d"])(gen_nn_ops.quantized_conv2d)
+tf_export(v1=["nn.quantized_relu_x"])(gen_nn_ops.quantized_relu_x)
+tf_export(v1=["nn.quantized_max_pool"])(gen_nn_ops.quantized_max_pool)
+
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 312e1f08791c934a5271b76dd65527dcab2b933d..96b9d6fc0da4ff02dab25eea21302306ff687c66 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -53,31 +53,29 @@ class ZeroFractionTest(test_lib.TestCase):
     x_shape = [5, 17]
     x_np = np.random.randint(0, 2, size=x_shape).astype(np.float32)
     y_np = self._ZeroFraction(x_np)
-    with self.cached_session():
-      x_tf = constant_op.constant(x_np)
-      x_tf.set_shape(x_shape)
-      y_tf = nn_impl.zero_fraction(x_tf)
-      y_tf_np = y_tf.eval()
+
+    x_tf = constant_op.constant(x_np)
+    x_tf.set_shape(x_shape)
+    y_tf = nn_impl.zero_fraction(x_tf)
+    y_tf_np = self.evaluate(y_tf)
+
     eps = 1e-8
     self.assertAllClose(y_tf_np, y_np, eps)
 
   def testZeroFractionEmpty(self):
-    with self.cached_session():
-      x = np.zeros(0)
-      y = nn_impl.zero_fraction(x).eval()
-      self.assertTrue(np.isnan(y))
+    x = np.zeros(0)
+    y = self.evaluate(nn_impl.zero_fraction(x))
+    self.assertTrue(np.isnan(y))
 
   def testZeroFraction2_27Zeros(self):
     sparsity = nn_impl.zero_fraction(
         array_ops.zeros([int(2**27 * 1.01)], dtype=dtypes.int8))
-    with self.cached_session():
-      self.assertAllClose(1.0, sparsity.eval())
+    self.assertAllClose(1.0, self.evaluate(sparsity))
 
   def testZeroFraction2_27Ones(self):
     sparsity = nn_impl.zero_fraction(
         array_ops.ones([int(2**27 * 1.01)], dtype=dtypes.int8))
-    with self.cached_session():
-      self.assertAllClose(0.0, sparsity.eval())
+    self.assertAllClose(0.0, self.evaluate(sparsity))
 
   def testUnknownSize(self):
     value = array_ops.placeholder(dtype=dtypes.float32)
@@ -115,7 +113,7 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
     x_neg_axis = nn_ops.softmax(arr, axis=-2)
     y_pos_axis = nn_ops.softmax(arr, axis=0)
-    z_gt_axis = nn_ops.softmax(arr, axis=4)
+    z_gt_axis = nn_ops.softmax(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
@@ -200,7 +198,7 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
     x_neg_axis = nn_ops.log_softmax(arr, axis=-2)
     y_pos_axis = nn_ops.log_softmax(arr, axis=0)
-    z_gt_axis = nn_ops.log_softmax(arr, axis=4)
+    z_gt_axis = nn_ops.log_softmax(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
@@ -302,19 +300,18 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob)
-        final_count = 0
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob)
+      final_count = 0
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
@@ -330,19 +327,18 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        final_count = 0
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      final_count = 0
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
@@ -355,17 +351,15 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          # Verifies that each y column as only one type of activation.
-          for i in xrange(x_dim):
-            sorted_value = np.unique(np.sort(value[i, :]))
-            self.assertEqual(sorted_value.size, 1)
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        # Verifies that each y column as only one type of activation.
+        for i in xrange(x_dim):
+          sorted_value = np.unique(np.sort(value[i, :]))
+          self.assertEqual(sorted_value.size, 1)
 
   def testDropoutPlaceholderKeepProb(self):
     # Runs dropout with 0-1 tensor 10 times, sum the number of ones and validate
@@ -409,20 +403,19 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        # Set noise_shape=[None, 1] which means [x_dim, 1].
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[None, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        final_count = 0
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      # Set noise_shape=[None, 1] which means [x_dim, 1].
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[None, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      final_count = 0
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
@@ -563,78 +556,78 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
           initializer=constant_op.constant(biases))
       with self.session(graph=g) as sess:
         variables.global_variables_initializer().run()
-        return sess.run([list(sharded_weights), list(sharded_biases)])
+        return self.evaluate([list(sharded_weights), list(sharded_biases)])
 
   def testShapes(self):
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_basic_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertEqual(exp_logits.shape, got_logits.shape, self._eps)
-        self.assertEqual(exp_labels.shape, got_labels.shape, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_basic_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertEqual(exp_logits.shape, got_logits.shape, self._eps)
+      self.assertEqual(exp_labels.shape, got_labels.shape, self._eps)
 
   def testBasic(self):
     """Without accidental hit removal or subtract_log_q."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_basic_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_basic_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testAccidentalHitRemoval(self):
     """With accidental hit removal, no subtract_log_q."""
@@ -642,118 +635,118 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     num_classes = 5
     batch_size = 3
     sampled = [1, 0, 2, 3]
-    with self.cached_session():
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, _,
-         _) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=sampled,
-             subtract_log_q=False)
-        logits_tensor, _ = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=len(sampled),
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=True,
-            partition_strategy="div",
-            name="sampled_logits_accidental_hit_removal_num_true_%d" % num_true)
-        # Test that the exponentiated logits of accidental hits are near 0.
-        # First we need to find the hits in this random test run:
-        labels_reshape = labels.reshape((batch_size, num_true))
-        got_logits = logits_tensor.eval()
-        for row in xrange(batch_size):
-          row_labels = labels_reshape[row, :]
-          for col in xrange(len(sampled)):
-            if sampled[col] in row_labels:
-              # We need to add the num_true_test offset into logits_*
-              self.assertNear(
-                  np.exp(got_logits[row, col + num_true]), 0., self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, _,
+       _) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=sampled,
+           subtract_log_q=False)
+      logits_tensor, _ = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=len(sampled),
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=True,
+          partition_strategy="div",
+          name="sampled_logits_accidental_hit_removal_num_true_%d" % num_true)
+      # Test that the exponentiated logits of accidental hits are near 0.
+      # First we need to find the hits in this random test run:
+      labels_reshape = labels.reshape((batch_size, num_true))
+      got_logits = self.evaluate(logits_tensor)
+      for row in xrange(batch_size):
+        row_labels = labels_reshape[row, :]
+        for col in xrange(len(sampled)):
+          if sampled[col] in row_labels:
+            # We need to add the num_true_test offset into logits_*
+            self.assertNear(
+                np.exp(got_logits[row, col + num_true]), 0., self._eps)
 
   def testSubtractLogQ(self):
     """With subtract_log_q, no accidental hit removal."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=True)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=True,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_subtract_log_q_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=True)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=True,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_subtract_log_q_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testSharded(self):
     """With sharded weights and sharded biases."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        weight_shards, bias_shards = self._ShardTestEmbeddings(
-            weights, biases, num_shards=3)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=[constant_op.constant(shard) for shard in weight_shards],
-            biases=[constant_op.constant(shard) for shard in bias_shards],
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_sharded_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      weight_shards, bias_shards = self._ShardTestEmbeddings(
+          weights, biases, num_shards=3)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=[constant_op.constant(shard) for shard in weight_shards],
+          biases=[constant_op.constant(shard) for shard in bias_shards],
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_sharded_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testNCELoss(self):
     # A simple test to verify the numerics.
@@ -782,35 +775,34 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_nce_loss = np.sum(
         _SigmoidCrossEntropyWithLogits(exp_logits, exp_labels), 1)
 
-    with self.cached_session():
-      got_nce_loss = nn_impl.nce_loss(
-          weights=constant_op.constant(weights),
-          biases=constant_op.constant(biases),
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
-
-      # Test with sharded weights and sharded biases.
-      weight_shards, bias_shards = self._ShardTestEmbeddings(
-          weights, biases, num_shards=3)
-      got_nce_loss = nn_impl.nce_loss(
-          weights=[constant_op.constant(shard) for shard in weight_shards],
-          biases=[constant_op.constant(shard) for shard in bias_shards],
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
+    got_nce_loss = nn_impl.nce_loss(
+        weights=constant_op.constant(weights),
+        biases=constant_op.constant(biases),
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        partition_strategy="div")
+
+    self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
+
+    # Test with sharded weights and sharded biases.
+    weight_shards, bias_shards = self._ShardTestEmbeddings(
+        weights, biases, num_shards=3)
+    got_nce_loss = nn_impl.nce_loss(
+        weights=[constant_op.constant(shard) for shard in weight_shards],
+        biases=[constant_op.constant(shard) for shard in bias_shards],
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        partition_strategy="div")
+
+    self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
 
   def testSampledSoftmaxLoss(self):
     # A simple test to verify the numerics.
@@ -839,39 +831,38 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.cached_session():
-      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
-          weights=constant_op.constant(weights),
-          biases=constant_op.constant(biases),
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          remove_accidental_hits=False,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-4)
-
-      # Test with sharded weights and sharded biases.
-      weight_shards, bias_shards = self._ShardTestEmbeddings(
-          weights, biases, num_shards=3)
-      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
-          weights=[constant_op.constant(shard) for shard in weight_shards],
-          biases=[constant_op.constant(shard) for shard in bias_shards],
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          remove_accidental_hits=False,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-4)
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
+        weights=constant_op.constant(weights),
+        biases=constant_op.constant(biases),
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        remove_accidental_hits=False,
+        partition_strategy="div")
+
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-4)
+
+    # Test with sharded weights and sharded biases.
+    weight_shards, bias_shards = self._ShardTestEmbeddings(
+        weights, biases, num_shards=3)
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
+        weights=[constant_op.constant(shard) for shard in weight_shards],
+        biases=[constant_op.constant(shard) for shard in bias_shards],
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        remove_accidental_hits=False,
+        partition_strategy="div")
+
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-4)
 
   def testSampledSoftmaxLossBf16(self):
     # A simple test to verify the numerics for bfloat16.
@@ -900,29 +891,30 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.cached_session():
-      true_exp_bf16 = np.full(
-          [batch_size, 1], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
-      sampled_exp_bf16 = np.full(
-          [len(sampled)], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
-      sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
-
-      got_sampled_softmax_loss = math_ops.cast(
-          nn_impl.sampled_softmax_loss(
-              weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
-              biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
-              labels=constant_op.constant(
-                  labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
-              inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
-              num_sampled=4,
-              num_classes=num_classes,
-              num_true=1,
-              sampled_values=sampled_vals_bf16,
-              remove_accidental_hits=False,
-              partition_strategy="div"), dtypes.float32)
-
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-1)
+    true_exp_bf16 = np.full([batch_size, 1],
+                            fill_value=0.5,
+                            dtype=dtypes.bfloat16.as_numpy_dtype)
+    sampled_exp_bf16 = np.full([len(sampled)],
+                               fill_value=0.5,
+                               dtype=dtypes.bfloat16.as_numpy_dtype)
+    sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
+
+    got_sampled_softmax_loss = math_ops.cast(
+        nn_impl.sampled_softmax_loss(
+            weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
+            biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
+            labels=constant_op.constant(
+                labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
+            inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
+            num_sampled=4,
+            num_classes=num_classes,
+            num_true=1,
+            sampled_values=sampled_vals_bf16,
+            remove_accidental_hits=False,
+            partition_strategy="div"), dtypes.float32)
+
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-1)
 
 
 class CReluTest(test_lib.TestCase):
@@ -931,9 +923,9 @@ class CReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.concatenate([x * (x > 0), -x * (x < 0)], axis=1)
-    with self.cached_session():
-      z = nn_ops.crelu(constant_op.constant(x)).eval()
-      self.assertAllClose(y, z, 1e-4)
+
+    z = self.evaluate(nn_ops.crelu(constant_op.constant(x)))
+    self.assertAllClose(y, z, 1e-4)
 
 
 class ReluTest(test_lib.TestCase):
@@ -942,9 +934,9 @@ class ReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.maximum(x, 0.0)
-    with self.cached_session():
-      z = nn_ops.relu(constant_op.constant(x)).eval()
-      self.assertAllEqual(y, z)
+
+    z = self.evaluate(nn_ops.relu(constant_op.constant(x)))
+    self.assertAllEqual(y, z)
 
   def testNaNs(self):
     # Test that relu(nan) = nan for various sizes.
@@ -967,8 +959,9 @@ class LeakyReluTest(test_lib.TestCase):
 
     outputs = nn_ops.leaky_relu(inputs)
     self.assertEquals(inputs.shape, outputs.shape)
-    with self.cached_session() as sess:
-      inputs, outputs = sess.run([inputs, outputs])
+
+    inputs, outputs = self.evaluate([inputs, outputs])
+
     self.assertGreaterEqual(outputs.min(), 0.0)
     self.assertLessEqual(outputs.max(), 1.0)
     self.assertAllClose(inputs, outputs)
@@ -977,8 +970,9 @@ class LeakyReluTest(test_lib.TestCase):
     for dtype in [np.int32, np.int64, np.float16, np.float32, np.float64]:
       np_values = np.array([-2, -1, 0, 1, 2], dtype=dtype)
       outputs = nn_ops.leaky_relu(constant_op.constant(np_values))
-      with self.cached_session() as sess:
-        outputs = sess.run(outputs)
+
+      outputs = self.evaluate(outputs)
+
       tol = 2e-3 if dtype == np.float16 else 1e-6
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
@@ -1004,9 +998,10 @@ class SwishTest(test_lib.TestCase):
     tf_values = constant_op.constant(np_values)
     actual_tf_outputs = nn_impl.swish(tf_values)
     expected_tf_outputs = tf_values * math_ops.sigmoid(tf_values)
-    with self.cached_session() as sess:
-      actual_outputs, expected_outputs = sess.run(
-          [actual_tf_outputs, expected_tf_outputs])
+
+    actual_outputs, expected_outputs = self.evaluate(
+        [actual_tf_outputs, expected_tf_outputs])
+
     self.assertAllClose(actual_outputs, expected_outputs)
 
   def testGradients(self):
@@ -1051,7 +1046,7 @@ class MomentsTest(test_lib.TestCase):
                 self.assertLess(err, 1e-3)
 
               # Evaluate.
-              [mean, variance] = sess.run([mean, variance])
+              [mean, variance] = self.evaluate([mean, variance])
               # Make sure that there are no NaNs
               self.assertFalse(np.isnan(mean).any())
               self.assertFalse(np.isnan(variance).any())
@@ -1094,9 +1089,9 @@ class DataFormatDimMapTest(test_lib.TestCase):
   def _test(self, x_val, y_val_expected):
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x)
-    with self.cached_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
-      self.assertAllEqual(y_val, y_val_expected)
+
+    y_val = self.evaluate(y)
+    self.assertAllEqual(y_val, y_val_expected)
 
   def test(self):
     self._test(0, 0)
@@ -1117,8 +1112,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [2, 2, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="NCHW")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testNHWCtoHWNC(self):
@@ -1126,8 +1121,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [2, 0, 1, 3, 2, 0, 1, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="HWNC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testNHWCtoWHCN(self):
@@ -1135,8 +1130,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [3, 1, 0, 2, 3, 1, 0, 2]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="WHCN")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testArbitraryASCII(self):
@@ -1144,8 +1139,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [3, 2, 1, 0, 3, 2, 1, 0]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="qwer", dst_format="rewq")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
 
@@ -1155,64 +1150,64 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x)
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 3, 4, 9])
 
   def testNCHWToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 9, 3, 4])
 
   def testNHWCToHWNC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [4, 9, 7, 3])
 
   def testHWNCToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [9, 7, 4, 3])
 
   def testNHWCToNCHW2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x)
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[7, 4], [5, 1], [9, 3], [4, 5]])
 
   def testNHWCToHWNC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[9, 3], [4, 5], [7, 4], [5, 1]])
 
   def testHWNCToNHWC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[4, 5], [7, 4], [9, 3], [5, 1]])
 
   def testNCHWToNHWC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[7, 4], [4, 5], [5, 1], [9, 3]])
 
 
diff --git a/tensorflow/python/ops/nn_xent_test.py b/tensorflow/python/ops/nn_xent_test.py
index 57ce4fd0a995f5fe04de3c8e9bbc371412687c32..7bf18c47feec8d8d732c0908ab62451934667a30 100644
--- a/tensorflow/python/ops/nn_xent_test.py
+++ b/tensorflow/python/ops/nn_xent_test.py
@@ -68,7 +68,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
           loss = nn_impl.sigmoid_cross_entropy_with_logits(
               labels=targets, logits=logits)
           np_loss = np.array(losses).astype(np.float32)
-          tf_loss = loss.eval()
+          tf_loss = self.evaluate(loss)
         self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testLogisticOutputMultiDim(self):
@@ -79,7 +79,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
           loss = nn_impl.sigmoid_cross_entropy_with_logits(
               labels=targets, logits=logits)
           np_loss = np.array(losses).astype(np.float32)
-          tf_loss = loss.eval()
+          tf_loss = self.evaluate(loss)
         self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testGradient(self):
@@ -143,7 +143,7 @@ class WeightedCrossEntropyTest(test.TestCase):
         loss = nn_impl.weighted_cross_entropy_with_logits(
             targets=targets, logits=logits, pos_weight=pos_weight)
         np_loss = np.array(losses).astype(np.float32)
-        tf_loss = loss.eval()
+        tf_loss = self.evaluate(loss)
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testOutputMultiDim(self):
@@ -154,7 +154,7 @@ class WeightedCrossEntropyTest(test.TestCase):
         loss = nn_impl.weighted_cross_entropy_with_logits(
             targets=targets, logits=logits, pos_weight=pos_weight)
         np_loss = np.array(losses).astype(np.float32)
-        tf_loss = loss.eval()
+        tf_loss = self.evaluate(loss)
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testGradient(self):
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 0c0d81afb63fd330cbc1e0244b4c6ba387ce97a1..0ab39ad0a8edd60c78a6bea3ae31e4f025c9e0bd 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -28,9 +28,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(
-    "debugging.assert_all_finite",
-    v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
+@tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
 def verify_tensor_all_finite(t, msg, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
@@ -43,15 +41,30 @@ def verify_tensor_all_finite(t, msg, name=None):
   Returns:
     Same tensor as `t`.
   """
-  with ops.name_scope(name, "VerifyFinite", [t]) as name:
-    t = ops.convert_to_tensor(t, name="t")
-    with ops.colocate_with(t):
-      verify_input = array_ops.check_numerics(t, message=msg)
-      out = control_flow_ops.with_dependencies([verify_input], t)
+  return verify_tensor_all_finite_v2(t, msg, name)
+
+
+@tf_export("debugging.assert_all_finite", v1=[])
+def verify_tensor_all_finite_v2(x, message, name=None):
+  """Assert that the tensor does not contain any NaN's or Inf's.
+
+  Args:
+    x: Tensor to check.
+    message: Message to log on failure.
+    name: A name for this operation (optional).
+
+  Returns:
+    Same tensor as `x`.
+  """
+  with ops.name_scope(name, "VerifyFinite", [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    with ops.colocate_with(x):
+      verify_input = array_ops.check_numerics(x, message=message)
+      out = control_flow_ops.with_dependencies([verify_input], x)
   return out
 
 
-@tf_export("add_check_numerics_ops")
+@tf_export(v1=["add_check_numerics_ops"])
 def add_check_numerics_ops():
   """Connect a `check_numerics` to every floating point tensor.
 
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index ead7ae5478c74aad4f67296ed68895c1f54f7333..3c818f3d6cde52d7722e66c22de375e33574efdd 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -19,14 +19,16 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.parallel_for.pfor import PFor
 from tensorflow.python.util import nest
 
 
-def for_loop(loop_fn, loop_fn_dtypes, iters):
+def for_loop(loop_fn, loop_fn_dtypes, iters, parallel_iterations=None):
   """Runs `loop_fn` `iters` times and stacks the outputs.
 
 
@@ -39,6 +41,8 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
       objects. The shape of these outputs should not depend on the input.
     loop_fn_dtypes: dtypes for the outputs of loop_fn.
     iters: Number of iterations for which to run loop_fn.
+    parallel_iterations: The number of iterations that can be dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     Returns a nested structure of stacked output tensor objects with the same
@@ -66,11 +70,16 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
       outputs.append(ta)
     return tuple([i + 1] + outputs)
 
+  if parallel_iterations is not None:
+    extra_args = {"parallel_iterations": parallel_iterations}
+  else:
+    extra_args = {}
   ta_list = control_flow_ops.while_loop(
-      lambda i, *ta: i < iters, while_body, [0] + [
-          tensor_array_ops.TensorArray(dtype, iters)
-          for dtype in flat_loop_fn_dtypes
-      ])[1:]
+      lambda i, *ta: i < iters,
+      while_body,
+      [0] + [tensor_array_ops.TensorArray(dtype, iters)
+             for dtype in flat_loop_fn_dtypes],
+      **extra_args)[1:]
 
   # TODO(rachelim): enable this for sparse tensors
 
@@ -79,7 +88,15 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
   return nest.pack_sequence_as(loop_fn_dtypes, output)
 
 
-def pfor(loop_fn, iters):
+def _flatten_first_two_dims(x):
+  """Flattens the first two dimensions of x into a single dimension."""
+  old_shape = array_ops.shape(x)
+  new_shape = array_ops.concat([[old_shape[0] * old_shape[1]], old_shape[2:]],
+                               axis=0)
+  return array_ops.reshape(x, new_shape)
+
+
+def pfor(loop_fn, iters, parallel_iterations=None):
   """Equivalent to running `loop_fn` `iters` times and stacking the outputs.
 
   `pfor` has functionality similar to `for_loop`, i.e. running `loop_fn` `iters`
@@ -99,8 +116,8 @@ def pfor(loop_fn, iters):
       reads, etc).
     - Conversion works only on a limited set of kernels for which a converter
       has been registered.
-    - loop_fn cannot currently contain control flow operations like
-      tf.while_loop or tf.cond.
+    - loop_fn has limited support for control flow operations. tf.cond in
+      particular is not supported.
     - `loop_fn` should return nested structure of Tensors or Operations. However
       if an Operation is returned, it should have zero outputs.
     - The shape and dtype of `loop_fn` outputs should not depend on the input
@@ -109,12 +126,21 @@ def pfor(loop_fn, iters):
   Args:
     loop_fn: A function that takes an int32 scalar tf.Tensor object representing
       the iteration number, and returns a possibly nested structure of Tensor or
-      Operation objects.
+      Operation objects. Note that if setting `parallel_iterations` argument to
+      something other than None, `loop_fn` may be called more than once during
+      graph construction. So it may need to avoid mutating global state.
     iters: Number of iterations for which to run loop_fn.
+    parallel_iterations: A knob to control how many iterations are vectorized
+      and dispatched in parallel. The default value of None corresponds to
+      vectorizing all the iterations.  If `parallel_iterations` is smaller than
+      `iters`, then chunks of at most that many iterations are dispatched in
+      sequence. This knob can be used to control the total memory usage.
 
   Returns:
     Returns a nested structure of stacked tensor objects with the same nested
     structure as the output of `loop_fn`.
+  Raises:
+    ValueError: If parallel_iterations is not None and not an integer > 1.
   """
   existing_ops = set(ops.get_default_graph().get_operations())
   with ops.name_scope("loop_body"):
@@ -122,9 +148,61 @@ def pfor(loop_fn, iters):
     loop_fn_outputs = loop_fn(loop_var)
   new_ops = set(ops.get_default_graph().get_operations()) - existing_ops
   iters = ops.convert_to_tensor(iters)
-  with ops.name_scope("pfor"):
-    converter = PFor(loop_var, iters, new_ops)
-    outputs = []
-    for loop_fn_output in nest.flatten(loop_fn_outputs):
-      outputs.append(converter.convert(loop_fn_output))
-    return nest.pack_sequence_as(loop_fn_outputs, outputs)
+  if parallel_iterations is not None:
+    if parallel_iterations < 1:
+      raise ValueError("parallel_iterations must be None or a positive integer")
+    if parallel_iterations == 1:
+      raise ValueError("Found parallel_iterations == 1. Use for_loop instead.")
+    iters_value = tensor_util.constant_value(iters)
+    if iters_value is not None and iters_value < parallel_iterations:
+      parallel_iterations = None
+  if parallel_iterations is None:
+    with ops.name_scope("pfor"):
+      converter = PFor(loop_var, iters, new_ops)
+      outputs = []
+      for loop_fn_output in nest.flatten(loop_fn_outputs):
+        outputs.append(converter.convert(loop_fn_output))
+      return nest.pack_sequence_as(loop_fn_outputs, outputs)
+  else:
+    num_tiled_iterations = iters // parallel_iterations
+    num_remaining_iterations = iters % parallel_iterations
+    # TODO(agarwal): Avoid calling loop_fn twice. Generate the loop body inside
+    # a tf.function and extract the graph from there to vectorize it.
+    with ops.name_scope("pfor_untiled"):
+      converter = PFor(loop_var, num_remaining_iterations, new_ops)
+      remaining_outputs = []
+      flattened_loop_fn_outputs = nest.flatten(loop_fn_outputs)
+      for loop_fn_output in flattened_loop_fn_outputs:
+        remaining_outputs.append(converter.convert(loop_fn_output))
+
+    with ops.name_scope("pfor_tiled"):
+      loop_fn_dtypes = [ops.convert_to_tensor(x).dtype
+                        for x in flattened_loop_fn_outputs]
+
+      def tiled_loop_body(j):
+        offset = j * parallel_iterations + num_remaining_iterations
+
+        def tiled_loop_fn(i):
+          return nest.flatten(loop_fn(i + offset))
+
+        return pfor(tiled_loop_fn, parallel_iterations)
+
+      tiled_outputs = for_loop(tiled_loop_body, loop_fn_dtypes,
+                               num_tiled_iterations, parallel_iterations=1)
+      tiled_outputs = [_flatten_first_two_dims(y) for y in tiled_outputs]
+
+    with ops.name_scope("pfor"):
+      iters_value = tensor_util.constant_value(iters)
+      if iters_value is None or iters_value % parallel_iterations:
+        outputs = control_flow_ops.cond(
+            math_ops.equal(num_remaining_iterations, 0),
+            lambda: tiled_outputs,
+            lambda: [array_ops.concat([x, y], axis=0)
+                     for x, y in zip(remaining_outputs, tiled_outputs)])
+      else:
+        outputs = tiled_outputs
+      return nest.pack_sequence_as(loop_fn_outputs, nest.flatten(outputs))
+
+
+
+
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index e86f409d68b8b9f116149f750ed3967a8c91303f..4470c0b95807a24d0edaebfbbda249dd886ffc37 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -73,9 +73,13 @@ class PForTest(test.TestCase):
       else:
         self.assertAllEqual(outputs[i + n], outputs[i])
 
-  def _test_loop_fn(self, loop_fn, iters, loop_fn_dtypes=dtypes.float32):
-    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters)
-    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters)
+  def _test_loop_fn(self, loop_fn, iters,
+                    loop_fn_dtypes=dtypes.float32,
+                    parallel_iterations=None):
+    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
+                                    parallel_iterations=parallel_iterations)
+    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
+                                        parallel_iterations=parallel_iterations)
     self.run_and_assert_equal(t1, t2)
 
   def test_op_conversion_fallback_to_while_loop(self):
@@ -96,6 +100,30 @@ class PForTest(test.TestCase):
         loop_fn, 3, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
     flags.FLAGS.op_conversion_fallback_to_while_loop = False
 
+  def test_parallel_iterations(self):
+    for parallel_iterations in [2, 3, 8, 10]:
+      x = random_ops.random_uniform([8, 3])
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        return array_ops.gather(x, i)
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 8, parallel_iterations=parallel_iterations)
+      self._test_loop_fn(loop_fn, 4 * constant_op.constant(2),
+                         parallel_iterations=parallel_iterations)
+
+  def test_parallel_iterations_zero(self):
+    with self.assertRaisesRegexp(ValueError, "positive integer"):
+      pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=0)
+    with self.assertRaisesRegexp(TypeError, "positive integer"):
+      pfor_control_flow_ops.for_loop(lambda i: 1, dtypes.int32, 8,
+                                     parallel_iterations=0)
+
+  def test_parallel_iterations_one(self):
+    with self.assertRaisesRegexp(ValueError, "Use for_loop instead"):
+      pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=1)
+
 
 class ArrayTest(PForTest):
 
@@ -309,6 +337,14 @@ class ArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
+  def test_matrix_diag_part(self):
+    x = random_ops.random_uniform([3, 4, 2])
+
+    def loop_fn(i):
+      return array_ops.matrix_diag_part(array_ops.gather(x, i))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
+
   def test_strided_slice(self):
     x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
 
@@ -786,15 +822,36 @@ class NNTest(PForTest):
   def test_max_pool(self):
     x = random_ops.random_uniform([3, 2, 12, 12, 3])
     ksize = [1, 3, 3, 1]
+    strides = [1, 2, 2, 1]
 
     def loop_fn(i):
       x1 = array_ops.gather(x, i)
       output = nn.max_pool(
-          x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC")
+          x1, ksize, strides=strides, padding="VALID", data_format="NHWC")
       loss = nn.l2_loss(output)
-      return output, gradient_ops.gradients(loss, x1)
+      ones = array_ops.ones_like(output)
+      grad = gradient_ops.gradients(loss, x1, grad_ys=ones)
+      grad_grad = gradient_ops.gradients(grad, ones)
+      return output, grad, grad_grad
 
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
+  def test_max_pool3d(self):
+    x = random_ops.random_uniform([3, 3, 2, 12, 12, 3])
+    ksize = [1, 1, 3, 3, 1]
+    strides = [1, 1, 2, 2, 1]
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      output = nn.max_pool3d(
+          x1, ksize, strides=strides, padding="VALID", data_format="NDHWC")
+      loss = nn.l2_loss(output)
+      ones = array_ops.ones_like(output)
+      grad = gradient_ops.gradients(loss, x1, grad_ys=ones)
+      grad_grad = gradient_ops.gradients(grad, ones)
+      return output, grad, grad_grad
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
 
   def test_fused_batch_norm(self):
     data_formats = ["NHWC"]
diff --git a/tensorflow/python/ops/parallel_for/gradients.py b/tensorflow/python/ops/parallel_for/gradients.py
index 1f026b3660c39066b3a8cf741b0fbd1929b22665..3ba1bde347698acf3b1229808fe63cef2e3255af 100644
--- a/tensorflow/python/ops/parallel_for/gradients.py
+++ b/tensorflow/python/ops/parallel_for/gradients.py
@@ -25,7 +25,7 @@ from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.util import nest
 
 
-def jacobian(output, inputs, use_pfor=True):
+def jacobian(output, inputs, use_pfor=True, parallel_iterations=None):
   """Computes jacobian of `output` w.r.t. `inputs`.
 
   Args:
@@ -33,6 +33,8 @@ def jacobian(output, inputs, use_pfor=True):
     inputs: A tensor or a nested structure of tensor objects.
     use_pfor: If true, uses pfor for computing the jacobian. Else uses
       tf.while_loop.
+    parallel_iterations: A knob to control how many iterations and dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     A tensor or a nested strucutre of tensors with the same structure as
@@ -56,10 +58,14 @@ def jacobian(output, inputs, use_pfor=True):
     output_size = array_ops.shape(output)[0]
 
   if use_pfor:
-    pfor_outputs = control_flow_ops.pfor(loop_fn, output_size)
+    pfor_outputs = control_flow_ops.pfor(
+        loop_fn, output_size, parallel_iterations=parallel_iterations)
   else:
     pfor_outputs = control_flow_ops.for_loop(
-        loop_fn, [output.dtype] * len(flat_inputs), output_size)
+        loop_fn,
+        [output.dtype] * len(flat_inputs),
+        output_size,
+        parallel_iterations=parallel_iterations)
 
   for i, out in enumerate(pfor_outputs):
     if out is not None:
@@ -72,7 +78,7 @@ def jacobian(output, inputs, use_pfor=True):
   return nest.pack_sequence_as(inputs, pfor_outputs)
 
 
-def batch_jacobian(output, inp, use_pfor=True):
+def batch_jacobian(output, inp, use_pfor=True, parallel_iterations=None):
   """Computes and stacks jacobians of `output[i,...]` w.r.t. `input[i,...]`.
 
   e.g.
@@ -87,6 +93,8 @@ def batch_jacobian(output, inp, use_pfor=True):
     inp: A tensor with shape [b, x1, ..., x_m]
     use_pfor: If true, uses pfor for computing the Jacobian. Else uses a
       tf.while_loop.
+    parallel_iterations: A knob to control how many iterations and dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]`
@@ -118,10 +126,13 @@ def batch_jacobian(output, inp, use_pfor=True):
     return gradient_ops.gradients(y, inp)[0]
 
   if use_pfor:
-    pfor_output = control_flow_ops.pfor(loop_fn, output_row_size)
+    pfor_output = control_flow_ops.pfor(loop_fn, output_row_size,
+                                        parallel_iterations=parallel_iterations)
   else:
-    pfor_output = control_flow_ops.for_loop(loop_fn, output.dtype,
-                                            output_row_size)
+    pfor_output = control_flow_ops.for_loop(
+        loop_fn, output.dtype,
+        output_row_size,
+        parallel_iterations=parallel_iterations)
   if pfor_output is None:
     return None
   pfor_output = array_ops.reshape(pfor_output,
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 5a058bae82554eac98ec69ea4b9e809a0c06b223..545c482df8b6b692c7779104773c181fba72d564 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -416,6 +416,12 @@ class GradientsTest(test.TestCase):
       self.assertAllClose(ans, pfor_value)
       self.assertAllClose(ans, while_value)
 
+  def test_jacobian_parallel_iterations(self):
+    x = constant_op.constant([[1., 2], [3, 4]])
+    y = math_ops.matmul(x, x)
+    self.assertAllClose(gradients.jacobian(y, x, parallel_iterations=2),
+                        gradients.jacobian(y, x, parallel_iterations=3))
+
   def test_batch_jacobian_bad_shapes(self):
     x = random_ops.random_uniform([2, 2])
     y = random_ops.random_uniform([3, 2])
@@ -459,6 +465,13 @@ class GradientsTest(test.TestCase):
       self.assertAllClose(ans, pfor_value)
       self.assertAllClose(ans, while_value)
 
+  def test_batch_jacobian_parallel_iterations(self):
+    x = constant_op.constant([[1., 2], [3, 4]])
+    w = constant_op.constant([[1., 2, 3, 4], [5, 6, 7, 8]])
+    y = math_ops.matmul(x, w)
+    self.assertAllClose(gradients.batch_jacobian(y, x, parallel_iterations=2),
+                        gradients.batch_jacobian(y, x, parallel_iterations=3))
+
   def test_fc_batch_jacobian(self):
     pfor_jacobian, while_jacobian = create_fc_batch_jacobian(8, 4, 2)
     self.run_and_assert_equal(pfor_jacobian, while_jacobian)
@@ -472,7 +485,7 @@ class GradientsTest(test.TestCase):
     with session.Session() as sess:
       init = variables.global_variables_initializer()
       sess.run(init)
-      pfor = sess.run(pfor_jacobian)
+      pfor = self.evaluate(pfor_jacobian)
       for i in range(4):
         while_i = sess.run(while_gradients[i])
         self.assertAllClose(while_i, pfor[:, i, ...])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 5d10860e94399314719b5381f42624e135d40459..d789dc65b13a5eeff75ce21e9e72049c475f9221 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1303,7 +1303,11 @@ def _inputs_with_flattening(pfor_input, input_indices):
 @RegisterPForWithArgs("Conv2D", dims=[0])
 @RegisterPForWithArgs("AvgPool", dims=[0])
 @RegisterPForWithArgs("MaxPool", dims=[0])
+@RegisterPForWithArgs("MaxPool3D", dims=[0])
+@RegisterPForWithArgs("MaxPool3DGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("MaxPoolGrad", dims=[0, 1, 2])
+@RegisterPForWithArgs("MaxPool3DGradGrad", dims=[0, 1, 2])
+@RegisterPForWithArgs("MaxPoolGradGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("SoftmaxCrossEntropyWithLogits", dims=[0, 1])
 def _convert_flatten_batch(pfor_input, op_type, dims):
   del op_type
@@ -1532,6 +1536,7 @@ def _convert_conv2d_backprop_filter(pfor_input):
 
 @RegisterPForWithArgs("Identity", array_ops.identity)
 @RegisterPForWithArgs("StopGradient", array_ops.stop_gradient)
+@RegisterPForWithArgs("MatrixDiagPart", array_ops.matrix_diag_part)
 def _convert_identity(pfor_input, op_type, op_func):
   del op_type
   return wrap(op_func(*[x.t for x in pfor_input.inputs]), True)
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index a99fa7f15263604f9984ce4a19f7630555c253a6..484caf017968103c0949dce2205cf60fbc494439 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -46,7 +46,7 @@ ops.NotDifferentiable("SerializeTensor")
 ops.NotDifferentiable("StringToNumber")
 
 
-@tf_export("io.VarLenFeature", "VarLenFeature")
+@tf_export("io.VarLenFeature", v1=["VarLenFeature", "io.VarLenFeature"])
 class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   """Configuration for parsing a variable-length input feature.
 
@@ -56,7 +56,7 @@ class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   pass
 
 
-@tf_export("io.SparseFeature", "SparseFeature")
+@tf_export("io.SparseFeature", v1=["io.SparseFeature", "SparseFeature"])
 class SparseFeature(
     collections.namedtuple(
         "SparseFeature",
@@ -131,7 +131,7 @@ class SparseFeature(
         cls, index_key, value_key, dtype, size, already_sorted)
 
 
-@tf_export("io.FixedLenFeature", "FixedLenFeature")
+@tf_export("io.FixedLenFeature", v1=["io.FixedLenFeature", "FixedLenFeature"])
 class FixedLenFeature(collections.namedtuple(
     "FixedLenFeature", ["shape", "dtype", "default_value"])):
   """Configuration for parsing a fixed-length input feature.
@@ -151,7 +151,8 @@ class FixedLenFeature(collections.namedtuple(
         cls, shape, dtype, default_value)
 
 
-@tf_export("io.FixedLenSequenceFeature", "FixedLenSequenceFeature")
+@tf_export("io.FixedLenSequenceFeature",
+           v1=["io.FixedLenSequenceFeature", "FixedLenSequenceFeature"])
 class FixedLenSequenceFeature(collections.namedtuple(
     "FixedLenSequenceFeature",
     ["shape", "dtype", "allow_missing", "default_value"])):
@@ -362,7 +363,7 @@ def _prepend_none_dimension(features):
     return features
 
 
-@tf_export("io.parse_example", "parse_example")
+@tf_export("io.parse_example", v1=["io.parse_example", "parse_example"])
 def parse_example(serialized, features, name=None, example_names=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -763,7 +764,8 @@ def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types,
           dense_shapes_as_proto, dense_shapes)
 
 
-@tf_export("io.parse_single_example", "parse_single_example")
+@tf_export("io.parse_single_example",
+           v1=["io.parse_single_example", "parse_single_example"])
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -1246,7 +1248,9 @@ def _parse_sequence_example_raw(serialized,
 
 # TODO(sundberg): rewrite this method to call the batch version, which is more
 # efficient especially for large inputs.
-@tf_export("io.parse_single_sequence_example", "parse_single_sequence_example")
+@tf_export("io.parse_single_sequence_example",
+           v1=["io.parse_single_sequence_example",
+               "parse_single_sequence_example"])
 def parse_single_sequence_example(
     serialized, context_features=None, sequence_features=None,
     example_name=None, name=None):
diff --git a/tensorflow/python/ops/partitioned_variables.py b/tensorflow/python/ops/partitioned_variables.py
index 7743b634e8fa418572f334130f2072dcfe8d029c..6174979d5e15af9b2b79ae980d06cbc3a5143ec4 100644
--- a/tensorflow/python/ops/partitioned_variables.py
+++ b/tensorflow/python/ops/partitioned_variables.py
@@ -154,7 +154,7 @@ def variable_axis_size_partitioner(
   return _partitioner
 
 
-@tf_export("min_max_variable_partitioner")
+@tf_export(v1=["min_max_variable_partitioner"])
 def min_max_variable_partitioner(max_partitions=1, axis=0,
                                  min_slice_size=256 << 10,
                                  bytes_per_string_element=16):
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index f7fa264461ee7a1a80d8e7c0cf7d71c4d23225bf..6b469a954f6531641f4bc61396563581b7c368fe 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -73,7 +73,7 @@ class Conv2DTest(test.TestCase):
           max_input=x1_max,
           min_filter=x2_min,
           max_filter=x2_max)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     quantized_output = value[0]
     output_min = value[1]
     output_max = value[2]
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
index 0f3b04e4ad07749e53042216d5abbff8551dc04b..b81843d17482bdff910827125ed8affd4094b942 100644
--- a/tensorflow/python/ops/quantized_ops_test.py
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -41,7 +41,7 @@ class QuantizedOpsTest(test.TestCase):
       x_min = 0.0
       x_max = 255.0
       op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
-      value = sess.run(op)
+      value = self.evaluate(op)
       self.assertArrayNear(expected_output, value.output, 0.1)
 
   def testDequantizeOp(self):
@@ -52,7 +52,7 @@ class QuantizedOpsTest(test.TestCase):
       x_min = 0.0
       x_max = 255.0
       op = array_ops.dequantize(x, x_min, x_max, mode="MIN_FIRST")
-      value = sess.run(op)
+      value = self.evaluate(op)
       self.assertArrayNear(expected_output, value, 0.1)
 
 
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9ff5f26804deae1a355cebe1896d9795891cbfec
--- /dev/null
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -0,0 +1,692 @@
+package(
+    default_visibility = [
+        "//intelligence/datum/prensor:__pkg__",
+        "//learning/brain/contrib/text:__pkg__",
+        "//nlp/projects/atc/tf/ops:__pkg__",
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+#-------------------------------------------------------------------------------
+# RaggedTensor
+#-------------------------------------------------------------------------------
+
+py_library(
+    name = "ragged",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    tags = ["nofixdeps"],
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_elementwise_ops",
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_getitem",
+        ":ragged_map_ops",
+        ":ragged_math_ops",
+        ":ragged_operators",
+        ":ragged_tensor",
+        ":ragged_tensor_value",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_array_ops",
+    srcs = ["ragged_array_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_array_ops_gen",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_conversion_ops",
+    srcs = ["ragged_conversion_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_conversion_ops_gen",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "ragged_factory_ops",
+    srcs = ["ragged_factory_ops.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "ragged_functional_ops",
+    srcs = ["ragged_functional_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_getitem",
+    srcs = ["ragged_getitem.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_math_ops",
+    srcs = ["ragged_math_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_math_ops_gen",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_elementwise_ops",
+    srcs = ["ragged_elementwise_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_operators",
+    srcs = ["ragged_operators.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_elementwise_ops",
+        ":ragged_getitem",
+        ":ragged_tensor",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_tensor",
+    srcs = ["ragged_tensor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_tensor_value",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "ragged_tensor_value",
+    srcs = ["ragged_tensor_value.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//third_party/py/numpy"],
+)
+
+py_library(
+    name = "ragged_util",
+    srcs = ["ragged_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "segment_id_ops",
+    srcs = ["segment_id_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_map_ops",
+    srcs = ["ragged_map_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+#-------------------------------------------------------------------------------
+# RaggedTensor Tests
+#-------------------------------------------------------------------------------
+
+py_test(
+    name = "ragged_tensor_test",
+    size = "medium",
+    srcs = ["ragged_tensor_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_eager_test",
+    size = "medium",
+    srcs = ["ragged_eager_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_range_op_test",
+    srcs = ["ragged_range_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_tensor_bounding_shape_op_test",
+    srcs = ["ragged_tensor_bounding_shape_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_row_lengths_op_test",
+    srcs = ["ragged_row_lengths_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_gather_op_test",
+    srcs = ["ragged_gather_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_batch_gather_op_test",
+    srcs = ["ragged_batch_gather_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_gather_nd_op_test",
+    srcs = ["ragged_gather_nd_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_row_splits_to_segment_ids_op_test",
+    srcs = ["ragged_row_splits_to_segment_ids_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_segment_ids_to_row_splits_op_test",
+    srcs = ["ragged_segment_ids_to_row_splits_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_from_tensor_op_test",
+    srcs = ["ragged_from_tensor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_to_sparse_op_test",
+    srcs = ["ragged_to_sparse_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_from_sparse_op_test",
+    srcs = ["ragged_from_sparse_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
+    ],
+)
+
+py_test(
+    name = "ragged_to_tensor_op_test",
+    srcs = ["ragged_to_tensor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_segment_op_test",
+    srcs = ["ragged_segment_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_reduce_op_test",
+    srcs = ["ragged_reduce_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_map_inner_values_op_test",
+    srcs = ["ragged_map_inner_values_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_const_op_test",
+    srcs = ["ragged_const_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_constant_value_op_test",
+    srcs = ["ragged_constant_value_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "convert_to_tensor_or_ragged_tensor_op_test",
+    srcs = ["convert_to_tensor_or_ragged_tensor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_boolean_mask_op_test",
+    srcs = ["ragged_boolean_mask_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_concat_op_test",
+    srcs = ["ragged_concat_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_stack_op_test",
+    srcs = ["ragged_stack_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_tile_op_test",
+    srcs = ["ragged_tile_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_util_test",
+    srcs = ["ragged_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_expand_dims_op_test",
+    srcs = ["ragged_expand_dims_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_where_op_test",
+    srcs = ["ragged_where_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_elementwise_ops_test",
+    srcs = ["ragged_elementwise_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_operators_test",
+    srcs = ["ragged_operators_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_map_fn_op_test",
+    size = "small",
+    srcs = ["ragged_map_fn_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/keras:backend",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a2884854546493a2ba1bd0f7be4986c27fd3482
--- /dev/null
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -0,0 +1,226 @@
+"""Ragged Tensors.
+
+This package defines the [`RaggedTensor`](ragged/RaggedTensor.md) class, which
+represents tensors with non-uniform shapes.  In particular, each `RaggedTensor`
+has one or more *ragged dimensions*, which are dimensions whose slices may have
+different lengths.  For example, the inner (column) dimension of
+`rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged, since the column slices
+(`rt[0, :]`, ..., `rt[4, :]`) have different lengths.  For a more detailed
+description of ragged tensors, see the [`RaggedTensor`](ragged/RaggedTensor.md)
+class documentation.
+
+## RaggedTensor Operations
+
+This package also defines a collection of operations for manipulating
+ragged tensors.
+
+### RaggedTensor Versions of Standard Tensor Operations
+
+Many of the operations defined by this package are analogous to
+[`Tensor`](https://www.tensorflow.org/api_docs/python/tf/Tensor)
+operations, but they accept `RaggedTensor`s as input and can return
+`RaggedTensor`s as output.  For example, `ragged.add` performs elementwise
+addition just like `tf.add`, but can be used on `RaggedTensor`s.
+
+These `RaggedTensor` versions of the standard `Tensor` operations can also be
+used with standard `Tensors`; and for the most part, they will return the same
+value that the standard `Tensor` operation would return.  However, there are
+a few notable exceptions:
+
+* For [`ragged.stack(...)`](ragged/stack.md) and
+  [`ragged.concat(...)`](ragged/concat.md), the input tensors are not required
+  to have matching shapes.  In the returned tensor, all dimensions up to the
+  `axis` dimension will be ragged.
+
+### Ragged-Tensor Specific Operations
+
+The following operations are specific to ragged tensors:
+
+* **Factory ops**:
+  [`constant(...)`](ragged/constant.md),
+  [`from_row_splits(...)`](ragged/from_row_splits.md),
+  [`from_row_lengths(...)`](ragged/from_row_lengths.md),
+  [`from_row_starts(...)`](ragged/from_row_starts.md),
+  [`from_row_limits(...)`](ragged/from_row_limits.md),
+  [`from_value_rowids(...)`](ragged/from_value_rowids.md),
+  [`from_nested_row_splits(...)`](ragged/from_nested_row_splits.md),
+  [`from_nested_value_rowids(...)`](ragged/from_nested_value_rowids.md).
+
+* **Conversion ops**:
+  [`from_tensor(...)`](ragged/from_tensor.md),
+  [`to_tensor(...)`](ragged/to_tensor.md),
+  [`from_sparse(...)`](ragged/from_sparse.md),
+  [`to_sparse(...)`](ragged/to_sparse.md),
+  [`from_variant(...)`](ragged/from_variant.md),
+  [`to_variant(...)`](ragged/to_variant.md),
+  [`convert_to_tensor_or_ragged_tensor(...)`](
+  ragged/convert_to_tensor_or_ragged_tensor.md).
+
+* **Shape ops**:
+  [`row_splits(...)`](ragged/row_splits.md),
+  [`row_lengths(...)`](ragged/row_lengths.md),
+  [`row_starts(...)`](ragged/row_starts.md),
+  [`row_limits(...)`](ragged/row_limits.md),
+  [`value_rowids(...)`](ragged/value_rowids.md),
+  [`nrows(...)`](ragged/nrows.md),
+  [`nested_row_splits(...)`](ragged/nested_row_splits.md),
+  [`row_splits_to_segment_ids(...)`](ragged/row_splits_to_segment_ids.md),
+  [`segment_ids_to_row_splits(...)`](ragged/segment_ids_to_row_splits.md),
+  [`bounding_shape(...)`](ragged/bounding_shape.md).
+
+* **Functional ops**:
+  [`map_inner_values(...)`](ragged/map_inner_values.md),
+  [`make_elementwise_op(...)`](ragged/make_elementwise_op.md).
+
+
+<!-- Ragged Classes & related helper functions -->
+@@RaggedTensor
+@@RaggedTensorType
+@@RaggedTensorValue
+@@is_ragged
+
+<!-- Factory Ops -->
+@@constant
+@@constant_value
+@@from_row_splits
+@@from_row_lengths
+@@from_row_starts
+@@from_row_limits
+@@from_value_rowids
+@@from_nested_row_splits
+@@from_nested_value_rowids
+@@convert_to_tensor_or_ragged_tensor
+
+<!-- Conversion Ops -->
+@@from_tensor
+@@to_tensor
+@@from_sparse
+@@to_sparse
+@@row_splits_to_segment_ids
+@@segment_ids_to_row_splits
+
+<!-- Array Ops -->
+@@row_splits
+@@row_lengths
+@@row_starts
+@@row_limits
+@@value_rowids
+@@nrows
+@@nested_row_splits
+@@bounding_shape
+@@gather
+@@batch_gather
+@@gather_nd
+@@boolean_mask
+@@concat
+@@stack
+@@tile
+@@expand_dims
+@@where
+
+<!-- Math Ops -->
+@@range
+
+@@segment_sum
+@@segment_prod
+@@segment_min
+@@segment_max
+@@segment_mean
+@@segment_sqrt_n
+
+@@reduce_sum
+@@reduce_prod
+@@reduce_min
+@@reduce_max
+@@reduce_mean
+@@reduce_all
+@@reduce_any
+
+<!-- Functional Ops -->
+@@map_inner_values
+@@map_fn
+
+<!-- Elementwise Ops -->
+@@make_elementwise_op
+
+<!-- Symbols from  ragged_elementwise_ops._symbols_to_export are whitelisted -->
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.ragged import ragged_operators
+
+from tensorflow.python.ops.ragged.ragged_array_ops import batch_gather
+from tensorflow.python.ops.ragged.ragged_array_ops import boolean_mask
+from tensorflow.python.ops.ragged.ragged_array_ops import bounding_shape
+from tensorflow.python.ops.ragged.ragged_array_ops import concat
+from tensorflow.python.ops.ragged.ragged_array_ops import expand_dims
+from tensorflow.python.ops.ragged.ragged_array_ops import gather
+from tensorflow.python.ops.ragged.ragged_array_ops import gather_nd
+from tensorflow.python.ops.ragged.ragged_array_ops import nrows
+from tensorflow.python.ops.ragged.ragged_array_ops import row_lengths
+from tensorflow.python.ops.ragged.ragged_array_ops import row_limits
+from tensorflow.python.ops.ragged.ragged_array_ops import row_starts
+from tensorflow.python.ops.ragged.ragged_array_ops import stack
+from tensorflow.python.ops.ragged.ragged_array_ops import tile
+from tensorflow.python.ops.ragged.ragged_array_ops import value_rowids
+from tensorflow.python.ops.ragged.ragged_array_ops import where
+
+from tensorflow.python.ops.ragged.ragged_conversion_ops import from_sparse
+from tensorflow.python.ops.ragged.ragged_conversion_ops import from_tensor
+from tensorflow.python.ops.ragged.ragged_conversion_ops import to_sparse
+from tensorflow.python.ops.ragged.ragged_conversion_ops import to_tensor
+
+# pylint: disable=protected-access, wildcard-import
+from tensorflow.python.ops.ragged.ragged_elementwise_ops import *
+from tensorflow.python.ops.ragged.ragged_elementwise_ops import _symbols_to_export as _elementwise_ops
+# pylint: enable=protected-access, wildcard-import
+
+from tensorflow.python.ops.ragged.ragged_factory_ops import constant
+from tensorflow.python.ops.ragged.ragged_factory_ops import constant_value
+from tensorflow.python.ops.ragged.ragged_factory_ops import convert_to_tensor_or_ragged_tensor
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_nested_row_splits
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_nested_value_rowids
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_lengths
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_limits
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_splits
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_starts
+from tensorflow.python.ops.ragged.ragged_factory_ops import from_value_rowids
+
+from tensorflow.python.ops.ragged.ragged_functional_ops import map_inner_values
+
+from tensorflow.python.ops.ragged.ragged_map_ops import map_fn
+
+from tensorflow.python.ops.ragged.ragged_math_ops import range  # pylint: disable=redefined-builtin
+
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_all
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_any
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_max
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_mean
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_min
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_prod
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_sum
+
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_max
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_mean
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_min
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_prod
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_sqrt_n
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_sum
+
+from tensorflow.python.ops.ragged.ragged_tensor import is_ragged
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorType
+
+from tensorflow.python.ops.ragged.ragged_tensor_value import RaggedTensorValue
+
+from tensorflow.python.ops.ragged.segment_id_ops import row_splits_to_segment_ids
+from tensorflow.python.ops.ragged.segment_id_ops import segment_ids_to_row_splits
+
+from tensorflow.python.util import all_util as _all_util
+
+# Any symbol that is not referenced (with "@@name") in the module docstring
+# above, or included in the "_elementwise_ops" whitelist, will be removed.
+_all_util.remove_undocumented(__name__, _elementwise_ops)
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..243fa34c4b855a81b54f932164752376fe97c9d7
--- /dev/null
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -0,0 +1,209 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.convert_to_tensor_or_ragged_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
+                                              parameterized.TestCase):
+
+  #=============================================================================
+  # Tests where the 'value' param is a RaggedTensor
+  #=============================================================================
+  @parameterized.parameters([
+      dict(pylist=[[1, 2], [3]]),
+      dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.string),
+  ])
+  def testConvertRaggedTensor(self, pylist, dtype=None, preferred_dtype=None):
+    rt = ragged.constant(pylist)
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        rt, dtype, preferred_dtype)
+    self.assertIs(converted, rt)
+
+  @parameterized.parameters([
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.float32,
+          message=('Tensor conversion requested dtype float32 for '
+                   'RaggedTensor with dtype int32')),
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.string,
+          message=('Tensor conversion requested dtype string for '
+                   'RaggedTensor with dtype .*')),
+  ])
+  def testConvertRaggedTensorError(self,
+                                   pylist,
+                                   message,
+                                   dtype=None,
+                                   preferred_dtype=None):
+    rt = ragged.constant(pylist)
+
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(rt, dtype, preferred_dtype)
+
+  #=============================================================================
+  # Tests where the 'value' param is a RaggedTensorValue
+  #=============================================================================
+  @parameterized.parameters([
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          expected_dtype=dtypes.int32),
+      dict(
+          value=ragged.constant_value([[b'a', b'b'], [b'c']]),
+          expected_dtype=dtypes.string),
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          preferred_dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          preferred_dtype=dtypes.string,
+          expected_dtype=dtypes.int32),
+  ])
+  def testConvertRaggedTensorValue(self,
+                                   value,
+                                   dtype=None,
+                                   preferred_dtype=None,
+                                   expected_dtype=None):
+    if expected_dtype is None:
+      expected_dtype = value.dtype if dtype is None else dtype
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        value, dtype, preferred_dtype)
+    self.assertEqual(value.ragged_rank, converted.ragged_rank)
+    self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
+    with self.test_session():
+      self.assertEqual(value.tolist(), self.evaluate(converted).tolist())
+
+  @parameterized.parameters([
+      dict(
+          value=ragged.constant_value([['a', 'b'], ['c']], dtype=str),
+          dtype=dtypes.int32,
+          message=r"invalid literal for int\(\) with base 10: 'a'"),
+  ])
+  def testConvertRaggedTensorValueError(self,
+                                        value,
+                                        message,
+                                        dtype=None,
+                                        preferred_dtype=None):
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+
+  #=============================================================================
+  # Tests where the 'value' param is a Tensor
+  #=============================================================================
+  @parameterized.parameters([
+      dict(pylist=[[1, 2], [3, 4]]),
+      dict(pylist=[[1, 2], [3, 4]], preferred_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3, 4]], preferred_dtype=dtypes.string),
+  ])
+  def testConvertTensor(self, pylist, dtype=None, preferred_dtype=None):
+    tensor = constant_op.constant(pylist)
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        tensor, dtype, preferred_dtype)
+    with self.test_session():
+      self.assertIs(tensor, converted)
+
+  @parameterized.parameters([
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.float32,
+          message=('Tensor conversion requested dtype float32 for '
+                   'Tensor with dtype int32')),
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.string,
+          message=('Tensor conversion requested dtype string for '
+                   'Tensor with dtype int32')),
+  ])
+  def testConvertTensorError(self,
+                             pylist,
+                             message,
+                             dtype=None,
+                             preferred_dtype=None):
+    tensor = constant_op.constant(pylist)
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(tensor, dtype, preferred_dtype)
+
+  #=============================================================================
+  # Tests where the 'value' param is a np.array
+  #=============================================================================
+  @parameterized.parameters([
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          expected_dtype=dtypes.int32),
+      dict(
+          value=np.array([[b'a', b'b'], [b'c', b'd']]),
+          expected_dtype=dtypes.string),
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          preferred_dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          preferred_dtype=dtypes.string,
+          expected_dtype=dtypes.int32),
+  ])
+  def testConvertNumpyArray(self,
+                            value,
+                            dtype=None,
+                            preferred_dtype=None,
+                            expected_dtype=None):
+    if expected_dtype is None:
+      expected_dtype = value.dtype if dtype is None else dtype
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        value, dtype, preferred_dtype)
+    self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
+    with self.test_session():
+      self.assertAllEqual(value, converted)
+
+  @parameterized.parameters([
+      dict(
+          value=np.array([['a', 'b'], ['c', 'd']], dtype=str),
+          dtype=dtypes.int32,
+          message=r"invalid literal for int\(\) with base 10: 'a'"),
+  ])
+  def testConvertNumpyArrayError(self,
+                                 value,
+                                 message,
+                                 dtype=None,
+                                 preferred_dtype=None):
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..425f3957c38550f43ceb74fff7f236bff1ace69c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -0,0 +1,1493 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Array operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_ragged_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import segment_id_ops
+
+#===============================================================================
+# Row Partitioning
+#===============================================================================
+
+
+def value_rowids(rt_input, name=None):
+  """Returns the row indices for the `values` in the given ragged tensor.
+
+  `value_rowids(rt)` corresponds one-to-one with the outermost dimension of
+  `rt.values`, and specifies the row containing each value.  In particular,
+  the row `rt[row]` consists of the values `rt.values[j]` where
+  `value_rowids(rt)[j] == row`.
+
+  Args:
+    rt_input: The RaggedTensor whose row indices should be returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A 1-D `int64` `Tensor` with shape `self.values.shape[:1]`.
+    The returned tensor is nonnegative, and is sorted in ascending order.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> rt.values.eval()
+    [3, 1, 4, 1, 5, 9, 2, 6]
+    >>> ragged.value_rowids(rt).eval()
+    [0, 0, 0, 0, 2, 2, 2, 3]  # corresponds 1:1 with rt.values
+    ```
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    raise TypeError(
+        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.cached_value_rowids is not None):
+    return rt_input.cached_value_rowids
+
+  with ops.name_scope(name, 'RaggedValueRowIds', [rt_input]):
+    return segment_id_ops.row_splits_to_segment_ids(rt_input.row_splits)
+
+
+def nrows(rt_input, out_type=dtypes.int64, name=None):
+  """Returns the number of rows in the given potentially ragged tensor.
+
+  I.e., the size of the outermost dimension of the tensor.
+
+  Args:
+    rt_input: The potentially ragged tensor whose number of rows should be
+      returned.
+    out_type: `dtype` for the returned tensor.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A scalar `Tensor` with dtype `out_type`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> ragged.nrows(rt).eval()  # rt has 5 rows.
+    5
+    ```
+  """
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.cached_nrows is not None):
+    return rt_input.cached_nrows
+
+  with ops.name_scope(name, 'RaggedNRows', [rt_input]):
+    if ragged_tensor.is_ragged(rt_input):
+      return array_ops.shape(rt_input.row_splits, out_type=out_type)[0] - 1
+    else:
+      return array_ops.shape(rt_input, out_type=out_type)[0]
+
+
+def row_starts(rt_input, name=None):
+  """Returns the start indices for rows in the given ragged tensor.
+
+  These indices specify where the values for each row begin in
+  `rt_input.values`.  `ragged.row_starts(rt_input)` is equal to
+  `rt_input.row_splits[:-1]`.
+
+  Args:
+    rt_input: The RaggedTensor whose row starts should be returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A 1-D Tensor of int64 with shape `[nrows]`.
+    The returned tensor is nonnegative, and is sorted in ascending order.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> ragged.values(rt).eval()
+    [3, 1, 4, 1, 5, 9, 2, 6]
+    >>> ragged.row_starts(rt).eval()  # indices of row starts in ragged.values
+    [0, 4, 4, 7, 8]
+    ```
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    raise TypeError(
+        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
+  with ops.name_scope(name, 'RaggedRowStarts', [rt_input]):
+    return rt_input.row_splits[:-1]
+
+
+def row_limits(rt_input, name=None):
+  """Returns the limit indices for rows in the given ragged tensor.
+
+  These indices specify where the values for each row end in
+  `rt_input.values`.  `ragged.row_limits(rt_input)` is equal to
+  `rt_input.row_splits[:-1]`.
+
+  Args:
+    rt_input: The RaggedTensor whose row limits should be returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A 1-D Tensor of int64 with shape `[nrows]`.
+    The returned tensor is nonnegative, and is sorted in ascending order.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> ragged.values(rt).eval()
+    [3, 1, 4, 1, 5, 9, 2, 6]
+    >>> ragged.row_limits(rt).eval()  # indices of row limits in ragged.values
+    [4, 4, 7, 8, 8]
+    ```
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    raise TypeError(
+        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
+  with ops.name_scope(name, 'RaggedRowLimits', [rt_input]):
+    return rt_input.row_splits[1:]
+
+
+def row_lengths(rt_input, axis=1, name=None):
+  """Returns the lengths of the rows in the given potentially ragged tensor.
+
+  `ragged.row_lengths(rt_input)[i]` indicates the number of values in the
+  `i`th row of `rt_input`.
+
+  Args:
+    rt_input: The potentially ragged tensor whose row lengths should be
+      returned.  Must have at least `axis+1` dimensions.
+    axis: An integer constant indicating the axis whose row lengths should be
+      returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A potentially Tensor of int64 with shape `rt_input.shape[:axis]`.
+
+  Raises:
+    ValueError: If rt_input is a scalar, or `axis` is out of bounds.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []])
+    >>> ragged.row_lengths(rt).eval()  # lengths of rows in rt
+    [2, 0, 2, 1, 0]
+    >>> ragged.row_lengths(rt, axis=2).eval()  # lengths of axis=2 rows.
+    [[3, 1], [], [2, 1], [1], []]
+    ```
+  """
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.cached_row_lengths is not None):
+    return rt_input.cached_row_lengths
+
+  with ops.name_scope(name, 'RaggedRowLengths', [rt_input]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    ndims = rt_input.shape.ndims
+    if ndims is not None:
+      if ndims == 0:
+        raise ValueError('rt_input may not be a scalar.')
+      elif not -ndims <= axis < ndims:
+        raise ValueError('axis=%d out of bounds: expected %d<=axis<%d.' %
+                         (axis, -ndims, ndims))
+    if ragged_tensor.is_ragged(rt_input):
+      axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
+      if axis == 0:
+        return nrows(rt_input)
+      elif axis == 1:
+        splits = rt_input.row_splits
+        return splits[1:] - splits[:-1]
+      else:
+        return rt_input.with_values(row_lengths(rt_input.values, axis - 1))
+    else:
+      shape = array_ops.shape(rt_input, out_type=dtypes.int64)
+      return array_ops.ones(shape[:axis], dtypes.int64) * shape[axis]
+
+
+#===============================================================================
+# Bounding Shape
+#===============================================================================
+def bounding_shape(rt_input, axis=None, name=None):
+  """Returns the tight bounding box shape for a potentially ragged tensor.
+
+  Args:
+    rt_input: A potentially ragged tensor.
+    axis: An integer scalar or vector indicating which axes to return the
+      bounding box for.  If not specified, then the full bounding box is
+      returned.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    An int64 `Tensor`.  If `axis` is not specified, then `output`
+    is a vector with `output.shape=[rt_input.shape.ndims]`.  If `axis` is a
+    scalar, then the `output` is a scalar.  If `axis` is a vector, then
+    `output` is a vector, where `output[i]` is the bounding size for
+    dimension `axis[i]`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+    >>> ragged.bounding_shape(rt).eval().tolist()
+    [5, 4]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedBoundingBox', [rt_input, axis]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    if not ragged_tensor.is_ragged(rt_input):
+      bbox = array_ops.shape(rt_input)
+      return bbox if axis is None else array_ops.gather(bbox, axis)
+
+    nested_splits = rt_input.nested_row_splits
+    rt_inner_values = rt_input.inner_values
+
+    # Optimized special cases for when axis=0 or axis=1:
+    if isinstance(axis, int):
+      if axis == 0:
+        return array_ops.shape(nested_splits[0], out_type=dtypes.int64)[0] - 1
+      elif axis == 1:
+        return math_ops.maximum(math_ops.reduce_max(row_lengths(rt_input)), 0)
+
+    splits_shape = array_ops.shape(rt_input.row_splits, out_type=dtypes.int64)
+    inner_values_shape = array_ops.shape(rt_inner_values, out_type=dtypes.int64)
+
+    ragged_dimensions = array_ops.stack([splits_shape[0] - 1] + [
+        math_ops.maximum(math_ops.reduce_max(splits[1:] - splits[:-1]), 0)
+        for splits in nested_splits
+    ])
+    inner_dimensions = inner_values_shape[1:]
+
+    bbox = array_ops.concat([ragged_dimensions, inner_dimensions], axis=0)
+    return bbox if axis is None else array_ops.gather(bbox, axis)
+
+
+#===============================================================================
+# ragged_gather
+#===============================================================================
+# TODO(edloper): Add an `axis` argument
+def gather(params, indices, name=None):
+  """Gathers ragged slices from `params` axis `0` according to `indices`.
+
+  Returns `RaggedTensor` output, such that:
+
+  ```python
+  output.shape = indices.shape + params.shape[1:]
+  output.ragged_rank = indices.shape.ndims + params.ragged_rank
+  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+  ```
+
+  `params` may be ragged.  `indices` may be ragged.
+  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
+  then an error is returned.
+
+  Examples:
+
+  ```python
+  >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
+  >>> indices = tf.constant([3, 1, 2, 1, 0])
+  >>> ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+  >>> ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
+
+  >>> print ragged.gather(params, ragged_indices).eval().tolist()
+  [['d', 'b', 'c'], ['b'], [], ['a']]
+
+  >>> print ragged.gather(ragged_params, indices).eval().tolist()
+  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
+
+  >>> print ragged.gather(ragged_params, ragged_indices).eval().tolist()
+  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
+  ```
+
+  Args:
+    params: The potentially ragged tensor from which to gather values. Must be
+      at least rank 1.
+    indices: The potentially ragged tensor indicating which values to gather.
+      Must have dtype `int32` or `int64`.  Values must be in the range `[0,
+      params.shape[0]]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor`, where `output.dtype=params.dtype` and
+    `output.shape=indices.shape + params.shape[1:]` and
+    `output.ragged_rank=indices.shape.ndims + params.ragged_rank`.
+
+  Raises:
+    ValueError: If indices.shape.ndims is not known statically.
+  """
+  with ops.name_scope(name, 'RaggedGather', [params, indices]):
+    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+
+    if ragged_tensor.is_ragged(indices):
+      return indices.with_values(gather(params, indices.values))
+
+    if not ragged_tensor.is_ragged(params):
+      return array_ops.gather(params, indices)
+
+    indices = ops.convert_to_tensor(indices)
+    if indices.shape.ndims is None:
+      raise ValueError('indices.shape.ndims must be known statically')
+
+    result = gen_ragged_array_ops.ragged_gather(
+        indices=indices,
+        params_dense_values=params.inner_values,
+        params_nested_splits=params.nested_row_splits,
+        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
+        1)
+
+    # Compose the RaggedTensor from splits & values.
+    return ragged_factory_ops.from_nested_row_splits(
+        result.output_dense_values, result.output_nested_splits)
+
+
+#===============================================================================
+# ragged.batch_gather
+#===============================================================================
+def batch_gather(params, indices, name=None):
+  """Gathers slices from `params` according to `indices` with batch dims.
+
+  This operation is similar to `gather`, but it assumes that the leading `N`
+  dimensions of `indices` and `params` are batch dimensions, and performs a
+  gather within each batch.  In particular, when using this operation with `N`
+  batch dimensions `B1...BN`:
+
+  * `indices` has shape `[B1...BN, I]`
+  * `params` has shape `[B1...BN, P1...PM]`.
+  * `result` has shape `[B1...BN, I, P2...PM]`.
+  * `result[b1...bN, i, p2...pM] =
+    params[b1...bN, indices[b1...bN, i], p2...pM]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
+      `M>0`).
+    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
+    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
+
+  #### Example:
+    ```python
+    >>> params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    >>> indices = ragged.constant([[1, 2, 0], [], [], [0, 0]])
+    >>> ragged.batch_gather(params, indices)
+    [['b', 'c', 'a'], [], [], ['e', 'e']]
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.batch_gather(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
+    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_ndims = indices.shape.ndims
+    if indices_ndims is None:
+      raise ValueError(
+          'batch_gather does not allow indices with unknown shape.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+
+    if ragged_tensor.is_ragged(indices):
+      # If the outermost ragged dimension is a batch dimension, recurse.
+      if indices_ndims > 2:
+        if not ragged_tensor.is_ragged(params):
+          raise ValueError('batch shape from indices does '
+                           'not match params shape')
+        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
+        with ops.control_dependencies(checks):
+          return ragged_factory_ops.from_row_splits(
+              batch_gather(params.values, indices.values), indices.row_splits)
+
+      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
+      else:
+        # Ensure that `params` is ragged and has at least 2 dimensions.
+        if not ragged_tensor.is_ragged(params):
+          if params.shape.ndims is not None and params.shape.ndims < 2:
+            raise ValueError('batch shape from indices does '
+                             'not match params shape')
+          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
+
+        # Adjust indices from within-batch to global (in params.values), and
+        # then use ragged.gather to gather them.
+        num_indices = row_lengths(indices)
+        params_starts = row_starts(params)
+        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
+        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
+        return ragged_factory_ops.from_row_splits(
+            gather(params.values, adjusted_index_values), indices.row_splits)
+
+    else:  # params is a RaggedTensor and indices is a Tensor.
+      if indices_ndims == 1:
+        return gather(params, indices)
+      elif indices_ndims == 2:
+        # Adjust indices from batch-local to global (in params.values)
+        adjustments = array_ops.expand_dims(row_starts(params), 1)
+        adjusted_indices = math_ops.to_int64(indices) + adjustments
+        return gather(params.values, adjusted_indices)
+      else:
+        raise ValueError(
+            'batch shape from indices does not match params shape')
+
+
+#===============================================================================
+# ragged.gather_nd
+#===============================================================================
+def gather_nd(params, indices, name=None):
+  """Gather slices from `params` using `n`-dimensional indices.
+
+  This operation is similar to `gather`, but it uses the innermost dimension
+  of `indices` to define a slice into `params`.  In particular, if:
+
+  * `indices` has shape `[A1...AN, I]`
+  * `params` has shape `[B1...BM]`
+
+  Then:
+
+  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
+  * `result[a1...aN] = params[indices[a1...aN, :]]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[A1...AN, I]`.
+    indices: A potentially ragged tensor with shape `[B1...BM]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
+
+  #### Examples:
+    ```python
+    >>> params = tf.ragged.constant_value(
+    ...     [ [ ['000', '001'], ['010'              ]          ],
+    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
+    ...       [ [            ], ['210'              ]          ] ])
+
+    >>> # Gather 2D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2], [0]])
+    [ [ [            ], ['210'] ]
+      [ ['000', '001'], ['010'] ] ]
+
+    >>> # Gather 1D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
+    [['210'], ['000', '001']]
+
+    >>> # Gather scalars from a 3D tensor
+    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
+    ['001', '112']
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.gather_nd(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
+
+    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_shape = indices.shape
+    indices_ndims = indices_shape.ndims
+    if indices_ndims is None:
+      raise ValueError('indices.rank be statically known.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+    if (ragged_tensor.is_ragged(indices) and
+        indices_ndims == indices.ragged_rank + 1):
+      raise ValueError('The innermost dimension of indices may not be ragged')
+
+    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
+    # that each index slices into.
+    index_size = indices_shape[-1].value
+    if index_size is None:
+      raise ValueError('indices.shape[-1] must be statically known.')
+
+    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
+    # dense, then we convert it to ragged before recursing, and then convert
+    # the result back to `dense` if appropriate.
+    if indices_ndims > 2:
+      indices_is_dense = not ragged_tensor.is_ragged(indices)
+      if indices_is_dense:
+        indices = ragged_conversion_ops.from_tensor(
+            indices, ragged_rank=indices_ndims - 2)
+      result = indices.with_inner_values(
+          gather_nd(params, indices.inner_values))
+      if (indices_is_dense and ragged_tensor.is_ragged(result) and
+          result.ragged_rank == indices_ndims - 2):
+        result = ragged_conversion_ops.to_tensor(result)
+      return result
+
+    # indices_ndims <= 2, and the innermost dimension of indices may not be
+    # ragged, so `indices` must not be ragged.
+    assert not ragged_tensor.is_ragged(indices)
+    assert ragged_tensor.is_ragged(params)
+
+    # Handle corner case: An empty index tuple selects the entire `params`
+    # value.  So if `index_size` is zero, then tile `params`.
+    if index_size == 0:
+      params_ndims = params.ragged_rank + array_ops.rank(params.inner_values)
+      for dim in range(indices_ndims - 1):
+        params = expand_dims(params, axis=0)
+      multiples = array_ops.concat([
+          array_ops.shape(indices)[:-1],
+          array_ops.ones([params_ndims], dtypes.int32)
+      ],
+                                   axis=0)
+      return tile(params, multiples)
+
+    # When index_size=1, we can just flatten the index tuples and use gather.
+    elif index_size == 1:
+      flattened_index_tuples = array_ops.reshape(indices, [-1])
+      return gather(params, flattened_index_tuples)
+
+    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
+    # Flatten both the index tuples and the params, such that the flattened
+    # index tuples point to the correct values in the flattened params; and
+    # then use ragged.gather on the flattened index tuples & params.
+    else:
+      indices = math_ops.to_int64(indices)
+
+      # Flatten the outermost 2 dimensions of the index tuples & params.
+      flattened_index_tuples = array_ops.gather(params.row_splits,
+                                                indices[..., 0])
+      flattened_index_tuples += indices[..., 1]
+      flattened_params = params.values
+
+      # Flatten any remaining dimensions.
+      for dim in range(2, index_size):
+        if not ragged_tensor.is_ragged(flattened_params):
+          flattened_index_tuples = array_ops.expand_dims(
+              flattened_index_tuples, axis=1)
+          flattened_index_tuples = array_ops.concat(
+              [flattened_index_tuples, indices[..., dim:]], axis=1)
+          return array_ops.gather_nd(flattened_params, flattened_index_tuples)
+
+        flattened_index_tuples = array_ops.gather(
+            row_starts(flattened_params), flattened_index_tuples)
+        flattened_index_tuples += indices[..., dim]
+        flattened_params = flattened_params.values
+
+      # Gather using the flattened index tuples and params.
+      return gather(flattened_params, flattened_index_tuples)
+
+
+#===============================================================================
+# Masking
+#===============================================================================
+def boolean_mask(data, mask, keepdims=False, name=None):
+  """Applies a boolean mask to `data`.
+
+  Returns a potentially ragged tensor that is formed by retaining the elements
+  in `data` where the corresponding value in `mask` is `True`.
+
+  If `keepdims` is true then outer dimensions (corresponding to the `mask`
+  dimensions) are preserved, and:
+
+  * `output[a1...aA, i, b1...bB] = data[a1...aA, j, b1...bB]`
+
+     Where `j` is the `i`th `True` entry of `mask[a1...aA]`.
+
+  If `keepdims` is false, then the outer dimensions are collapsed (similar to
+  the behavior of `tf.boolean_mask`), and:
+
+  * `output[i, b1...bB] = data[a1...aA, b1...bB]`
+
+     Where `(a1...aA)` is the `i`th `True` entry of `mask`
+     (in row-major order).
+
+  Args:
+    data: A potentially ragged tensor.
+    mask: A potentially ragged boolean tensor.  `mask`'s shape must be a prefix
+      of `data`'s shape.  `rank(mask)` must be known statically.
+    keepdims: Whether to preserve the outer dimensions (`keepdims=True`) or
+      flatten them (`keepdims=False`).
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A potentially ragged tensor that is formed by retaining the elements in
+    `data` where the corresponding value in `mask` is `True`.
+
+    If `keepdims` is false:
+
+    * `rank(output) = rank(data) - rank(mask) + 1`.
+    * `output.ragged_rank = max(data.ragged_rank - rank(mask) + 1, 0)`.
+
+    If `keepdims` is true:
+
+    * `rank(output) = rank(data)`.
+    * `output.ragged_rank = max(data.ragged_rank, rank(mask) - 1)`.
+
+  Raises:
+    ValueError: if `rank(mask)` is not known statically; or if `mask.shape` is
+      not a prefix of `data.shape`.
+
+  #### Examples:
+    ```python
+    >>> # Aliases for True & False so data and mask line up.
+    >>> T, F = (True, False)
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Flatten outer dims.
+    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
+    ...     keepdims=False).tolist()
+    [1, 3, 7]
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Preserve outer dims.
+    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
+    ...     keepdims=True).tolist()
+    [[1, 3], [], [7]]
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Flatten outer dims.
+    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
+    ...     keepdims=False).tolist()
+    [3, 5, 6]
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Preserve outer dims.
+    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
+    ...     keepdims=True).tolist()
+    [[3], [], [5, 6]]
+
+    >>> tf.ragged.boolean_mask(  # Mask rows of a 2D RaggedTensor.
+    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+    ...     tf.ragged.constant([True, False, True]),
+    ...     keepdims=True).tolist()
+    [[1, 2, 3], [5, 6]]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedMask', [data, mask]):
+    # Convert inputs to tensors.
+    data = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        data, name='data')
+    mask = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        mask, dtypes.bool, name='mask')
+
+    # Get static rank of mask.
+    if mask.shape.ndims is None:
+      raise ValueError('mask.shape.ndims must be kown statically.')
+    elif mask.shape.ndims == 0:
+      raise ValueError('mask cannot be scalar.')
+
+    # If mask is ragged, then recurse with a non-ragged mask.
+    if ragged_tensor.is_ragged(mask):
+      if not ragged_tensor.is_ragged(data):
+        data = ragged_conversion_ops.from_tensor(
+            data, ragged_rank=mask.ragged_rank)
+      # Check that mask.nested_row_splits is a prefix of
+      # data.nested_row_splits.
+      splits_list = [
+          mask.nested_row_splits, data.nested_row_splits[:mask.ragged_rank]
+      ]
+      with ops.control_dependencies(
+          ragged_util.assert_splits_match(splits_list)):
+        # Strip off ragged `splits` until `mask` is non-ragged.  Keep the splits
+        # that we strip off in `splits`, so we can add them back on after
+        # we recursively mask the non-ragged data.
+        splits = []
+        while ragged_tensor.is_ragged(mask):
+          if mask.shape.ndims > 2:
+            splits.append(mask.row_splits)
+          else:
+            # Count the number of True mask values in each row to find the
+            # lengths of the filtered rows; then convert to splits.
+            int_mask = ragged_functional_ops.map_inner_values(
+                math_ops.cast, mask, dtype=dtypes.int64)
+            masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1)
+            splits.append(_lengths_to_splits(masked_row_lengths))
+          mask = mask.values
+          data = data.values
+
+        # Recursively apply the nested non-ragged mask to the nested data.
+        masked_values = boolean_mask(data, mask, keepdims)
+
+        # Add the ragged `splits` back to the result.
+        if keepdims:
+          masked_values = ragged_factory_ops.from_nested_row_splits(
+              masked_values, splits)
+
+        return masked_values
+
+    # If mask is non-ragged and has rank 1, and data is ragged, then build a
+    # ragged tensor with the indicated rows.
+    elif ragged_tensor.is_ragged(data) and mask.shape.ndims == 1:
+      # Get the masked splits: first get the length of each row, then filter
+      # out the rows that we are deleting, and convert that filtered set of
+      # masks back to a splits tensor.
+      lengths = row_lengths(data)
+      masked_lengths = array_ops.boolean_mask(lengths, mask)
+      masked_splits = _lengths_to_splits(masked_lengths)
+
+      # Get the masked values: first get row ids corresponding to each
+      # value, then use tf.gather to build a boolean mask that's false for
+      # values that come from rows that we are deleting, and use that mask to
+      # construct the masked values tensor.
+      segment_ids = segment_id_ops.row_splits_to_segment_ids(data.row_splits)
+      segment_mask = array_ops.gather(mask, segment_ids)
+      masked_values = boolean_mask(data.values, segment_mask, keepdims=False)
+
+      return ragged_factory_ops.from_row_splits(masked_values, masked_splits)
+
+    # If mask is non-ragged and has rank>1, then convert it to be ragged,
+    # with a ragged rank matching data.
+    if ragged_tensor.is_ragged(data):
+      mask = ragged_conversion_ops.from_tensor(
+          mask, ragged_rank=min(data.ragged_rank, mask.shape.ndims - 1))
+      return boolean_mask(data, mask, keepdims)
+
+    # Otherwise, data and mask are both `Tensor`s.
+    else:
+      # Apply `boolean_mask` to get the masked values.
+      masked_values = array_ops.boolean_mask(data, mask)
+
+      if mask.shape.ndims >= 2 and keepdims:
+        # Add the innermost ragged dimension.  For each innermost cell, get the
+        # number of values it contains.  Then flatten that to get a list of
+        # cell lengths, and convert it to splits.  Finally, combine the splits
+        # and values to get the innermost ragged tensor.
+        masked_lengths = math_ops.count_nonzero(mask, axis=-1)
+        flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1])
+        masked_values = ragged_factory_ops.from_row_lengths(
+            masked_values, flattened_masked_lengths)
+
+        # Wrap remaining ragged dimensions.
+        if mask.shape.ndims > 2 and keepdims:
+          mask_shape = array_ops.shape(mask, out_type=dtypes.int64)
+          split_size = math_ops.cumprod(mask_shape) + 1
+          for dim in range(mask.shape.ndims - 3, -1, -1):
+            elt_size = mask_shape[dim + 1]
+            masked_splits = math_ops.range(split_size[dim]) * elt_size
+            masked_values = ragged_factory_ops.from_row_splits(
+                masked_values, masked_splits)
+
+      return masked_values
+
+
+#===============================================================================
+# Concatenation and Stacking
+#===============================================================================
+def concat(rt_inputs, axis, name=None):
+  """Concatenates potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  concatenation of `[rt[i0...iaxis] for rt in rt_inputs]`.
+
+  Args:
+    rt_inputs: A list of potentially ragged tensors.  May not be empty. All
+      `rt_inputs` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to concatenate.
+      (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `rt_inputs` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in rt_inputs]))`.
+
+  Raises:
+    ValueError: If `rt_inputs` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.concat([t1, t2], axis=0)
+    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
+    >>> ragged.concat([t1, t2], axis=1)
+    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
+    ```
+  """
+  if not isinstance(rt_inputs, (list, tuple)):
+    rt_inputs = [rt_inputs]
+  with ops.name_scope(name, 'RaggedConcat', rt_inputs):
+    return _ragged_stack_concat_helper(rt_inputs, axis, stack_values=False)
+
+
+def stack(rt_inputs, axis, name=None):
+  """Stacks potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  list `[rt[i0...iaxis] for rt in rt_inputs]`.
+
+  Args:
+    rt_inputs: A list of potentially ragged tensors.  May not be empty. All
+      `rt_inputs` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to stack.
+      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `rt_inputs` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K+1`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in rt_inputs]))`.
+
+  Raises:
+    ValueError: If `rt_inputs` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.stack([t1, t2], axis=0)
+    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
+    >>> ragged.stack([t1, t2], axis=1)
+    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
+    ```
+  """
+  if not isinstance(rt_inputs, (list, tuple)):
+    rt_inputs = [rt_inputs]
+  with ops.name_scope(name, 'RaggedConcat', rt_inputs):
+    return _ragged_stack_concat_helper(rt_inputs, axis, stack_values=True)
+
+
+def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
+  """Helper function to concatenate or stack ragged tensors.
+
+  Args:
+    rt_inputs: A list of RaggedTensors or Tensors to combine.
+    axis: The axis along which to concatenate or stack.
+    stack_values: A boolean -- if true, then stack values; otherwise,
+      concatenate them.
+
+  Returns:
+    A RaggedTensor.
+  Raises:
+    ValueError: If rt_inputs is empty, or if axis is out of range.
+  """
+  # Validate parameters.
+  if not rt_inputs:
+    raise ValueError('rt_inputs may not be empty.')
+
+  # Convert input tensors.
+  rt_inputs = [
+      ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+          rt_input, name='rt_input') for rt_input in rt_inputs
+  ]
+
+  # Special case: if there's only one input, then return it as-is.
+  if len(rt_inputs) == 1:
+    if stack_values:
+      return expand_dims(rt_inputs[0], axis=0)
+    else:
+      return rt_inputs[0]
+
+  # Check the rank (number of dimensions) of the input tensors.
+  ndims = None
+  for rt in rt_inputs:
+    if ndims is None:
+      ndims = rt.shape.ndims
+    else:
+      rt.shape.assert_has_rank(ndims)
+
+  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
+  axis = ragged_util.get_positive_axis(axis, out_ndims)
+
+  # If all the inputs are Tensors, and we're combining the final dimension,
+  # then we can delegate to the tf.stack/tf.concat operation, and return a
+  # Tensor.
+  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
+    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
+      if stack_values:
+        return array_ops.stack(rt_inputs, axis)
+      else:
+        return array_ops.concat(rt_inputs, axis)
+
+  # Convert any Tensor inputs to RaggedTensors.  This makes it
+  # possible to concatenate Tensors and RaggedTensors together.
+  for i in range(len(rt_inputs)):
+    if not ragged_tensor.is_ragged(rt_inputs[i]):
+      rt_inputs[i] = ragged_conversion_ops.from_tensor(
+          rt_inputs[i], ragged_rank=1)
+
+  # Convert the input tensors to all have the same ragged_rank.
+  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
+  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
+
+  if axis == 0:
+    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
+  elif axis == 1:
+    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
+  else:  # axis > 1: recurse.
+    values = [rt.values for rt in rt_inputs]
+    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
+    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
+      return ragged_factory_ops.from_row_splits(
+          _ragged_stack_concat_helper(values, axis - 1, stack_values),
+          splits[0][0])
+
+
+def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 0.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  # Concatenate the inner values together.
+  inner_values = [rt.inner_values for rt in rt_inputs]
+  concatenated_inner_values = array_ops.concat(inner_values, axis=0)
+
+  # Concatenate the splits together for each ragged dimension (adjusting
+  # split offsets as necessary).
+  nested_splits = [rt.nested_row_splits for rt in rt_inputs]
+  ragged_rank = rt_inputs[0].ragged_rank
+  concatenated_nested_splits = [
+      _concat_ragged_splits([ns[dim]
+                             for ns in nested_splits])
+      for dim in range(ragged_rank)
+  ]
+
+  # If we are performing a stack operation, then add another splits.
+  if stack_values:
+    stack_lengths = array_ops.stack([nrows(rt) for rt in rt_inputs])
+    stack_splits = _lengths_to_splits(stack_lengths)
+    concatenated_nested_splits.insert(0, stack_splits)
+
+  return ragged_factory_ops.from_nested_row_splits(concatenated_inner_values,
+                                                   concatenated_nested_splits)
+
+
+def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 1.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  num_inputs = len(rt_inputs)
+
+  rt_nrows = nrows(rt_inputs[0])
+  nrows_msg = 'Input tensors have incompatible shapes.'
+  nrows_checks = [
+      check_ops.assert_equal(nrows(rt), rt_nrows, message=nrows_msg)
+      for rt in rt_inputs[1:]
+  ]
+
+  with ops.control_dependencies(nrows_checks):
+    # Concatentate the inputs together to put them in a single ragged tensor.
+    concatenated_rt = _ragged_stack_concat_axis_0(rt_inputs, stack_values=False)
+
+    # Use ragged.gather to permute the rows of concatenated_rt.  In particular,
+    #   permuted_rt = [rt_inputs[0][0], ..., rt_inputs[N][0],
+    #                  rt_inputs[0][1], ..., rt_inputs[N][1],
+    #                      ...,
+    #                  rt_inputs[0][M], ..., rt_input[N][M]]
+    # where `N=num_inputs-1` and `M=rt_nrows-1`.
+    row_indices = math_ops.range(rt_nrows * num_inputs)
+    row_index_matrix = array_ops.reshape(row_indices, [num_inputs, -1])
+    transposed_row_index_matrix = array_ops.transpose(row_index_matrix)
+    row_permutation = array_ops.reshape(transposed_row_index_matrix, [-1])
+    permuted_rt = gather(concatenated_rt, row_permutation)
+
+    if stack_values:
+      # Add a new splits tensor to group together the values.
+      stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
+      _copy_row_shape(rt_inputs, stack_splits)
+      return ragged_factory_ops.from_row_splits(permuted_rt, stack_splits)
+    else:
+      # Merge together adjacent rows by dropping the row-split indices that
+      # separate them.
+      concat_splits = permuted_rt.row_splits[::num_inputs]
+      _copy_row_shape(rt_inputs, concat_splits)
+      return ragged_factory_ops.from_row_splits(permuted_rt.values,
+                                                concat_splits)
+
+
+def _copy_row_shape(rt_inputs, splits):
+  """Sets splits.shape to [rt[shape[0]+1] for each rt in rt_inputs."""
+  for rt in rt_inputs:
+    if rt.shape[0] is not None:
+      splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
+
+
+#===============================================================================
+# Tiling
+#===============================================================================
+def tile(rt_input, multiples, name=None):
+  """Constructs a `RaggedTensor` by tiling a given `RaggedTensor`.
+
+  The values of `rt_input` are replicated `multiples[i]` times along the
+  `i`th dimension (for each dimension `i`).  For every dimension `axis` in
+  `rt_input`, the length of each output element in that dimension is the
+  length of corresponding input element multiplied by `multiples[axis]`.
+
+  Args:
+    rt_input: A `RaggedTensor`.
+    multiples: A 1-D integer `Tensor`.  Length must be the same as the number of
+      dimensions in `rt_input`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor` with the same type, rank, and ragged_rank as `rt_input`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> ragged.tile(rt, [3, 2]).eval().tolist()
+    [[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedTile', [rt_input, multiples]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    multiples = ragged_util.convert_to_int_tensor(
+        multiples, name='multiples', dtype=dtypes.int64)
+    multiples.shape.assert_has_rank(1)
+    if not ragged_tensor.is_ragged(rt_input):
+      return array_ops.tile(rt_input, multiples, name)
+
+    # If the constant value of `multiples` is available, then we can use it
+    # to skip tiling dimensions where `multiples=1`.
+    const_multiples = tensor_util.constant_value(multiples)
+
+    return ragged_factory_ops.from_nested_row_splits(
+        _tile_ragged_values(rt_input, multiples, const_multiples),
+        _tile_ragged_splits(rt_input, multiples, const_multiples))
+
+
+def _tile_ragged_values(rt_input, multiples, const_multiples=None):
+  """Builds inner_values tensor for a tiled `RaggedTensor`.
+
+  Returns a tensor that repeats the values in
+  `rt_input.inner_values` in the
+  appropriate pattern to construct a `RaggedTensor` that tiles `rt_input` as
+  specified by `multiples`.
+
+  Args:
+    rt_input: The `RaggedTensor` whose values should be repeated.
+    multiples: A 1-D integer `tensor`, indicating how many times each dimension
+      should be repeated.
+    const_multiples: Optional constant value for multiples.  Used to skip tiling
+      dimensions where `multiples=1`.
+
+  Returns:
+    A `Tensor` with the same type and rank as `rt_input.inner_values`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> _tile_ragged_values(rt, [3, 2]).eval().tolist()
+    [1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3]
+    ```
+  """
+  ragged_rank = rt_input.ragged_rank
+  nested_splits = rt_input.nested_row_splits
+
+  # Pointers to the values in `rt_input.inner_values`.
+  inner_value_ids = math_ops.range(nested_splits[-1][-1])
+
+  # For each ragged dimension (working from the innermost to outermost),
+  # expand `inner_value_ids` as necessary to tile that dimension.
+  prev_splits = None
+  for axis in range(ragged_rank, 0, -1):
+    # Ragged splits for this dimension.
+    splits = nested_splits[axis - 1]
+
+    # Adjust splits so they point into `inner_value_ids` (instead of just
+    # pointing into the next dimension's values).
+    if prev_splits is not None:  # Not the first pass through the loop.
+      splits = array_ops.gather(prev_splits * multiples[axis + 1], splits)
+
+    # Repeat each element in this ragged dimension `multiples[axis]` times.
+    if const_multiples is None or const_multiples[axis] != 1:
+      inner_value_ids = _repeat_ranges(inner_value_ids, splits, multiples[axis])
+
+    prev_splits = splits
+
+  # Gather the tiled inner values.
+  ragged_tiled_values = array_ops.gather(rt_input.inner_values, inner_value_ids)
+
+  # Tile the inner_values for the uniform dimensions (i.e., for `axis=0` plus
+  # `axis=range(ragged_rank, rank)`).
+  inner_repeats = array_ops.concat([multiples[:1], multiples[ragged_rank + 1:]],
+                                   axis=0)
+  return array_ops.tile(ragged_tiled_values, inner_repeats)
+
+
+def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
+  """Builds nested_split tensors for a tiled `RaggedTensor`.
+
+  Returns a list of split tensors that can be used to construct the
+  `RaggedTensor` that tiles `rt_input` as specified by `multiples`.
+
+  Args:
+    rt_input: The `RaggedTensor` that is being tiled.
+    multiples: A 1-D integer `tensor`, indicating how many times each dimension
+      should be repeated.
+    const_multiples: Optional constant value for multiples.  Used to skip tiling
+      dimensions where `multiples=1`.
+
+  Returns:
+    A list of 1-D `int64` `Tensor`s (one for each ragged dimension in
+    `rt_input`).
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> _tile_ragged_splits(rt, [3, 2]).eval().tolist()
+    [0, 4, 6, 10, 12, 16, 18]
+    ```
+  """
+  ragged_rank = rt_input.ragged_rank
+  nested_splits = rt_input.nested_row_splits
+
+  # For each ragged dimension: nested_splits[axis] -> result_splits[axis].
+  result_splits = []
+  for axis in range(ragged_rank):
+    # Get the length of each row for the input tensor for this dimension.
+    input_lengths = nested_splits[axis][1:] - nested_splits[axis][:-1]
+
+    # Multiply those lengths by the `multiples` of dimension axis+1, since
+    # each value will be repeated that number of times.
+    output_lengths = input_lengths * multiples[axis + 1]
+
+    # Repeat ranges of the row lengths as necessary for them to be tiled in
+    # each ragged dimension `d < axis`.  (Start with dimension d=axis-1, and
+    # work our way up to dimension d=0.)
+    repeats = 1
+    for d in range(axis - 1, -1, -1):
+      if const_multiples is None or const_multiples[d + 1] != 1:
+        splits = nested_splits[d] * repeats
+        output_lengths = _repeat_ranges(output_lengths, splits,
+                                        multiples[d + 1])
+      repeats *= multiples[d + 1]
+
+    # Tile splits for the outermost (uniform) dimension.
+    output_lengths = array_ops.tile(output_lengths, multiples[:1])
+
+    # Convert to splits.
+    result_splits.append(_lengths_to_splits(output_lengths))
+
+  return result_splits
+
+
+#===============================================================================
+# Reshaping
+#===============================================================================
+
+
+def expand_dims(rt_input, axis, name=None):
+  """Inserts a dimension with shape 1 into a potentially ragged tensor's shape.
+
+  Given a potentially ragged tenor `rt_input`, this operation inserts a
+  dimension with size 1 at the dimension `axis` of `rt_input`'s shape.
+
+  * If `rt_input` is a `Tensor`, then this is equivalent to
+    `tf.expand_dims`.
+  * If `rt_input` is ragged, and `axis=0`, then the new dimension will be
+    uniform; but the previously outermost dimension will become ragged.
+  * If `rt_input` is ragged, and `0 < axis < rt_input.ragged_rank`, then the
+    new dimension will be ragged.
+  * If `rt_input` is ragged, and axis >= rt_input.ragged_rank`, then the new
+    dimension will be uniform.
+
+  The following table gives some examples showing how `ragged.expand_dims`
+  impacts the shapes of different input tensors.  Ragged dimensions are
+  indicated by enclosing them in parentheses.
+
+  rt_input.shape          | axis | result.shape
+  ----------------------- | ---- | -----------------------------
+  `[D1, D2]`              |  `0` | `[1, D1, D2]`
+  `[D1, D2]`              |  `1` | `[D1, 1, D2]`
+  `[D1, D2]`              |  `2` | `[D1, D2, 1]`
+  `[D1, (D2), (D3), D4]`  |  `0` | `[1, (D1), (D2), (D3), D4]`
+  `[D1, (D2), (D3), D4]`  |  `1` | `[D1, (1), (D2), (D3), D4]`
+  `[D1, (D2), (D3), D4]`  |  `2` | `[D1, (D2), (1), (D3), D4]`
+  `[D1, (D2), (D3), D4]`  |  `3` | `[D1, (D2), (D3), 1, D4]`
+  `[D1, (D2), (D3), D4]`  |  `4` | `[D1, (D2), (D3), D4, 1]`
+
+  Args:
+    rt_input: The potentially tensor that should be expanded with a new
+      dimension.
+    axis: An integer constant indicating where the new dimension should be
+      inserted.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor with the same values as `rt_input`, with an added dimension of
+    size 1 at `axis`.
+
+  #### Examples:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> print rt.shape
+    TensorShape([2, None])
+
+    >>> expanded = ragged.expand_dims(rt, axis=0)
+    >>> print(expanded.shape, expanded.eval().tolist())
+    TensorShape([1, None, None]) [[[1, 2], [3]]]
+
+    >>> expanded = ragged.expand_dims(rt, axis=1)
+    >>> print(expanded.shape, expanded.eval().tolist())
+    TensorShape([2, None, None]) [[[1, 2]], [[3]]]
+
+    >>> expanded = ragged.expand_dims(rt, axis=2)
+    >>> print(expanded.shape, expanded.eval().tolist())
+    TensorShape([2, None, 1]) [[[1], [2]], [[3]]]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedExpandDims', [rt_input]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+
+    if not ragged_tensor.is_ragged(rt_input):
+      return array_ops.expand_dims(rt_input, axis)
+
+    ndims = None if rt_input.shape.ndims is None else rt_input.shape.ndims + 1
+    axis = ragged_util.get_positive_axis(axis, ndims)
+    if axis == 0:
+      values = rt_input
+      splits = array_ops.stack([0, nrows(rt_input)])
+    elif axis == 1:
+      values = rt_input
+      splits = math_ops.range(nrows(rt_input) + 1)
+    else:
+      values = expand_dims(rt_input.values, axis - 1)
+      splits = rt_input.row_splits
+
+    return ragged_factory_ops.from_row_splits(values, splits)
+
+
+#===============================================================================
+# ragged.where
+#===============================================================================
+def where(condition, x=None, y=None, name=None):
+  """Return the elements, either from `x` or `y`, depending on the `condition`.
+
+  : If both `x` and `y` are `None`:
+    Returns the coordinates of true elements of `condition`. The coordinates
+    are returned in a 2-D tensor with shape
+    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
+    coordinates of the `i`th true value (in row-major order).
+
+  : If both `x` and `y` are non-`None`:
+    Returns a tensor formed by selecting values from `x` where condition is
+    true, and from `y` when condition is false.  In particular:
+
+    : If `condition`, `x`, and `y` all have the same shape:
+
+      * `result[i1...iN] = x[i1...iN]` if `condition[i1...iN]` is true.
+      * `result[i1...iN] = y[i1...iN]` if `condition[i1...iN]` is false.
+
+    : Otherwise:
+
+      * `condition` must be a vector.
+      * `x` and `y` must have the same number of dimensions.
+      * The outermost dimensions of `condition`, `x`, and `y` must all have the
+        same size.
+      * `result[i] = x[i]` if `condition[i]` is true.
+      * `result[i] = y[i]` if `condition[i]` is false.
+
+  Args:
+    condition: A potentially ragged tensor of type `bool`
+    x: A potentially ragged tensor (optional).
+    y: A potentially ragged tensor (optional).  Must be specified if `x` is
+      specified.  Must have the same rank and type as `x`.
+    name: A name of the operation (optional)
+
+  Returns:
+    : If both `x` and `y` are `None`:
+      A `Tensor` with shape `(num_true, dim_size(condition))`.
+    : Otherwise:
+      A potentially ragged tensor with the same type, rank, and outermost
+      dimension size as `x` and `y`.
+      `result.ragged_rank = max(x.ragged_rank, y.ragged_rank)`.
+
+  Raises:
+    ValueError: When exactly one of `x` or `y` is non-`None`; or when
+      `condition`, `x`, and `y` have incompatible shapes.
+
+  #### Examples:
+    ```python
+    >>> # Coordinates where condition is true.
+    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
+    >>> ragged.where(condition)
+    [[0, 0], [0, 2], [1, 1]]
+
+    >>> # Elementwise selection between x and y, based on condition.
+    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
+    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'b', 'C'], ['d', 'E']]
+
+    >>> # Row selection between x and y, based on condition.
+    >>> condition = [True, False]
+    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'B', 'C'], ['d', 'e']]
+    ```
+  """
+  if (x is None) != (y is None):
+    raise ValueError('x and y must be either both None or both non-None')
+  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
+    condition = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        condition, name='condition')
+    if x is None:
+      return _coordinate_where(condition)
+    else:
+      x = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(y, name='y')
+      return _elementwise_where(condition, x, y)
+
+
+def _elementwise_where(condition, x, y):
+  """Ragged version of tf.where(condition, x, y)."""
+  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
+  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
+  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
+
+  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
+    return array_ops.where(condition, x, y)
+
+  elif condition_is_ragged and x_is_ragged and y_is_ragged:
+    return ragged_functional_ops.map_inner_values(array_ops.where, condition, x,
+                                                  y)
+  elif not condition_is_ragged:
+    # Concatenate x and y, and then use `gather` to assemble the selected rows.
+    condition.shape.assert_has_rank(1)
+    x_nrows = nrows(x)
+    x_and_y = concat([x, y], axis=0)
+    indices = array_ops.where(condition, math_ops.range(x_nrows),
+                              x_nrows + math_ops.range(nrows(y)))
+    return gather(x_and_y, indices)
+
+  else:
+    raise ValueError('Input shapes do not match.')
+
+
+def _coordinate_where(condition):
+  """Ragged version of tf.where(condition)."""
+  if not isinstance(condition, ragged_tensor.RaggedTensor):
+    return array_ops.where(condition)
+
+  # The coordinate for each `true` value in condition.values.
+  selected_coords = _coordinate_where(condition.values)
+
+  # Convert the first index in each coordinate to a row index and column index.
+  first_index = selected_coords[:, 0]
+  selected_rows = array_ops.gather(value_rowids(condition), first_index)
+  selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
+  selected_cols = first_index - selected_row_starts
+
+  # Assemble the row & column index with the indices for inner dimensions.
+  return array_ops.concat([
+      array_ops.expand_dims(selected_rows, 1),
+      array_ops.expand_dims(selected_cols, 1), selected_coords[:, 1:]
+  ],
+                          axis=1)
+
+
+#===============================================================================
+# Internal Helper Functions
+#===============================================================================
+
+
+def _lengths_to_splits(lengths):
+  """Returns splits corresponding to the given lengths."""
+  return array_ops.concat([[0], math_ops.cumsum(lengths)], axis=0)
+
+
+def _increase_ragged_rank_to(rt_input, ragged_rank):
+  """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
+  if ragged_rank > 0:
+    if not ragged_tensor.is_ragged(rt_input):
+      rt_input = ragged_conversion_ops.from_tensor(rt_input)
+    if rt_input.ragged_rank < ragged_rank:
+      rt_input = rt_input.with_values(
+          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
+  return rt_input
+
+
+def _concat_ragged_splits(splits_list):
+  """Concatenates a list of RaggedTensor splits to form a single splits."""
+  pieces = [splits_list[0]]
+  splits_offset = splits_list[0][-1]
+  for splits in splits_list[1:]:
+    pieces.append(splits[1:] + splits_offset)
+    splits_offset += splits[-1]
+  return array_ops.concat(pieces, axis=0)
+
+
+def _repeat_ranges(params, splits, multiple):
+  """Repeats each range of `params` (as specified by `splits`) `multiple` times.
+
+  Let the `i`th range of `params` be defined as
+  `params[splits[i]:splits[i + 1]]`.  Then this function returns a tensor
+  containing range 0 repeated `multiple` times, followed by range 1 repeated
+  `multiple`, ..., followed by the last range repeated `multiple` times.
+
+  Args:
+    params: The `Tensor` whose values should be repeated.
+    splits: A splits tensor indicating the ranges of `params` that should be
+      repeated.
+    multiple: The number of times each range should be repeated.
+
+  Returns:
+    A `Tensor` with the same rank and type as `params`.
+
+  #### Example:
+    ```python
+    >>> _repeat_ranges(['a', 'b', 'c'], [0, 2, 3], 3)
+    ['a', 'b', 'a', 'b', 'a', 'b', 'c', 'c', 'c']
+    ```
+  """
+  # Repeat each split value `multiple` times.  E.g., if `splits=[0 3 4]` and
+  # `multiples=3`, then `repeated_splits=[0 0 0 3 3 3 4 4 4]`.
+  repeated_splits = array_ops.tile(
+      array_ops.expand_dims(splits, axis=1), array_ops.stack([1, multiple]))
+  repeated_splits = array_ops.reshape(repeated_splits, [-1])
+
+  # Divide the splits into repeated starts & repeated limits.  E.g., if
+  # `repeated_splits=[0 0 0 3 3 3 4 4 4]` then `repeated_starts=[0 0 0 3 3 3]`
+  # and `repeated_limits=[3 3 3 4 4 4]`.
+  n_splits = array_ops.shape(repeated_splits, out_type=dtypes.int64)[0]
+  repeated_starts = repeated_splits[:n_splits - multiple]
+  repeated_limits = repeated_splits[multiple:]
+
+  # Get indices for each range from starts to limits, and use those to gather
+  # the values in the desired repetition pattern.
+  offsets = ragged_math_ops.range(repeated_starts, repeated_limits).values
+  return array_ops.gather(params, offsets)
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..79a2ecd87ae11b2c6aadb888074bc8721123cba3
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -0,0 +1,199 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.batch_gather."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Example
+      #=========================================================================
+      dict(
+          descr='Docstring example',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d'], [], ['e']]),
+          indices=ragged.constant_value([[1, 2, 0], [], [], [0, 0]]),
+          expected=ragged.constant_value([[b'b', b'c', b'a'], [], [],
+                                          [b'e', b'e']])),
+      #=========================================================================
+      # 0 Batch Dimensions
+      #=========================================================================
+      dict(
+          descr='params: [P1], indices: [I], result: [I]',
+          params=['a', 'b', 'c', 'd'],
+          indices=[3, 2],
+          expected=[b'd', b'c']),
+      dict(
+          descr='params: [P1, (P2)], indices: [I], result: [I, (P2)]',
+          params=ragged.constant_value([['a', 'b'], [], ['c'], ['d', 'e']]),
+          indices=[3, 2],
+          expected=ragged.constant_value([[b'd', b'e'], [b'c']])),
+      #=========================================================================
+      # 1 Batch Dimension
+      #=========================================================================
+      dict(
+          descr='params: [B1, P1], indices: [B1, I], result: [B1, I]',
+          params=[['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']],
+          indices=[[2, 0], [0, 1], [1, 0]],
+          expected=[[b'c', b'a'], [b'd', b'e'], [b'h', b'g']]),
+      dict(
+          descr='params: [B1, (P1)], indices: [B1, I], result: [B1, I]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d', 'e'], ['g']]),
+          indices=[[2, 0], [0, 1], [0, 0]],
+          expected=[[b'c', b'a'], [b'd', b'e'], [b'g', b'g']]),
+      dict(
+          descr='params: [B1, P1], indices: [B1, (I)], result: [B1, (I)]',
+          params=[['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']],
+          indices=ragged.constant_value([[2, 0, 2], [0], [1]]),
+          expected=ragged.constant_value([[b'c', b'a', b'c'], [b'd'], [b'h']])),
+      dict(
+          descr=('params: [B1, (P1), (P2), P3], indices: [B1, I], '
+                 'result: [B1, I, (P2), P3]'),
+          params=ragged.constant_value(
+              [[[['a']], [['b'], ['c']]], [[['d'], ['e']], [['f']]], [[['g']]]],
+              ragged_rank=2),
+          indices=[[1, 0], [0, 1], [0, 0]],
+          expected=ragged.constant_value(
+              [[[[b'b'], [b'c']], [[b'a']]], [[[b'd'], [b'e']], [[b'f']]],
+               [[[b'g']], [[b'g']]]],
+              ragged_rank=2)),
+      #=========================================================================
+      # 2 Batch Dimensions
+      #=========================================================================
+      dict(
+          descr=('params: [B1, B2, P1], indices: [B1, B2, I], '
+                 'result: [B1, B2, I]'),
+          params=[[['a', 'b', 'c']], [['d', 'e', 'f']], [['g', 'h', 'i']]],
+          indices=[[[2, 0]], [[0, 1]], [[1, 0]]],
+          expected=[[[b'c', b'a']], [[b'd', b'e']], [[b'h', b'g']]]),
+      dict(
+          descr=('params: [B1, (B2), P1], indices: [B1, (B2), I], '
+                 'result: [B1, (B2), I]'),
+          params=ragged.constant_value(
+              [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
+              ragged_rank=1),
+          indices=ragged.constant_value([[[2, 0], [0, 1]], [[1, 0]]],
+                                        ragged_rank=1),
+          expected=ragged.constant_value(
+              [[[b'c', b'a'], [b'd', b'e']], [[b'h', b'g']]], ragged_rank=1)),
+      dict(
+          descr=('params: [B1, (B2), (P1)], indices: [B1, (B2), I], '
+                 'result: [B1, (B2), I]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']], [['e', 'f']]],
+                                       ragged_rank=2),
+          indices=ragged.constant_value([[[2, 0], [0, 0]], [[1, 0]]],
+                                        ragged_rank=1),
+          expected=ragged.constant_value(
+              [[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]], ragged_rank=1)),
+      dict(
+          descr=('params: [B1, (B2), P1], indices: [B1, (B2), (I)], '
+                 'result: [B1, (B2), (I)]'),
+          params=ragged.constant_value(
+              [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
+              ragged_rank=1),
+          indices=ragged.constant_value([[[2, 1, 0], [0]], [[1, 1]]],
+                                        ragged_rank=2),
+          expected=ragged.constant_value(
+              [[[b'c', b'b', b'a'], [b'd']], [[b'h', b'h']]], ragged_rank=2)),
+      #=========================================================================
+      # 3 Batch Dimensions
+      #=========================================================================
+      dict(
+          descr=(
+              'params: [B1, (B2), (B3), (P1)], indices: [B1, (B2), (B3), I], '
+              'result: [B1, (B2), (B3), I]'),
+          params=ragged.constant_value(
+              [[[['a', 'b', 'c'], ['d']], [['e', 'f']]]], ragged_rank=3),
+          indices=ragged.constant_value([[[[2, 0], [0, 0]], [[1, 0]]]],
+                                        ragged_rank=2),
+          expected=ragged.constant_value(
+              [[[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]]], ragged_rank=2)),
+  ])
+  def testRaggedBatchGather(self, descr, params, indices, expected):
+    result = ragged.batch_gather(params, indices)
+    self.assertEqual(
+        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    with self.test_session():
+      if hasattr(expected, 'tolist'):
+        expected = expected.tolist()
+      self.assertEqual(result.eval().tolist(), expected)
+
+  def testRaggedBatchGatherUnknownRankError(self):
+    params = [['a', 'b'], ['c', 'd']]
+    indices = array_ops.placeholder(dtypes.int32, shape=None)
+    ragged_indices = ragged.from_row_splits(indices, [0, 2, 4])
+
+    with self.assertRaisesRegexp(
+        ValueError, 'batch_gather does not allow indices with unknown shape.'):
+      ragged.batch_gather(params, indices)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'batch_gather does not allow indices with unknown shape.'):
+      ragged.batch_gather(params, ragged_indices)
+
+  @parameterized.parameters([
+      dict(
+          params=ragged.constant([['a'], ['b'], ['c']]),
+          indices=ragged.constant([[0], [0]]),
+          message='Dimensions 3 and 2 are not compatible'),
+      dict(
+          params=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          indices=ragged.constant([[[0, 0], [0, 0, 0]], [[0]]]),
+          message='batch shape from indices does not match params shape'),
+      dict(
+          params=ragged.constant([[[0, 0], [0, 0, 0]], [[0]]]),
+          indices=ragged.constant([[[0, 0]], [[0, 0, 0]], [[0]]]),
+          message='Dimensions must be equal, but are 3 and 4'),
+      dict(
+          params=ragged.constant([[[0, 0], [0, 0, 0]], [[0]], [[0]]]),
+          indices=ragged.constant([[[0, 0]], [[0, 0, 0]], [[0]]]),
+          error=errors.InvalidArgumentError,
+          message='Condition x == y did not hold element-wise'),
+      dict(
+          params=ragged.constant(['a', 'b', 'c']),
+          indices=ragged.constant([[0], [0]]),
+          message='batch shape from indices does not match params shape'),
+      dict(params=ragged.constant_value([['a']]),
+           indices=0,
+           message='indices.rank must be at least 1.'),
+      dict(params=ragged.constant_value([['a']]),
+           indices=[[[0]]],
+           message='batch shape from indices does not match params shape'),
+  ])
+  def testRaggedBatchGatherStaticError(self,
+                                       params,
+                                       indices,
+                                       message,
+                                       error=ValueError):
+    with self.assertRaisesRegexp(error, message):
+      ragged.batch_gather(params, indices)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3279c1e84036d605443fee1f82a426ec2b5340b
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
@@ -0,0 +1,351 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.boolean_mask."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
+  # Define short constants for true & false, so the data & mask can be lined
+  # up in the examples below.  This makes it easier to read the examples, to
+  # see which values should be kept vs. masked.
+  T = True
+  F = False
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring examples
+      #=========================================================================
+      dict(
+          descr='Docstring example 1',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          mask=[[T, F, T], [F, F, F], [T, F, F]],
+          keepdims=False,
+          expected=[1, 3, 7]),
+      dict(
+          descr='Docstring example 2',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          mask=[[T, F, T], [F, F, F], [T, F, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[1, 3], [], [7]])),
+      dict(
+          descr='Docstring example 3',
+          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          keepdims=False,
+          expected=[3, 5, 6]),
+      dict(
+          descr='Docstring example 4',
+          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          keepdims=True,
+          expected=ragged.constant_value([[3], [], [5, 6]])),
+      dict(
+          descr='Docstring example 5',
+          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=[True, False, True],
+          keepdims=False,
+          expected=ragged.constant_value([[1, 2, 3], [5, 6]])),
+      #=========================================================================
+      # Uniform data and uniform mask.
+      #=========================================================================
+      dict(
+          descr='data.shape=[7]; mask.shape=[7]; keepdims=True',
+          data=[1, 2, 3, 4, 5, 6, 7],
+          mask=[T, F, T, T, F, F, F],
+          keepdims=True,
+          expected=[1, 3, 4]),
+      dict(
+          descr='data.shape=[5, 3]; mask.shape=[5]; keepdims=True',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]],
+          mask=[True, False, True, True, False],
+          keepdims=True,
+          expected=[[1, 2, 3], [7, 8, 9], [10, 11, 12]]),
+      dict(
+          descr='data.shape=[5, 3]; mask.shape=[5, 3]; keepdims=True',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9], [0, 1, 2], [3, 4, 5]],
+          mask=[[F, F, F], [T, F, T], [T, T, T], [F, F, F], [T, T, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[], [4, 6], [7, 8, 9], [], [3, 4]])),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3]; keepdims=True',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[F, F, T],
+          keepdims=True,
+          expected=[[[2, 4], [6, 8]]]),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3]; keepdims=False',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[F, F, T],
+          keepdims=False,
+          expected=[[[2, 4], [6, 8]]]),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3, 2]; keepdims=True',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[[T, F], [T, T], [F, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[[1, 2]], [[5, 6], [7, 8]], []],
+                                         ragged_rank=1)),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3, 2]; keepdims=False',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[[T, F], [T, T], [F, F]],
+          keepdims=False,
+          expected=[[1, 2], [5, 6], [7, 8]]),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3, 2, 2]; keepdims=True',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[[[T, T], [F, T]], [[F, F], [F, F]], [[T, F], [T, T]]],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2], [4]], [[], []], [[2], [6, 8]]])),
+      dict(
+          descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=True',
+          data=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                [[[2, 4], [6, 8]], [[1, 3], [5, 7]]]],
+          mask=[[[[T, T], [F, F]], [[T, F], [F, F]]],
+                [[[F, F], [F, F]], [[T, T], [T, F]]]],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[[1, 2], []], [[5], []]], [[[], []], [[1, 3], [5]]]])),
+      dict(
+          descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=False',
+          data=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                [[[2, 4], [6, 8]], [[1, 3], [5, 7]]]],
+          mask=[[[[T, T], [F, F]], [[T, F], [F, F]]],
+                [[[F, F], [F, F]], [[T, T], [T, F]]]],
+          keepdims=False,
+          expected=[1, 2, 5, 1, 3, 5]),
+
+      #=========================================================================
+      # Ragged data and ragged mask.
+      #=========================================================================
+      dict(
+          descr='data.shape=[5, (D2)]; mask.shape=[5, (D2)]',
+          data=ragged.constant_value(
+              [[1, 2], [3, 4, 5, 6], [7, 8, 9], [], [1, 2, 3]]),
+          mask=ragged.constant_value(
+              [[F, F], [F, T, F, T], [F, F, F], [], [T, F, T]]),
+          keepdims=True,
+          expected=ragged.constant_value([[], [4, 6], [], [], [1, 3]])),
+      dict(
+          descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
+          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2]], [[5, 6], [7, 8]], []])),
+      dict(
+          descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
+          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          keepdims=False,
+          expected=ragged.constant_value([[1, 2], [5, 6], [7, 8]])),
+      dict(
+          descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8], [2, 4]], [[6, 8]]],
+              ragged_rank=1),
+          mask=ragged.constant_value([[T, F], [T, T, F], [F]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2]], [[5, 6], [7, 8]], []],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+              ragged_rank=1),
+          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          keepdims=False,
+          expected=[[1, 2], [5, 6], [7, 8]]),
+      dict(
+          descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2), (D3)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4]]]),
+          mask=ragged.constant_value(
+              [[[T, T], [F, T]], [[F, F], [F, F]], [[T, F]]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2], [4]], [[], []], [[2]]])),
+      dict(
+          descr=('data.shape=[3, (D2), (D3), (D4)]; '
+                 'mask.shape=[3, (D2), (D3), (D4)]'),
+          data=ragged.constant_value(
+              [[[[1, 2], [3, 4]], [[5, 6]]], [[[2, 4], [6, 8]]]]),
+          mask=ragged.constant_value(
+              [[[[T, T], [F, F]], [[T, F]]], [[[F, F], [T, T]]]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[[1, 2], []], [[5]]], [[[], [6, 8]]]])),
+
+      #=========================================================================
+      # Ragged mask and uniform data
+      #=========================================================================
+      dict(
+          descr='data.shape=[2, 3]; mask.shape=[2, (3)]',
+          data=[[1, 2, 3], [4, 5, 6]],
+          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          keepdims=True,
+          expected=ragged.constant_value([[1], [5, 6]])),
+      dict(
+          descr='data.shape=[2, 3, 2]; mask.shape=[2, (3)]',
+          data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
+          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2]], [[9, 0], [2, 4]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[2, 3, 2]; mask.shape=[2, (3), 2]',
+          data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
+          mask=ragged.constant_value(
+              [[[T, F], [F, F], [T, T]], [[T, F], [F, T], [F, F]]],
+              ragged_rank=1),
+          keepdims=True,
+          expected=ragged.constant_value([[[1], [], [5, 6]], [[7], [0], []]])),
+
+      #=========================================================================
+      # Ragged data and uniform mask.
+      #=========================================================================
+      dict(
+          descr='data.shape=[4, (D2)]; mask.shape=[4]',
+          data=ragged.constant_value([[1, 2, 3], [4], [], [5, 6]]),
+          mask=[T, F, T, F],
+          keepdims=False,
+          expected=ragged.constant_value([[1, 2, 3], []])),
+      dict(
+          descr='data.shape=[4, (D2), (D3)]; mask.shape=[4]',
+          data=ragged.constant_value(
+              [[[1, 2, 3]], [[4], []], [[5, 6]], []]),
+          mask=[T, F, T, T],
+          keepdims=False,
+          expected=ragged.constant_value([[[1, 2, 3]], [[5, 6]], []])),
+      dict(
+          descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1),
+          mask=[T, F, F, T],
+          keepdims=False,
+          expected=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1),
+          mask=[T, F, F, T],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[1, (2)]; mask.shape=[1, 2]',
+          data=ragged.constant_value([[1, 2]]),
+          mask=[[T, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[1]])),
+      dict(
+          descr='data.shape=[2, (2), (D3)]; mask.shape=[2, 2]',
+          data=ragged.constant_value([[[1], [2, 3]], [[], [4, 5, 6]]]),
+          mask=[[T, F], [T, T]],
+          keepdims=True,
+          expected=ragged.constant_value([[[1]], [[], [4, 5, 6]]])),
+      dict(
+          descr='data.shape=[2, (2), 3]; mask.shape=[2, 2]',
+          data=ragged.constant_value(
+              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
+              ragged_rank=1),
+          mask=[[T, F], [T, T]],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2, 3]], [[7, 8, 9], [2, 4, 6]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[2, (2), 3]; mask.shape=[2, 2, 3]',
+          data=ragged.constant_value(
+              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
+              ragged_rank=1),
+          mask=[[[T, F, F], [T, F, T]], [[T, F, T], [F, F, F]]],
+          keepdims=True,
+          expected=ragged.constant_value([[[1], [4, 6]], [[7, 9], []]])),
+  ])  # pyformat: disable
+  def testBooleanMask(self, descr, data, mask, keepdims, expected):
+    actual = ragged.boolean_mask(data, mask, keepdims=keepdims)
+    self.assertEqual(
+        getattr(actual, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    with self.test_session():
+      if isinstance(expected, ragged.RaggedTensorValue):
+        expected = expected.tolist()
+      self.assertEqual(actual.eval().tolist(), expected)
+
+  def testErrors(self):
+    self.assertRaisesRegexp(ValueError,
+                            r'mask\.shape\.ndims must be kown statically',
+                            ragged.boolean_mask, [[1, 2]],
+                            array_ops.placeholder(dtypes.bool))
+
+    self.assertRaisesRegexp(TypeError,
+                            "Expected bool, got 0 of type 'int' instead.",
+                            ragged.boolean_mask, [[1, 2]], [[0, 1]])
+    self.assertRaisesRegexp(
+        ValueError, 'Tensor conversion requested dtype bool for '
+        'RaggedTensor with dtype int32', ragged.boolean_mask,
+        ragged.constant([[1, 2]]), ragged.constant([[0, 0]]))
+
+    self.assertRaisesRegexp(
+        ValueError, r'Shapes \(1, 2\) and \(1, 3\) are incompatible',
+        ragged.boolean_mask, [[1, 2]], [[True, False, True]])
+
+    # self.assertRaisesRegexp(ValueError,
+    #                         r'data=.* is non-ragged but mask=.* is ragged',
+    #                         ragged.boolean_mask, [[1, 2]],
+    #                         ragged.constant([[True, False]]))
+
+    # self.assertRaisesRegexp(
+    #     ValueError, r'data=.* is ragged but mask=.* is non-ragged',
+    #     ragged.boolean_mask, ragged.constant([[1, 2]]), [[True, False]])
+
+    self.assertRaisesRegexp(errors.InvalidArgumentError,
+                            r'Inputs must have identical ragged splits',
+                            ragged.boolean_mask, ragged.constant([[1, 2]]),
+                            ragged.constant([[True, False, True]]))
+
+    self.assertRaisesRegexp(ValueError, 'mask cannot be scalar',
+                            ragged.boolean_mask, [[1, 2]], True)
+
+    self.assertRaisesRegexp(ValueError,
+                            'mask cannot be scalar', ragged.boolean_mask,
+                            ragged.constant([[1, 2]]), True)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bddc5d8580fab7cecdfa0f923ddfd772b05c9a8a
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -0,0 +1,313 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.concat."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  def _rt_inputs_to_tensors(self, rt_inputs, ragged_ranks=None):
+    if ragged_ranks is None:
+      ragged_ranks = [None] * len(rt_inputs)
+    return [
+        ragged.constant(rt_input, ragged_rank=rrank)
+        if rrank != 0 else constant_op.constant(rt_input)
+        for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
+    ]
+
+  @parameterized.parameters(
+      dict(
+          descr='Two rank-2 inputs with empty value axis=1',
+          rt_inputs=([[]], [[]]),
+          axis=1,
+          expected=[[]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=0,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21'], [b'b00'],
+                    [b'b10']]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=1,
+          expected=[
+              [b'a00', b'a01', b'b00'],
+              [b'b10', b'b11', b'b12'],
+              [b'a20', b'a21', b'a22', b'b20']]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=-2,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21'], [b'b00'],
+                    [b'b10']]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=-1,
+          expected=[
+              [b'a00', b'a01', b'b00'],
+              [b'b10', b'b11', b'b12'],
+              [b'a20', b'a21', b'a22', b'b20']],
+          expected_shape=[3, None]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10']],                            # shape=(2, None)
+              [['c00'], ['c10', 'c11'], ['c21']]),           # shape=(3, None)
+          axis=0,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21', b'a22'], [b'b00'],
+                    [b'b10'], [b'c00'], [b'c10', b'c11'], [b'c21']]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],     # shape=(3, None)
+              [[], ['c10', 'c11'], ['c20', 'c21']]),         # shape=(3, None)
+          axis=1,
+          expected=[
+              [b'a00', b'a01', b'b00'],
+              [b'b10', b'b11', b'b12', b'c10', b'c11'],
+              [b'a20', b'a21', b'a22', b'b20', b'c20', b'c21']]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=0',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [['c100', 'c101', 'c102', 'c103']], [[], ['c210', 'c211']]]),
+          axis=0,
+          expected=[
+              [[b'a000', b'a001'], [b'a010']],
+              [[b'a100', b'a101', b'a102'], [b'a110', b'a111']],
+              [[b'b000']],
+              [[b'b100', b'b101'], [b'b110']],
+              [],
+              [[b'c100', b'c101', b'c102', b'c103']],
+              [[], [b'c210', b'c211']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [[], ['c110', 'c111']]]),
+          axis=1,
+          expected=[
+              [[b'a000', b'a001'], [b'a010'], [b'b000']],
+              [[b'a100', b'a101', b'a102'], [b'a110', b'a111'],
+               [b'b100', b'b101'], [b'b110'], [], [b'c110', b'c111']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=2',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=2,
+          expected=[
+              [[b'a000', b'a001', b'c000'],
+               [b'a010', b'b010', b'b011', b'c010']],
+              [[b'a100', b'a101', b'a102', b'b100', b'b101'],
+               [b'a110', b'a111', b'b110', b'c110', b'c111']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=-1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=-1,
+          expected=[
+              [[b'a000', b'a001', b'c000'],
+               [b'a010', b'b010', b'b011', b'c010']],
+              [[b'a100', b'a101', b'a102', b'b100', b'b101'],
+               [b'a110', b'a111', b'b110', b'c110', b'c111']]]),
+      dict(
+          descr='ragged_concat([uniform, ragged, uniform], axis=1)',
+          ragged_ranks=[0, 1, 0],
+          rt_inputs=(
+              [['0('], ['1('], ['2(']],                   # shape=(3, 1)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],  # shape=(3, None)
+              [[')0'], [')1'], [')2']]),                  # shape=(3, 1)
+          axis=1,
+          expected=[
+              [b'0(', b'b00', b')0'],
+              [b'1(', b'b10', b'b11', b'b12', b')1'],
+              [b'2(', b'b20', b')2']]),
+      dict(
+          descr='ragged_concat([uniform, uniform], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21'],
+              [b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']],
+          expected_ragged_rank=1),
+      dict(
+          descr='ragged_concat([uniform, ragged], axis=0)',
+          ragged_ranks=[0, 1],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21'],
+              [b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']]),
+      dict(
+          descr='ragged_concat([uniform, ragged], axis=0) with rank-3 inputs',
+          ragged_ranks=[0, 2],
+          rt_inputs=(
+              [[[0, 1], [2, 3]], [[4, 5], [6, 7]]],  # shape = (2, 2, 2)
+              [[[8], [8, 8]]]),                      # shape = (2, None, None)
+          axis=0,
+          expected=[[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[8], [8, 8]]]),
+      dict(
+          descr='Two rank-3 inputs with ragged_rank=1, axis=-1',
+          ragged_ranks=[1, 1],
+          rt_inputs=(
+              [[[0, 1], [2, 3], [4, 5]], [], [[6, 7], [8, 9]]],
+              [[[9, 8], [7, 6], [5, 4]], [], [[3, 2], [1, 0]]]),
+          axis=-1,
+          expected=[
+              [[0, 1, 9, 8], [2, 3, 7, 6], [4, 5, 5, 4]], [],
+              [[6, 7, 3, 2], [8, 9, 1, 0]]],
+          expected_ragged_rank=1),
+      dict(
+          descr='ragged_concat([vector, vector], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=([1, 2, 3], [4, 5, 6]),
+          axis=0,
+          expected=[1, 2, 3, 4, 5, 6]),
+      dict(
+          descr='One input (so ragged_conat is a noop)',
+          rt_inputs=([['a00', 'a01'], [], ['a20', 'a21']],),
+          axis=0,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21']]),
+  )   # pyformat: disable
+  def testRaggedConcat(self,
+                       descr,
+                       rt_inputs,
+                       axis,
+                       expected,
+                       ragged_ranks=None,
+                       expected_ragged_rank=None,
+                       expected_shape=None):
+    rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
+    concatenated = ragged.concat(rt_inputs, axis)
+    if expected_ragged_rank is not None:
+      self.assertEqual(concatenated.ragged_rank, expected_ragged_rank)
+    if expected_shape is not None:
+      self.assertEqual(concatenated.shape.as_list(), expected_shape)
+    with self.test_session():
+      self.assertEqual(concatenated.eval().tolist(), expected)
+
+  @parameterized.parameters(
+      dict(
+          rt_inputs=(),
+          axis=0,
+          error=ValueError,
+          message=r'rt_inputs may not be empty\.'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=r'foo',
+          error=TypeError,
+          message='axis must be an int'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=-3,
+          error=ValueError,
+          message='axis=-3 out of bounds: expected -2<=axis<2'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=2,
+          error=ValueError,
+          message='axis=2 out of bounds: expected -2<=axis<2'),
+      dict(
+          ragged_ranks=(0, 0),
+          rt_inputs=([[1, 2]], [[3, 4], [5, 6]]),
+          axis=1,
+          error=ValueError,
+          message='Dimension 0 in both shapes must be equal'),
+  )
+  def testStaticError(self, rt_inputs, axis, error, message, ragged_ranks=None):
+    rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
+    self.assertRaisesRegexp(error, message, ragged.concat, rt_inputs, axis)
+
+  @parameterized.parameters([
+      dict(
+          ragged_ranks=(1, 1),
+          rt_inputs=([[1, 2]], [[3, 4], [5, 6]]),
+          axis=1,
+          error=errors.InvalidArgumentError,
+          message='Input tensors have incompatible shapes'),
+  ])
+  def testRuntimeError(self, rt_inputs, axis, error, message,
+                       ragged_ranks=None):
+    rt_inputs = [
+        array_ops.placeholder_with_default(rt, shape=None) for rt in rt_inputs
+    ]
+    concatenated = ragged.concat(rt_inputs, axis)
+    with self.test_session():
+      self.assertRaisesRegexp(error, message, concatenated.eval)
+
+  def testNegativeAxisWithUnknownRankError(self):
+    rt_inputs = [
+        array_ops.placeholder(dtypes.int64),
+        array_ops.placeholder(dtypes.int64)
+    ]
+    self.assertRaisesRegexp(
+        ValueError, r'axis may only be negative if ndims is statically known.',
+        ragged.concat, rt_inputs, -1)
+
+  def testSingleTensorInput(self):
+    """Tests ragged_concat with a single tensor input.
+
+    Usually, we pass a list of values in for rt_inputs.  However, you can
+    also pass in a single value (as with tf.concat), in which case it simply
+    returns that tensor.  This test exercises that path.
+    """
+    rt_inputs = ragged.constant([[1, 2], [3, 4]])
+    concatenated = ragged.concat(rt_inputs, 0)
+    with self.test_session():
+      self.assertEqual(concatenated.eval().tolist(), [[1, 2], [3, 4]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66c39475fa6b0c525ad9a8e52f71a47ff2b87068
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -0,0 +1,370 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.constant."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import googletest
+
+
+class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      #=========================================================================
+      # 0-dimensional tensors.
+      dict(pylist=b'x', expected_shape=()),
+
+      #=========================================================================
+      # 1-dimensional tensors.
+      dict(pylist=[1, 2, 3], expected_shape=(3,)),
+
+      #=========================================================================
+      # 2-dimensional tensors.
+      dict(pylist=[[1, 2, 3], [4], [5, 6]], expected_shape=(3, None)),
+      dict(pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], expected_shape=(3, None)),
+
+      #=========================================================================
+      # 3-dimensional tensors.
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          expected_shape=(3, None, None)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      #=========================================================================
+      # 4-dimensional tensors.
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          expected_shape=(2, None, None, None)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          ragged_rank=1,
+          expected_shape=(2, None, 2, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2,),
+          expected_shape=(2, None, None, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2, 2),
+          expected_shape=(2, None, 2, 2)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ default ragged_rank and inner_shape
+      dict(pylist=[], expected_shape=(0,)),
+      dict(pylist=[[], [], []], expected_shape=(3, None)),
+      dict(
+          pylist=[[[], []], [], [[], [[]]]],
+          expected_shape=(3, None, None, None)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ explicit ragged_rank or inner_shape
+      dict(pylist=[], ragged_rank=1, expected_shape=(0, None)),
+      dict(pylist=[], ragged_rank=2, expected_shape=(0, None, None)),
+      dict(pylist=[], inner_shape=(0, 100, 20), expected_shape=(0, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=1,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=2,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, None, 100, 20)),
+      dict(pylist=[[], [], []], ragged_rank=2, expected_shape=(3, None, None)),
+      dict(pylist=[], inner_shape=(0,), expected_shape=(0,)),
+      dict(pylist=[[]], inner_shape=(1, 0), expected_shape=(1, 0)),
+
+      #=========================================================================
+      # default/inferred dtypes
+      dict(pylist=[], expected_dtype=dtypes.float32),
+      dict(pylist=[[[], [[[]], []]]], expected_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], expected_dtype=dtypes.int32),
+      dict(pylist=[[1., 2.], [], [4., 5., 6.]], expected_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3.], [4, 5, 6]], expected_dtype=dtypes.float32),
+      dict(pylist=[[b'a', b'b'], [b'c']], expected_dtype=dtypes.string),
+      dict(pylist=[[True]], expected_dtype=dtypes.bool),
+
+      #=========================================================================
+      # explicit dtypes
+      dict(pylist=[], dtype=dtypes.float32),
+      dict(pylist=[], dtype=dtypes.string),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.int64),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.int32),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.float32),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=dtypes.float16),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=dtypes.float32),
+      dict(pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
+           dtype=dtypes.string),
+  )
+  def testRaggedConst(self,
+                      pylist,
+                      dtype=None,
+                      ragged_rank=None,
+                      inner_shape=None,
+                      expected_shape=None,
+                      expected_dtype=None):
+    """Tests that `ragged_const(pylist).eval().tolist() == pylist`.
+
+    Args:
+      pylist: The `pylist` argument for `ragged_const()`.
+      dtype: The `dtype` argument for `ragged_const()`.  If not None, then also
+        test that the resulting ragged tensor has this `dtype`.
+      ragged_rank: The `ragged_rank` argument for `ragged_const()`.  If not
+        None, then also test that the resulting ragged tensor has this
+        `ragged_rank`.
+      inner_shape: The `inner_shape` argument for `ragged_const()`.  If not
+        None, then also test that the resulting ragged tensor has this
+        `inner_shape`.
+      expected_shape: The expected shape for the resulting ragged tensor.
+      expected_dtype: The expected dtype for the resulting ragged tensor (used
+        to test default/inferred types when dtype=None).
+    """
+    rt = ragged_factory_ops.constant(
+        pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
+
+    # If dtype was explicitly specified, check it.
+    if dtype is not None:
+      self.assertEqual(rt.dtype, dtype)
+    if expected_dtype is not None:
+      self.assertEqual(rt.dtype, expected_dtype)
+
+    # If ragged_rank was explicitly specified, check it.
+    if ragged_rank is not None:
+      if isinstance(rt, ragged_tensor.RaggedTensor):
+        self.assertEqual(rt.ragged_rank, ragged_rank)
+      else:
+        self.assertEqual(0, ragged_rank)
+
+    # If inner_shape was explicitly specified, check it.
+    if inner_shape is not None:
+      if isinstance(rt, ragged_tensor.RaggedTensor):
+        self.assertEqual(rt.inner_values.shape.as_list()[1:], list(inner_shape))
+      else:
+        self.assertEqual(rt.shape.as_list(), list(inner_shape))
+
+    if expected_shape is not None:
+      self.assertEqual(tuple(rt.shape.as_list()), expected_shape)
+
+    with self.test_session():
+      result = self.evaluate(rt)
+      if rt.shape.ndims > 0:
+        self.assertEqual(result.tolist(), pylist)
+        if expected_shape is not None:
+          self.assertEqual(result.shape, expected_shape)
+      else:
+        self.assertEqual(result, pylist)
+        if expected_shape is not None:
+          self.assertEqual((), expected_shape)
+
+  @parameterized.parameters(
+      dict(
+          pylist=12,
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with ragged_rank=1'),
+      dict(
+          pylist=12,
+          inner_shape=(1,),
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with '
+          'dim\\(inner_shape\\)=1'),
+      dict(
+          pylist=[[[1], [2]]],
+          ragged_rank=-1,
+          exception=ValueError,
+          message='Invalid ragged_rank=-1: must be nonnegative'),
+      dict(
+          pylist=[[1, [2]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[[1]], [[[2]]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[1], [[]]],
+          exception=ValueError,
+          message='Invalid pylist=.*: empty list nesting is greater '
+          'than scalar value nesting'),
+      dict(
+          pylist=[1, 2, 3],
+          ragged_rank=1,
+          exception=ValueError,
+          message='pylist has scalar values depth 1, but ragged_rank=1 '
+          'requires scalar value depth greater than 1'),
+      dict(
+          pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          ragged_rank=2,
+          exception=ValueError,
+          message='pylist has scalar values depth 2, but ragged_rank=2 '
+          'requires scalar value depth greater than 2'),
+      dict(
+          pylist=[1, 2, 3],
+          inner_shape=(1, 1),
+          exception=ValueError,
+          message='Too many elements provided.'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          inner_shape=(2, 2),
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=.*: incompatible with ragged_rank=1 and '
+          'dim\\(inner_shape\\)=2'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8, 9]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[], [[]]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+  )
+  def testRaggedConstError(self,
+                           pylist,
+                           dtype=None,
+                           ragged_rank=None,
+                           inner_shape=None,
+                           exception=None,
+                           message=None):
+    """Tests that `ragged_const()` raises an expected exception."""
+    self.assertRaisesRegexp(
+        exception,
+        message,
+        ragged_factory_ops.constant,
+        pylist,
+        dtype=dtype,
+        ragged_rank=ragged_rank,
+        inner_shape=inner_shape)
+
+  @parameterized.parameters([
+      dict(pylist=9, scalar_depth=0, max_depth=0),
+      dict(pylist=[9], scalar_depth=1, max_depth=1),
+      dict(pylist=[1, 2, 3], scalar_depth=1, max_depth=1),
+      dict(pylist=[[1], [2]], scalar_depth=2, max_depth=2),
+      dict(pylist=[[[1], [2]], [[3]]], scalar_depth=3, max_depth=3),
+      dict(pylist=[], scalar_depth=None, max_depth=1),
+      dict(pylist=[[]], scalar_depth=None, max_depth=2),
+      dict(pylist=[[], [], []], scalar_depth=None, max_depth=2),
+      dict(pylist=[[[], []], [[], [[[]]]], []], scalar_depth=None, max_depth=5),
+      dict(
+          pylist=[1, [2]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[1], 2],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[[[1]], []], [[2]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+  ])
+  def testScalarAndMaxDepthHelper(self,
+                                  pylist,
+                                  scalar_depth=None,
+                                  max_depth=None,
+                                  exception=None,
+                                  message=None):
+    """Tests for the _find_scalar_and_max_depth helper function."""
+    if exception is not None:
+      self.assertRaisesRegexp(
+          exception, message,
+          ragged_factory_ops._find_scalar_and_max_depth, pylist)
+    else:
+      self.assertEqual(
+          ragged_factory_ops._find_scalar_and_max_depth(pylist),
+          (scalar_depth, max_depth))
+
+  @parameterized.parameters([
+      dict(pylist=[[1], [2, 3]], ragged_rank=1, inner_shape=()),
+      dict(
+          pylist=[[[1], [2]], [[3], [4], [5]]], ragged_rank=1,
+          inner_shape=(1,)),
+      dict(pylist=[[[1], [2]], [[3], [4], [5]]], ragged_rank=2, inner_shape=()),
+      dict(
+          pylist=[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]]],
+          ragged_rank=1,
+          inner_shape=(2, 3)),
+      dict(
+          pylist=[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]]],
+          ragged_rank=2,
+          inner_shape=(3,)),
+      dict(
+          pylist=[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]]],
+          ragged_rank=3,
+          inner_shape=()),
+      dict(
+          pylist=[[[1], [2, 3]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[1], [[2]]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[[1]], [2]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+  ])
+  def testDefaultInnerShapeForPylistHelper(self,
+                                           pylist,
+                                           ragged_rank,
+                                           inner_shape=None,
+                                           exception=None,
+                                           message=None):
+    """Tests for the _default_inner_shape_for_pylist helper function."""
+    if exception is not None:
+      self.assertRaisesRegexp(
+          exception, message,
+          ragged_factory_ops._default_inner_shape_for_pylist, pylist,
+          ragged_rank)
+    else:
+      self.assertEqual(
+          ragged_factory_ops._default_inner_shape_for_pylist(
+              pylist, ragged_rank), inner_shape)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d80518930dbb74b5e044269df73002e68c0df2d2
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -0,0 +1,267 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.constant_value."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
+                                parameterized.TestCase):
+
+  @parameterized.parameters(
+      #=========================================================================
+      # 0-dimensional tensors.
+      dict(pylist='x', expected_shape=()),
+
+      #=========================================================================
+      # 1-dimensional tensors.
+      dict(pylist=[1, 2, 3], expected_shape=(3,)),
+
+      #=========================================================================
+      # 2-dimensional tensors.
+      dict(pylist=[[1, 2, 3], [4], [5, 6]], expected_shape=(3, None)),
+      dict(pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], expected_shape=(3, None)),
+
+      #=========================================================================
+      # 3-dimensional tensors.
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          expected_shape=(3, None, None)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      #=========================================================================
+      # 4-dimensional tensors.
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          expected_shape=(2, None, None, None)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          ragged_rank=1,
+          expected_shape=(2, None, 2, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2,),
+          expected_shape=(2, None, None, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2, 2),
+          expected_shape=(2, None, 2, 2)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ default ragged_rank and inner_shape
+      dict(pylist=[], expected_shape=(0,)),
+      dict(pylist=[[], [], []], expected_shape=(3, None)),
+      dict(
+          pylist=[[[], []], [], [[], [[]]]],
+          expected_shape=(3, None, None, None)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ explicit ragged_rank or inner_shape
+      dict(pylist=[], ragged_rank=1, expected_shape=(0, None)),
+      dict(pylist=[], ragged_rank=2, expected_shape=(0, None, None)),
+      dict(pylist=[], inner_shape=(0, 100, 20), expected_shape=(0, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=1,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=2,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, None, 100, 20)),
+      dict(pylist=[[], [], []], ragged_rank=2, expected_shape=(3, None, None)),
+      dict(pylist=[], inner_shape=(0,), expected_shape=(0,)),
+      dict(pylist=[[]], inner_shape=(1, 0), expected_shape=(1, 0)),
+
+      #=========================================================================
+      # default/inferred dtypes.
+      #
+      # Note: numpy has different default/inferred types than tensorflow.
+      # Since we are using values, not tensors, we get the default numpy types
+      # here.
+      dict(pylist=[], expected_dtype=np.float64),
+      dict(pylist=[[[], [[[]], []]]], expected_dtype=np.float64),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], expected_dtype=np.int64),
+      dict(pylist=[[1., 2.], [], [4., 5., 6.]], expected_dtype=np.float64),
+      dict(pylist=[[1, 2], [3.], [4, 5, 6]], expected_dtype=np.float64),
+      dict(pylist=[[b'a', b'b'], [b'c']], expected_dtype=np.dtype('S1')),
+      dict(pylist=[[True]], expected_dtype=np.bool),
+
+      #=========================================================================
+      # explicit dtypes
+      dict(pylist=[], dtype=np.float32),
+      dict(pylist=[], dtype=np.dtype('S1')),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=np.int64),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=np.int32),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=np.float32),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=np.float16),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=np.float32),
+      dict(
+          pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
+          dtype=np.dtype('S1')),
+  )
+  def testRaggedValues(self,
+                       pylist,
+                       dtype=None,
+                       ragged_rank=None,
+                       inner_shape=None,
+                       expected_shape=None,
+                       expected_dtype=None):
+    """Tests that `ragged_value(pylist).tolist() == pylist`."""
+    rt = ragged.constant_value(
+        pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
+
+    # If dtype was explicitly specified, check it.
+    if dtype is not None:
+      self.assertEqual(rt.dtype, dtype)
+    if expected_dtype is not None:
+      self.assertEqual(rt.dtype, expected_dtype)
+
+    # If ragged_rank was explicitly specified, check it.
+    if ragged_rank is not None:
+      if isinstance(rt, ragged.RaggedTensorValue):
+        self.assertEqual(rt.ragged_rank, ragged_rank)
+      else:
+        self.assertEqual(0, ragged_rank)
+
+    # If inner_shape was explicitly specified, check it.
+    if inner_shape is not None:
+      if isinstance(rt, ragged.RaggedTensorValue):
+        self.assertEqual(rt.inner_values.shape[1:], inner_shape)
+      else:
+        self.assertEqual(rt.shape, inner_shape)
+
+    if expected_shape is not None:
+      self.assertEqual(tuple(rt.shape), expected_shape)
+
+    if rt.shape:
+      self.assertEqual(rt.tolist(), pylist)
+      if expected_shape is not None:
+        self.assertEqual(rt.shape, expected_shape)
+    else:
+      self.assertEqual(rt, pylist)
+      if expected_shape is not None:
+        self.assertEqual((), expected_shape)
+
+  @parameterized.parameters(
+      dict(
+          pylist=12,
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with ragged_rank=1'),
+      dict(
+          pylist=12,
+          inner_shape=(1,),
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with '
+          'dim\\(inner_shape\\)=1'),
+      dict(
+          pylist=[[[1], [2]]],
+          ragged_rank=-1,
+          exception=ValueError,
+          message='Invalid ragged_rank=-1: must be nonnegative'),
+      dict(
+          pylist=[[1, [2]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[[1]], [[[2]]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[1], [[]]],
+          exception=ValueError,
+          message='Invalid pylist=.*: empty list nesting is greater '
+          'than scalar value nesting'),
+      dict(
+          pylist=[1, 2, 3],
+          ragged_rank=1,
+          exception=ValueError,
+          message='pylist has scalar values depth 1, but ragged_rank=1 '
+          'requires scalar value depth greater than 1'),
+      dict(
+          pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          ragged_rank=2,
+          exception=ValueError,
+          message='pylist has scalar values depth 2, but ragged_rank=2 '
+          'requires scalar value depth greater than 2'),
+      dict(
+          pylist=[1, 2, 3],
+          inner_shape=(1, 1),
+          exception=ValueError,
+          message='cannot reshape array'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          inner_shape=(2, 2),
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=.*: incompatible with ragged_rank=1 and '
+          'dim\\(inner_shape\\)=2'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8, 9]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[], [[]]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+  )
+  def testRaggedValuesError(self,
+                            pylist,
+                            dtype=None,
+                            ragged_rank=None,
+                            inner_shape=None,
+                            exception=None,
+                            message=None):
+    """Tests that `ragged.constant_value()` raises an expected exception."""
+    self.assertRaisesRegexp(
+        exception,
+        message,
+        ragged.constant_value,
+        pylist,
+        dtype=dtype,
+        ragged_rank=ragged_rank,
+        inner_shape=inner_shape)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..83212e49cf71c245d85b8216792ac0cfc97741dd
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -0,0 +1,424 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops to convert between RaggedTensors and other tensor types."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_ragged_conversion_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+#===============================================================================
+# RaggedTensor <-> Tensor conversion
+#===============================================================================
+def from_tensor(tensor, lengths=None, padding=None, ragged_rank=1, name=None):
+  """Converts a `Tensor` into a `RaggedTensor`.
+
+  The set of absent/default values may be specified using a vector of lengths
+  or a padding value (but not both).  If `lengths` is specified, then the
+  output tensor will satisfy `output[row] = tensor[row][:lengths[row]]`.
+  If `padding` is specified, then any row *suffix* consisting entirely of
+  `padding` will be excluded from the returned `RaggedTensor`.  If neither
+  `lengths` nor `padding` is specified, then the returned `RaggedTensor` will
+  have no absent/default values.
+
+  Examples:
+
+  ```python
+  >>> dt = tf.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+  >>> ragged.from_tensor(dt).eval().tolist()
+  [[5, 7, 0], [0, 3, 0], [6, 0, 0]]
+  >>> ragged.from_tensor(dt, lengths=[2, 0, 3]).eval().tolist()
+  [[5, 7], [], [6, 0, 0]]
+  >>> ragged.from_tensor(dt, padding=0).eval().tolist()
+  [[5, 7], [0, 3], [6]]
+  ```
+
+  Args:
+    tensor: The `Tensor` to convert.  Must have rank `ragged_rank + 1` or
+      higher.
+    lengths: An optional set of row lengths, specified using a 1-D integer
+      `Tensor` whose length is equal to `tensor.shape[0]` (the number of rows in
+      `tensor`).  If specified, then `output[row]` will contain
+      `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero.
+    padding: An optional padding value.  If specified, then any row suffix
+      consisting entirely of `padding` will be excluded from the returned
+      RaggedTensor.  `padding` is a `Tensor` with the same dtype as `tensor`
+      and with `shape=tensor.shape[ragged_rank + 1:]`.
+    ragged_rank: Integer specifying the ragged rank for the returned
+      `RaggedTensor`.  Must be greater than zero.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A `RaggedTensor` with the specified `ragged_rank`.  The shape of the
+    returned ragged tensor is compatible with the shape of `tensor`.
+  Raises:
+    ValueError: If both `lengths` and `padding` are specified.
+  """
+  if lengths is not None and padding is not None:
+    raise ValueError('Specify lengths or padding, but not both')
+  if not isinstance(ragged_rank, int):
+    raise TypeError('ragged_rank expected int, got %r' % ragged_rank)
+  if ragged_rank <= 0:
+    raise ValueError('ragged_rank must be greater than 0; got %s' % ragged_rank)
+
+  with ops.name_scope(name, 'RaggedFromTensor', [tensor, lengths, padding]):
+    tensor = ops.convert_to_tensor(tensor, name='tensor')
+    tensor.shape.with_rank_at_least(ragged_rank + 1)
+    input_shape = array_ops.shape(tensor, out_type=dtypes.int64)
+    ncols = input_shape[1]
+
+    # Handle ragged_rank>1 via recursion:
+    # If the output should have multiple ragged dimensions, then first
+    # flatten the tensor to eliminate all but the last ragged dimension,
+    # and recursively convert that flattened tensor.  Then add on the splits
+    # for the dimensions that we flattened out.
+    if ragged_rank > 1:
+      # Flatten `tensor` to eliminate all but the last ragged dimension.
+      new_shape = array_ops.concat(
+          [constant_op.constant([-1], dtypes.int64), input_shape[ragged_rank:]],
+          axis=0)
+      flattened = array_ops.reshape(tensor, new_shape)
+      # Recursively convert the flattened tensor.
+      values = from_tensor(flattened, lengths, padding)
+      # The total number of elements in each  dimension.  E.g., if
+      # input_shape=[3, 4, 5, 6], then dim[2] has 3*4*5 elements in total.
+      dim_size = math_ops.cumprod(input_shape)
+      # Construct splits tensors for the dimensions that were flattened.
+      new_splits = [
+          math_ops.range(0, dim_size[dim - 1] + 1) * input_shape[dim]
+          for dim in range(1, ragged_rank)
+      ]
+      return ragged_factory_ops.from_nested_row_splits(values, new_splits)
+
+    # If padding was specified, then use it to find row lengths.
+    if padding is not None:
+      padding = ops.convert_to_tensor(
+          padding, name='padding', dtype=tensor.dtype)
+      padding.shape.assert_is_compatible_with(tensor.shape[2:])
+
+      # Find places where the padding is equal to the tensor.  (This will
+      # broadcast `padding` across the outermost 2 dimensions of `tensor`,
+      # so `has_default_value.shape = tensor.shape`.)
+      has_default_value = math_ops.equal(padding, tensor)
+
+      # If the padding isn't a scalar, then require that all values in the
+      # padding match each item in the tensor.  After this block of code,
+      # `has_default.shape = tensor.shape[:2]`.  (Unfortunately, we can't just
+      # use reduce_all for both cases, becaue when you pass an empty `axis`
+      # list to reduce_all, it reduces all axes; but we want it to reduce no
+      # axes -- i.e., to be a no-op.)
+      tensor_rank = array_ops.rank(tensor)
+      reduce_axis = math_ops.range(2, tensor_rank)
+      has_default = control_flow_ops.cond(
+          tensor_rank > 2,
+          lambda: math_ops.reduce_all(has_default_value, axis=reduce_axis),
+          lambda: has_default_value)
+      has_default.set_shape(tensor_shape.TensorShape([None, None]))
+      has_default.set_shape(tensor.shape[:2])
+
+      # Use has_default it to find the length of each row: for each non-default
+      # item in a row, calculate the length that the row needs to have to
+      # include that item; and then take the max of those values (across each
+      # row).
+      has_nondefault = math_ops.logical_not(has_default)
+      has_nondefault = math_ops.cast(has_nondefault, dtypes.int64)
+      length_for_nondefault_value = (
+          has_nondefault * array_ops.expand_dims(
+              math_ops.range(1, ncols + 1), 0))
+      lengths = math_ops.reduce_max(length_for_nondefault_value, axis=1)
+
+    # If we have lengths (either directly supplied, or computed from paddings),
+    # then use those to construct splits; and then use masking to get the
+    # corresponding values.
+    if lengths is not None:
+      lengths = ragged_util.convert_to_int_tensor(lengths, 'lengths',
+                                                  dtypes.int64)
+      lengths.shape.assert_has_rank(1)
+      lengths = math_ops.minimum(lengths, ncols)
+      lengths = math_ops.maximum(lengths, 0)
+      limits = math_ops.cumsum(lengths)
+      splits = array_ops.concat(
+          [array_ops.zeros([1], dtypes.int64), limits], axis=0)
+      mask = array_ops.sequence_mask(lengths, maxlen=ncols)
+      values = array_ops.boolean_mask(tensor, mask)
+      return ragged_factory_ops.from_row_splits(values, splits)
+
+    # If neither padding nor lengths were specified, then create a splits
+    # vector that contains no default values, and reshape the input tensor
+    # to form the values for the RaggedTensor.
+    nrows = input_shape[0]
+    nvals = nrows * ncols
+    splits = math_ops.range(nrows + 1) * ncols
+    values_shape = array_ops.concat([[nvals], input_shape[2:]], axis=0)
+    values = array_ops.reshape(tensor, values_shape)
+    return ragged_factory_ops.from_row_splits(values, splits)
+
+
+def to_tensor(rt_input, default_value=None, name=None):
+  """Converts a `RaggedTensor` into a `Tensor`.
+
+  Example:
+
+  ```python
+  >>> rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+  >>> print ragged.to_tensor(rt).eval()
+  [[9 8 7]
+   [0 0 0]
+   [6 5 0]
+   [4 0 0]]
+  ```
+
+  Args:
+    rt_input: The input `RaggedTensor`.
+    default_value: Value to set for indices not specified in `rt_input`.
+      Defaults to zero.  `default_value` must be broadcastable to
+      `rt_input.shape[rt_input.ragged_rank + 1:]`.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A `Tensor` with shape `ragged.bounding_shape(rt_input)` and the
+    values specified by the non-empty values in `rt_input`.  Empty values are
+    assigned `default_value`.
+  """
+  with ops.name_scope(name, 'RaggedToTensor', [rt_input, default_value]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    if not ragged_tensor.is_ragged(rt_input):
+      return rt_input  # already dense
+    if default_value is not None:
+      default_value = ops.convert_to_tensor(
+          default_value, name='default_value', dtype=rt_input.dtype)
+
+    # If ragged_rank > 1, then recursively convert the ragged values into a
+    # `Tensor` before we proceed.
+    values = rt_input.values
+    if ragged_tensor.is_ragged(values):
+      values = to_tensor(values, default_value)
+
+    # Tile the default value, if necessary.
+    if default_value is not None:
+      if values.shape.ndims is not None:
+        default_value.shape.with_rank_at_most(values.shape.ndims - 1)
+      if (values.shape.ndims is None or default_value.shape.ndims is None or
+          values.shape.ndims != default_value.shape.ndims + 1):
+        value_shape = array_ops.shape(values)[1:]
+        default_value = array_ops.broadcast_to(default_value, value_shape)
+      default_value.shape.assert_is_compatible_with(values.shape[1:])
+
+    # Get the expected dense shape ([nrows, ncols] + value_shape).
+    rt_row_lengths = [rt_input.row_splits[1:] - rt_input.row_splits[:-1]]
+    nrows = array_ops.shape(rt_input.row_splits, out_type=dtypes.int64)[0] - 1
+    ncols = math_ops.maximum(math_ops.reduce_max(rt_row_lengths), 0)
+    values_shape = array_ops.shape(values, out_type=dtypes.int64)
+    value_shape = values_shape[1:]
+    nvals = values_shape[0]
+
+    # Build a default value if none was supplied.
+    if default_value is None:
+      default_value = array_ops.zeros(value_shape, dtype=values.dtype)
+    default_value.shape.assert_is_compatible_with(values.shape[1:])
+    default_value.set_shape(values.shape[1:])
+
+    # Get the row start indices, and expand to shape=[nrows, 1].
+    starts = array_ops.expand_dims(rt_input.row_splits[:-1], 1)
+
+    # Get the row limit indices, and expand to shape=[nrows, 1].
+    limits = array_ops.expand_dims(rt_input.row_splits[1:], 1)
+
+    # Get the column indices, and expand to shape=[1, ncols].
+    columns = array_ops.expand_dims(math_ops.range(0, ncols), 0)
+
+    # Build a list containing the values plus the default value.  We will use
+    # tf.gather to collect values from this list for the `Tensor` (using
+    # nvals as the index for the default value).
+    values_and_default = array_ops.concat(
+        [values, array_ops.stack([default_value])], axis=0)
+
+    # Construct a matrix "indices" pointing into values_and_default.  I.e.,
+    # output[r, c] = values_and_default[indices[r, c].
+    nondefault_index = starts + columns
+    has_value = nondefault_index < limits
+    default_index = array_ops.fill(array_ops.stack([nrows, ncols]), nvals)
+    indices = array_ops.where(has_value, nondefault_index, default_index)
+
+    # Gather the results into a `Tensor`.
+    return array_ops.gather(values_and_default, indices)
+
+
+#===============================================================================
+# RaggedTensor <-> SparseTensor conversion
+#===============================================================================
+def to_sparse(rt_input, name=None):
+  """Converts a `RaggedTensor` into a sparse tensor.
+
+  Example:
+
+  ```python
+  >>> rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+  >>> ragged.to_sparse(rt).eval()
+  SparseTensorValue(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]],
+                    values=[1, 2, 3, 4, 5, 6],
+                    dense_shape=[4, 3])
+  ```
+
+  Args:
+    rt_input: The input `RaggedTensor`.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A SparseTensor with the same values as `rt_input`.
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    raise TypeError('Expected RaggedTensor, got %s' % type(rt_input).__name__)
+  with ops.name_scope(name, 'RaggedToSparse', [rt_input]):
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+    result = gen_ragged_conversion_ops.ragged_tensor_to_sparse(
+        rt_input.nested_row_splits, rt_input.inner_values, name=name)
+    return sparse_tensor.SparseTensor(
+        result.sparse_indices, result.sparse_values, result.sparse_dense_shape)
+
+
+@ops.RegisterGradient('RaggedTensorToSparse')
+def _ragged_tensor_to_sparse_gradient(op, unused_sparse_indices_grad,
+                                      sparse_values_grad,
+                                      unused_sparse_shape_grad):
+  """Gradient for ragged.to_sparse."""
+  op_inputs_nested_row_splits = op.inputs[:-1]
+  op_inputs_inner_values = op.inputs[-1]
+
+  # No gradient for the RaggedTensor's nested_row_splits.
+  nested_row_splits_gradient = [None] * len(op_inputs_nested_row_splits)
+
+  # Gradient for the RaggedTensor's inner_values is formed by reshaping
+  # the gradient for the SparseTensor's values.
+  inner_values_shape = array_ops.shape(op_inputs_inner_values)
+  inner_values_gradient = array_ops.reshape(sparse_values_grad,
+                                            inner_values_shape)
+
+  return nested_row_splits_gradient + [inner_values_gradient]
+
+
+def from_sparse(st_input, name=None):
+  """Converts a 2D `SparseTensor` to a `RaggedTensor`.
+
+  Each row of the `output` `RaggedTensor` will contain the explicit values from
+  the same row in `st_input`.  `st_input` must be ragged-right.  If not it is
+  not ragged-right, then an error will be generated.
+
+  Example:
+
+  ```python
+  >>> st = SparseTensor(indices=[[0, 1], [0, 2], [0, 3], [1, 0], [3, 0]],
+  ...                   values=[1, 2, 3, 4, 5],
+  ...                   dense_shape=[4, 3])
+  >>> ragged.from_sparse(st).eval().tolist()
+  [[1, 2, 3], [4], [], [5]]
+  ```
+
+  Currently, only two-dimensional `SparseTensors` are supported.
+
+  Args:
+    st_input: The sparse tensor to convert.  Must have rank 2.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A `RaggedTensor` with the same values as `st_input`.
+    `output.ragged_rank = rank(st_input) - 1`.
+    `output.shape = [st_input.dense_shape[0], None]`.
+  Raises:
+    ValueError: If the number of dimensions in `st_input` is not known
+      statically, or is not two.
+  """
+  if not sparse_tensor.is_sparse(st_input):
+    raise TypeError('Expected SparseTensor, got %s' % type(st_input).__name__)
+  with ops.name_scope(name, 'RaggedFromSparse', [st_input]):
+    st_input = sparse_tensor.convert_to_tensor_or_sparse_tensor(
+        st_input, name='rt_input')
+
+    static_rank_from_dense_shape = (
+        None if st_input.dense_shape.shape.ndims is None
+        else st_input.dense_shape.shape.dims[0].value)
+    static_rank_from_indices = (
+        None if st_input.indices.shape.ndims is None
+        else st_input.indices.shape.dims[1].value)
+
+    if static_rank_from_dense_shape != 2 and static_rank_from_indices != 2:
+      raise ValueError('rank(st_input) must be 2')
+
+    with ops.control_dependencies(
+        _assert_sparse_indices_are_ragged_right(st_input.indices)):
+      # Treat sparse row indices as segment ids to generate a splits tensor that
+      # we can pair with the sparse tensor values.  (Ignore sparse column
+      # indices.)
+      segment_ids = st_input.indices[:, 0]
+      num_segments = st_input.dense_shape[0]
+      return ragged_factory_ops.from_value_rowids(st_input.values, segment_ids,
+                                                  num_segments)
+
+
+def _assert_sparse_indices_are_ragged_right(indices):
+  """Checks that the given SparseTensor.indices tensor is ragged-right.
+
+  Example: `indices = [[0, 0], [0, 1], [2, 0], [3, 1]]` is not ragged right
+  because the entry `[3, 1]` skips a cell.
+
+  Args:
+    indices: The SparseTensor indices to check.
+
+  Returns:
+    A list of control dependency op tensors.
+  """
+  index_prefix = indices[:, :-1]
+  index_suffix = indices[:, -1]
+
+  # Check whether each index is starting a new row in the innermost dimension
+  # (prefix[i] != prefix[i-1]) or continuing a row (prefix[i] == prefix[i-1]).
+  # (Note: this skips the first index; we will check that separately below.)
+  index_prefix_changed = math_ops.reduce_any(
+      math_ops.not_equal(index_prefix[1:], index_prefix[:-1]), axis=1)
+
+  # Check two cases:
+  #   * For indices that start a new row: index_suffix[i] must be zero.
+  #   * For indices that continue a row: index_suffix[i] must be equal to
+  #     index_suffix[i-1]+1.
+  index_ok = array_ops.where(
+      index_prefix_changed, math_ops.equal(index_suffix[1:], 0),
+      math_ops.equal(index_suffix[1:], index_suffix[:-1] + 1))
+
+  # Also check that the very first index didn't skip any cells.  The first
+  # index starts a new row (by definition), so its suffix should be zero.
+  sparse_indices_are_ragged_right = math_ops.logical_and(
+      math_ops.reduce_all(math_ops.equal(index_suffix[:1], 0)),
+      math_ops.reduce_all(index_ok))
+
+  message = [
+      'SparseTensor is not right-ragged',
+      'SparseTensor.indices =', indices
+  ]
+  return [control_flow_ops.Assert(sparse_indices_are_ragged_right, message)]
diff --git a/tensorflow/python/ops/ragged/ragged_eager_test.py b/tensorflow/python/ops/ragged/ragged_eager_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..731ff742aa18bfa45c68813d5e19f4dbe2307cdb
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_eager_test.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged in eager execution mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      dict(pylist=[[b'a', b'b'], [b'c']]),
+      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]]),
+      dict(pylist=[[[1, 2], [3, 4]], [[5, 6], [], [7, 8]]], ragged_rank=1),
+  ])
+  def testRaggedTensorToList(self, pylist, ragged_rank=None):
+    rt = ragged.constant(pylist, ragged_rank)
+    self.assertEqual(rt.tolist(), pylist)
+
+  expected = "RaggedTensor([['a', 'b'], ['c']])"
+  if sys.version_info[0] == 3:
+    expected = "RaggedTensor([[b'a', b'b'], [b'c']])"
+
+  @parameterized.parameters([
+      dict(pylist=[['a', 'b'], ['c']],
+           expected=expected),
+      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]],
+           expected='RaggedTensor([[[1, 2], [3]], [[4, 5, 6], [], [7]]])'),
+  ])
+  def testRaggedTensorStr(self, pylist, expected):
+    rt = ragged.constant(pylist)
+    self.assertEqual(str(rt), expected)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops.py b/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..23d0e8b5fc44d23ba4ada0ae69084b8547d42064
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
@@ -0,0 +1,367 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Elementwise operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+
+# Information about an argument to an operation: The name of the argument, its
+# position in the argument list, and a boolean flag indicating whether it
+# expects a list of tensors.
+_ArgInfo = collections.namedtuple('ArgInfo', ['name', 'position', 'is_list'])
+
+
+def make_elementwise_op(op, *elementwise_args):
+  """Returns a ragged-tensor version of the elementwise operation `op`.
+
+  The returned operation will:
+
+  1. Broadcast the elementwise arguments to have a compatible shape.
+     An exception is raised if the tensors not broadcast-compatible.
+  2. Call `op`, substituting the dense values of the broadcasted tensor for
+     each elementwise argument.
+  3. Return a potentially ragged tensor constructed from the output of `op`
+     and the broadcasted tensors' nested row splits.
+
+  For example, you can construct a ragged-tensor version of the standard
+  operation `tf.add` by calling `make_elementwise_op(tf.add, 'x', 'y')`.
+
+  Args:
+    op: The operation to wrap.
+    *elementwise_args: The names of arguments to `op` that are treated as
+      elementwise.  Arguments that take a list of tensors should have their
+      names wrapped in square brackets (e.g. "[inputs]").
+
+  Raises:
+    ValueError: If any name specified in `elementwise_args` is not the name
+      of an argument to `op`.
+  """
+  elementwise_arg_infos = _get_arg_infos(op, elementwise_args)
+
+  def ragged_op(*args, **kwargs):
+    """Ragged version of `op`."""
+    args = list(args)
+
+    # Collect all of the elementwise arguments, and put them in a single
+    # dict whose values are the (potentially ragged) tensors that need to
+    # be broadcast to a common shape.  The keys of this dict are tuples
+    # (argkey, index), where argkey is an int for poitional args or a string
+    # for keyword args; and index is None for non-list args and the index of the
+    # tensor for list args.
+    elementwise_args = {}
+    for (name, position, is_list) in elementwise_arg_infos.values():
+      if position < len(args):
+        if is_list:
+          args[position] = list(args[position])
+          for (index, arg) in enumerate(args[position]):
+            elementwise_args[position, index] = arg
+        else:
+          elementwise_args[position, None] = args[position]
+      elif name in kwargs:
+        if is_list:
+          kwargs[name] = list(kwargs[name])
+          for (i, arg) in enumerate(kwargs[name]):
+            elementwise_args[name, i] = arg
+        else:
+          elementwise_args[name, None] = kwargs[name]
+
+    with ops.name_scope(None, op.__name__, elementwise_args.values()):
+      # Convert all inputs to tensors or ragged tensors.
+      for ((key, index), tensor) in elementwise_args.items():
+        argname = elementwise_arg_infos[key].name
+        converted = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+            tensor, name=argname)
+        elementwise_args[key, index] = converted
+
+      # Broadcast tensors to have compatible shapes.
+      broadcast_args, result_splits, broadcast_check_ops = \
+          _broadcast_elementwise_args(elementwise_args)
+
+      # Replace tensor arguments with their dense values.
+      for ((key, index), tensor) in broadcast_args.items():
+        if ragged_tensor.is_ragged(tensor):
+          if isinstance(key, int) and index is None:
+            args[key] = tensor.inner_values
+          elif isinstance(key, int) and index is not None:
+            args[key][index] = tensor.inner_values
+          elif isinstance(key, str) and index is None:
+            kwargs[key] = tensor.inner_values
+          else:
+            assert isinstance(key, str) and index is not None
+            kwargs[key][index] = tensor.inner_values
+
+      # Call the elementwise op on the broadcasted dense values.
+      with ops.control_dependencies(broadcast_check_ops):
+        result_values = op(*args, **kwargs)
+
+      # Restore any ragged dimensions that we stripped off, and return the
+      # result.
+      return ragged_factory_ops.from_nested_row_splits(result_values,
+                                                       result_splits)
+
+  # Construct the docstring.
+  op_name = tf_export.get_canonical_name_for_symbol(op)
+  assert op_name is not None, op
+  argnames = ', '.join('`%s`' % s.strip('[]') for s in elementwise_args)
+  docstring = _ELEMENTWISE_DOCSTRING % dict(op_name=op_name, argnames=argnames)
+
+  # Update name, docstring, signature, etc., for the wrapper, and return it.
+  return tf_decorator.make_decorator(op, ragged_op, decorator_doc=docstring)
+
+
+_ELEMENTWISE_DOCSTRING = """\
+Ragged version of the elementwise operation `tf.%(op_name)s`.
+
+  The following elementwise arguments may be ragged or dense:
+  %(argnames)s.
+  These arguments will be broadcast to a compatible shape if necessary.
+  """
+
+
+def _get_arg_infos(func, elementwise_args):
+  """Returns `_ArgInfo`s for each `func` arg specified by `elementwise_args`.
+
+  Args:
+    func: The function whose arguments should be described.
+    elementwise_args: The names of the arguments to get info for.
+
+  Returns:
+    A dictionary that maps both names and positions of arguments to
+    `_ArgInfo` tuples.
+  """
+  arg_infos = {}
+
+  # Inspect the func's argspec to find the position of each arg.
+  arg_spec = tf_inspect.getargspec(func)
+  for argname in elementwise_args:
+    assert isinstance(argname, str)
+    is_list = argname.startswith('[') and argname.endswith(']')
+    if is_list:
+      argname = argname[1:-1]
+    assert argname in arg_spec.args, (func, argname, arg_spec.args)
+    arg_info = _ArgInfo(argname, arg_spec.args.index(argname), is_list)
+    arg_infos[arg_info.name] = arg_info
+    arg_infos[arg_info.position] = arg_info
+  return arg_infos
+
+
+def _broadcast_elementwise_args(elementwise_args):
+  """Broadcasts the values of `elementwise_args` to have compatible shapes.
+
+  Args:
+    elementwise_args: A dictionary whose keys are potentially ragged tensors.
+
+  Returns:
+    A tuple `(broadcast_args, broadcast_splits, checks)` where:
+
+    * `broadcast_args` is a dictionary with the same keys as
+      `elementwise_args`, mapping to broadcasted tensors.
+    * `broadcast_splits` is the broadcasted nested row splits.
+    * `checks` is a possibly empty tuple of assertion operations that should
+      be added as control dependencies.
+
+  Raises:
+    ValueError: If broadcasting fails.
+  """
+  # No elementwise arguments were used: nothing to do!
+  if not elementwise_args:
+    return elementwise_args, (), ()
+
+  # A single elementwise argument was used: no broadcasting necessary.
+  if len(elementwise_args) == 1:
+    arg = list(elementwise_args.values())[0]
+    if ragged_tensor.is_ragged(arg):
+      return elementwise_args, arg.nested_row_splits, ()
+    else:
+      return elementwise_args, (), ()
+
+  # Multiple elementwise arguments.
+  else:
+    is_ragged = [ragged_tensor.is_ragged(t) for t in elementwise_args.values()]
+    if not any(is_ragged):
+      return elementwise_args, (), ()
+
+    # Support limited broadcasting (namely, scalar + ragged).  Full
+    # broadcasting support will be added later.
+    if all((ragged_tensor.is_ragged(t) or t.shape.ndims == 0)
+           for t in elementwise_args.values()):
+      nested_splits_lists = [
+          t.nested_row_splits
+          for t in elementwise_args.values()
+          if ragged_tensor.is_ragged(t)
+      ]
+      if len(nested_splits_lists) == 1:
+        checks = ()
+      else:
+        if any(t.shape.ndims is None for t in elementwise_args.values()):
+          raise ValueError('Ragged elementwise ops require that rank (number '
+                           'of dimensions) be statically known.')
+        if len(set(t.shape.ndims for t in elementwise_args.values())) != 1:
+          raise ValueError('Ragged elementwise ops do not support '
+                           'broadcasting yet')
+        checks = ragged_util.assert_splits_match(nested_splits_lists)
+      return (elementwise_args, nested_splits_lists[0], checks)
+    else:
+      raise ValueError('Ragged elementwise ops do not support broadcasting yet')
+
+
+# A list of symbols that should be exported in the "ragged" package.
+_symbols_to_export = []
+
+
+def _add_elementwise_ops_to_this_module(specs, verbose=False):
+  """Adds ragged versions of the given ops to this module.
+
+  Args:
+    specs: A list of tuples containing the arguments for `make_elementwise_op`.
+    verbose: If true, then display each op that gets added.
+  """
+  for spec in specs:
+    original_op = spec[0]
+    ragged_op = make_elementwise_op(*spec)
+    canonical_name = tf_export.get_canonical_name_for_symbol(original_op)
+    if '.' not in canonical_name:
+      op_name = canonical_name
+    else:
+      op_name = original_op.__name__
+    if verbose:
+      print('Adding ragged_elementwise_op: tf.ragged.%s (based on tf.%s)' %
+            (op_name, canonical_name))
+    globals()[op_name] = ragged_op
+    _symbols_to_export.append(op_name)
+
+
+# A list of tuples containing arguments for `make_elementwise_op`, for each
+# elementwise operation that should have a ragged version built.  Each tuple
+# contains a standard `Tensor` operation, and the names of any arguments
+# that are processed in elementwise fashion.
+_TF_ELEMENTWISE_OPS = [
+    # Unary math operations.
+    (clip_ops.clip_by_value, 't'),
+    (math_ops.abs, 'x'),
+    (math_ops.acos, 'x'),
+    (math_ops.acosh, 'x'),
+    (math_ops.angle, 'input'),
+    (math_ops.asin, 'x'),
+    (math_ops.asinh, 'x'),
+    (math_ops.atan, 'x'),
+    (math_ops.atanh, 'x'),
+    (math_ops.cast, 'x'),
+    (math_ops.ceil, 'x'),
+    (math_ops.conj, 'x'),
+    (math_ops.cos, 'x'),
+    (math_ops.cosh, 'x'),
+    (math_ops.digamma, 'x'),
+    (math_ops.erf, 'x'),
+    (math_ops.erfc, 'x'),
+    (math_ops.exp, 'x'),
+    (math_ops.expm1, 'x'),
+    (math_ops.floor, 'x'),
+    (math_ops.imag, 'input'),
+    (math_ops.is_finite, 'x'),
+    (math_ops.is_inf, 'x'),
+    (math_ops.is_nan, 'x'),
+    (math_ops.lgamma, 'x'),
+    (math_ops.log, 'x'),
+    (math_ops.log1p, 'x'),
+    (math_ops.log_sigmoid, 'x'),
+    (math_ops.logical_not, 'x'),
+    (math_ops.negative, 'x'),
+    (math_ops.real, 'input'),
+    (math_ops.reciprocal, 'x'),
+    (math_ops.rint, 'x'),
+    (math_ops.round, 'x'),
+    (math_ops.rsqrt, 'x'),
+    (math_ops.saturate_cast, 'value'),
+    (math_ops.sign, 'x'),
+    (math_ops.sin, 'x'),
+    (math_ops.sinh, 'x'),
+    (math_ops.sqrt, 'x'),
+    (math_ops.square, 'x'),
+    (math_ops.tan, 'x'),
+
+    # Binary math operations
+    (math_ops.add, 'x', 'y'),
+    (math_ops.atan2, 'y', 'x'),
+    (math_ops.complex, 'real', 'imag'),
+    (math_ops.div, 'x', 'y'),
+    (math_ops.div_no_nan, 'x', 'y'),
+    (math_ops.divide, 'x', 'y'),
+    (math_ops.equal, 'x', 'y'),
+    (math_ops.floordiv, 'x', 'y'),
+    (math_ops.floormod, 'x', 'y'),
+    (math_ops.greater, 'x', 'y'),
+    (math_ops.greater_equal, 'x', 'y'),
+    (math_ops.less, 'x', 'y'),
+    (math_ops.less_equal, 'x', 'y'),
+    (math_ops.logical_and, 'x', 'y'),
+    (math_ops.logical_or, 'x', 'y'),
+    (math_ops.logical_xor, 'x', 'y'),
+    (math_ops.maximum, 'x', 'y'),
+    (math_ops.minimum, 'x', 'y'),
+    (math_ops.multiply, 'x', 'y'),
+    (math_ops.not_equal, 'x', 'y'),
+    (math_ops.pow, 'x', 'y'),
+    (math_ops.realdiv, 'x', 'y'),
+    (math_ops.squared_difference, 'x', 'y'),
+    (math_ops.subtract, 'x', 'y'),
+    (math_ops.truediv, 'x', 'y'),
+    (math_ops.truncatediv, 'x', 'y'),
+    (math_ops.truncatemod, 'x', 'y'),
+
+    # N-ary math operations
+    (math_ops.add_n, '[inputs]'),
+
+    # String operations
+    (string_ops.as_string, 'input'),
+    (string_ops.decode_base64, 'input'),
+    (string_ops.encode_base64, 'input'),
+    (string_ops.regex_full_match, 'input'),
+    (string_ops.regex_replace, 'input'),
+    (string_ops.string_join, '[inputs]'),
+    (string_ops.string_strip, 'input'),
+    (string_ops.string_to_hash_bucket, 'string_tensor'),
+    (string_ops.string_to_hash_bucket_fast, 'input'),
+    (string_ops.string_to_hash_bucket_strong, 'input'),
+    (string_ops.substr, 'input'),
+    (string_ops.unicode_script, 'input'),
+
+    # Array ops
+    (array_ops.check_numerics, 'tensor'),
+    (array_ops.identity, 'input'),
+    (array_ops.ones_like, 'tensor'),
+    (array_ops.zeros_like, 'tensor'),
+
+    # Parsing ops
+    (parsing_ops.decode_compressed, 'bytes'),
+    (parsing_ops.string_to_number, 'string_tensor'),
+]
+_add_elementwise_ops_to_this_module(_TF_ELEMENTWISE_OPS)
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py b/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dfa5cff45d0022300d47bd7257552ddf315352c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
@@ -0,0 +1,449 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.elementwise_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+# Constants listing various op types to test.  Each elementwise operation
+# should be included in at least one list below, or tested separately if
+# necessary (e.g., because it expects additional arguments).
+UNARY_FLOAT_OPS = [
+    ragged.abs,
+    ragged.acos,
+    ragged.acosh,
+    ragged.angle,
+    ragged.asin,
+    ragged.asinh,
+    ragged.atan,
+    ragged.atanh,
+    ragged.ceil,
+    ragged.conj,
+    ragged.cos,
+    ragged.cosh,
+    ragged.digamma,
+    ragged.erf,
+    ragged.erfc,
+    ragged.exp,
+    ragged.expm1,
+    ragged.floor,
+    ragged.imag,
+    ragged.is_finite,
+    ragged.is_inf,
+    ragged.is_nan,
+    ragged.lgamma,
+    ragged.log,
+    ragged.log1p,
+    ragged.log_sigmoid,
+    ragged.negative,
+    ragged.real,
+    ragged.reciprocal,
+    ragged.rint,
+    ragged.round,
+    ragged.rsqrt,
+    ragged.sign,
+    ragged.sin,
+    ragged.sinh,
+    ragged.sqrt,
+    ragged.square,
+    ragged.tan,
+    ragged.as_string,
+    ragged.identity,
+    ragged.ones_like,
+    ragged.zeros_like,
+]
+UNARY_BOOL_OPS = [
+    ragged.logical_not,
+]
+UNARY_STRING_OPS = [
+    ragged.decode_base64,
+    ragged.encode_base64,
+    ragged.string_strip,
+    ragged.decode_compressed,
+]
+BINARY_FLOAT_OPS = [
+    ragged.add,
+    ragged.atan2,
+    ragged.complex,
+    ragged.div,
+    ragged.div_no_nan,
+    ragged.divide,
+    ragged.equal,
+    ragged.floordiv,
+    ragged.floormod,
+    ragged.greater,
+    ragged.greater_equal,
+    ragged.less,
+    ragged.less_equal,
+    ragged.maximum,
+    ragged.minimum,
+    ragged.multiply,
+    ragged.not_equal,
+    ragged.pow,
+    ragged.realdiv,
+    ragged.squared_difference,
+    ragged.subtract,
+    ragged.truediv,
+]
+BINARY_BOOL_OPS = [
+    ragged.logical_and,
+    ragged.logical_or,
+    ragged.logical_xor,
+]
+UNARY_INT_OPS = [
+    ragged.unicode_script,
+]
+BINARY_INT_OPS = [
+    ragged.truncatediv,
+    ragged.truncatemod,
+]
+
+
+class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
+                               parameterized.TestCase):
+
+  def assertSameShape(self, x, y):
+    """Checks that x and y have the same shape (including ragged shapes)."""
+    if isinstance(x, ragged.RaggedTensor):
+      self.assertIsInstance(y, ragged.RaggedTensor)
+      self.assertEqual(x.ragged_rank, y.ragged_rank)
+      for (x_splits, y_splits) in zip(x.nested_row_splits, y.nested_row_splits):
+        self.assertAllEqual(x_splits, y_splits)
+      self.assertAllEqual(
+          array_ops.shape(x.inner_values), array_ops.shape(y.inner_values))
+    else:
+      self.assertIsInstance(y, ops.Tensor)
+      self.assertAllEqual(array_ops.shape(x), array_ops.shape(y))
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Test different input shapes.
+      #=========================================================================
+      [
+          # 0-dimensional input
+          {'x': 12},
+          # 1-dimensional input
+          {'x': [1, -2, 3]},
+          # 2-dimensional input
+          {'x': [[-2, 3], [-3, 4]]},
+          {'x': ragged.constant_value([[-2, 3], [-3]], ragged_rank=1)},
+          # 3-dimensional inputs
+          {'x': [[[-2, 3], [3, 4]], [[7, 6], [5, 4]]]},
+          {'x': ragged.constant_value([[[-2, 3], [3, 4]], [[7, 6]]],
+                                      ragged_rank=1)},
+          {'x': ragged.constant_value([[[-2, 3, 4], []], [[7, 6]], []],
+                                      ragged_rank=2)},
+          ] +
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]), 'op': op}
+       for op in UNARY_FLOAT_OPS] +
+      [{'x': ragged.constant_value([[True, False], [True]]), 'op': op}
+       for op in UNARY_BOOL_OPS] +
+      [{'x': ragged.constant_value([[18, 512], [12412]], np.int32), 'op': op}
+       for op in UNARY_INT_OPS] +
+      [{'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]), 'op': op}
+       for op in UNARY_STRING_OPS] +
+      [
+          {'op': ragged.clip_by_value,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'clip_value_min': 0.1, 'clip_value_max': 4.0},
+          {'op': ragged.cast,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': ragged.saturate_cast,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': ragged.string_to_hash_bucket,
+           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': ragged.string_to_hash_bucket_fast,
+           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': ragged.string_to_hash_bucket_strong,
+           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000,
+           'key': [1231, 12512]},
+          {'op': ragged.string_to_number,
+           'x': ragged.constant_value([['-2.0', '3.0'], ['-3.0']])},
+          {'op': ragged.regex_full_match,
+           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'pattern': r'\w+'},
+          {'op': ragged.regex_replace,
+           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'pattern': r'\d',
+           'rewrite': '#'},
+          {'op': ragged.substr,
+           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'pos': 2, 'len': 3},
+          {'op': ragged.check_numerics,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'message': 'check-numerics'},
+      ]
+      )  # pyformat: disable
+  def testUnaryOp(self, x, op=ragged.abs, **extra_args):
+    x = ragged.convert_to_tensor_or_ragged_tensor(x)
+    result = op(x, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_x = x.inner_values if isinstance(x, ragged.RaggedTensor) else x
+    expected_flat_values = array_ops.reshape(
+        op.__wrapped__(dense_x, **extra_args), [-1])
+
+    with self.test_session():
+      # Check that the result has the expected shape.
+      self.assertSameShape(x, result)
+
+      # Check that the result has the expected (flattened) values.
+      if isinstance(result, ragged.RaggedTensor):
+        result_flat_values = array_ops.reshape(result.inner_values, [-1])
+      else:
+        result_flat_values = array_ops.reshape(result, [-1])
+      self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  @parameterized.parameters(
+      [
+          #=====================================================================
+          # Without broadcasting -- i.e., shapes match exactly.
+          #=====================================================================
+          # Shapes: x:(), y:()
+          {'x': 12,
+           'y': 8},
+          # Shapes: x:(3,), y:(3,)
+          {'x': [7, 8, 9],
+           'y': [1, -2, 3]},
+          # Shapes: x:(2, 2), y:(2, 2)
+          {'x': [[-2, 3], [-3, -4]],
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(2, None), y:(2, None)
+          {'x': ragged.constant_value([[-2, 3], [-3]]),
+           'y': ragged.constant_value([[5, 6], [7]])},
+          # Shapes: x:(2, 2, 2), y:(2, 2, 2)
+          {'x': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+           'y': [[[9, 3], [3, 4]], [[5, 2], [7, 6]]]},
+          # Shapes: x:(2, None, None), y: (2, None, None)
+          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]])},
+          # Shapes: x:(2, None, 2), y: (2, None, 2)
+          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+                                      ragged_rank=1),
+           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+                                      ragged_rank=1)},
+
+          #=====================================================================
+          # With broadcasting
+          #=====================================================================
+          # Shapes: x:(), y:(3,)
+          {'x': 12,                                 # Broadcast () -> (3,)
+           'y': [1, -2, 3]},
+          # Shapes: x:(1,), y:(3,)
+          {'x': [12],                               # Broadcast (1,) -> (3,)
+           'y': [1, -2, 3]},
+          # Shapes: x:(), y:(2, 2)
+          {'x': 12,                                 # Broadcast () -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(1,), y:(2, 2)
+          {'x': 12,                                 # Broadcast (1,) -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(2, 1), y:(2, 2)
+          {'x': [[10], [20]],                       # Broadcast (2, 1) -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(), y:(2, None)
+          {'x': 10,                                 # Broadcast () -> (2, None)
+           'y': ragged.constant_value([[1, 2], [3]], dtype=np.int32)},
+          # TODO(edloper): Add tests for more advanced broadcasting, once we add
+          # support for it.
+
+          #=====================================================================
+          # Keyword Args
+          #=====================================================================
+          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]]),
+           'use_kwargs': True},
+          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+                                      ragged_rank=1),
+           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+                                      ragged_rank=1),
+           'use_kwargs': True},
+      ] +
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+        'y': ragged.constant_value([[5.0, 1.0], [12.0]]),
+        'op': op}
+       for op in BINARY_FLOAT_OPS] +
+      [{'x': ragged.constant_value([[-2, 3], [-3]]),
+        'y': ragged.constant_value([[5, 1], [12]]),
+        'op': op}
+       for op in BINARY_INT_OPS] +
+      [{'x': ragged.constant_value([[True, True], [False]]),
+        'y': ragged.constant_value([[False, True], [False]]),
+        'op': op}
+       for op in BINARY_BOOL_OPS] +
+      [
+      ]
+      )  # pyformat: disable
+  def testBinaryOp(self, x, y, op=ragged.add, **extra_args):
+    use_kwargs = extra_args.pop('use_kwargs', False)
+    x = ragged.convert_to_tensor_or_ragged_tensor(x)
+    y = ragged.convert_to_tensor_or_ragged_tensor(y)
+    if use_kwargs:
+      result = op(x=x, y=y, **extra_args)
+    else:
+      result = op(x, y, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_x = x.inner_values if isinstance(x, ragged.RaggedTensor) else x
+    dense_y = y.inner_values if isinstance(y, ragged.RaggedTensor) else y
+    expected_flat_values = array_ops.reshape(
+        op.__wrapped__(dense_x, dense_y, **extra_args), [-1])
+
+    with self.test_session():
+      # Check that the result has the expected shape.
+      self.assertSameShape(y, result)
+
+      # Check that the result has the expected (flattened) values.
+      if isinstance(result, ragged.RaggedTensor):
+        result_flat_values = array_ops.reshape(result.inner_values, [-1])
+      else:
+        result_flat_values = array_ops.reshape(result, [-1])
+      self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  @parameterized.parameters(
+      [
+          {'inputs': (12, 8, 3)},
+          {'inputs': ([1, 2, 3], [7, 8, 9], [3, 6, 9])},
+          {'inputs': ([[1, 2]], [[3, 4]], [[5, 6]])},
+          {'inputs': (ragged.constant_value([[1, 3], [-3]]),
+                      ragged.constant_value([[4, 7], [88]]),
+                      ragged.constant_value([[2, 9], [12]]))},
+          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
+                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
+                      ragged.constant_value([[[2, 9], [12]], [[8]]]))},
+          {'inputs': (ragged.constant_value([[[1, 3], [3, 4]], [[1, 5]]],
+                                            ragged_rank=1),
+                      ragged.constant_value([[[4, 7], [1, 2]], [[2, 2]]],
+                                            ragged_rank=1),
+                      ragged.constant_value([[[2, 9], [5, 2]], [[8, 0]]],
+                                            ragged_rank=1))},
+          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
+                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
+                      ragged.constant_value([[[2, 9], [12]], [[8]]])),
+           'use_kwargs': True},
+      ] + [
+          {'op': ragged.add_n,
+           'inputs': (ragged.constant_value([[1, 3], [-3]]),
+                      ragged.constant_value([[4, 7], [88]]),
+                      ragged.constant_value([[2, 9], [12]]))},
+          {'op': ragged.string_join,
+           'inputs': (ragged.constant_value([['a', 'b'], ['c']]),
+                      ragged.constant_value([['foo', 'bar'], ['baz']]),
+                      ragged.constant_value([['2', '9'], ['12']]))},
+      ])  # pyformat: disable
+  def testListValuedOp(self, inputs, op=ragged.add_n, **extra_args):
+    use_kwargs = extra_args.pop('use_kwargs', False)
+    inputs = [ragged.convert_to_tensor_or_ragged_tensor(x) for x in inputs]
+    if use_kwargs:
+      result = op(inputs=inputs, **extra_args)
+    else:
+      result = op(inputs, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_inputs = [
+        x.inner_values if isinstance(x, ragged.RaggedTensor) else x
+        for x in inputs
+    ]
+    expected_flat_values = array_ops.reshape(
+        op.__wrapped__(dense_inputs, **extra_args), [-1])
+
+    with self.test_session():
+      # Check that the result has the expected shape.
+      self.assertSameShape(inputs[0], result)
+
+      # Check that the result has the expected (flattened) values.
+      if isinstance(result, ragged.RaggedTensor):
+        result_flat_values = array_ops.reshape(result.inner_values, [-1])
+      else:
+        result_flat_values = array_ops.reshape(result, [-1])
+      self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  def testUnknownRankError(self):
+    x = ragged.constant([[1, 2], [3]])
+    y = ragged.from_row_splits(
+        array_ops.placeholder_with_default([1, 2, 3], shape=None), x.row_splits)
+    with self.assertRaisesRegexp(
+        ValueError, r'Ragged elementwise ops require that rank \(number '
+        r'of dimensions\) be statically known.'):
+      ragged.add(x, y)
+
+  def testBroadcastError1(self):
+    x = ragged.constant([[1, 2], [3]])
+    y = [[12]]
+    with self.assertRaisesRegexp(
+        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
+      ragged.add(x, y)
+
+  def testBroadcastError2(self):
+    x = ragged.constant([[[1, 2], [3, 4]], [[5]]], ragged_rank=2)
+    y = ragged.constant([[[8], [3]], [[2]]], ragged_rank=1)
+    with self.assertRaisesRegexp(ValueError,
+                                 'Inputs must have identical ragged splits'):
+      ragged.add(x, y)
+
+  def testBroadcastError3(self):
+    x = ragged.constant([[[1, 2], [3]], [[4, 5], [6]]], ragged_rank=2)
+    y = ragged.constant([[7, 8], [9]], ragged_rank=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
+      ragged.add(x, y)
+
+  def testBroadcastError4(self):
+    x = ragged.constant([[[1]]])
+    y = ragged.constant([[1]])
+    with self.assertRaisesRegexp(
+        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
+      ragged.add(x, y)
+
+  def testShapeMismatch(self):
+    x = ragged.constant([[1, 2, 3], [4, 5]])
+    y = ragged.constant([[1, 2, 3], [4, 5, 6]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'Inputs must have identical ragged splits'):
+      ragged.add(x, y)
+
+  def testDocstring(self):
+    self.assertRegexpMatches(
+        ragged.add.__doc__,
+        'Ragged version of the elementwise operation `tf.math.add`')
+    self.assertEqual(ragged.add.__name__, 'add')
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c4fd458c230a13ebf48d6a94028497a266ea1bf
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
@@ -0,0 +1,125 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.expand_dims."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedExpandDimsOpTest(test_util.TensorFlowTestCase,
+                             parameterized.TestCase):
+
+  # An example 4-d ragged tensor with shape [3, (D2), (D3), 2], and the
+  # expected result calling for expand_dims on each axis.  c.f. the table of
+  # expected result shapes in the ragged.expand_dims docstring.
+  EXAMPLE4D = [[[[1, 1], [2, 2]], [[3, 3]]],
+               [],
+               [[], [[4, 4], [5, 5], [6, 6]]]]  # pyformat: disable
+  EXAMPLE4D_EXPAND_AXIS = {
+      0: [EXAMPLE4D],
+      1: [[d0] for d0 in EXAMPLE4D],
+      2: [[[d1] for d1 in d0] for d0 in EXAMPLE4D],
+      3: [[[[d2] for d2 in d1] for d1 in d0] for d0 in EXAMPLE4D],
+      4: [[[[[d3] for d3 in d2] for d2 in d1] for d1 in d0] for d0 in EXAMPLE4D]
+  }
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring examples: 2D Ragged Inputs
+      dict(rt_input=[[1, 2], [3]],
+           axis=0,
+           expected=[[[1, 2], [3]]],
+           expected_shape=[1, None, None]),
+      dict(rt_input=[[1, 2], [3]],
+           axis=1,
+           expected=[[[1, 2]], [[3]]],
+           expected_shape=[2, None, None]),
+      dict(rt_input=[[1, 2], [3]],
+           axis=2,
+           expected=[[[1], [2]], [[3]]],
+           expected_shape=[2, None, 1]),
+
+      #=========================================================================
+      # 2D Tensor Inputs
+      dict(rt_input=[[1, 2], [3, 4], [5, 6]],
+           ragged_rank=0,
+           axis=0,
+           expected=[[[1, 2], [3, 4], [5, 6]]],
+           expected_shape=[1, 3, 2]),
+      dict(rt_input=[[1, 2], [3, 4], [5, 6]],
+           ragged_rank=0,
+           axis=1,
+           expected=[[[1, 2]], [[3, 4]], [[5, 6]]],
+           expected_shape=[3, 1, 2]),
+      dict(rt_input=[[1, 2], [3, 4], [5, 6]],
+           ragged_rank=0,
+           axis=2,
+           expected=[[[1], [2]], [[3], [4]], [[5], [6]]],
+           expected_shape=[3, 2, 1]),
+
+      #=========================================================================
+      # 4D Ragged Inputs: [3, (D2), (D3), 2]
+      # c.f. the table of expected result shapes in the expand_dims docstring.
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=0,
+           expected=EXAMPLE4D_EXPAND_AXIS[0],
+           expected_shape=[1, None, None, None, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=1,
+           expected=EXAMPLE4D_EXPAND_AXIS[1],
+           expected_shape=[3, None, None, None, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=2,
+           expected=EXAMPLE4D_EXPAND_AXIS[2],
+           expected_shape=[3, None, None, None, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=3,
+           expected=EXAMPLE4D_EXPAND_AXIS[3],
+           expected_shape=[3, None, None, 1, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=4,
+           expected=EXAMPLE4D_EXPAND_AXIS[4],
+           expected_shape=[3, None, None, 2, 1]),
+  ])  # pyformat: disable
+  def testRaggedExpandDims(self,
+                           rt_input,
+                           axis,
+                           expected,
+                           ragged_rank=None,
+                           expected_shape=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    expanded = ragged.expand_dims(rt, axis=axis)
+    self.assertEqual(expanded.shape.ndims, rt.shape.ndims + 1)
+    if expected_shape is not None:
+      self.assertEqual(expanded.shape.as_list(), expected_shape)
+
+    with self.test_session():
+      self.assertEqual(expanded.eval().tolist(), expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..de3a2d5b10be0e22c22f24ea8cb959c28cb741fd
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -0,0 +1,678 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for constructing RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
+
+
+#===============================================================================
+# Op to construct a constant RaggedTensor from a nested Python list.
+#===============================================================================
+def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
+  """Constructs a constant RaggedTensor from a nested Python list.
+
+  Example:
+
+  ```python
+  >>> ragged.constant([[1, 2], [3], [4, 5, 6]]).eval()
+  RaggedTensorValue(values=[1, 2, 3, 4, 5, 6], splits=[0, 2, 3, 6])
+  ```
+
+  All scalar values in `pylist` must have the same nesting depth `K`, and the
+  returned `RaggedTensor` will have rank `K`.  If `pylist` contains no scalar
+  values, then `K` is one greater than the maximum depth of empty lists in
+  `pylist`.  All scalar values in `pylist` must be compatible with `dtype`.
+
+  Args:
+    pylist: A nested `list` or `tuple`.  Any nested element that is not a `list`
+      or `tuple` must be a scalar value compatible with `dtype`.
+    dtype: The type of elements for the returned `RaggedTensor`.  If not
+      specified, then a default is chosen based on the scalar values in
+      `pylist`.
+    ragged_rank: An integer specifying the ragged rank of the returned
+      `RaggedTensor`.  Must be nonnegative and less than `K`. Defaults to
+      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to
+      `max(0, K - 1 - len(inner_shape))` if `inner_shape` is specified.
+    inner_shape: A tuple of integers specifying the shape for individual inner
+      values in the returned `RaggedTensor`.  Defaults to `()` if `ragged_rank`
+      is not specified.  If `ragged_rank` is specified, then a default is chosen
+      based on the contents of `pylist`.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A potentially ragged tensor with rank `K` and the specified `ragged_rank`,
+    containing the values from `pylist`.
+
+  Raises:
+    ValueError: If the scalar values in `pylist` have inconsistent nesting
+      depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
+  """
+  with ops.name_scope(name, 'RaggedConstant'):
+    return _constant_value(from_row_splits, constant_op.constant, pylist, dtype,
+                           ragged_rank, inner_shape)
+
+
+def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None):
+  """Constructs a RaggedTensorValue from a nested Python list.
+
+  > Warning: This function returns a `RaggedTensorValue`, not a `RaggedTensor`.
+  > If you wish to construct a constant `RaggedTensor`, use
+  > [`ragged.constant(...)`](constant.md) instead.
+
+  Example:
+
+  ```python
+  >>> ragged.constant_value([[1, 2], [3], [4, 5, 6]])
+  RaggedTensorValue(values=[1, 2, 3, 4, 5, 6], splits=[0, 2, 3, 6])
+  ```
+
+  All scalar values in `pylist` must have the same nesting depth `K`, and the
+  returned `RaggedTensorValue` will have rank `K`.  If `pylist` contains no
+  scalar values, then `K` is one greater than the maximum depth of empty lists
+  in `pylist`.  All scalar values in `pylist` must be compatible with `dtype`.
+
+  Args:
+    pylist: A nested `list` or `tuple`.  Any nested element that is not a `list`
+      or `tuple` must be a scalar value compatible with `dtype`.
+    dtype: `numpy.dtype`.  The type of elements for the returned `RaggedTensor`.
+      If not specified, then a default is chosen based on the scalar values in
+      `pylist`.
+    ragged_rank: An integer specifying the ragged rank of the returned
+      `RaggedTensorValue`.  Must be nonnegative and less than `K`. Defaults to
+      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to `max(0, K
+      - 1 - len(inner_shape))` if `inner_shape` is specified.
+    inner_shape: A tuple of integers specifying the shape for individual inner
+      values in the returned `RaggedTensorValue`.  Defaults to `()` if
+      `ragged_rank` is not specified.  If `ragged_rank` is specified, then a
+      default is chosen based on the contents of `pylist`.
+
+  Returns:
+    A `RaggedTensorValue` or `numpy.array` with rank `K` and the specified
+    `ragged_rank`, containing the values from `pylist`.
+
+  Raises:
+    ValueError: If the scalar values in `pylist` have inconsistent nesting
+      depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
+  """
+
+  def _ragged_factory(values, row_splits):
+    row_splits = np.array(row_splits, dtype=np.int64)
+    return ragged_tensor_value.RaggedTensorValue(values, row_splits)
+
+  def _inner_factory(pylist, dtype, shape, name=None):  # pylint: disable=unused-argument
+    return np.reshape(np.array(pylist, dtype=dtype), shape)
+
+  return _constant_value(_ragged_factory, _inner_factory, pylist, dtype,
+                         ragged_rank, inner_shape)
+
+
+def _constant_value(ragged_factory, inner_factory, pylist, dtype, ragged_rank,
+                    inner_shape):
+  """Constructs a constant RaggedTensor or RaggedTensorValue.
+
+  Args:
+    ragged_factory: A factory function with the signature:
+      `ragged_factory(values, row_splits)`
+    inner_factory: A factory function with the signature: `inner_factory(pylist,
+      dtype, shape, name)`
+    pylist: A nested `list` or `tuple`.
+    dtype: Data type for returned value.
+    ragged_rank: Ragged rank for returned value.
+    inner_shape: Inner value shape for returned value.
+
+  Returns:
+    A value returned by `ragged_factory` or `inner_factory`.
+
+  Raises:
+    ValueError: If the scalar values in `pylist` have inconsistent nesting
+      depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
+  """
+  if ragged_tensor.is_ragged(pylist):
+    raise TypeError('pylist may not be a RaggedTensor or RaggedTensorValue.')
+
+  if not isinstance(pylist, (list, tuple)):
+    # Scalar value
+    if ragged_rank is not None and ragged_rank != 0:
+      raise ValueError('Invalid pylist=%r: incompatible with ragged_rank=%d' %
+                       (pylist, ragged_rank))
+    if inner_shape is not None and inner_shape:
+      raise ValueError(
+          'Invalid pylist=%r: incompatible with dim(inner_shape)=%d' %
+          (pylist, len(inner_shape)))
+    return inner_factory(pylist, dtype, ())
+
+  if ragged_rank is not None and ragged_rank < 0:
+    raise ValueError(
+        'Invalid ragged_rank=%r: must be nonnegative' % ragged_rank)
+
+  # Find the depth of scalar values in `pylist`.
+  scalar_depth, max_depth = _find_scalar_and_max_depth(pylist)
+  if scalar_depth is not None:
+    if max_depth > scalar_depth:
+      raise ValueError('Invalid pylist=%r: empty list nesting is greater '
+                       'than scalar value nesting' % pylist)
+
+  # If both inner_shape and ragged_rank were specified, then check that
+  # they are compatible with pylist.
+  if inner_shape is not None and ragged_rank is not None:
+    expected_depth = ragged_rank + len(inner_shape) + 1
+    if ((scalar_depth is not None and expected_depth != scalar_depth) or
+        (scalar_depth is None and expected_depth < max_depth)):
+      raise ValueError(
+          'Invalid pylist=%r: incompatible with ragged_rank=%d '
+          'and dim(inner_shape)=%d' % (pylist, ragged_rank, len(inner_shape)))
+
+  # Check if the result is a `Tensor`.
+  if (ragged_rank == 0 or
+      (ragged_rank is None and
+       ((max_depth < 2) or
+        (inner_shape is not None and max_depth - len(inner_shape) < 2)))):
+    return inner_factory(pylist, dtype, inner_shape)
+
+  # Compute default value for inner_shape.
+  if inner_shape is None:
+    if ragged_rank is None:
+      inner_shape = ()
+    else:
+      inner_shape = _default_inner_shape_for_pylist(pylist, ragged_rank)
+
+  # Compute default value for ragged_rank.
+  if ragged_rank is None:
+    if scalar_depth is None:
+      ragged_rank = max(1, max_depth - 1)
+    else:
+      ragged_rank = max(1, scalar_depth - 1 - len(inner_shape))
+
+  # Build the splits for each ragged rank, and concatenate the inner values
+  # into a single list.
+  nested_splits = []
+  values = pylist
+  for dim in range(ragged_rank):
+    nested_splits.append([0])
+    concatenated_values = []
+    for row in values:
+      nested_splits[dim].append(nested_splits[dim][-1] + len(row))
+      concatenated_values.extend(row)
+    values = concatenated_values
+
+  values = inner_factory(
+      values, dtype=dtype, shape=(len(values),) + inner_shape, name='values')
+  for row_splits in reversed(nested_splits):
+    values = ragged_factory(values, row_splits)
+  return values
+
+
+def _find_scalar_and_max_depth(pylist):
+  """Finds nesting depth of scalar values in pylist.
+
+  Args:
+    pylist: A nested python `list` or `tuple`.
+
+  Returns:
+    A tuple `(scalar_depth, max_depth)`.  `scalar_depth` is the nesting
+    depth of scalar values in `pylist`, or `None` if `pylist` contains no
+    scalars.  `max_depth` is the maximum depth of `pylist` (including
+    empty lists).
+
+  Raises:
+    ValueError: If pylist has inconsistent nesting depths for scalars.
+  """
+  if isinstance(pylist, (list, tuple)):
+    scalar_depth = None
+    max_depth = 1
+    for child in pylist:
+      child_scalar_depth, child_max_depth = _find_scalar_and_max_depth(child)
+      if child_scalar_depth is not None:
+        if scalar_depth is not None and scalar_depth != child_scalar_depth + 1:
+          raise ValueError('all scalar values must have the same nesting depth')
+        scalar_depth = child_scalar_depth + 1
+      max_depth = max(max_depth, child_max_depth + 1)
+    return (scalar_depth, max_depth)
+  else:
+    return (0, 0)
+
+
+def _default_inner_shape_for_pylist(pylist, ragged_rank):
+  """Computes a default inner shape for the given python list."""
+
+  def get_inner_shape(item):
+    """Returns the inner shape for a python list `item`."""
+    if not isinstance(item, (list, tuple)):
+      return ()
+    elif item:
+      return (len(item),) + get_inner_shape(item[0])
+    else:
+      return (0,)
+
+  def check_inner_shape(item, shape):
+    """Checks that `item` has a consistent shape matching `shape`."""
+    is_nested = isinstance(item, (list, tuple))
+    if is_nested != bool(shape):
+      raise ValueError('inner values have inconsistent shape')
+    if is_nested:
+      if shape[0] != len(item):
+        raise ValueError('inner values have inconsistent shape')
+      for child in item:
+        check_inner_shape(child, shape[1:])
+
+  # Collapse the ragged layers to get the list of inner values.
+  inner_values = pylist
+  for dim in range(ragged_rank):
+    if not all(isinstance(v, (list, tuple)) for v in inner_values):
+      raise ValueError('pylist has scalar values depth %d, but ragged_rank=%d '
+                       'requires scalar value depth greater than %d' %
+                       (dim + 1, ragged_rank, ragged_rank))
+    inner_values = sum((list(v) for v in inner_values), [])
+
+  # Compute the inner shape looking only at the leftmost elements; and then
+  # use check_inner_shape to verify that other elements have the same shape.
+  inner_shape = get_inner_shape(inner_values)
+  check_inner_shape(inner_values, inner_shape)
+  return inner_shape[1:]
+
+
+#===============================================================================
+# Convert value -> tensor
+#===============================================================================
+def convert_to_tensor_or_ragged_tensor(value,
+                                       dtype=None,
+                                       preferred_dtype=None,
+                                       name=None):
+  """Converts value to a `RaggedTensor` or `Tensor`.
+
+  * If `value` is a `RaggedTensor`, then return it as-is.
+  * If `value` is a `RaggedTensorValue`, return a corresponding constant
+    `RaggedTensor`.
+  * Otherwise, use `convert_to_tensor` to convert `value` to a `Tensor`.
+
+  Args:
+    value: A `RaggedTensor`, a `RaggedTensorValue`, or an object whose type has
+      a registered `Tensor` conversion function.
+    dtype: Optional element type for the returned tensor.  If missing the type
+      is inferred from the type of `value`.
+    preferred_dtype: Optional element type for the returned tensor, used when
+      dtype is None.  This argument has no effect if `value` is already a
+      tensor, or when conversion is not possible.
+    name: Optional name to use if a new `Tensor` is created.
+
+  Returns:
+    A `Tensor` or `RaggedTensor`.
+  """
+  if isinstance(value, ragged_tensor.RaggedTensor):
+    if dtype and not dtype.is_compatible_with(value.dtype):
+      raise ValueError('Tensor conversion requested dtype %s for '
+                       'RaggedTensor with dtype %s: %r' %
+                       (dtype.name, value.dtype.name, value))
+    return value
+  elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
+    with ops.name_scope(name, 'ConvertToTensorOrRaggedTensor', []):
+      inner_values = ops.convert_to_tensor(
+          value=value.inner_values,
+          dtype=dtype,
+          preferred_dtype=preferred_dtype,
+          name='inner_values')
+      return from_nested_row_splits(inner_values, value.nested_row_splits)
+  else:
+    return ops.convert_to_tensor(
+        value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
+
+
+#===============================================================================
+# Ops to construct RaggedTensor from row-partitioned values.
+#===============================================================================
+
+
+def from_value_rowids(values, value_rowids, nrows=None, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `value_rowids`.
+
+  The returned `RaggedTensor` corresponds with the python list defined by:
+
+  ```python
+  result = [[values[i] for i in range(len(values)) if value_rowids[i] == row]
+            for row in range(nrows)]
+  ```
+
+  Warning: currently, this needs to cast value_rowids to int64 before
+  converting, since `tf.bincount` only supports `int32`.
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    value_rowids: A 1-D int64 tensor with shape `[nvals]`, which corresponds
+      one-to-one with `values`, and specifies each value's row index.  Must be
+      nonnegative, and must be sorted in ascending order.
+    nrows: An int64 scalar specifying the number of rows.  This should be
+      specified if the `RaggedTensor` may containing empty training rows.  Must
+      be greater than `value_rowids[-1]` (or zero if `value_rowids` is empty).
+      Defaults to `value_rowids[-1]` (or zero if `value_rowids` is empty).
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  Raises:
+    ValueError: If `nrows` is incompatible with `value_rowids`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_value_rowids(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
+    ...     nrows=5)
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedFromValueRowIds',
+                      [values, value_rowids, nrows]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    value_rowids = ops.convert_to_tensor(
+        value_rowids, dtypes.int64, name='value_rowids')
+    if nrows is None:
+      const_rowids = tensor_util.constant_value(value_rowids)
+      if const_rowids is None:
+        nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
+        const_nrows = None
+      else:
+        const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
+        nrows = ops.convert_to_tensor(const_nrows, dtypes.int64, name='nrows')
+    else:
+      nrows = ops.convert_to_tensor(nrows, dtypes.int64, 'nrows')
+      const_nrows = tensor_util.constant_value(nrows)
+      if const_nrows is not None:
+        if const_nrows < 0:
+          raise ValueError('Expected nrows >= 0; got %d' % const_nrows)
+        const_rowids = tensor_util.constant_value(value_rowids)
+        if const_rowids is not None and const_rowids.size > 0:
+          if not const_nrows >= const_rowids[-1] + 1:
+            raise ValueError(
+                'Expected nrows >= value_rowids[-1] + 1; got nrows=%d, '
+                'value_rowids[-1]=%d' % (const_nrows, const_rowids[-1]))
+
+    value_rowids.shape.assert_has_rank(1)
+    nrows.shape.assert_has_rank(0)
+    values.shape[:1].assert_is_compatible_with(value_rowids.shape)
+
+    # Convert value_rowids & nrows to row_splits.
+    # Note: we don't use segment_ids_to_row_splits() here because we want
+    # to save the intermediate value `row_lengths`, so we can cache it.
+    # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the cast
+    # (Remove the warning in the docstring when we do.)
+    value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
+    nrows_int32 = math_ops.cast(nrows, dtypes.int32)
+    row_lengths = math_ops.bincount(
+        value_rowids_int32,
+        minlength=nrows_int32,
+        maxlength=nrows_int32,
+        dtype=dtypes.int64)
+    row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
+    if const_nrows is not None:
+      row_lengths.set_shape([const_nrows])
+      row_splits.set_shape([const_nrows + 1])
+
+    return ragged_tensor.RaggedTensor(
+        values,
+        row_splits,
+        cached_row_lengths=row_lengths,
+        cached_value_rowids=value_rowids,
+        cached_nrows=nrows,
+        internal=True)
+
+
+def from_row_splits(values, row_splits, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
+
+  The returned `RaggedTensor` corresponds with the python list defined by:
+
+  ```python
+  result = [values[row_splits[i]:row_splits[i + 1]]
+            for i in range(len(row_splits) - 1)]
+  ```
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    row_splits: A 1-D int64 tensor with shape `[nrows+1]`.  Must not be empty,
+      and must be sorted in ascending order.  `row_splits[0]` must be zero and
+      `row_splits[-1]` must be `nvals`.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  Raises:
+    ValueError: If `row_splits` is an empty list.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_row_splits(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_splits=[0, 4, 4, 7, 8, 8])
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  if isinstance(row_splits, (list, tuple)) and not row_splits:
+    raise ValueError('row_splits tensor may not be empty.')
+  with ops.name_scope(name, 'RaggedFromRowSplits', [values, row_splits]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    row_splits = ops.convert_to_tensor(row_splits, dtypes.int64, 'row_splits')
+    row_splits.shape.assert_has_rank(1)
+    return ragged_tensor.RaggedTensor(
+        values=values, row_splits=row_splits, internal=True)
+
+
+def from_row_lengths(values, row_lengths, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
+
+  The returned `RaggedTensor` corresponds with the python list defined by:
+
+  ```python
+  result = [[values.pop(0) for i in range(length)]
+            for length in row_lengths]
+  ```
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    row_lengths: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative.
+      `sum(row_lengths)` must be `nvals`.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_row_lengths(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_lengths=[4, 0, 3, 1, 0])
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedFromRowLengths', [values, row_lengths]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    row_lengths = ops.convert_to_tensor(row_lengths, dtypes.int64,
+                                        'row_lengths')
+    row_lengths.shape.assert_has_rank(1)
+    row_limits = math_ops.cumsum(row_lengths)
+    row_splits = array_ops.concat([[0], row_limits], axis=0)
+    return ragged_tensor.RaggedTensor(
+        values=values,
+        row_splits=row_splits,
+        cached_row_lengths=row_lengths,
+        internal=True)
+
+
+def from_row_starts(values, row_starts, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
+
+  Equivalent to: `from_row_splits(values, concat([row_starts, nvals]))`.
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    row_starts: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative
+      and sorted in ascending order.  If `nrows>0`, then `row_starts[0]` must be
+      zero.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_row_starts(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_starts=[0, 4, 4, 7, 8])
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedFromRowStarts', [values, row_starts]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    row_starts = ops.convert_to_tensor(row_starts, dtypes.int64, 'row_starts')
+    row_starts.shape.assert_has_rank(1)
+    nvals = array_ops.shape(values, out_type=dtypes.int64)[:1]
+    row_splits = array_ops.concat([row_starts, nvals], axis=0)
+    return ragged_tensor.RaggedTensor(
+        values=values, row_splits=row_splits, internal=True)
+
+
+def from_row_limits(values, row_limits, name=None):
+  """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
+
+  Equivalent to: `from_row_splits(values, concat([0, row_limits]))`.
+
+  Args:
+    values: A potentially ragged tensor with shape `[nvals, ...]`.
+    row_limits: A 1-D int64 tensor with shape `[nrows]`.  Must be sorted in
+      ascending order.  If `nrows>0`, then `row_limits[-1]` must be `nvals`.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor`.  `result.rank = values.rank + 1`.
+    `result.ragged_rank = values.ragged_rank + 1`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.from_row_limits(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_limits=[4, 4, 7, 8, 8])
+    >>> rt.eval().tolist()
+    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedFromRowLimits', [values, row_limits]):
+    values = convert_to_tensor_or_ragged_tensor(values, name='values')
+    row_limits = ops.convert_to_tensor(row_limits, dtypes.int64, 'row_limits')
+    row_limits.shape.assert_has_rank(1)
+    zero = array_ops.zeros([1], dtypes.int64)
+    row_splits = array_ops.concat([zero, row_limits], axis=0)
+    return ragged_tensor.RaggedTensor(
+        values=values, row_splits=row_splits, internal=True)
+
+
+def from_nested_value_rowids(inner_values,
+                             nested_value_rowids,
+                             nested_nrows=None,
+                             name=None):
+  """Creates a `RaggedTensor` from a nested list of `value_rowids` tensors.
+
+  Equivalent to:
+
+  ```python
+  result = inner_values
+  for (value_rowids, nrows) in reversed(zip(nested_value_rowids, nested_nrows)):
+    result = from_value_rowids(result, value_rowids, nrows)
+  ```
+
+  Args:
+    inner_values: A potentially ragged tensor.
+    nested_value_rowids: A list of 1-D int64 tensors.  The `i`th tensor is used
+      as the `value_rowids` for the `i`th ragged dimension.
+    nested_nrows: A list of int64 scalars.  The `i`th scalar is used as the
+      `nrows` for the `i`th ragged dimension.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor` (or `inner_values` if `nested_value_rowids` is empty).
+
+  Raises:
+    ValueError: If `len(nested_values_rowids) != len(nested_nrows)`.
+  """
+  if isinstance(nested_value_rowids, ops.Tensor):
+    raise TypeError('nested_value_rowids must be a list of Tensors')
+  if nested_nrows is None:
+    nested_nrows = [None] * len(nested_value_rowids)
+  else:
+    if isinstance(nested_nrows, ops.Tensor):
+      raise TypeError('nested_nrows must be a list of Tensors')
+    if len(nested_nrows) != len(nested_value_rowids):
+      raise ValueError('nested_nrows must have the same length as '
+                       'nested_value_rowids')
+
+  with ops.name_scope(
+      name, 'RaggedFromNestedValueRowIds',
+      [inner_values] + list(nested_value_rowids) + list(nested_nrows)):
+    result = inner_values
+    for value_rowids, nrows in reversed(
+        list(zip(nested_value_rowids, nested_nrows))):
+      result = from_value_rowids(result, value_rowids, nrows)
+    return result
+
+
+def from_nested_row_splits(inner_values, nested_row_splits, name=None):
+  """Creates a `RaggedTensor` from a nested list of `row_splits` tensors.
+
+  Equivalent to:
+
+  ```python
+  result = inner_values
+  for row_splits in reversed(nested_row_splits):
+    result = from_row_splits(result, row_splits)
+  ```
+
+  Args:
+    inner_values: A potentially ragged tensor.
+    nested_row_splits: A list of 1-D int64 tensors.  The `i`th tensor is used as
+      the `row_splits` for the `i`th ragged dimension.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor` (or `inner_values` if `nested_row_splits` is empty).
+  """
+  if isinstance(nested_row_splits, ops.Tensor):
+    raise TypeError('nested_row_splits must be a list of Tensors')
+  with ops.name_scope(name, 'RaggedFromNestedRowSplits',
+                      [inner_values] + list(nested_row_splits)):
+    result = inner_values
+    for splits in reversed(nested_row_splits):
+      result = from_row_splits(result, splits)
+    return result
diff --git a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..77418ff20db851da6e38b66ce032ed9ba241bcfa
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
@@ -0,0 +1,109 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.from_sparse."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0]],
+        values=[1, 2, 3, 4, 5],
+        dense_shape=[4, 3])
+    rt = ragged.from_sparse(st)
+
+    with self.test_session():
+      self.assertEqual(rt.eval().tolist(), [[1, 2, 3], [4], [], [5]])
+
+  def testEmpty(self):
+    st = sparse_tensor.SparseTensor(
+        indices=array_ops.zeros([0, 2], dtype=dtypes.int64),
+        values=[],
+        dense_shape=[4, 3])
+    rt = ragged.from_sparse(st)
+
+    with self.test_session():
+      self.assertEqual(rt.eval().tolist(), [[], [], [], []])
+
+  def testBadSparseTensorRank(self):
+    st1 = sparse_tensor.SparseTensor(indices=[[0]], values=[0], dense_shape=[3])
+    st2 = sparse_tensor.SparseTensor(
+        indices=[[0, 0, 0]], values=[0], dense_shape=[3, 3, 3])
+    st3 = sparse_tensor.SparseTensor(
+        indices=array_ops.placeholder(dtypes.int64),
+        values=[0],
+        dense_shape=array_ops.placeholder(dtypes.int64))
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            ragged.from_sparse, st1)
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            ragged.from_sparse, st2)
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            ragged.from_sparse, st3)
+
+  def testGoodPartialSparseTensorRank(self):
+    st1 = sparse_tensor.SparseTensor(
+        indices=[[0, 0]],
+        values=[0],
+        dense_shape=array_ops.placeholder(dtypes.int64))
+    st2 = sparse_tensor.SparseTensor(
+        indices=array_ops.placeholder(dtypes.int64),
+        values=[0],
+        dense_shape=[4, 3])
+
+    # Shouldn't throw ValueError
+    ragged.from_sparse(st1)
+    ragged.from_sparse(st2)
+
+  def testNonRaggedSparseTensor(self):
+    # "index_suffix" means the value of the innermost dimension of the index
+    # (i.e., indices[i][-1]).
+    # See comments in _assert_sparse_indices_are_ragged_right() for more
+    # details/background.
+
+    # index_suffix of first index is not zero.
+    st1 = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [0, 2], [2, 0]], values=[1, 2, 3], dense_shape=[3, 3])
+    # index_suffix of an index that starts a new row is not zero.
+    st2 = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [2, 1]], values=[1, 2, 3], dense_shape=[3, 3])
+    # index_suffix of an index that continues a row skips a cell.
+    st3 = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [0, 1], [0, 3]], values=[1, 2, 3], dense_shape=[3, 3])
+    rt1 = ragged.from_sparse(st1)
+    rt2 = ragged.from_sparse(st2)
+    rt3 = ragged.from_sparse(st3)
+    with self.test_session():
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'.*SparseTensor is not right-ragged', rt1.eval)
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'.*SparseTensor is not right-ragged', rt2.eval)
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'.*SparseTensor is not right-ragged', rt3.eval)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c59cd0b77b47441ee25f66a3fcdc88db0ea2ec0
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
@@ -0,0 +1,462 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.from_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
+                             parameterized.TestCase):
+
+  def testDocStringExamples(self):
+    # The examples from ragged.from_tensor.__doc__.
+    dt = constant_op.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+    with self.test_session():
+      self.assertEqual(
+          ragged.from_tensor(dt).eval().tolist(),
+          [[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+
+      self.assertEqual(
+          ragged.from_tensor(dt, lengths=[1, 0, 3]).eval().tolist(),
+          [[5], [], [6, 0, 0]])
+
+      self.assertEqual(
+          ragged.from_tensor(dt, padding=0).eval().tolist(),
+          [[5, 7], [0, 3], [6]])
+
+  @parameterized.parameters(
+      # 2D test cases, no length or padding.
+      {
+          'tensor': [[]],
+          'expected': [[]],
+      },
+      {
+          'tensor': [[1]],
+          'expected': [[1]],
+      },
+      {
+          'tensor': [[1, 2]],
+          'expected': [[1, 2]],
+      },
+      {
+          'tensor': [[1], [2], [3]],
+          'expected': [[1], [2], [3]],
+      },
+      {
+          'tensor': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'expected': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+      },
+      # 3D test cases, no length or padding
+      {
+          'tensor': [[[]]],
+          'expected': [[[]]],
+      },
+      {
+          'tensor': [[[]]],
+          'expected': [[[]]],
+          'ragged_rank': 1,
+      },
+      {
+          'tensor': [[[1]]],
+          'expected': [[[1]]],
+      },
+      {
+          'tensor': [[[1, 2]]],
+          'expected': [[[1, 2]]],
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]]],
+          'expected': [[[1, 2], [3, 4]]],
+      },
+      {
+          'tensor': [[[1, 2]], [[3, 4]], [[5, 6]], [[7, 8]]],
+          'expected': [[[1, 2]], [[3, 4]], [[5, 6]], [[7, 8]]],
+      },
+      {
+          'tensor': [[[1], [2]], [[3], [4]], [[5], [6]], [[7], [8]]],
+          'expected': [[[1], [2]], [[3], [4]], [[5], [6]], [[7], [8]]],
+      },
+      # 2D test cases, with length
+      {
+          'tensor': [[1]],
+          'lengths': [1],
+          'expected': [[1]]
+      },
+      {
+          'tensor': [[1]],
+          'lengths': [0],
+          'expected': [[]]
+      },
+      {
+          'tensor': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'lengths': [0, 1, 2],
+          'expected': [[], [4], [7, 8]]
+      },
+      {
+          'tensor': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'lengths': [0, 0, 0],
+          'expected': [[], [], []]
+      },
+      {
+          'tensor': [[1, 2], [3, 4]],
+          'lengths': [2, 2],
+          'expected': [[1, 2], [3, 4]]
+      },
+      {
+          'tensor': [[1, 2], [3, 4]],
+          'lengths': [7, 8],  # lengths > ncols: truncated to ncols
+          'expected': [[1, 2], [3, 4]]
+      },
+      {
+          'tensor': [[1, 2], [3, 4]],
+          'lengths': [-2, -1],  # lengths < 0: treated as zero
+          'expected': [[], []]
+      },
+      # 3D test cases, with length
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'lengths': [0, 0],
+          'expected': [[], []]
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'lengths': [1, 2],
+          'expected': [[[1, 2]], [[5, 6], [7, 8]]]
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'lengths': [2, 2],
+          'expected': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+      },
+      # 2D test cases, with padding
+      {
+          'tensor': [[1]],
+          'padding': 0,
+          'expected': [[1]]
+      },
+      {
+          'tensor': [[0]],
+          'padding': 0,
+          'expected': [[]]
+      },
+      {
+          'tensor': [[0, 1]],
+          'padding': 0,
+          'expected': [[0, 1]]
+      },
+      {
+          'tensor': [[1, 0]],
+          'padding': 0,
+          'expected': [[1]]
+      },
+      {
+          'tensor': [[1, 0, 1, 0, 0, 1, 0, 0]],
+          'padding': 0,
+          'expected': [[1, 0, 1, 0, 0, 1]]
+      },
+      {
+          'tensor': [[3, 7, 0, 0], [2, 0, 0, 0], [5, 0, 0, 0]],
+          'padding': 0,
+          'expected': [[3, 7], [2], [5]]
+      },
+      # 3D test cases, with padding
+      {
+          'tensor': [[[1]]],
+          'padding': [0],
+          'expected': [[[1]]]
+      },
+      {
+          'tensor': [[[0]]],
+          'padding': [0],
+          'expected': [[]]
+      },
+      {
+          'tensor': [[[0, 0], [1, 2]], [[3, 4], [0, 0]]],
+          'padding': [0, 0],
+          'expected': [[[0, 0], [1, 2]], [[3, 4]]]
+      },
+      # 4D test cases, with padding
+      {
+          'tensor': [
+              [[[1, 2], [3, 4]], [[0, 0], [0, 0]], [[0, 0], [0, 0]]],
+              [[[0, 0], [0, 0]], [[5, 6], [7, 8]], [[0, 0], [0, 0]]],
+              [[[0, 0], [0, 0]], [[0, 0], [0, 0]], [[0, 0], [0, 0]]]
+          ],
+          'padding': [[0, 0], [0, 0]],
+          'expected': [
+              [[[1, 2], [3, 4]]],
+              [[[0, 0], [0, 0]], [[5, 6], [7, 8]]],
+              []
+          ]
+      },
+      # 3D test cases, with ragged_rank=2.
+      {
+          'tensor': [[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+          'ragged_rank': 2,
+          'expected': [[[1, 0], [2, 3]], [[0, 0], [4, 0]]]
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'ragged_rank': 2,
+          'lengths': [2, 0, 2, 1],
+          'expected': [[[1, 2], []], [[5, 6], [7]]]
+      },
+      {
+          'tensor': [[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+          'ragged_rank': 2,
+          'padding': 0,
+          'expected': [[[1], [2, 3]], [[], [4]]]
+      },
+      # 4D test cases, with ragged_rank>1
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 2,
+          'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]]
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 3,
+          'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]]
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 2,
+          'padding': [0, 0],
+          'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8]]]]
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 3,
+          'padding': 0,
+          'expected': [[[[1], [2, 3]], [[], [4]]],
+                       [[[5, 6], [7]], [[0, 8], []]]]
+      },
+  )  # pyformat: disable
+  def testRaggedFromTensor(self,
+                           tensor,
+                           expected,
+                           lengths=None,
+                           padding=None,
+                           ragged_rank=1):
+    dt = constant_op.constant(tensor)
+    rt = ragged.from_tensor(dt, lengths, padding, ragged_rank)
+    self.assertEqual(type(rt), ragged.RaggedTensor)
+    self.assertEqual(rt.ragged_rank, ragged_rank)
+    self.assertTrue(
+        dt.shape.is_compatible_with(rt.shape),
+        '%s is incompatible with %s' % (dt.shape, rt.shape))
+    with self.test_session():
+      self.assertEqual(rt.eval().tolist(), expected)
+
+  def testHighDimensions(self):
+    # Use distinct prime numbers for all dimension shapes in this test, so
+    # we can see any errors that are caused by mixing up dimension sizes.
+    dt = array_ops.reshape(
+        math_ops.range(3 * 5 * 7 * 11 * 13 * 17), [3, 5, 7, 11, 13, 17])
+    for ragged_rank in range(1, 4):
+      rt = ragged.from_tensor(dt, ragged_rank=ragged_rank)
+      self.assertEqual(type(rt), ragged.RaggedTensor)
+      self.assertEqual(rt.ragged_rank, ragged_rank)
+      self.assertTrue(
+          dt.shape.is_compatible_with(rt.shape),
+          '%s is incompatible with %s' % (dt.shape, rt.shape))
+      with self.test_session():
+        self.assertEqual(rt.eval().tolist(), self.evaluate(dt).tolist())
+
+  @parameterized.parameters(
+      # With no padding or lengths
+      {
+          'dt_shape': [0, 0],
+          'expected': []
+      },
+      {
+          'dt_shape': [0, 3],
+          'expected': []
+      },
+      {
+          'dt_shape': [3, 0],
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [0, 2, 3],
+          'expected': []
+      },
+      {
+          'dt_shape': [2, 0, 3],
+          'expected': [[], []]
+      },
+      {
+          'dt_shape': [2, 3, 0],
+          'expected': [[[], [], []], [[], [], []]]
+      },
+      {
+          'dt_shape': [2, 3, 0, 1],
+          'expected': [[[], [], []], [[], [], []]]
+      },
+      {
+          'dt_shape': [2, 3, 1, 0],
+          'expected': [[[[]], [[]], [[]]], [[[]], [[]], [[]]]]
+      },
+      # With padding
+      {
+          'dt_shape': [0, 0],
+          'padding': 0,
+          'expected': []
+      },
+      {
+          'dt_shape': [0, 3],
+          'padding': 0,
+          'expected': []
+      },
+      {
+          'dt_shape': [3, 0],
+          'padding': 0,
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [0, 2, 3],
+          'padding': [0, 0, 0],
+          'expected': []
+      },
+      {
+          'dt_shape': [2, 0, 3],
+          'padding': [0, 0, 0],
+          'expected': [[], []]
+      },
+      {
+          'dt_shape': [2, 3, 0],
+          'padding': [],
+          'expected': [[], []]
+      },
+      # With lengths
+      {
+          'dt_shape': [0, 0],
+          'lengths': [],
+          'expected': []
+      },
+      {
+          'dt_shape': [0, 3],
+          'lengths': [],
+          'expected': []
+      },
+      {
+          'dt_shape': [3, 0],
+          'lengths': [0, 0, 0],
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [3, 0],
+          'lengths': [2, 3, 4],  # lengths > ncols: truncated to ncols
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [0, 2, 3],
+          'lengths': [],
+          'expected': []
+      },
+      {
+          'dt_shape': [2, 0, 3],
+          'lengths': [0, 0],
+          'expected': [[], []]
+      },
+      {
+          'dt_shape': [2, 3, 0],
+          'lengths': [0, 0],
+          'expected': [[], []]
+      },
+  )
+  def testEmpty(self, dt_shape, expected, lengths=None, padding=None):
+    dt = array_ops.zeros(dt_shape)
+    rt = ragged.from_tensor(dt, lengths, padding)
+    self.assertEqual(type(rt), ragged.RaggedTensor)
+    self.assertEqual(rt.ragged_rank, 1)
+    self.assertTrue(dt.shape.is_compatible_with(rt.shape))
+    with self.test_session():
+      self.assertEqual(rt.eval().tolist(), expected)
+
+  @parameterized.parameters(
+      {
+          'tensor': [[1]],
+          'lengths': [0],
+          'padding': 0,
+          'error': (ValueError, 'Specify lengths or padding, but not both')
+      },
+      {
+          'tensor': [[1]],
+          'lengths': [0.5],
+          'error': (TypeError, 'lengths must be an integer tensor')
+      },
+      {
+          'tensor': [[1]],
+          'padding': 'a',
+          'error': (TypeError, "Expected int32, got 'a'.*")
+      },
+      {
+          'tensor': [[1]],
+          'padding': [1],
+          'error': (ValueError, r'Shapes \(1,\) and \(\) are incompatible')
+      },
+      {
+          'tensor': [[[1]]],
+          'padding': 1,
+          'error': (ValueError, r'Shapes \(\) and \(1,\) are incompatible')
+      },
+      {
+          'tensor': [[1]],
+          'ragged_rank': 'bad',
+          'error': (TypeError, r'ragged_rank expected int, got \'bad\'')
+      },
+      {
+          'tensor': [[1]],
+          'ragged_rank': 0,
+          'error': (ValueError, r'ragged_rank must be greater than 0; got 0')
+      },
+      {
+          'tensor': [[1]],
+          'ragged_rank': -1,
+          'error': (ValueError, r'ragged_rank must be greater than 0; got -1')
+      },
+  )
+  def testErrors(self,
+                 tensor,
+                 lengths=None,
+                 padding=None,
+                 ragged_rank=1,
+                 error=None):
+    dt = constant_op.constant(tensor)
+    self.assertRaisesRegexp(error[0], error[1], ragged.from_tensor, dt, lengths,
+                            padding, ragged_rank)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b71d88435c91d1c130c1c24a033ebcf4a7959cb
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for ragged tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+def map_inner_values(op, *args, **kwargs):
+  """Applies `op` to the inner values of one or more RaggedTensors.
+
+  Replaces any `RaggedTensor` in `args` or `kwargs` with its `inner_values`
+  tensor, and then calls `op`.  Returns a `RaggedTensor` that is constructed
+  from the input `RaggedTensor`s' `splits` and the value returned by
+  the `op`.
+
+  If the input arguments contain multiple `RaggedTensor`s, then they must have
+  identical `splits`.
+
+  Examples:
+
+  ```python
+  >>> rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
+  >>> ragged.map_inner_values(tf.ones_like, rt).eval().tolist()
+  [[1, 1, 1], [], [1, 1], [1]]
+  >>> ragged.map_inner_values(tf.multiply, rt, rt).eval().tolist()
+  [[1, 4, 9], [], [16, 25], [36]]
+  >>> ragged.map_inner_values(tf.add, rt, 5).eval().tolist()
+  [[6, 7, 8], [], [9, 10], [11]]
+  ```
+
+  Args:
+    op: The operation that should be applied to the RaggedTensor `inner_values`.
+      `op` is typically an element-wise operation (such as math_ops.add), but
+      any operation that preserves the size of the outermost dimension can be
+      used.  I.e., `shape[0]` of the value returned by `op` must match
+      `shape[0]` of the `RaggedTensor`s' `inner_values` tensors.
+    *args: Arguments for `op`.
+    **kwargs: Keyword arguments for `op`.
+
+  Returns:
+    A `RaggedTensor` whose `ragged_rank` matches the `ragged_rank` of all
+    input `RaggedTensor`s.
+  Raises:
+    ValueError: If args contains no `RaggedTensors`, or if the `nested_splits`
+      of the input `RaggedTensor`s are not identical.
+  """
+  # Replace RaggedTensors with their values; and collect the splits tensors
+  # from each RaggedTensor.
+  nested_splits_lists = []
+  inner_args = _replace_ragged_with_inner_values(args, nested_splits_lists)
+  inner_kwargs = _replace_ragged_with_inner_values(kwargs, nested_splits_lists)
+  if not nested_splits_lists:
+    return op(*args, **kwargs)
+
+  with ops.control_dependencies(
+      ragged_util.assert_splits_match(nested_splits_lists)):
+    # Delegate to op, and then compose the result from the transformed values
+    # and the splits.
+    return ragged_factory_ops.from_nested_row_splits(
+        op(*inner_args, **inner_kwargs), nested_splits_lists[0])
+
+
+def _replace_ragged_with_inner_values(value, nested_splits_lists):
+  """Replace RaggedTensors with their inner_values, and record their splits.
+
+  Returns a copy of `value`, with any nested `RaggedTensor`s replaced by their
+  `inner_values` tensor.  Looks inside lists, tuples, and dicts.
+
+  Appends each `RaggedTensor`'s `nested_splits` to `nested_splits_lists`.
+
+  Args:
+    value: The value that should be transformed by replacing `RaggedTensors`.
+    nested_splits_lists: An output parameter used to record the `nested_splits`
+      for any `RaggedTensors` that were replaced.
+
+  Returns:
+    A copy of `value` with nested `RaggedTensors` replaced by their `values`.
+  """
+  # Base case
+  if ragged_tensor.is_ragged(value):
+    value = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(value)
+    nested_splits_lists.append(value.nested_row_splits)
+    return value.inner_values
+
+  # Recursion cases
+  def recurse(v):
+    return _replace_ragged_with_inner_values(v, nested_splits_lists)
+
+  if isinstance(value, list):
+    return [recurse(v) for v in value]
+  elif isinstance(value, tuple):
+    return tuple(recurse(v) for v in value)
+  elif isinstance(value, dict):
+    return dict((k, recurse(v)) for (k, v) in value.items())
+  else:
+    return value
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c52db9e2a1643fd85d593b0bd6a307ec810246ca
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -0,0 +1,232 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.gather_nd."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
+
+  DOCSTRING_PARAMS = [[['000', '001'], ['010']],
+                      [['100'], ['110', '111', '112'], ['120']],
+                      [[], ['210']]]  # pyformat: disable
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Examples
+      #=========================================================================
+      dict(
+          descr='Docstring example 1',
+          params=ragged.constant_value(DOCSTRING_PARAMS),
+          indices=[[2], [0]],
+          expected=ragged.constant_value([[[], [b'210']],
+                                          [[b'000', b'001'], [b'010']]])),
+      dict(
+          descr='Docstring example 2',
+          params=ragged.constant_value(DOCSTRING_PARAMS),
+          indices=[[2, 1], [0, 0]],
+          expected=ragged.constant_value([[b'210'], [b'000', b'001']])),
+      dict(
+          descr='Docstring example 3',
+          params=ragged.constant_value(DOCSTRING_PARAMS),
+          indices=[[0, 0, 1], [1, 1, 2]],
+          expected=[b'001', b'112']),
+      #=========================================================================
+      # Indices with 0 values (selects the entire params)
+      #=========================================================================
+      dict(
+          descr='params: [B1, (B2)], indices: [0], result: [B1, (B2)]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=np.zeros([0], dtype=np.int32),
+          expected=ragged.constant_value([[b'a', b'b', b'c'], [b'd']])),
+      dict(
+          descr='params: [B1, (B2)], indices: [A1, 0], result: [A1, B1, (B2)]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=np.zeros([3, 0], dtype=np.int32),
+          expected=ragged.constant_value([[[b'a', b'b', b'c'], [b'd']],
+                                          [[b'a', b'b', b'c'], [b'd']],
+                                          [[b'a', b'b', b'c'], [b'd']]])),
+      dict(
+          descr=('params: [B1, (B2)], indices: [A1, A2, 0], '
+                 'result: [A1, A2, B1, (B2)]'),
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=np.zeros([1, 3, 0], dtype=np.int32),
+          expected=ragged.constant_value([[[[b'a', b'b', b'c'], [b'd']],
+                                           [[b'a', b'b', b'c'], [b'd']],
+                                           [[b'a', b'b', b'c'], [b'd']]]])),
+      dict(
+          descr='params: [B1], indices: [A1, (A2), 0], result: [A1, (A2), B1]',
+          params=['a'],
+          indices=ragged.constant_value([[[], []], [[]]],
+                                        ragged_rank=1,
+                                        dtype=np.int32),
+          expected=ragged.constant_value([[[b'a'], [b'a']], [[b'a']]],
+                                         ragged_rank=1)),
+      #=========================================================================
+      # Indices with 1 value (selects row from params)
+      #=========================================================================
+      dict(
+          descr='params: [B1, (B2)], indices: [A1, 1], result: [A1, (B2)]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=[[1], [0]],
+          expected=ragged.constant_value([[b'd'], [b'a', b'b', b'c']])),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, 1], '
+                 'result: [A1, (B2), (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[1], [1]],
+          expected=ragged.constant_value([[[b'e', b'f']], [[b'e', b'f']]])),
+      dict(
+          descr=('params: [B1, B2, B3], indices: [A1, (A2), 1], '
+                 'result: [A1, (A2), B2, B3]'),
+          params=[[['a']], [['b']]],
+          indices=ragged.constant_value([[[0]]], ragged_rank=1),
+          expected=ragged.constant_value([[[[b'a']]]], ragged_rank=1)),
+      #=========================================================================
+      # Indices with 2 values (selects row & col from params)
+      #=========================================================================
+      dict(
+          descr='params: [B1, (B2)], indices: [A1, 2], result: [A1]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=[[1, 0], [0, 0], [0, 2]],
+          expected=ragged.constant_value([b'd', b'a', b'c'])),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, 2], '
+                 'result: [A1, (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[1, 0], [0, 1], [0, 0]],
+          expected=ragged.constant_value([[b'e', b'f'], [b'd'],
+                                          [b'a', b'b', b'c']])),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, A2, 2], '
+                 'result: [A1, (A2), (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[[1, 0], [0, 1], [0, 0]]],
+          expected=ragged.constant_value([[[b'e', b'f'], [b'd'],
+                                           [b'a', b'b', b'c']]])),
+      dict(
+          descr=('params: [B1, (B2), B3], indices: [A1, A2, 2], '
+                 'result: [A1, A2, B3]'),
+          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
+                                        [['e', 'f']]],
+                                       ragged_rank=1),
+          indices=[[[1, 0], [0, 1], [0, 0]]],
+          expected=[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]),
+      dict(
+          descr=('params: [B1, (B2), B3], indices: [A1, A2, A3, 2], '
+                 'result: [A1, A2, A3, B3]'),
+          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
+                                        [['e', 'f']]],
+                                       ragged_rank=1),
+          indices=[[[[1, 0], [0, 1], [0, 0]]]],
+          expected=[[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]]),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, (A2), 2], '
+                 'result: [A1, (A2), (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=ragged.constant_value([[[1, 0], [0, 1]], [[0, 0]]],
+                                        ragged_rank=1),
+          expected=ragged.constant_value([[[b'e', b'f'], [b'd']],
+                                          [[b'a', b'b', b'c']]])),
+      #=========================================================================
+      # Indices with 3 values
+      #=========================================================================
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, 3], '
+                 'result: [A1]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[1, 0, 1], [0, 0, 0], [0, 1, 0]],
+          expected=[b'f', b'a', b'd']),
+      dict(
+          descr=('params: [B1, (B2), B3], indices: [A1, 3], '
+                 'result: [A1]'),
+          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
+                                        [['e', 'f']]],
+                                       ragged_rank=1),
+          indices=[[1, 0, 1], [0, 0, 0], [0, 1, 1]],
+          expected=[b'f', b'a', b'd']),
+      dict(
+          descr=('params: [B1, (B2), (B3), B4], indices: [A1, 3], '
+                 'result: [A1, B4]'),
+          params=ragged.constant_value([[[['a', 'b'], ['c', 'd']],
+                                         [['e', 'f']]]],
+                                       ragged_rank=2),
+          indices=[[0, 0, 1], [0, 0, 0], [0, 1, 0]],
+          expected=[[b'c', b'd'], [b'a', b'b'], [b'e', b'f']]),
+  ])  # pyformat: disable
+  def testRaggedGatherNd(self, descr, params, indices, expected):
+    result = ragged.gather_nd(params, indices)
+    self.assertEqual(
+        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    with self.test_session() as sess:
+      if hasattr(expected, 'tolist'):
+        expected = expected.tolist()
+      self.assertEqual(self.evaluate(result).tolist(), expected)
+
+  def testRaggedGatherNdUnknownRankError(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd']])
+    indices1 = array_ops.placeholder(dtypes.int32, shape=None)
+    indices2 = array_ops.placeholder(dtypes.int32, shape=[None])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'indices.rank be statically known.'):
+      ragged.gather_nd(params, indices1)
+    with self.assertRaisesRegexp(
+        ValueError, r'indices.shape\[-1\] must be statically known.'):
+      ragged.gather_nd(params, indices2)
+
+  @parameterized.parameters([
+      dict(
+          params=['a'],
+          indices=0,
+          message='Shape must be at least rank 1 but is rank 0'
+          " for 'GatherNd'"),
+      dict(
+          params=ragged.constant_value([['a']]),
+          indices=0,
+          message='indices.rank must be at least 1.'),
+      dict(
+          params=['a', 'b', 'c'],
+          indices=ragged.constant([[0]]),
+          message='The innermost dimension of indices may not be ragged'),
+  ])
+  def testRaggedGatherNdStaticError(self,
+                                    params,
+                                    indices,
+                                    message,
+                                    error=ValueError):
+    with self.assertRaisesRegexp(error, message):
+      ragged.gather_nd(params, indices)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb52d05c32ea2f4a47ade9cc84ae3415789e3b8b
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -0,0 +1,144 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.gather."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExamples(self):
+    params = constant_op.constant(['a', 'b', 'c', 'd', 'e'])
+    indices = constant_op.constant([3, 1, 2, 1, 0])
+    ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, ragged_indices).eval().tolist(),
+          [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
+      self.assertEqual(
+          ragged.gather(ragged_params, indices).eval().tolist(),
+          [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
+      self.assertEqual(
+          ragged.gather(ragged_params, ragged_indices).eval().tolist(),
+          [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
+
+  def testTensorParamsAndTensorIndices(self):
+    params = ['a', 'b', 'c', 'd', 'e']
+    indices = [2, 0, 2, 1]
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [b'c', b'a', b'c', b'b'])
+      self.assertEqual(type(ragged.gather(params, indices)), ops.Tensor)
+
+  def testRaggedParamsAndTensorIndices(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    indices = [2, 0, 2, 1]
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
+
+  def testTensorParamsAndRaggedIndices(self):
+    params = ['a', 'b', 'c', 'd', 'e']
+    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
+
+  def testRaggedParamsAndRaggedIndices(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
+           [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
+           [[]]]                                        #  [p[3]            ]]
+      )  # pyformat: disable
+
+  def testRaggedParamsAndScalarIndices(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    indices = 1
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(), [b'c', b'd', b'e'])
+
+  def test3DRaggedParamsAnd2DTensorIndices(self):
+    params = ragged.constant([[['a', 'b'], []], [['c', 'd'], ['e'], ['f']],
+                              [['g']]])
+    indices = [[1, 2], [0, 1], [2, 2]]
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
+           [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
+           [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
+      )  # pyformat: disable
+
+  def testTensorParamsAnd4DRaggedIndices(self):
+    indices = ragged.constant(
+        [[[[3, 4], [0, 6]], []], [[[2, 1], [1, 0]], [[2, 5]], [[2, 3]]],
+         [[[1, 0]]]],  # pyformat: disable
+        ragged_rank=2,
+        inner_shape=(2,))
+    params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+    with self.test_session():
+      self.assertEqual(
+          ragged.gather(params, indices).eval().tolist(),
+          [[[[b'd', b'e'], [b'a', b'g']], []],
+           [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
+           [[[b'b', b'a']]]])  # pyformat: disable
+
+  def testOutOfBoundsError(self):
+    tensor_params = ['a', 'b', 'c']
+    tensor_indices = [0, 1, 2]
+    ragged_params = ragged.constant([['a', 'b'], ['c']])
+    ragged_indices = ragged.constant([[0, 3]])
+    with self.test_session():
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'indices\[1\] = 3 is not in \[0, 3\)',
+                              ragged.gather(tensor_params, ragged_indices).eval)
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'indices\[2\] = 2 is not in \[0, 2\)',
+                              ragged.gather(ragged_params, tensor_indices).eval)
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'indices\[1\] = 3 is not in \[0, 2\)',
+                              ragged.gather(ragged_params, ragged_indices).eval)
+
+  def testUnknownIndicesRankError(self):
+    params = ragged.constant([], ragged_rank=1)
+    indices = constant_op.constant([0], dtype=dtypes.int64)
+    indices = array_ops.placeholder_with_default(indices, None)
+    self.assertRaisesRegexp(ValueError,
+                            r'indices\.shape\.ndims must be known statically',
+                            ragged.gather, params, indices)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
new file mode 100644
index 0000000000000000000000000000000000000000..9821695046c577627298c413fcfc7716b71f8019
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -0,0 +1,388 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python-style indexing and slicing for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+def ragged_tensor_getitem(self, key):
+  """Returns the specified piece of this RaggedTensor.
+
+  Supports multidimensional indexing and slicing, with one restriction:
+  indexing into a ragged inner dimension is not allowed.  This case is
+  problematic because the indicated value may exist in some rows but not
+  others.  In such cases, it's not obvious whether we should (1) report an
+  IndexError; (2) use a default value; or (3) skip that value and return a
+  tensor with fewer rows than we started with.  Following the guiding
+  principles of Python ("In the face of ambiguity, refuse the temptation to
+  guess" <go/pep20>), we simply disallow this operation.
+
+  Any dimensions added by `array_ops.newaxis` will be ragged if the following
+  dimension is ragged.
+
+  Args:
+    self: The RaggedTensor to slice.
+    key: Indicates which piece of the RaggedTensor to return, using standard
+      Python semantics (e.g., negative values index from the end).  `key`
+      may have any of the following types:
+
+      * `int` constant
+      * Scalar integer `Tensor`
+      * `slice` containing integer constants and/or scalar integer
+        `Tensor`s
+      * `Ellipsis`
+      * `tf.newaxis`
+      * `tuple` containing any of the above (for multidimentional indexing)
+
+  Returns:
+    A `Tensor` or `RaggedTensor` object.  Values that include at least one
+    ragged dimension are returned as `RaggedTensor`.  Values that include no
+    ragged dimensions are returned as `Tensor`.  See above for examples of
+    expressions that return `Tensor`s vs `RaggedTensor`s.
+
+  Raises:
+    ValueError: If `key` is out of bounds.
+    ValueError: If `key` is not supported.
+    TypeError: If the indices in `key` have an unsupported type.
+
+  Examples:
+
+    ```python
+    >>> # A 2-D ragged tensor with 1 ragged dimension.
+    >>> rt = ragged.constant([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g']])
+    >>> rt[0].eval().tolist()       # First row (1-D `Tensor`)
+    ['a', 'b', 'c']
+    >>> rt[:3].eval().tolist()      # First three rows (2-D RaggedTensor)
+    [['a', 'b', 'c'], ['d', 'e'], '[f'], [g']]
+    >>> rt[3, 0].eval().tolist()    # 1st element of 4th row (scalar)
+    'g'
+
+    >>> # A 3-D ragged tensor with 2 ragged dimensions.
+    >>> rt = ragged.constant([[[1, 2, 3], [4]],
+    ...                    [[5], [], [6]],
+    ...                    [[7]],
+    ...                    [[8, 9], [10]]])
+    >>> rt[1].eval().tolist()       # Second row (2-D RaggedTensor)
+    [[5], [], [6]]
+    >>> rt[3, 0].eval().tolist()    # First element of fourth row (1-D Tensor)
+    [8, 9]
+    >>> rt[:, 1:3].eval().tolist()  # Items 1-3 of each row (3-D RaggedTensor)
+    [[[4]], [[], [6]], [], [[10]]]
+    >>> rt[:, -1:].eval().tolist()  # Last item of each row (3-D RaggedTensor)
+    [[[4]], [[6]], [[7]], [[10]]]
+    ```
+  """
+  scope_tensors = [self] + list(_tensors_in_key_list(key))
+  if isinstance(key, (list, tuple)):
+    key = list(key)
+  else:
+    key = [key]
+  with ops.name_scope(None, "RaggedGetItem", scope_tensors):
+    return _ragged_getitem(self, key)
+
+
+def _ragged_getitem(rt_input, key_list):
+  """Helper for indexing and slicing ragged tensors with __getitem__().
+
+  Extracts the specified piece of the `rt_input`.  See
+  `RaggedTensor.__getitem__` for examples and restrictions.
+
+  Args:
+    rt_input: The `RaggedTensor` from which a piece should be returned.
+    key_list: The list of keys specifying which piece to return. Each key
+      corresponds with a separate dimension.
+
+  Returns:
+    The indicated piece of rt_input.
+
+  Raises:
+    ValueError: If `key_list` is not supported.
+    TypeError: If any keys in `key_list` have an unsupported type.
+  """
+  if not key_list:
+    return rt_input
+  row_key = key_list[0]
+  inner_keys = key_list[1:]
+
+  if row_key is Ellipsis:
+    expanded_key_list = _expand_ellipsis(key_list, rt_input.shape.ndims)
+    return _ragged_getitem(rt_input, expanded_key_list)
+
+  # Adding a new axis: Get rt_input[inner_keys], and wrap it in a RaggedTensor
+  # that puts all values in a single row.
+  if row_key is array_ops.newaxis:
+    inner_rt = _ragged_getitem(rt_input, inner_keys)
+    nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
+    return ragged_factory_ops.from_row_splits(inner_rt,
+                                              array_ops.stack([0, nsplits - 1]))
+
+  # Slicing a range of rows: first slice the outer dimension, and then
+  # call `_ragged_getitem_inner_dimensions` to handle the inner keys.
+  if isinstance(row_key, slice):
+    sliced_rt_input = _slice_ragged_row_dimension(rt_input, row_key)
+    return _ragged_getitem_inner_dimensions(sliced_rt_input, inner_keys)
+
+  # Indexing a single row: slice values to get the indicated row, and then
+  # use a recursive call to __getitem__ to handle the inner keys.
+  else:
+    starts = rt_input.row_splits[:-1]
+    limits = rt_input.row_splits[1:]
+    row = rt_input.values[starts[row_key]:limits[row_key]]
+    return row.__getitem__(inner_keys)
+
+
+def _slice_ragged_row_dimension(rt_input, row_key):
+  """Slice the outer dimension of `rt_input` according to the given `slice`.
+
+  Args:
+    rt_input: The `RaggedTensor` to slice.
+    row_key: The `slice` object that should be used to slice `rt_input`.
+
+  Returns:
+    A `RaggedTensor` containing the indicated slice of `rt_input`.
+  """
+  if row_key.start is None and row_key.stop is None and row_key.step is None:
+    return rt_input
+
+  # Use row_key to slice the starts & limits.
+  new_starts = rt_input.row_splits[:-1][row_key]
+  new_limits = rt_input.row_splits[1:][row_key]
+  zero_pad = array_ops.zeros([1], dtypes.int64)
+
+  # If there's no slice step, then we can just select a single continuous
+  # span of `ragged.values(rt_input)`.
+  if row_key.step is None or row_key.step == 1:
+    # Construct the new splits.  If new_starts and new_limits are empty,
+    # then this reduces to [0].  Otherwise, this reduces to:
+    #   concat([[new_starts[0]], new_limits])
+    new_splits = array_ops.concat(
+        [zero_pad[array_ops.size(new_starts):], new_starts[:1], new_limits],
+        axis=0)
+    values_start = new_splits[0]
+    values_limit = new_splits[-1]
+    return ragged_factory_ops.from_row_splits(
+        rt_input.values[values_start:values_limit], new_splits - values_start)
+
+  # If there is a slice step (aka a strided slice), then use ragged_gather to
+  # collect the necessary elements of `ragged.values(rt_input)`.
+  else:
+    return _build_ragged_tensor_from_value_ranges(new_starts, new_limits, 1,
+                                                  rt_input.values)
+
+
+def _ragged_getitem_inner_dimensions(rt_input, key_list):
+  """Retrieve inner dimensions, keeping outermost dimension unchanged.
+
+  Args:
+    rt_input: The `RaggedTensor` or `Tensor` from which a piece should be
+      extracted.
+    key_list: The __getitem__ keys for slicing the inner dimensions.
+
+  Returns:
+    A `RaggedTensor`.
+
+  Raises:
+    ValueError: If key_list is not supported.
+  """
+  if not key_list:
+    return rt_input
+
+  if isinstance(rt_input, ops.Tensor):
+    return rt_input.__getitem__([slice(None, None, None)] + key_list)
+
+  column_key = key_list[0]
+  if column_key is Ellipsis:
+    expanded_key_list = _expand_ellipsis(key_list, rt_input.values.shape.ndims)
+    return _ragged_getitem_inner_dimensions(rt_input, expanded_key_list)
+
+  # Adding a new axis to a ragged inner dimension: recursively get the inner
+  # dimensions of rt_input with key_list[1:], and then wrap the result in a
+  # RaggedTensor that puts each value in its own row.
+  if column_key is array_ops.newaxis:
+    inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
+    nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
+    return ragged_factory_ops.from_row_splits(inner_rt, math_ops.range(nsplits))
+
+  # Slicing a range of columns in a ragged inner dimension.  We use a
+  # recursive call to process the values, and then assemble a RaggedTensor
+  # with those values.
+  if isinstance(column_key, slice):
+    if (column_key.start is None and column_key.stop is None and
+        column_key.step is None):
+      # Trivial slice: recursively process all values, & splits is unchanged.
+      return rt_input.with_values(
+          _ragged_getitem_inner_dimensions(rt_input.values, key_list[1:]))
+    else:
+      # Nontrivial slice: use ragged_gather to extract the indicated slice as
+      # a new RaggedTensor (inner_rt), and then recursively process its values.
+      # The splits can be taken from ragged.row_splits(inner_rt).
+      inner_rt_starts = rt_input.row_splits[:-1]
+      inner_rt_limits = rt_input.row_splits[1:]
+      if column_key.start is not None and column_key.start != 0:
+        inner_rt_starts = _add_offset_to_ranges(
+            column_key.start, rt_input.row_splits[:-1], rt_input.row_splits[1:])
+      if column_key.stop is not None and column_key.stop != 0:
+        inner_rt_limits = _add_offset_to_ranges(
+            column_key.stop, rt_input.row_splits[:-1], rt_input.row_splits[1:])
+      inner_rt = _build_ragged_tensor_from_value_ranges(
+          inner_rt_starts, inner_rt_limits, column_key.step, rt_input.values)
+      return inner_rt.with_values(
+          _ragged_getitem_inner_dimensions(inner_rt.values, key_list[1:]))
+
+  # Indexing a single column in a ragged inner dimension: raise an Exception.
+  # See RaggedTensor.__getitem__.__doc__ for an explanation of why indexing
+  # into a ragged inner dimension is problematic.
+  else:
+    raise ValueError("Cannot index into an inner ragged dimension.")
+
+
+def _expand_ellipsis(key_list, num_remaining_dims):
+  """Expands the ellipsis at the start of `key_list`.
+
+  Assumes that the first element of `key_list` is Ellipsis.  This will either
+  remove the Ellipsis (if it corresponds to zero indices) or prepend a new
+  `slice(None, None, None)` (if it corresponds to more than zero indices).
+
+  Args:
+    key_list: The arguments to `__getitem__()`.
+    num_remaining_dims: The number of dimensions remaining.
+
+  Returns:
+    A copy of `key_list` with he ellipsis expanded.
+  Raises:
+    ValueError: If ragged_rank.shape.ndims is None
+    IndexError: If there are too many elements in `key_list`.
+  """
+  if num_remaining_dims is None:
+    raise ValueError("Ellipsis not supported for unknown shape RaggedTensors")
+  num_indices = sum(1 for idx in key_list if idx is not array_ops.newaxis)
+  if num_indices > num_remaining_dims + 1:
+    raise IndexError("Too many indices for RaggedTensor")
+  elif num_indices == num_remaining_dims + 1:
+    return key_list[1:]
+  else:
+    return [slice(None, None, None)] + key_list
+
+
+def _tensors_in_key_list(key_list):
+  """Generates all Tensors in the given slice spec."""
+  if isinstance(key_list, ops.Tensor):
+    yield key_list
+  if isinstance(key_list, (list, tuple)):
+    for v in key_list:
+      for tensor in _tensors_in_key_list(v):
+        yield tensor
+  if isinstance(key_list, slice):
+    for tensor in _tensors_in_key_list(key_list.start):
+      yield tensor
+    for tensor in _tensors_in_key_list(key_list.stop):
+      yield tensor
+    for tensor in _tensors_in_key_list(key_list.step):
+      yield tensor
+
+
+def _build_ragged_tensor_from_value_ranges(starts, limits, step, values):
+  """Returns a `RaggedTensor` containing the specified sequences of values.
+
+  Returns a RaggedTensor `output` where:
+
+  ```python
+  output.shape[0] = starts.shape[0]
+  output[i] = values[starts[i]:limits[i]:step]
+  ```
+
+  Requires that `starts.shape == limits.shape` and
+  `0 <= starts[i] <= limits[i] <= values.shape[0]`.
+
+  Args:
+    starts: 1D integer Tensor specifying the start indices for the sequences of
+      values to include.
+    limits: 1D integer Tensor specifying the limit indices for the sequences of
+      values to include.
+    step: Integer value specifying the step size for strided slices.
+    values: The set of values to select from.
+
+  Returns:
+    A `RaggedTensor`.
+
+  Raises:
+    ValueError: Until the prerequisite ops are checked in.
+  """
+  # Use `ragged_range` to get the index of each value we should include.
+  if step is None:
+    step = 1
+  step = ops.convert_to_tensor(step, name="step")
+  if step.dtype.is_integer:
+    step = math_ops.cast(step, dtypes.int64)
+  else:
+    raise TypeError("slice strides must be integers or None")
+  value_indices = ragged_math_ops.range(starts, limits, step)
+
+  # Use `ragged_gather` or `array_ops.gather` to collect the values.
+  if isinstance(values, ragged_tensor.RaggedTensor):
+    gathered_values = ragged_array_ops.gather(
+        params=values, indices=value_indices.values)
+  else:
+    gathered_values = array_ops.gather(
+        params=values, indices=value_indices.values)
+
+  # Assemble the RaggedTensor from splits & values.
+  return value_indices.with_values(gathered_values)
+
+
+def _add_offset_to_ranges(offset, starts, limits):
+  """Adds an indexing offset to each of the specified ranges.
+
+  If offset>=0, then return output[i]=min(starts[i]+offset, limits[i])
+  If offset<0, then return output[i]=max(limits[i]+offset, starts[i])
+
+  Args:
+    offset: The offset to add.  None, or an int, or a scalar Tensor.
+    starts: 1-D int64 tensor containing start indices.
+    limits: 1-D int64 tensor containing limit indices.
+
+  Returns:
+    A 1-D int64 tensor.
+  """
+
+  def map_positive_offset(offset):
+    return math_ops.minimum(starts + offset, limits)
+
+  def map_negative_offset(offset):
+    return math_ops.maximum(limits + offset, starts)
+
+  if isinstance(offset, ops.Tensor):
+    offset = math_ops.cast(offset, dtypes.int64)
+    return control_flow_ops.cond(offset >= 0,
+                                 lambda: map_positive_offset(offset),
+                                 lambda: map_negative_offset(offset))
+  elif isinstance(offset, int):
+    return (map_positive_offset(offset)
+            if offset > 0 else map_negative_offset(offset))
+
+  else:
+    raise TypeError("slice offsets must be integers or None")
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dac86310b9f29b9b3075875bb8816f9700924fe6
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -0,0 +1,283 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.map_fn."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops as mo
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import googletest
+
+
+class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+  @parameterized.parameters([
+      # The following test sets map over a RaggedTensor and apply a
+      # transformation that returns with shape:
+      # [d1, (d2)] -> [d1]
+      dict(
+          fn=mo.reduce_mean,
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[2, 4, 6],
+      ),
+      dict(
+          fn=string_ops.reduce_join,
+          elems=[['foo', 'bar', 'baz'], ['a'], ['b', 'c']],
+          expected_output=[b'foobarbaz', b'a', b'bc'],
+          dtype=dtypes.string,
+      ),
+      # [d1, (d2)] -> [d1, 2]
+      dict(
+          fn=lambda x: array_ops.stack([mo.reduce_mean(x), mo.reduce_sum(x)]),
+          # fn=self.stack_mean_and_sum,
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[[2, 6], [4.5, 9], [6.5, 13]],
+          dtype=dtypes.float32,
+      ),
+      # [d1, (d2)] -> [d1, (d2)]
+      dict(
+          fn=lambda x: x+1,
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[[2, 3, 4], [5, 6], [7, 8]],
+          dtype=dtypes.int64,
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+      ),
+      # [d1, (d2), d3] -> [d1, (d2), d3]
+      dict(
+          fn=lambda x: x+1,
+          elems=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          elems_ragged_rank=1,
+          expected_ragged_rank=1,
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+          expected_output=[[[2, 3], [4, 5]], [], [[6, 7], [8, 9], [10, 1]]],
+      ),
+      # [d1, (d2)] -> [d1, (d2), (d3)]
+      dict(
+          fn=lambda x: ragged.from_row_starts(x, [0]),
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[[[1, 2, 3]], [[4, 5]], [[6, 7]]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=2),
+      ),
+      # [d1, (d2), (d3)] -> [d1, (d2), (d3)]
+      dict(
+          fn=lambda x: ragged.map_inner_values(mo.add, x, 1),
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[[[2, 3, 4]], [[5, 6], [7, 8]]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=2),
+      ),
+      # [d1, (d2), (d3)] -> [d1, (d2)]
+      dict(
+          fn=lambda x: ragged.reduce_sum(x, axis=1),
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[[6], [9, 13]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+      ),
+      # [d1, (d2), (d3)] -> [d1, (d3)]
+      dict(
+          fn=lambda x: ragged.reduce_sum(x, axis=0),
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[[1, 2, 3], [10, 12]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+      ),
+      # [d1, (d2), (d3)] -> [d1]
+      dict(
+          fn=ragged.reduce_sum,
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[6, 22],
+          result_dtype=dtypes.int64,
+      ),
+      # [d1] -> [d1, (d2)]
+      dict(
+          fn=mo.range,
+          elems=[4, 0, 2],
+          expected_output=[[0, 1, 2, 3], [], [0, 1]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=1),
+      ),
+      # [d1] -> [d1, (d2), (d3)]
+      dict(
+          fn=lambda x: ragged.range(mo.range(x)),
+          elems=[5, 0, 3],
+          expected_output=[
+              [[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [], [[], [0], [0, 1]]
+          ],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=2),
+      ),
+      # [d1, (d2), (d3), (d4a), (d5)] ->  [d1, (d2), (d3), (d4b), (d5)]
+      dict(
+          fn=lambda x: ragged.add(x, 1),
+          elems=[[[[[1, 2, 3]], [[4], [5]]]], [[[[6, 7]]], [[[8], []]]]],
+          expected_output=[[[[[2, 3, 4]], [[5], [6]]]],
+                           [[[[7, 8]]], [[[9], []]]]],
+          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
+                                               ragged_rank=4),
+      ),
+  ])
+
+  def testRaggedMap(
+      self,
+      fn,
+      elems,
+      expected_output,
+      expected_ragged_rank=None,
+      result_ragged_rank=None,
+      elems_ragged_rank=None,
+      dtype=dtypes.int64,
+      result_dtype=None,
+      infer_shape=False,
+  ):
+    elems = ragged.constant(elems, dtype, elems_ragged_rank)
+    output = ragged.map_fn(
+        fn=fn, elems=elems, dtype=result_dtype, infer_shape=infer_shape)
+
+    expected_rt = ragged.constant(
+        expected_output, ragged_rank=expected_ragged_rank)
+    with self.test_session():
+      if ragged.is_ragged(expected_output):
+        self.assertEqual(output.ragged_rank, expected_rt.ragged_rank)
+      output_values = self.evaluate(output)
+      self.assertAllEqual(expected_output, output_values.tolist())
+
+  def testRaggedMapOnStructure(self):
+    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    # [[10, 20, 30], [40], [50, 60, 70]]
+    robin = ragged.map_inner_values(mo.multiply, batman, 10)
+
+    features = {'batman': batman, 'robin': robin}
+
+    def _reduce_sum_from_all(f):
+      return mo.reduce_sum(f['batman']) + mo.reduce_sum(f['robin'])
+
+    output = ragged.map_fn(
+        fn=_reduce_sum_from_all,
+        elems=features,
+        dtype=dtypes.int32,
+    )
+
+    with self.test_session():
+      self.assertAllEqual(output.eval().tolist(), [66, 44, 198])
+
+  # Test mapping over a dict of RTs can produce a dict of RTs.
+  def testRaggedMapOnStructure_RaggedOutputs(self):
+    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    # [[10, 20, 30], [40], [50, 60, 70]]
+    robin = ragged.map_inner_values(mo.multiply, batman, 10)
+
+    features = {'batman': batman, 'robin': robin}
+
+    def _increment(f):
+      return {
+          'batman': ragged.add(f['batman'], 1),
+          'robin': ragged.add(f['robin'], 1),
+      }
+
+    output = ragged.map_fn(
+        fn=_increment,
+        elems=features,
+        infer_shape=False,
+        dtype={
+            'batman':
+                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1),
+            'robin':
+                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1)
+        },
+    )
+
+    with self.test_session():
+      self.assertAllEqual(output['batman'].eval().tolist(),
+                          [[2, 3, 4], [5], [6, 7, 8]])
+      self.assertAllEqual(output['robin'].eval().tolist(),
+                          [[11, 21, 31], [41], [51, 61, 71]])
+
+  def testZip(self):
+    x = ragged.constant([[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]],
+                        dtypes.int64)
+    y = array_ops.expand_dims(
+        mo.range(ragged.nrows(x), dtype=dtypes.int64), axis=1)
+
+    def _zip(foo):
+      y_val, x_val = foo
+      bar = backend.tile(y_val, array_ops.shape(x_val))
+      return array_ops.stack([bar, x_val], axis=1)
+
+    output = ragged.map_fn(
+        _zip, (y, x),
+        dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1),
+        infer_shape=False)
+
+    with self.test_session():
+      result = self.evaluate(output).tolist()
+      self.assertAllEqual(
+          result, [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]],
+                   [[3, 70]], [[4, 80], [4, 90], [4, 100]]])
+
+  def testBatchGather(self):
+    tokens = ragged.constant([['hello', '.', 'there'], ['merhaba'],
+                              ['bonjour', '.', 'ca va', '?']])
+    indices = ragged.constant([[0, 2], [0], [0, 2]])
+
+    def gather(x):
+      tokens_val, indices_val = x
+      return array_ops.gather(tokens_val, indices_val)
+
+    data = tokens, indices
+    out = ragged.map_fn(
+        gather,
+        data,
+        dtype=ragged.RaggedTensorType(dtype=dtypes.string, ragged_rank=1),
+        infer_shape=False)
+
+    with self.test_session():
+      self.assertAllEqual(
+          self.evaluate(out).tolist(),
+          [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])
+
+  def testMismatchRaggedRank(self):
+    elems = ragged.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
+    fn = lambda x: ragged.reduce_sum(x, axis=0)
+    with self.assertRaisesWithLiteralMatch(
+        ValueError, r'The declared ragged rank (23) mismatches the result (1)'):
+      _ = ragged.map_fn(
+          fn,
+          elems,
+          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=23))
+
+  def testMismatchRaggedRank2(self):
+    elems = ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
+    fn = lambda x: ragged.from_row_starts(x, [0])
+    with self.assertRaisesWithLiteralMatch(
+        ValueError, r'The declared ragged rank (10) mismatches the result (1)'):
+      _ = ragged.map_fn(
+          fn,
+          elems,
+          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=10))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..798d7c3ce81e77d7134752757387d8da27fed411
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
@@ -0,0 +1,210 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.map_inner_values."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
+                                 parameterized.TestCase):
+
+  def assertRaggedMapInnerValuesReturns(self,
+                                        op,
+                                        expected,
+                                        args=(),
+                                        kwargs=None):
+    kwargs = kwargs or {}
+    result = ragged.map_inner_values(op, *args, **kwargs)
+    with self.test_session():
+      self.assertEqual(result.eval().tolist(), expected)
+
+  def testDocStringExamples(self):
+    """Test the examples in apply_op_to_ragged_values.__doc__."""
+    rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
+    v1 = ragged.map_inner_values(array_ops.ones_like, rt)
+    v2 = ragged.map_inner_values(math_ops.multiply, rt, rt)
+    v3 = ragged.map_inner_values(math_ops.add, rt, 5)
+    with self.test_session():
+      self.assertEqual(v1.eval().tolist(), [[1, 1, 1], [], [1, 1], [1]])
+      self.assertEqual(v2.eval().tolist(), [[1, 4, 9], [], [16, 25], [36]])
+      self.assertEqual(v3.eval().tolist(), [[6, 7, 8], [], [9, 10], [11]])
+
+  def testOpWithSingleRaggedTensorArg(self):
+    tensor = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=array_ops.zeros_like,
+        args=(tensor,),
+        expected=[[0, 0, 0], [], [0, 0]])
+
+  def testOpWithTwoRaggedTensorArgs(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply, args=(x, y), expected=[[3, 2, 12], [], [4, 25]])
+
+  def testOpWithRaggedTensorAndScalarArgs(self):
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply, args=(5, y), expected=[[5, 10, 15], [], [20, 25]])
+
+  def testOpWithThreeRaggedTensorArgs(self):
+    condition = ragged.constant(
+        [[True, True, False], [], [True, False]])  # pyformat: disable
+    x = ragged.constant([['a', 'b', 'c'], [], ['d', 'e']])
+    y = ragged.constant([['A', 'B', 'C'], [], ['D', 'E']])
+    self.assertRaggedMapInnerValuesReturns(
+        op=array_ops.where,
+        args=(condition, x, y),
+        expected=[[b'a', b'b', b'C'], [], [b'd', b'E']])
+
+  def testOpWithRaggedTensorListArg(self):
+    x = ragged.constant([[1, 2, 3], [], [4, 5]])
+    y = ragged.constant([[10, 20, 30], [], [40, 50]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.add_n,
+        args=([x, y, x],),
+        expected=[[12, 24, 36], [], [48, 60]])
+
+  def testOpWithKeywordArgs(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        kwargs=dict(x=x, y=y),
+        expected=[[3, 2, 12], [], [4, 25]])
+
+  def testOpWithMixedPositionalAndKeywordArgs(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x,),
+        kwargs=dict(y=y),
+        expected=[[3, 2, 12], [], [4, 25]])
+
+  def testNonElementWiseOp(self):
+    x = ragged.constant(
+        [[[3, 1, 4], [1, 5, 9], [2, 6, 5]], [], [[3, 5, 8], [9, 7, 9]]],
+        ragged_rank=1)
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.reduce_sum,
+        kwargs={
+            'input_tensor': x,
+            'axis': 1,
+        },
+        expected=[[8, 15, 13], [], [16, 25]])
+
+  def testOpWithRaggedRankGreaterThanOne(self):
+    # ragged_rank=0
+    x0 = [3, 1, 4, 1, 5, 9, 2, 6, 5]
+    y0 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    with self.test_session():
+      self.assertEqual(
+          math_ops.multiply(x0, y0).eval().tolist(),
+          [3, 2, 12, 4, 25, 54, 14, 48, 45])
+
+    # ragged_rank=1
+    x1 = ragged.constant([[3, 1, 4], [], [1, 5], [9, 2], [6, 5]])
+    y1 = ragged.constant([[1, 2, 3], [], [4, 5], [6, 7], [8, 9]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x1, y1),
+        expected=[[3, 2, 12], [], [4, 25], [54, 14], [48, 45]])
+
+    # ragged_rank=2
+    x2 = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]], [[9, 2], [6, 5]]])
+    y2 = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]], [[6, 7], [8, 9]]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x2, y2),
+        expected=[[[3, 2, 12]],          # row 0
+                  [],                    # row 1
+                  [[], [4, 25]],         # row 2
+                  [[54, 14], [48, 45]]   # row 3
+                 ])  # pyformat: disable
+
+    # ragged_rank=3
+    x3 = ragged.constant([[[[3, 1, 4]], []], [], [[[], [1, 5]]],
+                          [[[9, 2], [6, 5]]]])
+    y3 = ragged.constant([[[[1, 2, 3]], []], [], [[[], [4, 5]]],
+                          [[[6, 7], [8, 9]]]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x3, y3),
+        expected=[
+            [[[3, 2, 12]], []],       # row 0
+            [],                       # row 1
+            [[[], [4, 25]]],          # row 2
+            [[[54, 14], [48, 45]]]    # row 3
+        ])  # pyformat: disable
+
+  def testOpWithRaggedRankThree(self):
+    x = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]]])
+    y = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x, y),
+        expected=[[[3, 2, 12]], [], [[], [4, 25]]])
+
+  def testOpWithInnerValuesOnly(self):
+    x = constant_op.constant([[1, 2], [3, 4], [5, 6]])
+    y = constant_op.constant(2)
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply, args=(x, y), expected=[[2, 4], [6, 8], [10, 12]])
+
+  def testRaggedTensorSplitsRaggedRankMismatchError(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[[3, 1, 4], []], [], [[1, 5]]])
+    self.assertRaisesRegexp(ValueError,
+                            r'Inputs must have identical ragged splits.*',
+                            ragged.map_inner_values, math_ops.add, x, y)
+
+  def testRaggedTensorSplitsValueMismatchError(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1], [2, 3], [4, 5]])
+    self.assertRaisesRegexp(errors.InvalidArgumentError,
+                            r'Inputs must have identical ragged splits.*',
+                            ragged.map_inner_values, math_ops.add, x, y)
+
+  def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
+    splits1 = array_ops.placeholder_with_default(
+        constant_op.constant([0, 3, 3, 5], dtypes.int64), None)
+    splits2 = array_ops.placeholder_with_default(
+        constant_op.constant([0, 1, 3, 5], dtypes.int64), None)
+    x = ragged.from_row_splits([3, 1, 4, 1, 5], splits1)
+    y = ragged.from_row_splits([1, 2, 3, 4, 5], splits2)
+    result = ragged.map_inner_values(math_ops.add, x, y)
+    with self.test_session():
+      self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[Inputs must have identical ragged splits\] '
+          r'\[Condition x == y did not hold element-wise:\].*', result.eval)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fafa23b8dcbbf128723c1b8e51611a958087fdeb
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -0,0 +1,446 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional operations.
+
+See the [Higher Order
+Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+
+def map_fn(fn,
+           elems,
+           dtype=None,
+           parallel_iterations=None,
+           back_prop=True,
+           swap_memory=False,
+           infer_shape=True,
+           name=None):
+  """map on the list of tensors unpacked from `elems` on dimension 0.
+
+  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
+  sequence of elements from first to last. The elements are made of the
+  tensors unpacked from `elems`. `dtype` is the data type of the return
+  value of `fn`. Users must provide `dtype` if it is different from
+  the data type of `elems`.
+
+  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
+  of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
+
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
+  Furthermore, `fn` may emit a different structure than its input.  For example,
+  `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
+  the `dtype` parameter is not optional: `dtype` must be a type or (possibly
+  nested) tuple of types matching the output of `fn`.
+
+  To apply a functional operation to the nonzero elements of a SparseTensor
+  one of the following methods is recommended. First, if the function is
+  expressible as TensorFlow ops, use
+
+  ```python
+    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
+  ```
+
+  If, however, the function is not expressible as a TensorFlow op, then use
+
+  ```python
+  result = SparseTensor(
+    input.indices, map_fn(fn, input.values), input.dense_shape)
+  ```
+
+  instead.
+
+  When executing eagerly, map_fn does not execute in parallel even if
+  `parallel_iterations` is set to a value > 1. You can still get the
+  performance benefits of running a function in parallel by using the
+  `tf.contrib.eager.defun` decorator,
+
+  ```python
+  # Assume the function being used in map_fn is fn.
+  # To ensure map_fn calls fn in parallel, use the defun decorator.
+  @tf.contrib.eager.defun
+  def func(tensor):
+    return tf.map_fn(fn, tensor)
+  ```
+
+  Note that if you use the defun decorator, any non-TensorFlow Python code
+  that you may have written in your function won't get executed. See
+  `tf.contrib.eager.defun` for more details. The recommendation would be to
+  debug without defun but switch to defun to get performance benefits of
+  running map_fn in parallel.
+
+  Args:
+    fn: The callable to be performed.  It accepts one argument, which will have
+      the same (possibly nested) structure as `elems`.  Its output must have the
+      same structure as `dtype` if one is provided, otherwise it must have the
+      same structure as `elems`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which will
+      be unpacked along their first dimension.  The nested sequence of the
+      resulting slices will be applied to `fn`.
+    dtype: (optional) The output type(s) of `fn`.  If `fn` returns a structure
+      of Tensors differing from the structure of `elems`, then `dtype` is not
+      optional and must have the same structure as the output of `fn`. Use
+      `RaggedTensorType` to declare an output of type `RaggedTensor`.
+    parallel_iterations: (optional) The number of iterations allowed to run in
+      parallel. When graph building, the default value is 10. While executing
+      eagerly, the default value is set to 1.
+    back_prop: (optional) True enables support for back propagation.
+    swap_memory: (optional) True enables GPU-CPU memory swapping.
+    infer_shape: (optional) False disables tests for consistent output shapes.
+    name: (optional) Name prefix for the returned tensors.
+
+  Returns:
+    A possibly nested sequence of potentially ragged tensors.  Each
+    tensor packs the results of applying `fn` to tensors unpacked from `elems`
+    along the first dimension, from first to last.
+
+  Raises:
+    TypeError: if `fn` is not callable or the structure of the output of
+      `fn` and `dtype` do not match, or if elems is a SparseTensor.
+    ValueError: if the lengths of the output of `fn` and `dtype` do not match.
+
+  #### Examples:
+
+    ```python
+    elems = np.array([1, 2, 3, 4, 5, 6])
+    squares = map_fn(lambda x: x * x, elems)
+    # squares == [1, 4, 9, 16, 25, 36]
+    ```
+
+    ```python
+    elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
+    alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
+    # alternate == [-1, 2, -3]
+    ```
+
+    ```python
+    elems = np.array([1, 2, 3])
+    alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
+    # alternates[0] == [1, 2, 3]
+    # alternates[1] == [-1, -2, -3]
+    ```
+
+    ```python
+    elems=ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
+    mean = map_fn(tf.reduce_mean, elems)
+    # mean == [2, 4, 6]
+    ```
+
+    ```python
+    elems=ragged.constant([[1, 2, 3], [4, 5], [6, 7]], dtype=tf.int64)
+    out = map_fn(fn=lambda x: x+1, elems,
+      dtype=ragged.RaggedTensorType(type=tf.int64, ragged_rank=0))
+    # out = ragged.constant([[2, 3, 4], [5, 6], [7, 8]])
+    ```
+  """
+  if not callable(fn):
+    raise TypeError("fn must be callable.")
+
+  if isinstance(elems, sparse_tensor.SparseTensor):
+    raise TypeError(
+        "To perform a map on the values of a sparse tensor use either "
+        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
+        " SparseTensor(input.indices, map_fn(fn, input.values), "
+        "input.dense_shape)")
+
+  in_graph_mode = not context.executing_eagerly()
+  # Set the default number of parallel_iterations depending on graph/eager mode.
+  if in_graph_mode and not parallel_iterations:
+    parallel_iterations = 10
+  elif not in_graph_mode and not parallel_iterations:
+    parallel_iterations = 1
+
+  if not in_graph_mode and parallel_iterations > 1:
+    logging.log_first_n(logging.WARN, "Setting parallel_iterations > 1 has no "
+                        "effect when executing eagerly. Consider calling map_fn"
+                        " with tf.contrib.eager.defun to execute fn in "
+                        "parallel.", 1)
+    parallel_iterations = 1
+
+  input_is_sequence = nest.is_sequence(elems)
+  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
+
+  def input_pack(x):
+    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
+
+  elems_flat = input_flatten(elems)
+
+  with ops.name_scope(name, "map", elems_flat):
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode:
+      # Any get_variable calls in fn will cache the first call locally
+      # and not issue repeated network I/O requests for each iteration.
+      varscope = vs.get_variable_scope()
+      varscope_caching_device_was_none = False
+      if varscope.caching_device is None:
+        # TODO(ebrevdo): Change to using colocate_with here and in other
+        # methods.
+        varscope.set_caching_device(lambda op: op.device)
+        varscope_caching_device_was_none = True
+
+    elems_flat = [
+        ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+            elem, name="elem") for elem in elems_flat
+    ]
+
+    # We can either infer the output, or we can assume that it will be the same
+    # as the input structure.
+    dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
+
+    # Find the number of iterations, n may be known statically.
+    if isinstance(elems_flat[0], ragged_tensor.RaggedTensor):
+      n = ragged_array_ops.nrows(elems_flat[0], out_type=dtypes.int32)
+    else:
+      static_shape = elems_flat[0].shape
+      if static_shape.ndims is not None and static_shape.ndims < 1:
+        if len(elems_flat) == 1:
+          raise ValueError(
+              "elems must be a 1+ dimensional Tensor, not a scalar")
+        else:
+          raise ValueError(
+              "elements in elems must be 1+ dimensional Tensors, not scalars")
+      n = static_shape[0].value or array_ops.shape(elems_flat[0])[0]
+
+    # Create a flat list of TAs.
+
+    # Flatten the dtype structure to a list.
+    dtype_flat = nest.flatten(dtype)
+
+    # decompose to components
+    dtype_components = [_maybe_decompose_dtype(d) for d in dtype_flat]
+    dtype_components_flat = nest.flatten(dtype_components)
+
+    # Create TensorArrays.
+    accs_ta = [
+        tensor_array_ops.TensorArray(
+            dtype=t, dynamic_size=False, infer_shape=infer_shape, size=n)
+        for t in dtype_components_flat
+    ]
+
+    i = constant_op.constant(0)
+
+    def compute(i, tas):
+      """The loop body of map_fn.
+
+      Args:
+        i: the loop counter
+        tas: the flat TensorArray accumulator list
+
+      Returns:
+        (i + 1, tas): the updated counter + updated TensorArrays
+
+      Raises:
+        TypeError: if dtype and packed_fn_values structure do not match
+        ValueType: if dtype and packed_fn_values lengths do not match
+      """
+      # Get Tensors or RaggedTensors sliced at i, then pack it back to the
+      # original structure.
+      packed_values = input_pack([elem_flat[i] for elem_flat in elems_flat])
+      packed_fn_values = fn(packed_values)
+
+      # Check that the structure of the output matches what was declared or
+      # inferred.
+      # nest.assert_same_structure(dtype or elems, packed_fn_values)
+
+      # Flatten and decompose to a list of Tensors
+      flat_fn_values = nest.flatten(packed_fn_values)
+
+      # If we declared that we are expecting a RaggedTensor output, but we get a
+      # Tensor output. We should try to convert it to a RaggedTensor.
+      flat_fn_composite_tensors = list(
+          _convert_declared(flat_fn_values, dtype_flat))
+
+      flat_fn_components = [
+          _maybe_decompose_tensor(t) for t in flat_fn_composite_tensors
+      ]
+      flat_fn_tensors = nest.flatten(flat_fn_components)
+
+      # Write to TAs.
+      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_tensors)]
+
+      return (i + 1, tas)
+
+    _, r_a = control_flow_ops.while_loop(
+        lambda i, _: i < n, compute, (i, accs_ta),
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory,
+        maximum_iterations=n)
+
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode and varscope_caching_device_was_none:
+      varscope.set_caching_device(None)
+
+    # Pack back into a list of components
+    results_as_components = nest.pack_sequence_as(dtype_components, r_a)
+
+    # Stack TensorArrays for Tensor outputs, and concat RaggedTensor outputs.
+    def _stack_or_concat(e):
+      if isinstance(e, _RaggedTensorComponents):
+        return _concat_ragged_tensor_components(e)
+      else:
+        result = e.stack()
+        return result
+
+    results_flat_components = [
+        _stack_or_concat(e) for e in results_as_components
+    ]
+
+    results_packed = [
+        _maybe_recompose_tensor(c) for c in results_flat_components
+    ]
+    results_packed = nest.pack_sequence_as(dtype, results_packed)
+    return results_packed
+
+
+class _RaggedTensorComponents(
+    collections.namedtuple(
+        "_RaggedTensorComponents",
+        ["inner_values", "nested_row_lengths", "outer_row_length"])):
+  """A namedtuple of components which represent a `RaggedTensor`.
+
+  _RaggedTensorComponents is a list of components which can be used to create a
+  `RaggedTensor`. Use this class to represent a `RaggedTensor` in situations
+  where nest.flatten and nest.pack_sequence_as should decompose ragged tensors
+  into their components..
+
+  The following are a list of components for a `RaggedTensor`:
+
+  inner_values: The flat and inner values of a RaggedTensor. This could be
+    a `Tensor`, a `TensorArray`, or a data type.
+  nested_row_lengths: a tuple containing the row lengths of each rank. The
+    elements of the tuple could be `Tensor`s or `TensorArray`s.
+  outer_row_length: a `Tensor` or `TensorArray` containing the row length of the
+    `RaggedTensor`'s outermost dimension.
+
+  See `RaggedTensor` for more details of the use of each component.
+  """
+  __slots__ = ()
+
+
+def _concat_ragged_tensor_components(rt_ta):
+  inner_values = rt_ta.inner_values.concat()
+  nested_row_lengths = tuple(
+      row_lengths_ta.concat() for row_lengths_ta in rt_ta.nested_row_lengths)
+  outer_row_length = rt_ta.outer_row_length.concat()
+  return _RaggedTensorComponents(
+      inner_values=inner_values,
+      nested_row_lengths=nested_row_lengths,
+      outer_row_length=outer_row_length)
+
+
+def _maybe_decompose_tensor(rt):
+  """Decompose tensors to their composite tensors."""
+  if not isinstance(rt, ragged_tensor.RaggedTensor):
+    return rt
+
+  # The three component pieces we need:
+  # - inner values
+  inner_values = rt.inner_values
+
+  # - row_splits of the RT
+  splits = rt.nested_row_splits
+  nested_row_lengths = tuple(split[1:] - split[:-1] for split in splits)
+
+  # - outer row length
+  outer_row_length = array_ops.expand_dims(ragged_array_ops.nrows(rt), axis=0)
+
+  return _RaggedTensorComponents(
+      inner_values=inner_values,
+      nested_row_lengths=nested_row_lengths,
+      outer_row_length=outer_row_length,
+  )
+
+
+def _maybe_recompose_tensor(t):
+  """Reconstructs a _RaggedTensorComponents into a RaggedTensor."""
+  if not isinstance(t, _RaggedTensorComponents):
+    return t
+
+  values = t.inner_values
+  nested_row_lengths = tuple(t.nested_row_lengths)
+  for nested_row_length in reversed(nested_row_lengths):
+    values = ragged_factory_ops.from_row_lengths(values, nested_row_length)
+  return ragged_factory_ops.from_row_lengths(values, t.outer_row_length)
+
+
+def _maybe_decompose_dtype(d):
+  """Decompose dtypes into composite tensors (if necessary)."""
+  if not isinstance(d, ragged_tensor.RaggedTensorType):
+    return d
+
+  result = _RaggedTensorComponents(
+      inner_values=d.dtype,
+      nested_row_lengths=tuple(dtypes.int64 for i in range(d.ragged_rank - 1)),
+      outer_row_length=dtypes.int64,
+  )
+  return result
+
+
+def _convert_declared(fn_output_flat, output_declared):
+  """Convert outputs which are `Tensor`s into `_RaggedTensorComponents`."""
+  for current, declared in zip(fn_output_flat, output_declared):
+    if isinstance(declared, ragged_tensor.RaggedTensorType):
+      if isinstance(current, ragged_tensor.RaggedTensor):
+        # Check that the ragged ranks match up.
+        # + 1 to account for the rank of the outermost dimension.
+        if declared.ragged_rank != current.ragged_rank + 1:
+          raise ValueError(
+              "The declared ragged rank (%d) mismatches the result (%d)" %
+              (declared.ragged_rank, current.ragged_rank))
+        yield current
+      else:
+        # We the output is a Tensor, but the caller has declared that we are
+        # expecting an RaggedTensor output.
+        if declared.ragged_rank != 1:
+          raise ValueError(
+              "The declared ragged rank (%d) mismatches the result (1)" %
+              declared.ragged_rank)
+
+        row_length = array_ops.expand_dims(
+            ragged_array_ops.nrows(current), axis=0)
+        rt = _RaggedTensorComponents(
+            inner_values=current,
+            nested_row_lengths=(),
+            outer_row_length=row_length)
+        yield rt
+    else:
+      yield current
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..857b8dbfa361901108bf88949ac167a277991e36
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -0,0 +1,566 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for ragged tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_ragged_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import segment_id_ops
+
+
+#===============================================================================
+# ragged.range
+#===============================================================================
+# pylint: disable=redefined-builtin
+def range(starts, limits=None, deltas=1, dtype=None, name=None):
+  """Returns a `RaggedTensor` containing the specified sequences of numbers.
+
+  Each row of the returned `RaggedTensor` contains a single sequence:
+
+  ```python
+  ragged.range(starts, limits, deltas)[i] ==
+      tf.range(starts[i], limits[i], deltas[i])
+  ```
+
+  If `start[i] < limits[i] and deltas[i] > 0`, then `output[i]` will be an
+  empty list.  Similarly, if `start[i] > limits[i] and deltas[i] < 0`, then
+  `output[i]` will be an empty list.  This behavior is consistent with the
+  Python `range` function, but differs from the `tf.range` op, which returns
+  an error for these cases.
+
+  Examples:
+
+  ```python
+  >>> ragged.range([3, 5, 2]).eval().tolist()
+  [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]]
+  >>> ragged.range([0, 5, 8], [3, 3, 12]).eval().tolist()
+  [[0, 1, 2], [], [8, 9, 10, 11]]
+  >>> ragged.range([0, 5, 8], [3, 3, 12], 2).eval().tolist()
+  [[0, 2], [], [8, 10]]
+  ```
+
+  The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+  The vector inputs must all have the same size.  Scalar inputs are broadcast
+  to match the size of the vector inputs.
+
+  Args:
+    starts: Vector or scalar `Tensor`.  Specifies the first entry for each range
+      if `limits` is not `None`; otherwise, specifies the range limits, and the
+      first entries default to `0`.
+    limits: Vector or scalar `Tensor`.  Specifies the exclusive upper limits for
+      each range.
+    deltas: Vector or scalar `Tensor`.  Specifies the increment for each range.
+      Defaults to `1`.
+    dtype: The type of the elements of the resulting tensor.  If not specified,
+      then a value is chosen based on the other args.
+    name: A name for the operation.
+
+  Returns:
+    A `RaggedTensor` of type `dtype` with `ragged_rank=1`.
+  """
+  if limits is None:
+    starts, limits = 0, starts
+
+  with ops.name_scope(name, 'RaggedRange', [starts, limits, deltas]) as name:
+    starts = ops.convert_to_tensor(starts, dtype=dtype, name='starts')
+    limits = ops.convert_to_tensor(limits, dtype=dtype, name='limits')
+    deltas = ops.convert_to_tensor(deltas, dtype=dtype, name='deltas')
+
+    # infer dtype if not explicitly provided
+    if dtype is None:
+      starts, limits, deltas = _infer_matching_dtype(
+          [starts, limits, deltas],
+          [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64])
+
+    result = gen_ragged_math_ops.ragged_range(starts, limits, deltas, name=name)
+    return ragged_factory_ops.from_row_splits(result.rt_dense_values,
+                                              result.rt_nested_splits)
+
+
+def _infer_matching_dtype(tensors, dtype_hierarchy):
+  """Infers a matching dtype for tensors, and casts them to that dtype."""
+  assert all(t.dtype in dtype_hierarchy for t in tensors)
+  inferred_dtype = max([t.dtype for t in tensors], key=dtype_hierarchy.index)
+  return [math_ops.cast(t, inferred_dtype) for t in tensors]
+
+
+#===============================================================================
+# ragged_segment_<AGGREGATE>
+#===============================================================================
+
+# Docstring template used for the raggged_segment_<AGGREGATE> ops.
+_RAGGED_SEGMENT_DOCSTRING = """\
+Computes the %(combination)s along segments of a RaggedTensor.
+
+  Returns a RaggedTensor `output` with `num_segments` rows, where the row
+  `output[i]` is formed by taking the %(combination)s of all rows of `data`
+  whose corresponding `segment_id` is `i`.
+
+  The length of the row `output[i]` will be the maximum of the lengths of
+  all rows of `data` whose corresponding `segment_id` is `i`.  If no `data`
+  rows correspond to a given segment ID, then the output row for that segment
+  ID will be empty.
+
+  Args:
+    data: A `RaggedTensor` containing the values to combine.
+    segment_ids: A `Tensor` or `RaggedTensor`.  Must have type `int64` or
+      `int32`.  `segment_ids.shape` must be a prefix of `data.shape`.
+      Must be greater than or equal to zero, and less than `num_segments`.
+      `segment_ids` is not required to be sorted.
+    num_segments: An `int32` or `int64` scalar specifying the number of
+      distinct segment ids.
+    name: A name prefix for the returned tensor (optional).
+  Returns:
+    A `RaggedTensor` containing the %(combined)s values.  The returned tensor
+    has the same dtype as `data`, and its shape is
+    `[num_segments] + data.shape[segment_ids.rank:]`.
+  Raises:
+    ValueError: If `segment_ids.shape` is not a prefix of `data.shape`.
+"""
+
+
+def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
+                              num_segments, name=None):
+  """Aggregates along segments of a RaggedTensor using `unsorted_segment_op`.
+
+  Returns a RaggedTensor `output` with `num_segments` rows, where the row
+  `output[i]` is formed by combining all rows of `data` whose corresponding
+  `segment_id` is `i`.  The values in each row are combined using
+  `unsorted_segment_op`.
+
+  The length of the row `output[i]` will be the maximum of the lengths of
+  all rows of `data` whose corresponding `segment_id` is `i`.  If no `data`
+  rows correspond to a given segment ID, then the output row for that segment
+  ID will be empty.
+
+  Args:
+    unsorted_segment_op: The tensorflow `op` that should be used to combine
+      values in each row.  Must have the same signature and basic behavior as
+      `unsorted_segment_sum`, `unsorted_segment_max`, etc.
+    data: A `RaggedTensor` containing the values to be combined.
+    segment_ids: A `Tensor` or `RaggedTensor`.  Must have type `int64` or
+      `int32`.  `segment_ids.shape` must be a prefix of `data.shape`.
+      `segment_ids` is not required to be sorted.
+    num_segments: An `int32` or `int64` scalar.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` containing the aggregated values.  The returned tensor
+    has the same dtype as `data`, and its shape is
+    `[num_segments] + data.shape[segment_ids.rank:]`.
+  Raises:
+    ValueError: If segment_ids.shape is not a prefix of data.shape.
+  """
+  if not (ragged_tensor.is_ragged(data) or
+          ragged_tensor.is_ragged(segment_ids)):
+    return unsorted_segment_op(data, segment_ids, num_segments, name)
+
+  with ops.name_scope(name, 'RaggedSegment',
+                      [data, segment_ids, num_segments]) as name:
+    data = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        data, name='data')
+    segment_ids = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        segment_ids, name='segment_ids')
+
+    if ragged_tensor.is_ragged(segment_ids):
+      if not ragged_tensor.is_ragged(data):
+        raise ValueError('segment_ids.shape must be a prefix of data.shape, '
+                         'but segment_ids is ragged and data is not.')
+      check_splits = check_ops.assert_equal(
+          segment_ids.row_splits,
+          data.row_splits,
+          message='segment_ids.shape must be a prefix of data.shape')
+      with ops.control_dependencies([check_splits]):
+        return _ragged_segment_aggregate(unsorted_segment_op, data.values,
+                                         segment_ids.values, num_segments, name)
+
+    segment_ids = math_ops.cast(segment_ids, dtypes.int64)
+
+    # Find the length of each row in data.  (dtype=int64, shape=[data_nrows])
+    data_row_lengths = data.row_splits[1:] - data.row_splits[:-1]
+
+    # Find the length that each output row will have.  The length of the row
+    # corresponding to segment `id` is `max(data_row_lengths[i])` where
+    # `segment_ids[i]=id`.  (dtype=int64, shape=[output_nrows])
+    output_row_lengths = math_ops.maximum(
+        math_ops.unsorted_segment_max(data_row_lengths, segment_ids,
+                                      num_segments), 0)
+    assert output_row_lengths.dtype == dtypes.int64
+
+    # Build the splits tensor for the output RaggedTensor.
+    output_splits = array_ops.concat(
+        [
+            array_ops.zeros([1], dtypes.int64),
+            math_ops.cumsum(output_row_lengths)
+        ],
+        axis=0)
+
+    # For each row in `data`, find the start & limit position where that row's
+    # values will be aggregated in output.values.
+    data_row_to_out_row_start = array_ops.gather(output_splits, segment_ids)
+    data_row_to_out_row_limit = data_row_to_out_row_start + data_row_lengths
+
+    # For each value in `data.values`, find the position where it will
+    # aggregated in `output.values`.
+    # Get the target output values index for each data values index.
+    data_val_to_out_val_index = range(data_row_to_out_row_start,
+                                      data_row_to_out_row_limit).values
+
+    # Recursively aggregate the values.
+    output_values = _ragged_segment_aggregate(unsorted_segment_op, data.values,
+                                              data_val_to_out_val_index,
+                                              output_splits[-1])
+    return ragged_factory_ops.from_row_splits(output_values, output_splits)
+
+
+def segment_sum(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_sum, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentSum')
+
+
+def segment_prod(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_prod, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentProd')
+
+
+def segment_min(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_min, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentMin')
+
+
+def segment_max(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_max, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentMax')
+
+
+def segment_mean(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  with ops.name_scope(name, 'RaggedSegmentMean',
+                      [data, segment_ids, num_segments]):
+    total = segment_sum(data, segment_ids, num_segments)
+    ones = ragged_factory_ops.from_nested_row_splits(
+        array_ops.ones_like(data.inner_values), data.nested_row_splits)
+    count = segment_sum(ones, segment_ids, num_segments)
+    return ragged_factory_ops.from_nested_row_splits(
+        total.inner_values / count.inner_values, total.nested_row_splits)
+
+
+def segment_sqrt_n(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  with ops.name_scope(name, 'RaggedSegmentSqrtN',
+                      [data, segment_ids, num_segments]):
+    total = segment_sum(data, segment_ids, num_segments)
+    ones = ragged_factory_ops.from_nested_row_splits(
+        array_ops.ones_like(data.inner_values), data.nested_row_splits)
+    count = segment_sum(ones, segment_ids, num_segments)
+    return ragged_factory_ops.from_nested_row_splits(
+        total.inner_values / math_ops.sqrt(count.inner_values),
+        total.nested_row_splits)
+
+
+def _set_ragged_segment_docstring(func, combination, combined):
+  func.__doc__ = _RAGGED_SEGMENT_DOCSTRING % dict(
+      combination=combination, combined=combined)
+
+
+_set_ragged_segment_docstring(segment_sum, 'sum', 'summed')
+_set_ragged_segment_docstring(segment_prod, 'product', 'multiplied')
+_set_ragged_segment_docstring(segment_min, 'minimum', 'minimized')
+_set_ragged_segment_docstring(segment_max, 'maximum', 'maximized')
+_set_ragged_segment_docstring(segment_mean, 'mean', 'averaged')
+_set_ragged_segment_docstring(segment_sqrt_n, 'sum divided by sqrt(N)',
+                              'summed')
+
+#===============================================================================
+# ragged_reduce_<AGGREGATE>
+#===============================================================================
+
+# Docstring template used for ragged_reduce_<AGGREGATE> ops.
+_RAGGED_REDUCE_DOCSTRING = """\
+Computes the %(combination)s of elements across dimensions of a `RaggedTensor`.
+
+  Reduces `rt_input` along the dimensions given in `axis` by taking the
+  %(combination)s of values.  If a reduced dimension has no elements for
+  some index, then the value for that index will be %(default)s.
+
+  The rank of the tensor is reduced by `1` for each entry in `axis`.  If
+  `axis` is not specified, then all dimensions are reduced, and a scalar
+  value is returned.
+  Args:
+    rt_input: A `RaggedTensor` containing the values to be %(combined)s.
+    axis: The dimensions to reduce.  May be `None` (to reduce all axes), an
+      `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce
+      a given set of axes), or a `Tensor` with a constant value.  Must be in
+      the range `[0, rt_input.rank]`.
+    name: A name prefix for the returned tensor (optional).
+  Returns:
+    A `RaggedTensor` containing the %(combined)s values.  The returned tensor
+    has the same dtype as `data`, and its shape is given by removing the
+    dimensions specified in `axis` from `rt_input.shape`.  The `ragged_rank`
+    of the returned tensor is given by substracting any ragged dimensions
+    specified in `axis` from `rt_input.ragged_rank`.
+  Raises:
+    ValueError: If `axis` contains a `Tensor` whose value is not constant.
+  ####Example:
+    ```python%(example)s    ```
+"""
+_RAGGED_REDUCE_SUM_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_sum(rt, axis=0).eval().tolist()
+    [15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
+    >>> ragged.reduce_sum(rt, axis=1).eval().tolist()
+    [8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+"""
+_RAGGED_REDUCE_PROD_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_prod(rt, axis=0).eval().tolist()
+    [54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
+    >>> ragged.reduce_prod(rt, axis=1).eval().tolist()
+    [12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
+"""
+_RAGGED_REDUCE_MIN_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_min(rt, axis=0).eval().tolist()
+    [1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
+    >>> ragged.reduce_min(rt, axis=1).eval().tolist()
+    [1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
+"""
+_RAGGED_REDUCE_MAX_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_max(rt, axis=0).eval().tolist()
+    [9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
+    >>> ragged.reduce_max(rt, axis=1).eval().tolist()
+    [4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
+"""
+_RAGGED_REDUCE_MEAN_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_mean(rt, axis=0).eval().tolist()
+    [3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
+    >>> ragged.reduce_mean(rt, axis=1).eval().tolist()
+    [2.66666, 3, 9, 4]  # = [mean(3, 1, 4), mean(1, 5), 9, mean(2, 6)]
+"""
+_RAGGED_REDUCE_ALL_EXAMPLE = """
+    >>> rt = ragged.constant([[True, True], [True, True, False, True], [False, True]])
+    >>> ragged.reduce_all(rt, axis=0).eval().tolist()
+    [False, True, False, True]
+    >>> ragged.reduce_all(rt, axis=1).eval().tolist()
+    [True, False, False]
+"""
+_RAGGED_REDUCE_ANY_EXAMPLE = """
+    >>> rt = ragged.constant([[True, True], [True, True, False, True], [False, True]])
+    >>> ragged.reduce_any(rt, axis=0).eval().tolist()
+    [True, True, False, True]
+    >>> ragged.reduce_any(rt, axis=1).eval().tolist()
+    [True, True, True]
+"""
+
+
+def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
+                             name=None):
+  """Aggregates across axes of a RaggedTensor using the given `Tensor` ops.
+
+  Reduces `rt_input` along the dimensions given in `axis`.  The rank of the
+  tensor is reduced by 1 for each entry in `axis`.  If `axis` is not specified,
+  then all dimensions are reduced, and a scalar value is returned.
+
+  This op assumes that `reduce_op` and `unsorted_segment_op` are associative;
+  if not, then reducing multiple axes will return incorrect results.  (In
+  particular, reducing multiple axes is currently implemented by reducing the
+  axes one at a time.)
+
+  Args:
+    reduce_op: The tensorflow `op` that should be used to reduce values in
+      uniform dimensions.  Must have the same signature and basic behavior as
+      `reduce_sum`, `reduce_max`, etc.
+    unsorted_segment_op: The tensorflow `op` that should be used to combine
+      values in ragged dimensions.  Must have the same signature and basic
+      behavior as `unsorted_segment_sum`, `unsorted_segment_max`, etc.
+    rt_input: A `Tensor` or `RaggedTensor` containing the values to be reduced.
+    axis: The axis or axes to reduce.  May be `None` (to reduce all axes), an
+      `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce a
+      given set of axes), or a `Tensor` with a constant value.  Must be in the
+      range `[0, rt_input.rank)`.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` containing the reduced values.  The returned tensor
+    has the same dtype as `data`, and its shape is given by removing the
+    dimensions specified in `axis` from `rt_input.shape`.  The `ragged_rank`
+    of the returned tensor is given by substracting any ragged dimensions
+    specified in `axis` from `rt_input.ragged_rank`.
+  Raises:
+    ValueError: If `axis` contains a `Tensor` whose value is not constant.
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    return reduce_op(rt_input, axis, name=name)
+
+  if isinstance(axis, ops.Tensor):
+    axis = tensor_util.constant_value(axis)
+    if axis is None:
+      raise ValueError('axis must be known at graph construction time.')
+
+  # When reducing all axes, just ignore splits & reduce the inner values.
+  if axis is None:
+    return reduce_op(rt_input.inner_values, None, name=name)
+
+  with ops.name_scope(name, 'RaggedReduce', [rt_input, axis]):
+    if isinstance(axis, (tuple, list)):
+      if not axis:
+        return rt_input
+      elif len(axis) == 1:
+        axis = axis[0]
+      else:
+        # When reducing multiple axes, just reduce one at a time.  This is less
+        # efficient, and only works for associative ops.  (In particular, it
+        # does not work for reduce_mean.)  However, reducing multiple axes at
+        # once will probably require a nontrivial c++ op.
+        axis = sorted(axis)
+        inner_reduced = _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                                 rt_input, axis[-1])
+        return _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                        inner_reduced, axis[:-1])
+
+    axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
+
+    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+
+    if axis == 0:
+      # out[i_1, i_2, ..., i_N] = sum_{j} rt_input[j, i_1, i_2, ..., i_N]
+      row_lengths = rt_input.row_splits[1:] - rt_input.row_splits[:-1]
+      num_segments = math_ops.maximum(math_ops.reduce_max(row_lengths), 0)
+      segment_ids = range(row_lengths).values
+      return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
+                                       segment_ids, num_segments)
+    elif axis == 1:
+      # out[i_0, i_1, i_2, ..., i_N] = sum_{j} rt_input[i_0, j, i_2, ..., i_N]
+      num_segments = array_ops.shape(rt_input.row_splits)[0] - 1
+      segment_ids = segment_id_ops.row_splits_to_segment_ids(
+          rt_input.row_splits)
+      return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
+                                       segment_ids, num_segments)
+    else:
+      # out[i_0, ..., i_[axis-1], i_axis+1], ..., i_N] =
+      #     sum_{j} rt_input [i_0, ..., i_[axis-1], j, i_axis+1], ..., i_N]
+      return rt_input.with_values(
+          _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                   rt_input.values, axis - 1))
+
+
+def reduce_sum(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_sum,
+                                  math_ops.unsorted_segment_sum, rt_input, axis,
+                                  name or 'RaggedReduceSum')
+
+
+def reduce_prod(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_prod,
+                                  math_ops.unsorted_segment_prod, rt_input,
+                                  axis, name or 'RaggedReduceProd')
+
+
+def reduce_min(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_min,
+                                  math_ops.unsorted_segment_min, rt_input, axis,
+                                  name or 'RaggedReduceMin')
+
+
+def reduce_max(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_max,
+                                  math_ops.unsorted_segment_max, rt_input, axis,
+                                  name or 'RaggedReduceMax')
+
+
+def reduce_mean(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  with ops.name_scope(name, 'RaggedReduceMean', [rt_input, axis]):
+    total = reduce_sum(rt_input, axis)
+    if ragged_tensor.is_ragged(rt_input):
+      ones = ragged_factory_ops.from_nested_row_splits(
+          array_ops.ones_like(rt_input.inner_values),
+          rt_input.nested_row_splits)
+    else:
+      ones = array_ops.ones_like(rt_input)
+    count = reduce_sum(ones, axis)
+    if ragged_tensor.is_ragged(total):
+      return ragged_factory_ops.from_nested_row_splits(
+          total.inner_values / count.inner_values, total.nested_row_splits)
+    else:
+      return total / count
+
+
+def _cast(rt_input, dtype):
+  return ragged_functional_ops.map_inner_values(math_ops.cast, rt_input, dtype)
+
+
+def reduce_all(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  with ops.name_scope(name, 'RaggedReduceAll', [rt_input, axis]):
+    return _cast(reduce_prod(_cast(rt_input, dtypes.int32), axis), dtypes.bool)
+
+
+def reduce_any(rt_input, axis=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  with ops.name_scope(name, 'RaggedReduceAny', [rt_input, axis]):
+    return _cast(reduce_sum(_cast(rt_input, dtypes.int32), axis), dtypes.bool)
+
+
+def _set_ragged_reduce_docstring(func, combination, combined, default, example):
+  func.__doc__ = _RAGGED_REDUCE_DOCSTRING % dict(
+      combination=combination,
+      combined=combined,
+      default=default,
+      example=example)
+
+
+_set_ragged_reduce_docstring(reduce_sum, 'sum', 'summed', '0',
+                             _RAGGED_REDUCE_SUM_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_prod, 'product', 'multiplied', '1',
+                             _RAGGED_REDUCE_PROD_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_min, 'minimum', 'minimized',
+                             '`rt_input.dtype.min`', _RAGGED_REDUCE_MIN_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_max, 'maximum', 'maximized',
+                             '`rt_input.dtype.max`', _RAGGED_REDUCE_MAX_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_mean, 'mean', 'averaged', 'NaN',
+                             _RAGGED_REDUCE_MEAN_EXAMPLE)
+
+_set_ragged_reduce_docstring(reduce_all, 'logical and', 'and-ed', 'True',
+                             _RAGGED_REDUCE_ALL_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_any, 'logical or', 'or-ed', 'False',
+                             _RAGGED_REDUCE_ANY_EXAMPLE)
diff --git a/tensorflow/python/ops/ragged/ragged_operators.py b/tensorflow/python/ops/ragged/ragged_operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..223ba0d2e7f050650a0849fdb4987afb38cebd2e
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_operators.py
@@ -0,0 +1,79 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operator overloads for `RaggedTensor`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.ragged import ragged_elementwise_ops
+from tensorflow.python.ops.ragged import ragged_getitem
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import tf_decorator
+
+
+def _right(operator):
+  """Right-handed version of an operator: swap args x and y."""
+  return tf_decorator.make_decorator(operator, lambda y, x: operator(x, y))
+
+
+# Indexing
+ragged_tensor.RaggedTensor.__getitem__ = ragged_getitem.ragged_tensor_getitem
+
+# Ordering operators
+ragged_tensor.RaggedTensor.__ge__ = ragged_elementwise_ops.greater_equal
+ragged_tensor.RaggedTensor.__gt__ = ragged_elementwise_ops.greater
+ragged_tensor.RaggedTensor.__le__ = ragged_elementwise_ops.less_equal
+ragged_tensor.RaggedTensor.__lt__ = ragged_elementwise_ops.less
+
+# Logical operators
+ragged_tensor.RaggedTensor.__and__ = ragged_elementwise_ops.logical_and
+ragged_tensor.RaggedTensor.__rand__ = _right(ragged_elementwise_ops.logical_and)
+ragged_tensor.RaggedTensor.__invert__ = ragged_elementwise_ops.logical_not
+ragged_tensor.RaggedTensor.__ror__ = _right(ragged_elementwise_ops.logical_or)
+ragged_tensor.RaggedTensor.__or__ = ragged_elementwise_ops.logical_or
+ragged_tensor.RaggedTensor.__xor__ = ragged_elementwise_ops.logical_xor
+ragged_tensor.RaggedTensor.__rxor__ = _right(ragged_elementwise_ops.logical_xor)
+
+# Arithmetic operators
+ragged_tensor.RaggedTensor.__abs__ = ragged_elementwise_ops.abs
+ragged_tensor.RaggedTensor.__add__ = ragged_elementwise_ops.add
+ragged_tensor.RaggedTensor.__radd__ = _right(ragged_elementwise_ops.add)
+ragged_tensor.RaggedTensor.__div__ = ragged_elementwise_ops.div
+ragged_tensor.RaggedTensor.__rdiv__ = _right(ragged_elementwise_ops.div)
+ragged_tensor.RaggedTensor.__floordiv__ = ragged_elementwise_ops.floordiv
+ragged_tensor.RaggedTensor.__rfloordiv__ = _right(
+    ragged_elementwise_ops.floordiv)
+ragged_tensor.RaggedTensor.__mod__ = ragged_elementwise_ops.floormod
+ragged_tensor.RaggedTensor.__rmod__ = _right(ragged_elementwise_ops.floormod)
+ragged_tensor.RaggedTensor.__mul__ = ragged_elementwise_ops.multiply
+ragged_tensor.RaggedTensor.__rmul__ = _right(ragged_elementwise_ops.multiply)
+ragged_tensor.RaggedTensor.__neg__ = ragged_elementwise_ops.negative
+ragged_tensor.RaggedTensor.__pow__ = ragged_elementwise_ops.pow
+ragged_tensor.RaggedTensor.__rpow__ = _right(ragged_elementwise_ops.pow)
+ragged_tensor.RaggedTensor.__sub__ = ragged_elementwise_ops.subtract
+ragged_tensor.RaggedTensor.__rsub__ = _right(ragged_elementwise_ops.subtract)
+ragged_tensor.RaggedTensor.__truediv__ = ragged_elementwise_ops.truediv
+ragged_tensor.RaggedTensor.__rtruediv__ = _right(ragged_elementwise_ops.truediv)
+
+
+# Dummy methods
+def _dummy_bool(_):
+  """Dummy method to prevent a RaggedTensor from being used as a Python bool."""
+  raise TypeError("RaggedTensor may not be used as a boolean.")
+
+
+ragged_tensor.RaggedTensor.__bool__ = _dummy_bool
+ragged_tensor.RaggedTensor.__nonzero__ = _dummy_bool
diff --git a/tensorflow/python/ops/ragged/ragged_operators_test.py b/tensorflow/python/ops/ragged/ragged_operators_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a99d788ef79f5893eb09cad2b9f336c435704783
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_operators_test.py
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for overloaded RaggedTensor operators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase):
+  # @TODO(edloper): Test right-handed versions of operators once we add
+  # broadcasting support for elementwise ops.
+
+  def testOrderingOperators(self):
+    x = ragged.constant([[1, 5], [3]])
+    y = ragged.constant([[4, 5], [1]])
+    with self.test_session():
+      self.assertEqual((x > y).eval().tolist(), [[False, False], [True]])
+      self.assertEqual((x >= y).eval().tolist(), [[False, True], [True]])
+      self.assertEqual((x < y).eval().tolist(), [[True, False], [False]])
+      self.assertEqual((x <= y).eval().tolist(), [[True, True], [False]])
+
+  def assertEqual(self, a, b):
+    if a != b:
+      print('%30s %s' % (b, a))
+
+  def testArithmeticOperators(self):
+    x = ragged.constant([[1.0, -2.0], [8.0]])
+    y = ragged.constant([[4.0, 4.0], [2.0]])
+    with self.test_session():
+      self.assertEqual(abs(x).eval().tolist(), [[1.0, 2.0], [8.0]])
+
+      self.assertEqual((-x).eval().tolist(), [[-1.0, 2.0], [-8.0]])
+
+      self.assertEqual((x + y).eval().tolist(), [[5.0, 2.0], [10.0]])
+      self.assertEqual((3.0 + y).eval().tolist(), [[7.0, 7.0], [5.0]])
+      self.assertEqual((x + 3.0).eval().tolist(), [[4.0, 1.0], [11.0]])
+
+      self.assertEqual((x - y).eval().tolist(), [[-3.0, -6.0], [6.0]])
+      self.assertEqual((3.0 - y).eval().tolist(), [[-1.0, -1.0], [1.0]])
+      self.assertEqual((x + 3.0).eval().tolist(), [[4.0, 1.0], [11.0]])
+
+      self.assertEqual((x * y).eval().tolist(), [[4.0, -8.0], [16.0]])
+      self.assertEqual((3.0 * y).eval().tolist(), [[12.0, 12.0], [6.0]])
+      self.assertEqual((x * 3.0).eval().tolist(), [[3.0, -6.0], [24.0]])
+
+      self.assertEqual((x / y).eval().tolist(), [[0.25, -0.5], [4.0]])
+      self.assertEqual((y / x).eval().tolist(), [[4.0, -2.0], [0.25]])
+      self.assertEqual((2.0 / y).eval().tolist(), [[0.5, 0.5], [1.0]])
+      self.assertEqual((x / 2.0).eval().tolist(), [[0.5, -1.0], [4.0]])
+
+      self.assertEqual((x // y).eval().tolist(), [[0.0, -1.0], [4.0]])
+      self.assertEqual((y // x).eval().tolist(), [[4.0, -2.0], [0.0]])
+      self.assertEqual((2.0 // y).eval().tolist(), [[0.0, 0.0], [1.0]])
+      self.assertEqual((x // 2.0).eval().tolist(), [[0.0, -1.0], [4.0]])
+
+      self.assertEqual((x % y).eval().tolist(), [[1.0, 2.0], [0.0]])
+      self.assertEqual((y % x).eval().tolist(), [[0.0, -0.0], [2.0]])
+      self.assertEqual((2.0 % y).eval().tolist(), [[2.0, 2.0], [0.0]])
+      self.assertEqual((x % 2.0).eval().tolist(), [[1.0, 0.0], [0.0]])
+
+  def testLogicalOperators(self):
+    a = ragged.constant([[True, True], [False]])
+    b = ragged.constant([[True, False], [False]])
+    with self.test_session():
+      self.assertEqual((~a).eval().tolist(), [[False, False], [True]])
+
+      self.assertEqual((a & b).eval().tolist(), [[True, False], [False]])
+      self.assertEqual((a & True).eval().tolist(), [[True, True], [False]])
+      self.assertEqual((True & b).eval().tolist(), [[True, False], [False]])
+
+      self.assertEqual((a | b).eval().tolist(), [[True, True], [False]])
+      self.assertEqual((a | False).eval().tolist(), [[True, True], [False]])
+      self.assertEqual((False | b).eval().tolist(), [[True, False], [False]])
+
+      self.assertEqual((a ^ b).eval().tolist(), [[False, True], [False]])
+      self.assertEqual((a ^ True).eval().tolist(), [[False, False], [True]])
+      self.assertEqual((True ^ b).eval().tolist(), [[False, True], [True]])
+
+  def testDummyOperators(self):
+    a = ragged.constant([[True, True], [False]])
+    with self.assertRaisesRegexp(TypeError,
+                                 'RaggedTensor may not be used as a boolean.'):
+      bool(a)
+    with self.assertRaisesRegexp(TypeError,
+                                 'RaggedTensor may not be used as a boolean.'):
+      if a:
+        pass
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_range_op_test.py b/tensorflow/python/ops/ragged/ragged_range_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c6a6fb75c8a85f7d10f4f3e501f2f53f28a48e5
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_range_op_test.py
@@ -0,0 +1,124 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_range op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedRangeOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExamples(self):
+    """Examples from ragged_range.__doc__."""
+    with self.test_session():
+      rt1 = ragged.range([3, 5, 2]).eval().tolist()
+      self.assertEqual(rt1, [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]])
+
+      rt2 = ragged.range([0, 5, 8], [3, 3, 12]).eval().tolist()
+      self.assertEqual(rt2, [[0, 1, 2], [], [8, 9, 10, 11]])
+
+      rt3 = ragged.range([0, 5, 8], [3, 3, 12], 2).eval().tolist()
+      self.assertEqual(rt3, [[0, 2], [], [8, 10]])
+
+  def testBasicRanges(self):
+    with self.test_session():
+      # Specify limits only.
+      self.assertEqual(
+          ragged.range([0, 3, 5]).eval().tolist(),
+          [list(range(0)), list(range(3)), list(range(5))])
+
+      # Specify starts and limits.
+      self.assertEqual(
+          ragged.range([0, 3, 5], [2, 3, 10]).eval().tolist(),
+          [list(range(0, 2)), list(range(3, 3)), list(range(5, 10))])
+
+      # Specify starts, limits, and deltas.
+      self.assertEqual(
+          ragged.range([0, 3, 5], [4, 4, 15], [2, 3, 4]).eval().tolist(),
+          [list(range(0, 4, 2)), list(range(3, 4, 3)),
+           list(range(5, 15, 4))])
+
+  def testFloatRanges(self):
+    with self.test_session():
+      expected = [[0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8, 3.2, 3.6], [3.0],
+                  [5.0, 7.2, 9.4, 11.6, 13.8]]
+      actual = ragged.range([0.0, 3.0, 5.0], [3.9, 4.0, 15.0],
+                            [0.4, 1.5, 2.2]).eval().tolist()
+      self.assertEqual(expected, [[round(v, 5) for v in row] for row in actual])
+
+  def testNegativeDeltas(self):
+    with self.test_session():
+      self.assertEqual(
+          ragged.range([0, 3, 5], limits=0, deltas=-1).eval().tolist(),
+          [list(range(0, 0, -1)), list(range(3, 0, -1)),
+           list(range(5, 0, -1))])
+
+      self.assertEqual(
+          ragged.range([0, -3, 5], limits=0, deltas=[-1, 1,
+                                                     -2]).eval().tolist(),
+          [list(range(0, 0, -1)), list(range(-3, 0, 1)),
+           list(range(5, 0, -2))])
+
+  def testBroadcast(self):
+    with self.test_session():
+      # Specify starts and limits, broadcast deltas.
+      self.assertEqual(
+          ragged.range([0, 3, 5], [4, 4, 15], 3).eval().tolist(),
+          [list(range(0, 4, 3)), list(range(3, 4, 3)),
+           list(range(5, 15, 3))])
+
+      # Broadcast all arguments.
+      self.assertEqual(
+          ragged.range(0, 5, 1).eval().tolist(), [list(range(0, 5, 1))])
+
+  def testEmptyRanges(self):
+    rt1 = ragged.range([0, 5, 3], [0, 3, 5])
+    rt2 = ragged.range([0, 5, 5], [0, 3, 5], -1)
+    with self.test_session():
+      self.assertEqual(rt1.eval().tolist(), [[], [], [3, 4]])
+      self.assertEqual(rt2.eval().tolist(), [[], [5, 4], []])
+
+  def testShapeFnErrors(self):
+    with self.test_session():
+      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
+                              ragged.range, [[0]], 5)
+      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
+                              ragged.range, 0, [[5]])
+      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
+                              ragged.range, 0, 5, [[0]])
+      self.assertRaisesRegexp(ValueError, r'Dimensions must be equal.*',
+                              ragged.range, [0], [1, 2])
+
+  def testKernelErrors(self):
+    with self.test_session():
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              r'Requires delta != 0',
+                              ragged.range(0, 0, 0).eval)
+
+  def testShape(self):
+    self.assertEqual(ragged.range(0, 0, 0).shape.as_list(), [1, None])
+    self.assertEqual(ragged.range([1, 2, 3]).shape.as_list(), [3, None])
+    self.assertEqual(
+        ragged.range([1, 2, 3], [4, 5, 6]).shape.as_list(), [3, None])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..93176c738df0d9ae2d6287838b97756e4eda2eb3
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -0,0 +1,343 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.reduce_<AGGREGATE> ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+_MAX_INT32 = dtypes.int32.max
+_MIN_INT32 = dtypes.int32.min
+_NAN = np.nan
+
+
+def mean(*values):
+  return 1.0 * sum(values) / len(values)
+
+
+class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Docstring examples.  RaggedTensor for testing is:
+      #   [[3, 1, 4],
+      #    [1, 5,  ],
+      #    [9,     ],
+      #    [2, 6   ]]
+      #=========================================================================
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=-2,
+          expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=-1,
+          expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_any,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=0,
+          expected=[True, True, False, True]),
+      dict(
+          ragged_reduce_op=ragged.reduce_any,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=1,
+          expected=[True, True, True]),
+      dict(
+          ragged_reduce_op=ragged.reduce_all,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=0,
+          expected=[False, True, False, True]),
+      dict(
+          ragged_reduce_op=ragged.reduce_all,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=1,
+          expected=[True, False, False]),
+
+      #=========================================================================
+      # Examples with the following RaggedTensor (ragged_rank=1):
+      #   [[0, 1, 2, 3],
+      #    [4         ],
+      #    [          ],
+      #    [5, 6      ],
+      #    [7         ],
+      #    [8, 9      ]]
+      #=========================================================================
+
+      # axis=None
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=0 * 1 * 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=min(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=max(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=mean(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
+      # axis=0
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[0 + 4 + 5 + 7 + 8, 1 + 6 + 9, 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[0 * 4 * 5 * 7 * 8, 1 * 6 * 9, 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[min(0, 4, 5, 7, 8), min(1, 6, 9), 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[max(0, 4, 5, 7, 8), max(1, 6, 9), 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[mean(0, 4, 5, 7, 8),
+                    mean(1, 6, 9), 2, 3]),
+      # axis=1
+      # Note: we don't test mean here because it gives a NaN, and this will
+      # cause assertEqual to fail (since NaN != NaN).  See testMeanNan().
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[0 * 1 * 2 * 3, 4, 1, 5 * 6, 7, 8 * 9]),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[min(0, 1, 2, 3), 4, _MAX_INT32,
+                    min(5, 6), 7,
+                    min(8, 9)]),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[max(0, 1, 2, 3), 4, _MIN_INT32,
+                    max(5, 6), 7,
+                    max(8, 9)]),
+
+      #=========================================================================
+      # Examples with ragged_rank=2:
+      # [[[1, 2], [ ], [3, 4, 5]],
+      #  [[6, 7], [ ], [8      ]],
+      #  [                      ],
+      #  [[9   ]                ]]
+      #=========================================================================
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[],
+          expected=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=None,
+          expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=0,
+          expected=[[1 + 6 + 9, 2 + 7], [], [3 + 8, 4, 5]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=1,
+          expected=[[1 + 3, 2 + 4, 5], [6 + 8, 7], [], [9]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=2,
+          expected=[[1 + 2, 0, 3 + 4 + 5], [6 + 7, 0, 8], [], [9]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 1],
+          expected=[1 + 3 + 6 + 8 + 9, 2 + 4 + 7, 5]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 2],
+          expected=[1 + 6 + 9 + 2 + 7, 0, 3 + 8 + 4 + 5]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[1, 2],
+          expected=[1 + 2 + 3 + 4 + 5, 6 + 7 + 8, 0, 9]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 1, 2],
+          expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
+
+      #=========================================================================
+      # Examples for ragged_reduce_mean ragged_rank=2:
+      # [[[1, 2], [3, 4, 5]],
+      #  [[6, 7], [8      ]],
+      #  [[9   ]          ]]
+      #=========================================================================
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
+          axis=0,
+          expected=[[mean(1, 6, 9), mean(2, 7)], [mean(3, 8), 4, 5]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
+          axis=1,
+          expected=[[mean(1, 3), mean(2, 4), 5], [mean(6, 8), 7], [9]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
+          axis=2,
+          expected=[[mean(1, 2), mean(3, 4, 5)], [mean(6, 7), 8], [9]]),
+  )
+  def testReduce(self, ragged_reduce_op, rt_input, axis, expected):
+    rt_input = ragged.constant(rt_input)
+    reduced = ragged_reduce_op(rt_input, axis)
+    with self.test_session():
+      self.assertEqual(reduced.eval().tolist(), expected)
+
+  def assertEqualWithNan(self, actual, expected):
+    """Like assertEqual, but NaN==NaN."""
+    self.assertTrue(
+        ((actual == expected) | (np.isnan(actual) & np.isnan(expected))).all())
+
+  def testMeanNan(self):
+    rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
+    expected = (
+        np.array([0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]) / np.array(
+            [4, 1, 0, 2, 1, 2]))
+    rt_input = ragged.constant(rt_as_list)
+    reduced = ragged.reduce_mean(rt_input, axis=1)
+    with self.test_session():
+      self.assertEqualWithNan(reduced.eval(), expected)
+
+  def testMeanWithTensorInputs(self):
+    tensor = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
+    expected = [2.0, 20.0]
+    reduced = ragged.reduce_mean(tensor, axis=1)
+    with self.test_session():
+      self.assertAllEqual(reduced.eval(), expected)
+
+  def testErrors(self):
+    rt_input = ragged.constant([[1, 2, 3], [4, 5]])
+    axis = array_ops.placeholder_with_default(constant_op.constant([0]), None)
+    self.assertRaisesRegexp(ValueError,
+                            r'axis must be known at graph construction time.',
+                            ragged.reduce_sum, rt_input, axis)
+    self.assertRaisesRegexp(TypeError,
+                            r'axis must be an int; got str.*',
+                            ragged.reduce_sum, rt_input, ['x'])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d5a0a5d11c92cccef54f27fdeaf36608a61980c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
@@ -0,0 +1,184 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.row_lengths."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      # Docstring Example
+      dict(
+          rt_input=[[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []],
+          expected=[2, 0, 2, 1, 0]),
+      dict(
+          rt_input=[[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []],
+          axis=2,
+          expected=[[3, 1], [], [2, 1], [1], []]),
+
+      # 1D tensor
+      dict(
+          rt_input=[1, 2, 3, 4, 5],
+          ragged_rank=0,
+          axis=0,
+          expected=5),
+
+      # 2D Tensor (0 ragged dimensions)
+      dict(
+          rt_input=[[1, 2], [3, 4], [5, 6], [7, 8]],
+          ragged_rank=0,
+          expected=[2, 2, 2, 2]),
+      dict(
+          rt_input=[[1, 2], [3, 4], [5, 6], [7, 8]],
+          ragged_rank=0,
+          axis=0,
+          expected=4),
+
+      # 2D Tensor (1 ragged dimension)
+      dict(
+          rt_input=[['a'], ['b', 'c', 'd'], ['e'], [], ['f']],
+          expected=[1, 3, 1, 0, 1]),
+      dict(
+          rt_input=[['a'], ['b', 'c', 'd'], ['e'], [], ['f']],
+          axis=0,
+          expected=5),
+      dict(
+          rt_input=[['a', 'b', 'c', 'd', 'e', 'f', 'g']],
+          expected=[7]),
+      dict(
+          rt_input=[[], ['a', 'b', 'c', 'd', 'e', 'f', 'g'], []],
+          expected=[0, 7, 0]),
+      dict(
+          rt_input=[],
+          ragged_rank=1,
+          expected=[]),
+      dict(
+          rt_input=[],
+          ragged_rank=1,
+          axis=0,
+          expected=0),
+
+      # 3D Tensor (0 ragged dimensions)
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
+          ragged_rank=0,
+          axis=0,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
+          ragged_rank=0,
+          axis=1,
+          expected=[3, 3]),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
+          ragged_rank=0,
+          axis=2,
+          expected=[[2, 2, 2], [2, 2, 2]],
+          expected_ragged_rank=0),
+
+      # 3D Tensor (1 ragged dimension)
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
+          ragged_rank=1,
+          axis=0,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
+          ragged_rank=1,
+          axis=1,
+          expected=[3, 2]),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
+          ragged_rank=1,
+          axis=2,
+          expected=[[2, 2, 2], [2, 2]],
+          expected_ragged_rank=1),
+
+      # 3D Tensor (2 ragged dimensions)
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=0,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=-3,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=1,
+          expected=[3, 2]),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=-2,
+          expected=[3, 2]),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=2,
+          expected=[[2, 3, 0], [4, 1]],
+          expected_ragged_rank=1),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=-1,
+          expected=[[2, 3, 0], [4, 1]],
+          expected_ragged_rank=1),
+  ])  # pyformat: disable
+  def testRowLengths(self,
+                     rt_input,
+                     expected,
+                     axis=1,
+                     ragged_rank=None,
+                     expected_ragged_rank=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    lengths = ragged.row_lengths(rt, axis)
+    with self.test_session():
+      self.assertEqual(lengths.eval().tolist(), expected)
+      if expected_ragged_rank is not None:
+        if isinstance(lengths, ragged.RaggedTensor):
+          self.assertEqual(lengths.ragged_rank, expected_ragged_rank)
+        else:
+          self.assertEqual(0, expected_ragged_rank)
+
+  @parameterized.parameters([
+      dict(
+          rt_input=10,
+          exception=ValueError,
+          message='rt_input may not be a scalar.'),
+      dict(
+          rt_input=[10, 20],
+          axis=1,
+          exception=ValueError,
+          message='axis=1 out of bounds: expected -1<=axis<1.'),
+      dict(
+          rt_input=[[2, 3, 0], [4, 1, 2]],
+          axis=-3,
+          exception=ValueError,
+          message='axis=-3 out of bounds: expected -2<=axis<2.'),
+  ])
+  def testErrors(self, rt_input, exception, message, axis=1):
+    with self.assertRaisesRegexp(exception, message):
+      ragged.row_lengths(rt_input, axis)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f246bf35524084c958f66caecceae3547012ee9a
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
@@ -0,0 +1,56 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ragged.row_splits_to_segment_ids() op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    splits = [0, 3, 3, 5, 6, 9]
+    expected = [0, 0, 0, 2, 2, 3, 4, 4, 4]
+    segment_ids = ragged.row_splits_to_segment_ids(splits)
+    with self.test_session():
+      self.assertEqual(segment_ids.eval().tolist(), expected)
+
+  def testEmptySplits(self):
+    # Note: the splits for an empty ragged tensor contains a single zero.
+    segment_ids = ragged.row_splits_to_segment_ids([0])
+    with self.test_session():
+      self.assertEqual(segment_ids.eval().tolist(), [])
+
+  def testErrors(self):
+    self.assertRaisesRegexp(ValueError, r'Invalid row_splits: \[\]',
+                            ragged.row_splits_to_segment_ids, [])
+    self.assertRaisesRegexp(
+        ValueError, r'Tensor conversion requested dtype int64 for '
+        'Tensor with dtype float32', ragged.row_splits_to_segment_ids,
+        constant_op.constant([0.5]))
+    self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
+                            ragged.row_splits_to_segment_ids, 0)
+    self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
+                            ragged.row_splits_to_segment_ids, [[0]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa7adf66b0bbd5ae8091c7b7f47bfaae56d9d266
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
@@ -0,0 +1,74 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ragged.segment_ids_to_row_splits() op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
+    expected = [0, 3, 3, 5, 6, 9]
+    splits = ragged.segment_ids_to_row_splits(segment_ids)
+    with self.test_session():
+      self.assertEqual(splits.eval().tolist(), expected)
+
+  def testEmptySegmentIds(self):
+    # Note: the splits for an empty ragged tensor contains a single zero.
+    segment_ids = ragged.segment_ids_to_row_splits([])
+    with self.test_session():
+      self.assertEqual(segment_ids.eval().tolist(), [0])
+
+  def testErrors(self):
+    self.assertRaisesRegexp(TypeError,
+                            r'segment_ids must be an integer tensor.*',
+                            ragged.segment_ids_to_row_splits,
+                            constant_op.constant([0.5]))
+    self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
+                            ragged.segment_ids_to_row_splits, 0)
+    self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
+                            ragged.segment_ids_to_row_splits, [[0]])
+
+  def testNumSegments(self):
+    segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
+    num_segments = 7
+    expected = [0, 3, 3, 5, 6, 9, 9, 9]
+    splits = ragged.segment_ids_to_row_splits(segment_ids, num_segments)
+    with self.test_session():
+      self.assertEqual(splits.eval().tolist(), expected)
+
+  def testUnsortedSegmentIds(self):
+    # Segment ids are not required to be sorted.
+    segment_ids = [0, 4, 3, 2, 4, 4, 2, 0, 0]
+    splits1 = ragged.segment_ids_to_row_splits(segment_ids)
+    expected1 = [0, 3, 3, 5, 6, 9]
+
+    splits2 = ragged.segment_ids_to_row_splits(segment_ids, 7)
+    expected2 = [0, 3, 3, 5, 6, 9, 9, 9]
+    with self.test_session():
+      self.assertEqual(splits1.eval().tolist(), expected1)
+      self.assertEqual(splits2.eval().tolist(), expected2)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_segment_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..373a332f135a63b4ec2b4c738497ba2f322287b4
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
@@ -0,0 +1,236 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_range op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+def prod(values):
+  val = 1
+  for v in values:
+    val *= v
+  return val
+  # return reduce(lambda x, y: x * y, values, 1)
+
+
+def mean(values):
+  return 1.0 * sum(values) / len(values)
+
+
+def sqrt_n(values):
+  return 1.0 * sum(values) / math.sqrt(len(values))
+
+
+class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
+
+  def assertNestedListAmostEqual(self, lhs, rhs, places=7, context='value'):
+    self.assertEqual(type(lhs), type(rhs))
+    if isinstance(lhs, (list, tuple)):
+      self.assertEqual(len(lhs), len(rhs), 'Length differs for %s' % context)
+      for i in range(len(lhs)):
+        self.assertNestedListAmostEqual(lhs[i], rhs[i], places,
+                                        '%s[%s]' % (context, i))
+    else:
+      self.assertAlmostEqual(
+          lhs, rhs, places,
+          '%s != %s within %s places at %s' % (lhs, rhs, places, context))
+
+  def expected_value(self, data, segment_ids, num_segments, combiner):
+    """Find the expected value for a call to ragged_segment_<aggregate>.
+
+    Args:
+      data: The input RaggedTensor, expressed as a nested python list.
+      segment_ids: The segment ids, as a python list of ints.
+      num_segments: The number of segments, as a python int.
+      combiner: The Python function used to combine values.
+    Returns:
+      The expected value, as a nested Python list.
+    """
+    self.assertEqual(len(data), len(segment_ids))
+
+    # Build an empty (num_segments x ncols) "grouped" matrix
+    ncols = max(len(row) for row in data)
+    grouped = [[[] for _ in range(ncols)] for row in range(num_segments)]
+
+    # Append values from data[row] to grouped[segment_ids[row]]
+    for row in range(len(data)):
+      for col in range(len(data[row])):
+        grouped[segment_ids[row]][col].append(data[row][col])
+
+    # Combine the values.
+    return [[combiner(values)
+             for values in grouped_row
+             if values]
+            for grouped_row in grouped]
+
+  @parameterized.parameters(
+      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+  )
+  def testRaggedSegment_Int(self, segment_op, combiner, segment_ids):
+    rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
+    rt = ragged.constant(rt_as_list)
+    num_segments = max(segment_ids) + 1
+    expected = self.expected_value(rt_as_list, segment_ids, num_segments,
+                                   combiner)
+
+    segmented = segment_op(rt, segment_ids, num_segments)
+    with self.test_session():
+      self.assertListEqual(segmented.eval().tolist(), expected)
+
+  @parameterized.parameters(
+      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_sqrt_n, sqrt_n, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 10, 10, 10]),
+  )
+  def testRaggedSegment_Float(self, segment_op, combiner, segment_ids):
+    rt_as_list = [[0., 1., 2., 3.], [4.], [], [5., 6.], [7.], [8., 9.]]
+    rt = ragged.constant(rt_as_list)
+    num_segments = max(segment_ids) + 1
+    expected = self.expected_value(rt_as_list, segment_ids, num_segments,
+                                   combiner)
+
+    segmented = segment_op(rt, segment_ids, num_segments)
+    with self.test_session():
+      self.assertNestedListAmostEqual(
+          self.evaluate(segmented).tolist(), expected, places=5)
+
+  def testRaggedRankTwo(self):
+    rt = ragged.constant([
+        [[111, 112, 113, 114], [121],],  # row 0
+        [],                              # row 1
+        [[], [321, 322], [331]],         # row 2
+        [[411, 412]]                     # row 3
+    ])  # pyformat: disable
+    segment_ids1 = [0, 2, 2, 2]
+    segmented1 = ragged.segment_sum(rt, segment_ids1, 3)
+    expected1 = [[[111, 112, 113, 114], [121]],     # row 0
+                 [],                                # row 1
+                 [[411, 412], [321, 322], [331]]    # row 2
+                ]  # pyformat: disable
+    with self.test_session():
+      self.assertEqual(segmented1.eval().tolist(), expected1)
+
+    segment_ids2 = [1, 2, 1, 1]
+    segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
+    expected2 = [[],
+                 [[111+411, 112+412, 113, 114], [121+321, 322], [331]],
+                 []]  # pyformat: disable
+    with self.test_session():
+      self.assertEqual(segmented2.eval().tolist(), expected2)
+
+  def testRaggedSegmentIds(self):
+    rt = ragged.constant([
+        [[111, 112, 113, 114], [121],],  # row 0
+        [],                              # row 1
+        [[], [321, 322], [331]],         # row 2
+        [[411, 412]]                     # row 3
+    ])  # pyformat: disable
+    segment_ids = ragged.constant([[1, 2], [], [1, 1, 2], [2]])
+    segmented = ragged.segment_sum(rt, segment_ids, 3)
+    expected = [[],
+                [111+321, 112+322, 113, 114],
+                [121+331+411, 412]]  # pyformat: disable
+    with self.test_session():
+      self.assertEqual(segmented.eval().tolist(), expected)
+
+  def testShapeMismatchError1(self):
+    dt = constant_op.constant([1, 2, 3, 4, 5, 6])
+    segment_ids = ragged.constant([[1, 2], []])
+    self.assertRaisesRegexp(
+        ValueError, 'segment_ids.shape must be a prefix of data.shape, '
+        'but segment_ids is ragged and data is not.', ragged.segment_sum, dt,
+        segment_ids, 3)
+
+  def testShapeMismatchError2(self):
+    rt = ragged.constant([
+        [[111, 112, 113, 114], [121]],  # row 0
+        [],                             # row 1
+        [[], [321, 322], [331]],        # row 2
+        [[411, 412]]                    # row 3
+    ])  # pyformat: disable
+    segment_ids = ragged.constant([[1, 2], [1], [1, 1, 2], [2]])
+
+    # Error is raised at graph-building time if we can detect it then.
+    self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'segment_ids.shape must be a prefix of data.shape.*',
+        ragged.segment_sum, rt, segment_ids, 3)
+
+    # Otherwise, error is raised when we run the graph.
+    segment_ids2 = ragged.from_row_splits(
+        array_ops.placeholder_with_default(segment_ids.values, None),
+        array_ops.placeholder_with_default(segment_ids.row_splits, None))
+    segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
+    with self.test_session():
+      self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          'segment_ids.shape must be a prefix of data.shape.*', segmented2.eval)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d474a749f049b24543e6d0406479fead5f44a908
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -0,0 +1,330 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.stack."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']], [[b'b00'],
+                                                               [b'b10']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=1,
+          expected=[
+              [[b'a00', b'a01'], [b'b00']],
+              [[], [b'b10', b'b11', b'b12']],
+              [[b'a20', b'a21', b'a22'], [b'b20']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00', 'b01'], [], ['b20', 'b21', 'b22']]),  # shape=(3, None)
+          axis=2,
+          expected=[
+              [[b'a00', b'b00'], [b'a01', b'b01']], [],
+              [[b'a20', b'b20'], [b'a21', b'b21'], [b'a22', b'b22']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-3',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=-3,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']], [[b'b00'],
+                                                               [b'b10']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=-2,
+          expected=[
+              [[b'a00', b'a01'], [b'b00']],
+              [[], [b'b10', b'b11', b'b12']],
+              [[b'a20', b'a21', b'a22'], [b'b20']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00', 'b01'], [], ['b20', 'b21', 'b22']]),  # shape=(3, None)
+          axis=-1,
+          expected=[
+              [[b'a00', b'b00'], [b'a01', b'b01']], [],
+              [[b'a20', b'b20'], [b'a21', b'b21'], [b'a22', b'b22']]]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10']],                            # shape=(2, None)
+              [['c00'], ['c10', 'c11'], ['c21']]),           # shape=(3, None)
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21', b'a22']],
+                    [[b'b00'], [b'b10']],
+                    [[b'c00'], [b'c10', b'c11'], [b'c21']]]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],     # shape=(3, None)
+              [[], ['c10', 'c11'], ['c20', 'c21']]),         # shape=(3, None)
+          axis=1,
+          expected=[
+              [[b'a00', b'a01'], [b'b00'], []],
+              [[], [b'b10', b'b11', b'b12'], [b'c10', b'c11']],
+              [[b'a20', b'a21', b'a22'], [b'b20'], [b'c20', b'c21']]],
+          expected_shape=[3, None, None]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00', 'b01'], [], ['b20', 'b21', 'b22']],   # shape=(3, None)
+              [['c00', 'c01'], [], ['c20', 'c21', 'c22']]),  # shape=(3, None)
+          axis=2,
+          expected=[
+              [[b'a00', b'b00', b'c00'], [b'a01', b'b01', b'c01']], [],
+              [[b'a20', b'b20', b'c20'], [b'a21', b'b21', b'c21'],
+               [b'a22', b'b22', b'c22']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=0',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [['c100', 'c101', 'c102', 'c103']], [[], ['c210', 'c211']]]),
+          axis=0,
+          expected=[
+              [[[b'a000', b'a001'], [b'a010']],
+               [[b'a100', b'a101', b'a102'], [b'a110', b'a111']]],
+              [[[b'b000']],
+               [[b'b100', b'b101'], [b'b110']]],
+              [[],
+               [[b'c100', b'c101', b'c102', b'c103']],
+               [[], [b'c210', b'c211']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [[], ['c110', 'c111']]]),
+          axis=1,
+          expected=[
+              [[[b'a000', b'a001'], [b'a010']], [[b'b000']], []],
+              [[[b'a100', b'a101', b'a102'], [b'a110', b'a111']],
+               [[b'b100', b'b101'], [b'b110']],
+               [[], [b'c110', b'c111']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=2',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=2,
+          expected=[
+              [[[b'a000', b'a001'], [], [b'c000']],
+               [[b'a010'], [b'b010', b'b011'], [b'c010']]],
+              [[[b'a100', b'a101', b'a102'], [b'b100', b'b101'], []],
+               [[b'a110', b'a111'], [b'b110'], [b'c110', b'c111']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=3',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']]],
+              [[['b000', 'b001'], ['b010']]],
+              [[['c000', 'c001'], ['c010']]]),
+          axis=3,
+          expected=[[
+              [[b'a000', b'b000', b'c000'], [b'a001', b'b001', b'c001']],
+              [[b'a010', b'b010', b'c010']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=-2',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=-2,
+          expected=[
+              [[[b'a000', b'a001'], [], [b'c000']],
+               [[b'a010'], [b'b010', b'b011'], [b'c010']]],
+              [[[b'a100', b'a101', b'a102'], [b'b100', b'b101'], []],
+               [[b'a110', b'a111'], [b'b110'], [b'c110', b'c111']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=-1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']]],
+              [[['b000', 'b001'], ['b010']]],
+              [[['c000', 'c001'], ['c010']]]),
+          axis=-1,
+          expected=[[
+              [[b'a000', b'b000', b'c000'], [b'a001', b'b001', b'c001']],
+              [[b'a010', b'b010', b'c010']]]]),
+      dict(
+          descr='ragged_stack([uniform, ragged, uniform], axis=1)',
+          ragged_ranks=[0, 1, 0],
+          rt_inputs=(
+              [['0('], ['1('], ['2(']],                   # shape=(3, 1)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],  # shape=(3, None)
+              [[')0'], [')1'], [')2']]),                  # shape=(3, 1)
+          axis=1,
+          expected=[
+              [[b'0('], [b'b00'], [b')0']],
+              [[b'1('], [b'b10', b'b11', b'b12'], [b')1']],
+              [[b'2('], [b'b20'], [b')2']]]),
+      dict(
+          descr='ragged_stack([uniform, uniform], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [[b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21']],
+              [[b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']]]),
+      dict(
+          descr='ragged_stack([uniform, ragged], axis=0)',
+          ragged_ranks=[0, 1],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [[b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21']],
+              [[b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']]]),
+      dict(
+          descr='ragged_stack([uniform, ragged], axis=0) with rank-3 inputs',
+          ragged_ranks=[0, 2],
+          rt_inputs=(
+              [[[0, 1], [2, 3]], [[4, 5], [6, 7]]],  # shape = (2, 2, 2)
+              [[[8], [8, 8]]]),                      # shape = (2, None, None)
+          axis=0,
+          expected=[[[[0, 1], [2, 3]], [[4, 5], [6, 7]]], [[[8], [8, 8]]]]),
+      dict(
+          descr='Two rank-3 inputs with ragged_rank=1, axis=-1',
+          ragged_ranks=[1, 1],
+          rt_inputs=(
+              [[[0, 1], [2, 3], [4, 5]], [], [[6, 7], [8, 9]]],
+              [[[9, 8], [7, 6], [5, 4]], [], [[3, 2], [1, 0]]]),
+          axis=-1,
+          expected=[
+              [[[0, 9], [1, 8]], [[2, 7], [3, 6]], [[4, 5], [5, 4]]],
+              [],
+              [[[6, 3], [7, 2]], [[8, 1], [9, 0]]]],
+          expected_shape=[3, None, 2, 2]),
+      dict(
+          descr='Two rank-3 inputs with ragged_rank=1, axis=-2',
+          ragged_ranks=[1, 1],
+          rt_inputs=(
+              [[[0, 1], [2, 3], [4, 5]], [], [[6, 7], [8, 9]]],
+              [[[9, 8], [7, 6], [5, 4]], [], [[3, 2], [1, 0]]]),
+          axis=-2,
+          expected=[
+              [[[0, 1], [9, 8]], [[2, 3], [7, 6]], [[4, 5], [5, 4]]], [],
+              [[[6, 7], [3, 2]], [[8, 9], [1, 0]]]]),
+      dict(
+          descr='ragged_stack([vector, vector], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=([1, 2, 3], [4, 5, 6]),
+          axis=0,
+          expected=[[1, 2, 3], [4, 5, 6]]),
+      dict(
+          descr='One input (so just adds an outer dimension)',
+          rt_inputs=([['a00', 'a01'], [], ['a20', 'a21']],),
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']]]),
+  )   # pyformat: disable
+  def testRaggedStack(self,
+                      descr,
+                      rt_inputs,
+                      axis,
+                      expected,
+                      ragged_ranks=None,
+                      expected_ragged_rank=None,
+                      expected_shape=None):
+    if ragged_ranks is None:
+      ragged_ranks = [None] * len(rt_inputs)
+    rt_inputs = [
+        ragged.constant(rt_input, ragged_rank=rrank)
+        if rrank != 0 else constant_op.constant(rt_input)
+        for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
+    ]
+    stacked = ragged.stack(rt_inputs, axis)
+    if expected_ragged_rank is not None:
+      self.assertEqual(stacked.ragged_rank, expected_ragged_rank)
+    if expected_shape is not None:
+      self.assertEqual(stacked.shape.as_list(), expected_shape)
+    with self.test_session():
+      self.assertEqual(stacked.eval().tolist(), expected)
+
+  @parameterized.parameters(
+      dict(
+          rt_inputs=(),
+          axis=0,
+          error=ValueError,
+          message=r'rt_inputs may not be empty\.'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=r'foo',
+          error=TypeError,
+          message='axis must be an int'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=-4,
+          error=ValueError,
+          message='axis=-4 out of bounds: expected -3<=axis<3'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=3,
+          error=ValueError,
+          message='axis=3 out of bounds: expected -3<=axis<3'),
+  )
+  def testError(self, rt_inputs, axis, error, message):
+    self.assertRaisesRegexp(error, message, ragged.stack, rt_inputs, axis)
+
+  def testSingleTensorInput(self):
+    """Tests ragged_stack with a single tensor input.
+
+    Usually, we pass a list of values in for rt_inputs.  However, you can
+    also pass in a single value (as with tf.stack), in which case it is
+    equivalent to expand_dims(axis=0).  This test exercises that path.
+    """
+    rt_inputs = ragged.constant([[1, 2], [3, 4]])
+    stacked = ragged.stack(rt_inputs, 0)
+    with self.test_session():
+      self.assertEqual(stacked.eval().tolist(), [[[1, 2], [3, 4]]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..abb27fc3c0812fc5eec0bcd078c916c23e815d19
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -0,0 +1,664 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for storing ragged tensors and their values."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.ragged import ragged_tensor_value
+
+# pylint: disable=protected-access
+_eval_using_default_session = ops._eval_using_default_session
+
+# pylint: enable=protected-access
+
+#===============================================================================
+# RaggedTensor
+#===============================================================================
+
+
+class RaggedTensor(object):
+  """Represents a ragged tensor (go/ragged).
+
+  A `RaggedTensor` is a tensor with one or more *ragged dimensions*, which are
+  dimensions whose slices may have different lengths.  For example, the inner
+  (column) dimension of `rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged,
+  since the column slices (`rt[0, :]`, ..., `rt[4, :]`) have different lengths.
+  Dimensions whose slices all have the same length are called *uniform
+  dimensions*.  The outermost dimension of a `RaggedTensor` is always uniform,
+  since it consists of a single slice (and so there is no possibility for
+  differing slice lengths).
+
+  The total number of dimensions in a `RaggedTensor` is called its *rank*,
+  and the number of ragged dimensions in a `RaggedTensor` is called its
+  *ragged-rank*.  A `RaggedTensor`'s ragged-rank is fixed at graph creation
+  time: it can't depend on the runtime values of `Tensor`s, and can't vary
+  dynamically for different session runs.
+
+  ### Potentially Ragged Tensors
+
+  Many ops support both `Tensor`s and `RaggedTensor`s.  The term "potentially
+  ragged tensor" may be used to refer to a tensor that might be either a
+  `Tensor` or a `RaggedTensor`.  The ragged-rank of a `Tensor` is zero.
+
+  ### Documenting RaggedTensor Shapes
+
+  When documenting the shape of a RaggedTensor, ragged dimensions can be
+  indicated by enclosing them in parentheses.  For example, the shape of
+  a 3-D `RaggedTensor` that stores the fixed-size word embedding for each
+  word in a sentence, for each sentence in a batch, could be written as
+  `[num_sentences, (num_words), embedding_size]`.  The parentheses around
+  `(num_words)` indicate that that dimension is ragged, and that the length
+  of each element list in that dimension may vary for each item.
+
+  ### Component Tensors
+
+  Internally, a `RaggedTensor` consists of a concatenated list of values that
+  are partitioned into variable-length rows.  In particular, each `RaggedTensor`
+  consists of:
+
+    * A `values` tensor, which concatenates the variable-length rows into a
+      flattened list.  For example, the `values` tensor for
+      `[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is `[3, 1, 4, 1, 5, 9, 2, 6]`.
+
+    * A `row_splits` vector, which indicates how those flattened values are
+      divided into rows.  In particular, the values for row `rt[i]` are stored
+      in the slice `rt.values[rt.row_splits[i]:rt.row_splits[i+1]]`.
+
+  Example:
+
+  ```python
+  >>> rt = ragged.from_row_splits(values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...                             row_splits=[0, 4, 4, 7, 8, 8])
+  >>> rt.tolist()
+  [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+  ```
+
+  ### Alternative Row-Partitioning Schemes
+
+  In addition to `row_splits`, ragged tensors provide support for four other
+  row-partitioning schemes:
+
+    * `row_lengths`: a vector with shape `[nrows]`, which specifies the length
+      of each row.
+
+    * `value_rowids` and `nrows`: `value_rowids` is a vector with shape
+      `[nvals]`, corresponding one-to-one with `values`, which specifies
+      each value's row index.  In particular, the row `rt[row]` consists of the
+      values `rt.values[j]` where `value_rowids[j]==row`.  `nrows` is an
+      int64 scalar that specifies the number of rows in the `RaggedTensor`.
+      (`nrows` is used to indicate trailing empty rows.)
+
+    * `row_starts`: a vector with shape `[nrows]`, which specifies the start
+      offset of each row.  Equivalent to `row_splits[:-1]`.
+
+    * `row_limits`: a vector with shape `[nrows]`, which specifies the stop
+      offset of each row.  Equivalent to `row_splits[1:]`.
+
+  Example: The following ragged tensors are equivalent, and all represent the
+  nested list `[[3, 1, 4, 1], [], [5, 9, 2], [6], []]`.
+
+  ```python
+  >>> values = [3, 1, 4, 1, 5, 9, 2, 6]
+  >>> rt1 = ragged.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+  >>> rt2 = ragged.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+  >>> rt3 = ragged.from_value_rowids(values,
+  ...                                value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
+  ...                                nrows=5)
+  >>> rt4 = ragged.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+  >>> rt5 = ragged.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+  ```
+
+  ### Multiple Ragged Dimensions
+
+  `RaggedTensor`s with multiple ragged dimensions can be defined by using
+  a nested `RaggedTensor` for the `values` tensor.  Each nested `RaggedTensor`
+  adds a single ragged dimension.
+
+  ```python
+  >>> inner_rt = ragged.from_row_splits(  # =rt1 from above
+  ...     values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+  >>> outer_rt = ragged.from_row_splits(
+  ...     values=inner_rt, row_splits=[0, 3, 3, 5])
+  >>> print outer_rt.tolist()
+  [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
+  >>> print outer_rt.ragged_rank
+  2
+  ```
+
+  The factory function `ragged.from_nested_row_splits` may be used to
+  construct a `RaggedTensor` with multiple ragged dimensions directly, by
+  providing a list of `row_splits` tensors:
+
+  ```python
+  >>> ragged.from_nested_row_splits(
+  ...     inner_values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...     nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8])).tolist()
+  [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
+  ```
+
+  ### Uniform Inner Dimensions
+
+  `RaggedTensor`s with uniform inner dimensions can be defined
+  by using a multidimensional `Tensor` for `values`.
+
+  ```python
+  >>> rt = ragged.from_row_splits(values=tf.ones([5, 3]), row_splits=[0, 2, 5])
+  >>> print rt.tolist()
+  [[[1, 1, 1], [1, 1, 1]],
+   [[1, 1, 1], [1, 1, 1], [1, 1, 1]]]
+   >>> print rt.shape.as_list()
+   [2, None, 3]
+  ```
+
+  ### RaggedTensor Shape Restrictions
+
+  The shape of a RaggedTensor is currently restricted to have the following
+  form:
+
+    * A single uniform dimension
+    * Followed by one or more ragged dimensions
+    * Followed by zero or more uniform dimensions.
+
+  This restriction follows from the fact that each nested `RaggedTensor`
+  replaces the uniform outermost dimension of its `values` with a uniform
+  dimension followed by a ragged dimension.
+  """
+
+  #=============================================================================
+  # Implementation notes
+  #=============================================================================
+  # Currently, the RaggedTensor class uses a single row-partitioning scheme
+  # (row_splits).
+  #
+  # We are considering adding value_rowids+nvals as a secondary
+  # row-partitioning scheme.  This change would not impact the functional
+  # interface of the RaggedTensor class, but it would impact the efficiency
+  # of several operations.  In particular:
+  #
+  #   * The functions `ragged.value_rowids` and `ragged.nrows` would always
+  #     return pre-existing tensors; they would not need to add any ops to
+  #     the graph.
+  #
+  #   * The `RaggedTensor` constructor would construct all row-partitioning
+  #     tensors (row_splits, value_rowids, and nvals).  In eager mode, this
+  #     would mean that conversion operations would occur whenever a
+  #     `RaggedTensor` is constructed.  But in graph mode, the converted
+  #     row-partitioning tensors would only be evaluated if they are used.
+  #
+  # Since this change impacts efficiency but not functionality, we would like
+  # to perform additional profiling with real-world use cases before we
+  # decide whether to make this change.
+
+  #=============================================================================
+  # Constructor (private)
+  #=============================================================================
+  def __init__(self,
+               values,
+               row_splits,
+               cached_row_lengths=None,
+               cached_value_rowids=None,
+               cached_nrows=None,
+               internal=False):
+    """Creates a `RaggedTensor` with a specified partitioning for `values`.
+
+    This constructor is private -- please use one of the following ops to
+    build `RaggedTensor`s:
+
+      * [`ragged.from_row_lengths()`](from_row_lengths.md)
+      * [`ragged.from_value_rowids()`](from_value_rowids.md)
+      * [`ragged.from_row_splits()`](from_row_splits.md)
+      * [`ragged.from_row_starts()`](from_row_starts.md)
+      * [`ragged.from_row_limits()`](from_row_limits.md)
+      * [`ragged.from_nested_row_splits()`](from_nested_row_splits.md)
+      * [`ragged.from_nested_value_rowids()`](from_nested_value_rowids.md)
+
+    Args:
+      values: A potentially ragged tensor of any dtype and shape `[nvals, ...]`.
+      row_splits: A 1-D int64 tensor with shape `[nrows+1]`.
+      cached_row_lengths: A 1-D int64 tensor with shape `[nrows]`
+      cached_value_rowids: A 1-D int64 tensor with shape `[nvals]`.
+      cached_nrows: A 1-D int64 scalar tensor.
+      internal: True if the constructor is being called by one of the factory
+        methods.  If false, an exception will be raised.
+
+    Raises:
+      TypeError: If a row partitioning tensor has an inappropriate dtype.
+      TypeError: If exactly one row partitioning argument was not specified.
+      ValueError: If a row partitioning tensor has an inappropriate shape.
+      ValueError: If multiple partitioning arguments are specified.
+      ValueError: If nrows is specified but value_rowids is not None.
+    """
+    if not internal:
+      raise ValueError("RaggedTensor constructor is private; please use one "
+                       "of the factory methods instead (e.g., "
+                       "ragged.from_row_lengths())")
+
+    # Validate the arguments.
+    if not isinstance(values, (RaggedTensor, ops.Tensor)):
+      raise TypeError("values must be a Tensor or RaggedTensor.")
+    if not isinstance(row_splits, ops.Tensor):
+      raise TypeError("Row-partitioning argument must be a Tensor.")
+    values.shape.with_rank_at_least(1)
+    row_splits.shape.assert_has_rank(1)
+
+    self._values = values
+    self._row_splits = row_splits
+
+    # Store any cached tensors.  These are used to avoid unnecessary
+    # round-trip conversions when a RaggedTensor is constructed from
+    # lengths or rowids, and we later want those lengths/rowids back.
+    for tensor in [cached_row_lengths, cached_value_rowids, cached_nrows]:
+      if tensor is not None and not isinstance(tensor, ops.Tensor):
+        raise TypeError("Cached value must be a Tensor or None.")
+    self._cached_row_lengths = cached_row_lengths
+    self._cached_value_rowids = cached_value_rowids
+    self._cached_nrows = cached_nrows
+
+  #=============================================================================
+  # Accessors
+  #=============================================================================
+
+  @property
+  def dtype(self):
+    """The `DType` of values in this tensor."""
+    return self._values.dtype
+
+  @property
+  def shape(self):
+    """The statically known shape of this ragged tensor.
+
+    Returns:
+      A `TensorShape` containing the statically known shape of this ragged
+      tensor.  Ragged dimensions have a size of `None`.
+
+    Examples:
+
+      ```python
+      >>> ragged.constant([[0], [1, 2]]).shape
+      TensorShape([Dimension(2), Dimension(None)])
+
+      >>> ragged.constant([[[0, 1]], [[1, 2], [3, 4]]], ragged_rank=1).shape
+      TensorShape([Dimension(2), Dimension(None), Dimension(2)
+      ```
+    """
+    nrows = tensor_shape.dimension_at_index(self._row_splits.shape, 0) - 1
+
+    values_shape = self._values.shape
+    value_shape = values_shape[1:]
+    return tensor_shape.TensorShape([nrows, None]).concatenate(value_shape)
+
+  @property
+  def ragged_rank(self):
+    """The number of ragged dimensions in this ragged tensor.
+
+    Returns:
+      A Python `int` indicating the number of ragged dimensions in this ragged
+      tensor.  The outermost dimension is not considered ragged.
+    """
+    values_is_ragged = isinstance(self._values, RaggedTensor)
+    return self._values.ragged_rank + 1 if values_is_ragged else 1
+
+  @property
+  def values(self):
+    """The concatenated rows for this ragged tensor.
+
+    `rt.values` is a potentially ragged tensor formed by flattening the two
+    outermost dimensions of `rt` into a single dimension.
+
+    `rt.values.shape = [nvals] + rt.shape[2:]` (where `nvals` is the
+    number of items in the outer two dimensions of `rt`).
+
+    `rt.ragged_rank = self.ragged_rank - 1`
+
+    Returns:
+      A potentially ragged tensor.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values.eval()
+      [3, 1, 4, 1, 5, 9, 2, 6]
+      ```
+    """
+    return self._values
+
+  @property
+  def row_splits(self):
+    """The row-split indices for this ragged tensor's `values`.
+
+    `rt.row_splits` specifies where the values for each row begin and end in
+    `rt.values`.  In particular, the values for row `rt[i]` are stored in
+    the slice `rt.values[rt.row_splits[i]:rt.row_splits[i+1]]`.
+
+    Returns:
+      A 1-D `int64` `Tensor` with shape `[self.nrows+1]`.
+      The returned tensor is non-empty, and is sorted in ascending order.
+      `self.row_splits[0]` is zero, and `self.row_splits[-1]` is equal to
+      `self.values.shape[0]`.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values.eval()
+      [3, 1, 4, 1, 5, 9, 2, 6]
+      >>> rt.row_splits.eval()  # indices of row splits in ragged.values
+      [0, 4, 4, 7, 8, 8]
+      ```
+    """
+    return self._row_splits
+
+  @property
+  def inner_values(self):
+    """The innermost `values` tensor for this ragged tensor.
+
+    Concretely, if `rt.values` is a `Tensor`, then `rt.inner_values` is
+    `rt.values`; otherwise, `rt.inner_values` is `rt.values.inner_values`.
+
+    Conceptually, `inner_values` is the tensor formed by flattening the
+    outermost dimension and all of the ragged dimensions into a single
+    dimension.
+
+    `rt.inner_values.shape = [nvals] + rt.shape[rt.ragged_rank + 1:]`
+    (where `nvals` is the number of items in the flattened dimensions).
+
+    Returns:
+      A `Tensor`.
+
+    #### Example:
+
+      ```python
+      >>> rt = ragged.constant([[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+      >>> ragged.inner_values(rt).eval()
+      [3, 1, 4, 1, 5, 9, 2, 6]
+      ```
+    """
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensor):
+      rt_values = rt_values.values
+    return rt_values
+
+  @property
+  def nested_row_splits(self):
+    """A tuple containing the row_splits for all ragged dimensions.
+
+    `rt.nested_row_splits` is a tuple containing the `row_splits` tensors for
+    all ragged dimensions in `rt`, ordered from outermost to innermost.  In
+    particular, `rt.nested_row_splits = (rt.row_splits,) + value_splits` where:
+
+        * `value_splits = ()` if `rt.values` is a `Tensor`.
+        * `value_splits = rt.values.nested_row_splits` otherwise.
+
+    Returns:
+      A `tuple` of 1-D `int64` `Tensor`s.
+
+    #### Example:
+
+      ```python
+      >>> rt = ragged.constant([[[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]])
+      >>> for i, splits in enumerate(ragged.nested_row_splits(rt)):
+      ...   print('Splits for dimension %d: %s' % (i+1, splits.eval()))
+      Splits for dimension 1: [0, 1]
+      Splits for dimension 2: [0, 3, 3, 5]
+      Splits for dimension 3: [0, 4, 4, 7, 8, 8]
+      ```
+
+    """
+    rt_nested_splits = [self.row_splits]
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensor):
+      rt_nested_splits.append(rt_values.row_splits)
+      rt_values = rt_values.values
+    return tuple(rt_nested_splits)
+
+  @property
+  def cached_value_rowids(self):
+    """The row lengths for this `RaggedTensor`, or `None`.
+
+    Returns:
+      The `value_rowids` tensor that was used to construct this `RaggedTensor`
+      if it was constructed using
+      [`ragged.from_value_rowids`](from_value_rowids.md); or `None` otherwise.
+    """
+    return self._cached_value_rowids
+
+  @property
+  def cached_nrows(self):
+    """The row lengths for this `RaggedTensor`, or `None`.
+
+    Returns:
+      The `nrows` tensor that was used to construct this `RaggedTensor`
+      if it was constructed using
+      [`ragged.from_value_rowids`](from_value_rowids.md); or `None` otherwise.
+    """
+    return self._cached_nrows
+
+  @property
+  def cached_row_lengths(self):
+    """The row lengths for this `RaggedTensor`, or `None`.
+
+    Returns:
+      The `row_lengths` tensor that was used to construct this `RaggedTensor`
+      if it was constructed using
+      [`ragged.from_row_lengths`](from_row_lengths.md); or `None` otherwise.
+    """
+    return self._cached_row_lengths
+
+  #=============================================================================
+  # Transformation
+  #=============================================================================
+
+  def with_values(self, new_values):
+    """Returns a copy of `self` with `values` replaced by `new_value`.
+
+    Preserves cached row-partitioning tensors such as `self.cached_nrows` and
+    `self.cached_value_rowids` if they have values.
+
+    Args:
+      new_values: Potentially ragged tensor to use as the `values` for the
+        returned `RaggedTensor`.  Must have `rank > 0`, and must have the same
+        number of rows as `self.values`.
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = 1 + new_values.rank`.
+      `result.ragged_rank = 1 + new_values.ragged_rank`
+    """
+    new_values.shape.with_rank_at_least(1)
+    self.values.shape[0].assert_is_compatible_with(new_values.shape[0])
+    return RaggedTensor(
+        new_values,
+        self._row_splits,
+        self._cached_row_lengths,
+        self._cached_value_rowids,
+        self._cached_nrows,
+        internal=True)
+
+  def with_inner_values(self, new_values):
+    """Returns a copy of `self` with `inner_values` replaced by `new_value`.
+
+    Preserves cached row-partitioning tensors such as `self.cached_nrows` and
+    `self.cached_value_rowids` if they have values.
+
+    Args:
+      new_values: Potentially ragged tensor that should replace
+      `self.inner_values`.  Must have `rank > 0`, and must have the same
+      number of rows as `self.inner_values`.
+
+    Returns:
+      A `RaggedTensor`.
+      `result.rank = self.ragged_rank + new_values.rank`.
+      `result.ragged_rank = self.ragged_rank + new_values.ragged_rank`.
+    """
+    if isinstance(self._values, ops.Tensor):
+      return self.with_values(new_values)
+    else:
+      return self.with_values(self.values.with_inner_values(new_values))
+
+  #=============================================================================
+  # String Encoding
+  #=============================================================================
+  def __str__(self):
+    if self._is_eager():
+      return "RaggedTensor(%s)" % self.tolist()
+    else:
+      return self.__repr__()
+
+  def __repr__(self):
+    return "RaggedTensor(values=%s, row_splits=%s)" % (self._values,
+                                                       self._row_splits)
+
+  #=============================================================================
+  # Eager Execution Mode
+  #=============================================================================
+
+  def tolist(self):
+    """Returns a nested Python `list` with the values for this `RaggedTensor`.
+
+    If a `RaggedTensor` `rt` was constructed in graph execution mode, then
+    `rt.tolist()` is equivalent to `rt.eval().tolist()`.
+
+    If a `RaggedTensor` `rt` was constructed in eager execution mode, then
+    `rt.tolist()` builds the Python list based on `rt`'s `EagerTensor`
+    components.
+
+    Returns:
+      A nested Python `list`.
+    """
+    if self._is_eager():
+      return self._eager_value().tolist()
+    else:
+      return self.eval().tolist()
+
+  def _eager_value(self):
+    """Returns a RaggedTensorValue for self.  Requires self._is_eager()=true."""
+    value = self.inner_values.numpy()
+    for row_splits in reversed(self.nested_row_splits):
+      value = ragged_tensor_value.RaggedTensorValue(value, row_splits.numpy())
+    return value
+
+  def _is_eager(self):
+    """Returns True if values & row_splits Tensors are all `EagerTensor`s."""
+    rt = self
+    while isinstance(rt, RaggedTensor):
+      if not isinstance(rt.row_splits, ops.EagerTensor):
+        return False
+      rt = rt.values
+    return isinstance(rt, ops.EagerTensor)
+
+  #=============================================================================
+  # Evaluation
+  #=============================================================================
+  def eval(self, feed_dict=None, session=None):  # pylint: disable=redefined-outer-name
+    """Evaluates this ragged tensor in a `Session`.
+
+    Args:
+      feed_dict: A dictionary that maps `Tensor` objects to feed values. See
+        `tf.Session.run` for a description of the valid feed values.
+      session: The `Session` to be used to evaluate this ragged tensor. If none,
+        the default session will be used.
+
+    Returns:
+      A `RaggedTensorValue` object.
+    """
+    return _eval_using_default_session(self, feed_dict,
+                                       self._as_graph_element().graph, session)
+
+  #=============================================================================
+  # Indexing & Slicing
+  #=============================================================================
+  def __getitem__(self, key):
+    """Returns the specified piece of this RaggedTensor."""
+    # See ragged_getitem.py for the documentation and implementation of this
+    # method.
+    #
+    # Note: the imports in ragged/__init__.py ensure that this method always
+    # gets overridden before it is called.
+
+  #=============================================================================
+  # Name Scope
+  #=============================================================================
+
+  # This private function is used by ops.name_scope to ensure that all of the
+  # input tensors for the scope belong to the same graph.  Defining this means
+  # that you may include `RaggedTensor` objects in the name_scope `values`
+  # list.
+  def _as_graph_element(self):
+    """Convert `self` to a graph element."""
+    values = self.values
+    while isinstance(values, RaggedTensor):
+      values = values.values
+    return values
+
+
+def is_ragged(value):
+  """Returns true if `value` is a ragged tensor or ragged tensor value."""
+  return isinstance(value,
+                    (RaggedTensor, ragged_tensor_value.RaggedTensorValue))
+
+
+#===============================================================================
+# Register RaggedTensor for use with session.run.
+#===============================================================================
+def _ragged_tensor_value_from_components(components):
+  components = list(components)
+  value = components.pop()
+  while components:
+    value = ragged_tensor_value.RaggedTensorValue(value, components.pop())
+  return value
+
+
+def _ragged_tensor_session_fetch(rt):
+  components = rt.nested_row_splits + (rt.inner_values,)
+  return (components, _ragged_tensor_value_from_components)
+
+
+def _ragged_tensor_session_feed(feed_key, feed_val):
+  key_components = feed_key.nested_row_splits + (feed_key.inner_values,)
+  val_components = feed_val.nested_row_splits + (feed_val.inner_values,)
+  return zip(key_components, val_components)
+
+
+def _ragged_tensor_session_feed_for_partial_run(feed_key):
+  return feed_key.nested_row_splits + (feed_key.inner_values,)
+
+
+session.register_session_run_conversion_functions(
+    RaggedTensor, _ragged_tensor_session_fetch, _ragged_tensor_session_feed,
+    _ragged_tensor_session_feed_for_partial_run)
+
+
+class RaggedTensorType(object):
+  """Encoding of a static type for a `RaggedTensor`.
+
+  Use this type to express/declare that an output must have the type of
+  `RaggedTensor`.
+  """
+
+  def __init__(self, dtype, ragged_rank):
+    """Initializes a RaggedTensorType object.
+
+    Args:
+      dtype: data type of the `RaggedTensor`'s inner values.
+      ragged_rank: ragged_rank of the declared `RaggedTensor`.
+    """
+    self._dtype = dtype
+    self._ragged_rank = ragged_rank
+
+  dtype = property(lambda self: self._dtype)
+  ragged_rank = property(lambda self: self._ragged_rank)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1c10aff9de5c961962cc2227442789cc9f7e9b0
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.bounding_shape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorBoundingShapeOp(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    # This is the example from ragged.bounding_shape.__doc__.
+    rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(rt).eval().tolist(), [5, 4])
+
+  def test2DRaggedTensorWithOneRaggedDimension(self):
+    values = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+    rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged.from_row_splits(values, [0, 7])
+    rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(rt1).eval().tolist(), [5, 3])
+      self.assertEqual(ragged.bounding_shape(rt2).eval().tolist(), [1, 7])
+      self.assertEqual(ragged.bounding_shape(rt3).eval().tolist(), [3, 7])
+
+  def test3DRaggedTensorWithOneRaggedDimension(self):
+    values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
+    rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged.from_row_splits(values, [0, 7])
+    rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(rt1).eval().tolist(), [5, 3, 2])
+      self.assertEqual(ragged.bounding_shape(rt2).eval().tolist(), [1, 7, 2])
+      self.assertEqual(ragged.bounding_shape(rt3).eval().tolist(), [3, 7, 2])
+
+  def testNonRaggedTensor(self):
+    dt = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(dt).eval().tolist(), [4, 3])
+
+  def testExplicitAxisOptimizations(self):
+    rt = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
+    with self.test_session():
+      self.assertEqual(ragged.bounding_shape(rt, 0).eval().tolist(), 5)
+      self.assertEqual(ragged.bounding_shape(rt, 1).eval().tolist(), 3)
+      self.assertEqual(
+          ragged.bounding_shape(rt, [1, 0]).eval().tolist(), [3, 5])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f66ca102ef91af0ac5d359b9dc1a02d5d9b0d682
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -0,0 +1,1206 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for third_party.tensorflow.python.ops.ragged_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import sys
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class _SliceBuilder(object):
+  """Helper to construct arguments for __getitem__.
+
+  Usage: _SliceBuilder()[<expr>] slice_spec Python generates for <expr>.
+  """
+
+  def __getitem__(self, slice_spec):
+    return slice_spec
+
+
+SLICE_BUILDER = _SliceBuilder()
+
+
+def _make_tensor_slice_spec(slice_spec, use_constant=True):
+  """Wraps all integers in an extended slice spec w/ a tensor.
+
+  This function is used to help test slicing when the slice spec contains
+  tensors, rather than integers.
+
+  Args:
+    slice_spec: The extended slice spec.
+    use_constant: If true, then wrap each integer with a tf.constant.  If false,
+      then wrap each integer with a tf.placeholder.
+
+  Returns:
+    A copy of slice_spec, but with each integer i replaced with tf.constant(i).
+  """
+
+  def make_piece_scalar(piece):
+    if isinstance(piece, int):
+      scalar = constant_op.constant(piece)
+      if use_constant:
+        return scalar
+      else:
+        return array_ops.placeholder_with_default(scalar, [])
+    elif isinstance(piece, slice):
+      return slice(
+          make_piece_scalar(piece.start), make_piece_scalar(piece.stop),
+          make_piece_scalar(piece.step))
+    else:
+      return piece
+
+  if isinstance(slice_spec, tuple):
+    return tuple(make_piece_scalar(piece) for piece in slice_spec)
+  else:
+    return make_piece_scalar(slice_spec)
+
+
+# Example 2D ragged tensor value with one ragged dimension and with scalar
+# values, expressed as nested python lists and as splits+values.
+EXAMPLE_RAGGED_TENSOR_2D = [[b'a', b'b'], [b'c', b'd', b'e'], [b'f'], [],
+                            [b'g']]
+EXAMPLE_RAGGED_TENSOR_2D_SPLITS = [0, 2, 5, 6, 6, 7]
+EXAMPLE_RAGGED_TENSOR_2D_VALUES = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+
+# Example 4D ragged tensor value, with two ragged dimensions and with values
+# whose shape is [2], expressed as nested python lists and as splits+values.
+EXAMPLE_RAGGED_TENSOR_4D = [
+    [                                       # rt[0]
+        [[1, 2], [3, 4], [5, 6]],           # rt[0][0]
+        [[7, 8], [9, 10], [11, 12]]],       # rt[0][1]
+    [],                                     # rt[1]
+    [                                       # rt[2]
+        [[13, 14], [15, 16], [17, 18]]],    # rt[2][0]
+    [                                       # rt[3]
+        [[19, 20]]]                         # rt[3][0]
+]  # pyformat: disable
+EXAMPLE_RAGGED_TENSOR_4D_SPLITS1 = [0, 2, 2, 3, 4]
+EXAMPLE_RAGGED_TENSOR_4D_SPLITS2 = [0, 3, 6, 9, 10]
+EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
+                                   [11, 12], [13, 14], [15, 16], [17,
+                                                                  18], [19, 20]]
+
+
+class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+  longMessage = True  # Property in unittest.Testcase. pylint: disable=invalid-name
+
+  #=============================================================================
+  # RaggedTensor class docstring examples
+  #=============================================================================
+
+  def testClassDocStringExamples(self):
+    # From section: "Component Tensors"
+    rt = ragged.from_row_splits(
+        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+    with self.test_session():
+      self.assertEqual(rt.tolist(),
+                       [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    del rt
+
+    # From section: "Alternative Row-Partitioning Schemes"
+    values = [3, 1, 4, 1, 5, 9, 2, 6]
+    rt1 = ragged.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+    rt2 = ragged.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+    rt3 = ragged.from_value_rowids(
+        values, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+    rt4 = ragged.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+    rt5 = ragged.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+    for rt in (rt1, rt2, rt3, rt4, rt5):
+      with self.test_session():
+        self.assertEqual(rt.tolist(),
+                         [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    del rt1, rt2, rt3, rt4, rt5
+
+    # From section: "Multiple Ragged Dimensions"
+    inner_rt = ragged.from_row_splits(
+        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+    outer_rt = ragged.from_row_splits(values=inner_rt, row_splits=[0, 3, 3, 5])
+    self.assertEqual(outer_rt.ragged_rank, 2)
+    with self.test_session():
+      self.assertEqual(outer_rt.tolist(),
+                       [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    del inner_rt, outer_rt
+
+    # From section: "Multiple Ragged Dimensions"
+    rt = ragged.from_nested_row_splits(
+        inner_values=[3, 1, 4, 1, 5, 9, 2, 6],
+        nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8]))
+    with self.test_session():
+      self.assertEqual(rt.tolist(),
+                       [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    del rt
+
+    # From section: "Uniform Inner Dimensions"
+    rt = ragged.from_row_splits(
+        values=array_ops.ones([5, 3]), row_splits=[0, 2, 5])
+    with self.test_session():
+      self.assertEqual(
+          rt.tolist(),
+          [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
+      self.assertEqual(rt.shape.as_list(), [2, None, 3])
+    del rt
+
+  #=============================================================================
+  # RaggedTensorValue Constructor
+  #=============================================================================
+
+  def testRaggedTensorValueConstruction(self):
+    values = np.array(b'a b c d e f g'.split())
+    splits = np.array([0, 2, 5, 6, 6, 7], dtype=np.int64)
+    splits2 = np.array([0, 3, 5], dtype=np.int64)
+
+    # Test construction of a RaggedTensorValue with ragged_rank=1.
+    rt_value = ragged.RaggedTensorValue(values, splits)
+    self.assertEqual(rt_value.row_splits.dtype, np.int64)
+    self.assertEqual(rt_value.shape, (5, None))
+    self.assertEqual(len(rt_value.nested_row_splits), 1)
+    self.assertAllEqual(splits, rt_value.row_splits)
+    self.assertAllEqual(values, rt_value.values)
+    self.assertAllEqual(splits, rt_value.nested_row_splits[0])
+    self.assertAllEqual(values, rt_value.inner_values)
+
+    # Test construction of a RaggedTensorValue with ragged_rank=2.
+    rt_value = ragged.RaggedTensorValue(
+        values=ragged.RaggedTensorValue(values, splits), row_splits=splits2)
+    self.assertEqual(rt_value.row_splits.dtype, np.int64)
+    self.assertEqual(rt_value.shape, (2, None, None))
+    self.assertEqual(len(rt_value.nested_row_splits), 2)
+    self.assertAllEqual(splits2, rt_value.row_splits)
+    self.assertAllEqual(splits, rt_value.values.row_splits)
+    self.assertAllEqual(splits2, rt_value.nested_row_splits[0])
+    self.assertAllEqual(splits, rt_value.nested_row_splits[1])
+    self.assertAllEqual(values, rt_value.values.values)
+    self.assertAllEqual(values, rt_value.inner_values)
+
+  #=============================================================================
+  # RaggedTensor Constructor (private)
+  #=============================================================================
+
+  def testRaggedTensorConstruction(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    rt = ragged.RaggedTensor(
+        values=values, row_splits=row_splits, internal=True)
+
+    with self.test_session():
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testRaggedTensorConstructionErrors(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'RaggedTensor constructor is private'):
+      ragged.RaggedTensor(values=values, row_splits=row_splits)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'values must be a Tensor or RaggedTensor'):
+      ragged.RaggedTensor(values=range(7), row_splits=row_splits, internal=True)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'Row-partitioning argument must be a Tensor'):
+      ragged.RaggedTensor(
+          values=values, row_splits=[0, 2, 2, 5, 6, 7], internal=True)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shape \(6, 1\) must have rank 1'):
+      ragged.RaggedTensor(
+          values=values,
+          row_splits=array_ops.expand_dims(row_splits, 1),
+          internal=True)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'Cached value must be a Tensor or None.'):
+      ragged.RaggedTensor(values=values, row_splits=row_splits,
+                          cached_row_lengths=[2, 3, 4], internal=True)
+
+
+#=============================================================================
+# RaggedTensor Factory Ops
+#=============================================================================
+
+  def testFromValueRowIdsWithDerivedNRows(self):
+    # nrows is known at graph creation time.
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+
+    rt = ragged.from_value_rowids(values, value_rowids)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, value_rowids)
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromValueRowIdsWithDerivedNRowsDynamic(self):
+    # nrows is not known at graph creation time.
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    value_rowids = array_ops.placeholder_with_default(value_rowids, shape=None)
+
+    rt = ragged.from_value_rowids(values, value_rowids)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [None, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, value_rowids)
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromValueRowIdsWithExplicitNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(7, dtypes.int64)
+
+    rt = ragged.from_value_rowids(values, value_rowids, nrows)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [7, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    self.assertIs(rt_nrows, nrows)  # cached_nrows
+    with self.test_session():
+      self.assertEqual(
+          rt.tolist(),
+          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
+
+  def testFromValueRowIdsWithExplicitNRowsEqualToDefault(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(5, dtypes.int64)
+
+    rt = ragged.from_value_rowids(values, value_rowids, nrows)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    self.assertIs(rt_nrows, nrows)  # cached_nrows
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, value_rowids)
+      self.assertAllEqual(rt_nrows, nrows)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromValueRowIdsWithEmptyValues(self):
+    rt = ragged.from_value_rowids([], [])
+    rt_nrows = ragged.nrows(rt)
+    self.assertEqual(rt.dtype, dtypes.float32)
+    self.assertEqual(rt.shape.as_list(), [0, None])
+    self.assertEqual(rt.ragged_rank, 1)
+    self.assertEqual(rt.values.shape.as_list(), [0])
+    self.assertEqual(ragged.value_rowids(rt).shape.as_list(), [0])
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval().tolist(), 0)
+      self.assertEqual(rt.tolist(), [])
+
+  def testFromRowSplits(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+
+    rt = ragged.from_row_splits(values, row_splits)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_splits = rt.row_splits
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_row_splits, row_splits)
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowSplitsWithEmptySplits(self):
+    err_msg = 'row_splits tensor may not be empty'
+    with self.assertRaisesRegexp(ValueError, err_msg):
+      ragged.from_row_splits([], [])
+
+  def testFromRowStarts(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
+
+    rt = ragged.from_row_starts(values, row_starts)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_starts = ragged.row_starts(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertAllEqual(rt_row_starts, row_starts)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowLimits(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
+
+    rt = ragged.from_row_limits(values, row_limits)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_limits = ragged.row_limits(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertAllEqual(rt_row_limits, row_limits)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowLengths(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_lengths = constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
+
+    rt = ragged.from_row_lengths(values, row_lengths)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_lengths = ragged.row_lengths(rt)
+    rt_nrows = ragged.nrows(rt)
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_row_lengths, row_lengths)  # cached_nrows
+    with self.test_session():
+      self.assertEqual(rt_nrows.eval(), 5)
+      self.assertAllEqual(rt_row_lengths, row_lengths)
+      self.assertEqual(rt.tolist(),
+                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromNestedValueRowIdsWithDerivedNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+
+    rt = ragged.from_nested_value_rowids(values, nested_value_rowids)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [4, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_values_values = rt_values.values
+    rt_values_value_rowids = ragged.value_rowids(rt_values)
+
+    self.assertIs(rt_values_values, values)
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+      self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+      self.assertEqual(
+          rt.tolist(),
+          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+
+  def testFromNestedValueRowIdsWithExplicitNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    nrows = [
+        constant_op.constant(6, dtypes.int64),
+        constant_op.constant(6, dtypes.int64)
+    ]
+
+    rt = ragged.from_nested_value_rowids(values, nested_value_rowids, nrows)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [6, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_value_rowids = ragged.value_rowids(rt)
+    rt_nrows = ragged.nrows(rt)
+    rt_values_values = rt_values.values
+    rt_values_value_rowids = ragged.value_rowids(rt_values)
+    rt_values_nrows = ragged.nrows(rt_values)
+
+    self.assertIs(rt_values_values, values)
+    with self.test_session():
+      self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+      self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+      self.assertAllEqual(rt_nrows, nrows[0])
+      self.assertAllEqual(rt_values_nrows, nrows[1])
+      self.assertEqual(rt.tolist(),
+                       [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [],
+                        [[b'f'], [b'g'], []], [], []])
+
+  def testFromNestedValueRowIdsWithExplicitNRowsMismatch(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    nrows = [constant_op.constant(6, dtypes.int64)]
+    with self.assertRaisesRegexp(
+        ValueError, 'nested_nrows must have the same '
+        'length as nested_value_rowids'):
+      ragged.from_nested_value_rowids(values, nested_value_rowids, nrows)
+
+  def testFromNestedValueRowIdsWithNonListInput(self):
+    with self.assertRaisesRegexp(
+        TypeError, 'nested_value_rowids must be a list of Tensors'):
+      ragged.from_nested_value_rowids([1, 2, 3],
+                                      constant_op.constant(
+                                          [[0, 1, 2], [0, 1, 2]], dtypes.int64))
+    with self.assertRaisesRegexp(TypeError,
+                                 'nested_nrows must be a list of Tensors'):
+      ragged.from_nested_value_rowids([1, 2, 3], [[0, 1, 2], [0, 1, 2]],
+                                      constant_op.constant([3, 3]))
+
+  def testFromNestedRowSplits(self):
+    inner_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_row_splits = [
+        constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
+        constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    ]
+
+    rt = ragged.from_nested_row_splits(inner_values, nested_row_splits)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [4, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_row_splits = rt.row_splits
+    rt_values_values = rt_values.values
+    rt_values_row_splits = rt_values.row_splits
+
+    self.assertIs(rt_values_values, inner_values)
+    self.assertIs(rt_row_splits, nested_row_splits[0])
+    self.assertIs(rt_values_row_splits, nested_row_splits[1])
+    with self.test_session():
+      self.assertEqual(
+          rt.tolist(),
+          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+
+  def testFromNestedRowSplitsWithNonListInput(self):
+    with self.assertRaisesRegexp(TypeError,
+                                 'nested_row_splits must be a list of Tensors'):
+      ragged.from_nested_row_splits([1, 2],
+                                    constant_op.constant([[0, 1, 2], [0, 1, 2]],
+                                                         dtypes.int64))
+
+  def testFromValueRowIdsWithBadNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(5, dtypes.int64)
+
+    with self.assertRaisesRegexp(ValueError, r'Expected nrows >= 0; got -2'):
+      ragged.from_value_rowids(
+          values=values,
+          value_rowids=array_ops.placeholder_with_default(value_rowids, None),
+          nrows=-2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=2, '
+        r'value_rowids\[-1\]=4'):
+      ragged.from_value_rowids(
+          values=values, value_rowids=value_rowids, nrows=2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=4, '
+        r'value_rowids\[-1\]=4'):
+      ragged.from_value_rowids(
+          values=values, value_rowids=value_rowids, nrows=4)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shape \(7, 1\) must have rank 1'):
+      ragged.from_value_rowids(
+          values=values,
+          value_rowids=array_ops.expand_dims(value_rowids, 1),
+          nrows=nrows)
+
+    with self.assertRaisesRegexp(ValueError, r'Shape \(1,\) must have rank 0'):
+      ragged.from_value_rowids(
+          values=values,
+          value_rowids=value_rowids,
+          nrows=array_ops.expand_dims(nrows, 0))
+
+  def testGraphMismatch(self):
+    with ops.Graph().as_default():
+      values = constant_op.constant([1, 2, 3])
+    with ops.Graph().as_default():
+      splits = constant_op.constant([0, 2, 3])
+    self.assertRaisesRegexp(ValueError, '.* must be from the same graph as .*',
+                            ragged.from_row_splits, values, splits)
+
+  #=============================================================================
+  # Ragged Value & Row-Partitioning Tensor Accessors
+  #=============================================================================
+
+  def testRaggedTensorAccessors_2d(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    rt1 = ragged.from_row_splits(values, row_splits)
+    rt2 = ragged.from_value_rowids(values, value_rowids)
+
+    for rt in [rt1, rt2]:
+      with self.test_session():
+        self.assertEqual(rt.tolist(),
+                         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+        self.assertEqual(rt.values.eval().tolist(),
+                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+        self.assertEqual(rt.values.shape.dims[0].value, 7)
+        self.assertEqual(
+            ragged.value_rowids(rt).eval().tolist(), [0, 0, 2, 2, 2, 3, 4])
+        self.assertEqual(ragged.nrows(rt).eval().tolist(), 5)
+        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 2, 5, 6, 7])
+        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 2, 5, 6])
+        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 2, 5, 6, 7])
+        self.assertEqual(
+            ragged.row_lengths(rt).eval().tolist(), [2, 0, 3, 1, 1])
+        self.assertEqual(rt.inner_values.eval().tolist(),
+                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
+                         [[0, 2, 2, 5, 6, 7]])
+
+  def testRaggedTensorAccessors_3d_with_ragged_rank_1(self):
+    values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    rt1 = ragged.from_row_splits(values, row_splits)
+    rt2 = ragged.from_value_rowids(values, value_rowids)
+
+    for rt in [rt1, rt2]:
+      with self.test_session():
+        self.assertEqual(rt.tolist(),
+                         [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]],
+                          [[10, 11]], [[12, 13]]])
+        self.assertEqual(
+            rt.values.eval().tolist(),
+            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+        self.assertEqual(rt.values.shape.dims[0].value, 7)
+        self.assertEqual(
+            ragged.value_rowids(rt).eval().tolist(), [0, 0, 2, 2, 2, 3, 4])
+        self.assertEqual(ragged.nrows(rt).eval().tolist(), 5)
+        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 2, 5, 6, 7])
+        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 2, 5, 6])
+        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 2, 5, 6, 7])
+        self.assertEqual(
+            ragged.row_lengths(rt).eval().tolist(), [2, 0, 3, 1, 1])
+        self.assertEqual(
+            rt.inner_values.eval().tolist(),
+            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
+                         [[0, 2, 2, 5, 6, 7]])
+
+  def testRaggedTensorAccessors_3d_with_ragged_rank_2(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_row_splits = [
+        constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
+        constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    ]
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    rt1 = ragged.from_nested_row_splits(values, nested_row_splits)
+    rt2 = ragged.from_nested_value_rowids(values, nested_value_rowids)
+
+    for rt in [rt1, rt2]:
+      with self.test_session():
+        self.assertEqual(
+            rt.tolist(),
+            [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+        self.assertEqual(rt.values.eval().tolist(),
+                         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+        self.assertEqual(rt.values.shape.dims[0].value, 5)
+        self.assertEqual(
+            ragged.value_rowids(rt).eval().tolist(), [0, 0, 1, 3, 3])
+        self.assertEqual(ragged.nrows(rt).eval().tolist(), 4)
+        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 3, 3, 5])
+        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 3, 3])
+        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 3, 3, 5])
+        self.assertEqual(ragged.row_lengths(rt).eval().tolist(), [2, 1, 0, 2])
+        self.assertEqual(rt.inner_values.eval().tolist(),
+                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
+                         [[0, 2, 3, 3, 5], [0, 2, 2, 5, 6, 7]])
+
+  def testNRowsWithTensorInput(self):
+    dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
+    nrows = ragged.nrows(dt)
+    with self.test_session():
+      self.assertEqual(nrows.eval(), 2)
+
+  def testRowLengthsWithTensorInput(self):
+    dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
+    row_lengths = ragged.row_lengths(dt)
+    with self.test_session():
+      self.assertEqual(row_lengths.eval().tolist(), [3, 3])
+
+  #=============================================================================
+  # RaggedTensor.shape
+  #=============================================================================
+
+  def testShape(self):
+    """Tests for RaggedTensor.shape."""
+    rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
+    self.assertEqual(rt1.shape.as_list(), [5, None])
+
+    rt2 = ragged.from_row_splits(
+        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]],
+        [0, 2, 5, 6, 6, 7])
+    self.assertEqual(rt2.shape.as_list(), [5, None, 2])
+
+    rt3 = ragged.from_row_splits(
+        [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]], [0, 2, 2, 3])
+    self.assertEqual(rt3.shape.as_list(), [3, None, 2, 2])
+
+    rt4 = ragged.from_row_splits(rt3, [0, 1, 3, 3])
+    self.assertEqual(rt4.shape.as_list(), [3, None, None, 2, 2])
+
+    rt5 = ragged.from_row_splits(
+        array_ops.placeholder(dtype=dtypes.string), [0, 2, 3, 5])
+    self.assertEqual(rt5.shape.ndims, None)
+
+    rt6 = ragged.from_row_splits([1, 2, 3],
+                                 array_ops.placeholder(dtype=dtypes.int64))
+    self.assertEqual(rt6.shape.as_list(), [None, None])
+
+  #=============================================================================
+  # RaggedTensor.__getitem__
+  #=============================================================================
+
+  def _TestGetItem(self, rt, slice_spec, expected):
+    """Helper function for testing RaggedTensor.__getitem__.
+
+    Checks that calling `rt.__getitem__(slice_spec) returns the expected value.
+    Checks three different configurations for each slice spec:
+
+      * Call __getitem__ with the slice spec as-is (with int values)
+      * Call __getitem__ with int values in the slice spec wrapped in
+        `tf.constant()`.
+      * Call __getitem__ with int values in the slice spec wrapped in
+        `tf.placeholder()` (so value is not known at graph construction time).
+
+    Args:
+      rt: The RaggedTensor to test.
+      slice_spec: The slice spec.
+      expected: The expected value of rt.__getitem__(slice_spec), as a python
+        list; or an exception class.
+    """
+    with self.test_session():
+      tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+      tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
+      value1 = rt.__getitem__(slice_spec).eval()
+      value2 = rt.__getitem__(tensor_slice_spec1).eval()
+      value3 = rt.__getitem__(tensor_slice_spec2).eval()
+      if hasattr(value1, 'tolist'):
+        value1 = value1.tolist()
+      if hasattr(value2, 'tolist'):
+        value2 = value2.tolist()
+      if hasattr(value3, 'tolist'):
+        value3 = value3.tolist()
+      self.assertEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
+      self.assertEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
+      self.assertEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
+
+  def _TestGetItemException(self, rt, slice_spec, expected, message):
+    """Helper function for testing RaggedTensor.__getitem__ exceptions."""
+    with self.test_session():
+      tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+      self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
+      self.assertRaisesRegexp(expected, message, rt.__getitem__,
+                              tensor_slice_spec1)
+
+  @parameterized.parameters(
+      # Tests for rt[i]
+      (SLICE_BUILDER[-5], EXAMPLE_RAGGED_TENSOR_2D[-5]),
+      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
+      (SLICE_BUILDER[-1], EXAMPLE_RAGGED_TENSOR_2D[-1]),
+      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
+      (SLICE_BUILDER[1], EXAMPLE_RAGGED_TENSOR_2D[1]),
+      (SLICE_BUILDER[4], EXAMPLE_RAGGED_TENSOR_2D[4]),
+
+      # Tests for rt[i:]
+      (SLICE_BUILDER[-6:], EXAMPLE_RAGGED_TENSOR_2D[-6:]),
+      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
+      (SLICE_BUILDER[-1:], EXAMPLE_RAGGED_TENSOR_2D[-1:]),
+      (SLICE_BUILDER[0:], EXAMPLE_RAGGED_TENSOR_2D[0:]),
+      (SLICE_BUILDER[3:], EXAMPLE_RAGGED_TENSOR_2D[3:]),
+      (SLICE_BUILDER[5:], EXAMPLE_RAGGED_TENSOR_2D[5:]),
+
+      # Tests for rt[:j]
+      (SLICE_BUILDER[:-6], EXAMPLE_RAGGED_TENSOR_2D[:-6]),
+      (SLICE_BUILDER[:-3], EXAMPLE_RAGGED_TENSOR_2D[:-3]),
+      (SLICE_BUILDER[:-1], EXAMPLE_RAGGED_TENSOR_2D[:-1]),
+      (SLICE_BUILDER[:0], EXAMPLE_RAGGED_TENSOR_2D[:0]),
+      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
+      (SLICE_BUILDER[:5], EXAMPLE_RAGGED_TENSOR_2D[:5]),
+
+      # Tests for rt[i:j]
+      (SLICE_BUILDER[0:3], EXAMPLE_RAGGED_TENSOR_2D[0:3]),
+      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
+      (SLICE_BUILDER[-5:3], EXAMPLE_RAGGED_TENSOR_2D[-5:3]),
+      (SLICE_BUILDER[3:1], EXAMPLE_RAGGED_TENSOR_2D[3:1]),
+      (SLICE_BUILDER[-1:1], EXAMPLE_RAGGED_TENSOR_2D[-1:1]),
+      (SLICE_BUILDER[1:-1], EXAMPLE_RAGGED_TENSOR_2D[1:-1]),
+
+      # Tests for rt[i, j]
+      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
+      (SLICE_BUILDER[1, 2], EXAMPLE_RAGGED_TENSOR_2D[1][2]),
+      (SLICE_BUILDER[-1, 0], EXAMPLE_RAGGED_TENSOR_2D[-1][0]),
+      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
+      (SLICE_BUILDER[:], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_2D),
+
+      # Empty slice spec.
+      ([], EXAMPLE_RAGGED_TENSOR_2D),
+
+      # Test for ellipsis
+      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_2D[2]),
+      (SLICE_BUILDER[..., :], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[..., 2, 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+      (SLICE_BUILDER[2, ..., 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+
+      # Test for array_ops.newaxis
+      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, array_ops.newaxis],
+       [[row] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+
+      # Slicing inner ragged dimensions.
+      (SLICE_BUILDER[-1:, 1:4],
+       [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D[-1:]]),
+      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      # TODO(edloper): Add tests for strided slices, once support is added.
+  )
+  def testRaggedTensorGetItemWithRaggedRank1(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Ragged tensor
+    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  # pylint: disable=invalid-slice-index
+  @parameterized.parameters(
+      # Tests for out-of-bound errors
+      (SLICE_BUILDER[5], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[-6], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 2], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[3, 0], ValueError, '.*out of bounds.*'),
+
+      # Indexing into an inner ragged dimension
+      (SLICE_BUILDER[:, 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+      (SLICE_BUILDER[:1, 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+      (SLICE_BUILDER[..., 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+
+      # Tests for type errors
+      (SLICE_BUILDER[0.5], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[1:3:0.5], TypeError,
+       re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[:, 1:3:0.5], TypeError,
+       'slice strides must be integers or None'),
+      (SLICE_BUILDER[:, 0.5:1.5], TypeError,
+       'slice offsets must be integers or None'),
+      (SLICE_BUILDER['foo'], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[:, 'foo':'foo'], TypeError,
+       'slice offsets must be integers or None'),
+
+      # Tests for other errors
+      (SLICE_BUILDER[..., 0, 0, 0], IndexError,
+       'Too many indices for RaggedTensor'),
+  )
+  def testRaggedTensorGetItemErrorsWithRaggedRank1(self, slice_spec, expected,
+                                                   message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Ragged tensor
+    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+    # if sys.version_info[0] == 3:
+    #   message = 'must be str, not int'
+
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      # Tests for rt[index, index, ...]
+      (SLICE_BUILDER[2, 0], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
+      (SLICE_BUILDER[2, 0, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
+      (SLICE_BUILDER[2, 0, 1, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1][1]),
+      (SLICE_BUILDER[2, 0, 1:], EXAMPLE_RAGGED_TENSOR_4D[2][0][1:]),
+      (SLICE_BUILDER[2, 0, 1:, 1:], [[16], [18]]),
+      (SLICE_BUILDER[2, 0, :, 1], [14, 16, 18]),
+      (SLICE_BUILDER[2, 0, 1, :], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
+
+      # Tests for rt[index, slice, ...]
+      (SLICE_BUILDER[0, :], EXAMPLE_RAGGED_TENSOR_4D[0]),
+      (SLICE_BUILDER[1, :], EXAMPLE_RAGGED_TENSOR_4D[1]),
+      (SLICE_BUILDER[0, :, :, 1], [[2, 4, 6], [8, 10, 12]]),
+      (SLICE_BUILDER[1, :, :, 1], []),
+      (SLICE_BUILDER[2, :, :, 1], [[14, 16, 18]]),
+      (SLICE_BUILDER[3, :, :, 1], [[20]]),
+
+      # Tests for rt[slice, slice, ...]
+      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_4D),
+      (SLICE_BUILDER[:, :, :, 1], [[[2, 4, 6], [8, 10, 12]], [], [[14, 16, 18]],
+                                   [[20]]]),
+      (SLICE_BUILDER[1:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
+      (SLICE_BUILDER[-3:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
+
+      # Test for ellipsis
+      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_4D),
+      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_4D[2]),
+      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
+      (SLICE_BUILDER[..., 0], [[[1, 3, 5], [7, 9, 11]], [], [[13, 15, 17]],
+                               [[19]]]),
+      (SLICE_BUILDER[2, ..., 0], [[13, 15, 17]]),
+      (SLICE_BUILDER[2, 0, ..., 0], [13, 15, 17]),
+
+      # Test for array_ops.newaxis
+      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, array_ops.newaxis],
+       [[row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+
+      # Empty slice spec.
+      ([], EXAMPLE_RAGGED_TENSOR_4D),
+
+      # Slicing inner ragged dimensions.
+      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, :-1],
+       [[v[:-1] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, 1:2],
+       [[v[1:2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[1:, 1:3, 1:2],
+       [[v[1:2] for v in row[1:3]] for row in EXAMPLE_RAGGED_TENSOR_4D[1:]]),
+
+      # Strided slices
+      (SLICE_BUILDER[::2], EXAMPLE_RAGGED_TENSOR_4D[::2]),
+      (SLICE_BUILDER[1::2], EXAMPLE_RAGGED_TENSOR_4D[1::2]),
+      (SLICE_BUILDER[:, ::2], [row[::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, 1::2], [row[1::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, ::2],
+       [[v[::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, 1::2],
+       [[v[1::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+
+      # TODO(edloper): Add tests for strided slices, once support is added.
+      # TODO(edloper): Add tests slicing inner ragged dimensions, one support
+      # is added.
+  )
+  def testRaggedTensorGetItemWithRaggedRank2(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = ragged.from_nested_row_splits(
+        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
+        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      # Test for errors in unsupported cases
+      (SLICE_BUILDER[:, 0], ValueError,
+       'Cannot index into an inner ragged dimension.'),
+      (SLICE_BUILDER[:, :, 0], ValueError,
+       'Cannot index into an inner ragged dimension.'),
+
+      # Test for out-of-bounds errors.
+      (SLICE_BUILDER[1, 0], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 0, 3], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[5], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 5], ValueError, '.*out of bounds.*'),
+  )
+  def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
+                                                   message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = ragged.from_nested_row_splits(
+        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
+        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[:], []),
+      (SLICE_BUILDER[2:], []),
+      (SLICE_BUILDER[:-3], []),
+  )
+  def testRaggedTensorGetItemWithEmptyTensor(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = ragged.from_row_splits([], [0])
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[0], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[-1], ValueError, '.*out of bounds.*'),
+  )
+  def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
+                                                   message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = ragged.from_row_splits([], [0])
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
+      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
+      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
+      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
+      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
+      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
+      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
+  )
+  def testRaggedTensorGetItemWithPlaceholderShapes(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Intentionally use an unknown shape for `splits`, to force the code path
+    # that deals with having nrows unknown at graph construction time.
+    splits = constant_op.constant(
+        EXAMPLE_RAGGED_TENSOR_2D_SPLITS, dtype=dtypes.int64)
+    splits = array_ops.placeholder_with_default(splits, None)
+    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
+    with self.test_session():
+      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[..., 2], ValueError,
+       'Ellipsis not supported for unknown shape RaggedTensors'),)
+  def testRaggedTensorGetItemErrorsWithPlaceholderShapes(
+      self, slice_spec, expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Intentionally use an unknown shape for `values`.
+    values = array_ops.placeholder_with_default([0], None)
+    rt = ragged.from_row_splits(values, [0, 1])
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  def testGetItemNewAxis(self):
+    # rt: [[[['a', 'b'], ['c', 'd']], [], [['e', 'f']]], []]
+    splits1 = [0, 3, 3]
+    splits2 = [0, 2, 2, 3]
+    values = constant_op.constant([['a', 'b'], ['c', 'd'], ['e', 'f']])
+    rt = ragged.from_nested_row_splits(values, [splits1, splits2])
+    with self.test_session():
+      rt_newaxis0 = rt[array_ops.newaxis]
+      rt_newaxis1 = rt[:, array_ops.newaxis]
+      rt_newaxis2 = rt[:, :, array_ops.newaxis]
+      rt_newaxis3 = rt[:, :, :, array_ops.newaxis]
+      rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
+
+      self.assertEqual(rt.tolist(),
+                       [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
+      self.assertEqual(
+          rt_newaxis0.tolist(),
+          [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
+      self.assertEqual(
+          rt_newaxis1.tolist(),
+          [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
+      self.assertEqual(
+          rt_newaxis2.tolist(),
+          [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
+      self.assertEqual(
+          rt_newaxis3.tolist(),
+          [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
+      self.assertEqual(
+          rt_newaxis4.tolist(),
+          [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
+
+      self.assertEqual(rt.ragged_rank, 2)
+      self.assertEqual(rt_newaxis0.ragged_rank, 3)
+      self.assertEqual(rt_newaxis1.ragged_rank, 3)
+      self.assertEqual(rt_newaxis2.ragged_rank, 3)
+      self.assertEqual(rt_newaxis3.ragged_rank, 2)
+      self.assertEqual(rt_newaxis4.ragged_rank, 2)
+
+      self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
+      self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
+      self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
+      self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
+      self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
+
+  #=============================================================================
+  # RaggedTensor.__str__
+  #=============================================================================
+  def testRaggedTensorStr(self):
+    rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
+    expected1 = ('RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
+                 'shape=(7,), dtype=string), row_splits='
+                 'Tensor("RaggedFromRowSplits/row_splits:0", '
+                 'shape=(6,), dtype=int64))')
+    self.assertEqual(str(rt1), expected1)
+    self.assertEqual(repr(rt1), expected1)
+
+  def testRaggedTensorValueStr(self):
+    rt = ragged.RaggedTensorValue(
+        values=np.array(b'a b c d e f g'.split()),
+        row_splits=np.array([0, 2, 5, 6, 6, 7], dtype=np.int64))
+    if sys.version_info[0] == 2:
+      self.assertEqual(' '.join(str(rt).split()),
+                       (r"<RaggedTensorValue [['a', 'b'], ['c', 'd', 'e'], "
+                        "['f'], [], ['g']]>"))
+      self.assertEqual(
+          ' '.join(repr(rt).split()),
+          (r"RaggedTensorValue(values=array(['a', 'b', 'c', 'd', "
+           "'e', 'f', 'g'], dtype='|S1'), row_splits=array([0, 2, 5,"
+           ' 6, 6, 7]))'))
+    else:
+      self.assertEqual(
+          ' '.join(str(rt).split()),
+          (r"<RaggedTensorValue [[b'a', b'b'], [b'c', b'd', b'e'], "
+           "[b'f'], [], [b'g']]>"))
+      self.assertEqual(
+          ' '.join(repr(rt).split()),
+          (r"RaggedTensorValue(values=array([b'a', b'b', b'c', b'd', "
+           "b'e', b'f', b'g'], dtype='|S1'), row_splits=array([0, 2, 5,"
+           ' 6, 6, 7]))'))
+
+  #=============================================================================
+  # RaggedTensor.with_values() and RaggedTensor.with_inner_values().
+  #=============================================================================
+
+  def testWithValues(self):
+    rt1 = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    rt2 = ragged.constant([[[1, 2], [3, 4, 5]], [[6]], [], [[], [7]]])
+
+    rt1_plus_10 = rt1.with_values(rt1.values + 10)
+    rt2_times_10 = rt2.with_inner_values(rt2.inner_values * 10)
+    rt1_expanded = rt1.with_values(array_ops.expand_dims(rt1.values, axis=1))
+
+    with self.test_session():
+      self.assertEqual(rt1_plus_10.tolist(),
+                       [[11, 12], [13, 14, 15], [16], [], [17]])
+      self.assertEqual(rt2_times_10.tolist(),
+                       [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
+      self.assertEqual(rt1_expanded.tolist(),
+                       [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
+
+  #=============================================================================
+  # Session.run
+  #=============================================================================
+  def testSessionRun(self):
+    rt1 = ragged.constant([[1, 2, 3], [4]])
+    rt2 = ragged.constant([[[], [1, 2]], [[3]]])
+    with self.test_session() as session:
+      result = session.run({'rt1': rt1, 'rt2': rt2})
+      self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
+      self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
+
+  def testSessionRunFeed(self):
+    rt1 = ragged.from_row_splits(
+        array_ops.placeholder(dtypes.int32),
+        array_ops.placeholder(dtypes.int64))
+    rt2 = ragged.from_nested_row_splits(
+        array_ops.placeholder(dtypes.int32),
+        [array_ops.placeholder(dtypes.int64),
+         array_ops.placeholder(dtypes.int64)])
+
+    rt1_feed_val = ragged.constant_value([[1, 2, 3], [4]])
+    rt2_feed_val = ragged.constant_value([[[], [1, 2]], [[3]]])
+
+    with self.test_session() as session:
+      result = session.run({'rt1': rt1, 'rt2': rt2},
+                           feed_dict={rt1: rt1_feed_val,
+                                      rt2: rt2_feed_val})
+      self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
+      self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
+
+  def testSessionPartialRunFeed(self):
+    # Placeholder inputs.
+    a = ragged.from_row_splits(
+        array_ops.placeholder(dtypes.int32, shape=[None], name='a.values'),
+        array_ops.placeholder(dtypes.int64, name='a.row_splits'))
+    b = ragged.from_row_splits(
+        array_ops.placeholder(dtypes.int32, shape=[None], name='b.values'),
+        array_ops.placeholder(dtypes.int64, name='b.row_splits'))
+    c = array_ops.placeholder(dtypes.int32, shape=[], name='c')
+
+    # Feed values for placeholder inputs.
+    a_val = ragged.constant_value([[1, 2, 3], [4]])
+    b_val = ragged.constant_value([[5, 4, 3], [2]])
+    c_val = 3
+
+    # Compute some values.
+    r1 = ragged.reduce_sum(a * b, axis=1)
+    r2 = ragged.reduce_sum(a + c, axis=1)
+
+    with self.test_session() as session:
+      handle = session.partial_run_setup([r1, r2], [a, b, c])
+
+      res1 = session.partial_run(handle, r1, feed_dict={a: a_val, b: b_val})
+      self.assertEqual(res1.tolist(), [22, 8])
+
+      res2 = session.partial_run(handle, r2, feed_dict={c: c_val})
+      self.assertEqual(res2.tolist(), [15, 7])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_value.py b/tensorflow/python/ops/ragged/ragged_tensor_value.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d3249c991674a090d2dab4da8fb385b7463f13
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_value.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Value for RaggedTensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+class RaggedTensorValue(object):
+  """Represents the value of a `RaggedTensor`.
+
+  See `RaggedTensor` for a description of ragged tensors.
+  """
+
+  def __init__(self, values, row_splits):
+    """Creates a `RaggedTensorValue`.
+
+    Args:
+      values: A numpy array of any type and shape; or a RaggedTensorValue.
+      row_splits: A 1-D int64 numpy array.
+    """
+    if not (isinstance(row_splits, (np.ndarray, np.generic)) and
+            row_splits.dtype == np.int64 and row_splits.ndim == 1):
+      raise TypeError("row_splits must be a 1D int64 numpy array")
+    if not isinstance(values, (np.ndarray, np.generic, RaggedTensorValue)):
+      raise TypeError("values must be a numpy array or a RaggedTensorValue")
+    self._values = values
+    self._row_splits = row_splits
+
+  row_splits = property(
+      lambda self: self._row_splits,
+      doc="""The split indices for the ragged tensor value.""")
+  values = property(
+      lambda self: self._values,
+      doc="""The concatenated values for all rows in this tensor.""")
+  dtype = property(
+      lambda self: self._values.dtype,
+      doc="""The numpy dtype of values in this tensor.""")
+
+  @property
+  def inner_values(self):
+    """The innermost `values` array for this ragged tensor value."""
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensorValue):
+      rt_values = rt_values.values
+    return rt_values
+
+  @property
+  def nested_row_splits(self):
+    """The row_splits for all ragged dimensions in this ragged tensor value."""
+    rt_nested_splits = [self.row_splits]
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensorValue):
+      rt_nested_splits.append(rt_values.row_splits)
+      rt_values = rt_values.values
+    return tuple(rt_nested_splits)
+
+  @property
+  def ragged_rank(self):
+    """The number of ragged dimensions in this ragged tensor value."""
+    values_is_ragged = isinstance(self._values, RaggedTensorValue)
+    return self._values.ragged_rank + 1 if values_is_ragged else 1
+
+  @property
+  def shape(self):
+    """A tuple indicating the shape of this RaggedTensorValue."""
+    return (self._row_splits.shape[0] - 1,) + (None,) + self._values.shape[1:]
+
+  def __str__(self):
+    return "<RaggedTensorValue %s>" % self.tolist()
+
+  def __repr__(self):
+    return "RaggedTensorValue(values=%r, row_splits=%r)" % (self._values,
+                                                            self._row_splits)
+
+  def tolist(self):
+    """Returns this ragged tensor value as a nested Python list."""
+    values_as_list = self._values.tolist()
+    return [
+        values_as_list[self._row_splits[i]:self._row_splits[i + 1]]
+        for i in range(len(self._row_splits) - 1)
+    ]
diff --git a/tensorflow/python/ops/ragged/ragged_tile_op_test.py b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf62d96e7a9d2371ebb808548dfbb5b73677caa6
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
@@ -0,0 +1,215 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.tile."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import googletest
+
+
+class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Example
+      #=========================================================================
+      dict(
+          descr='docstring example: ragged_rank=1, repeat axes 0 and 1',
+          rt_input=[[1, 2], [3]],
+          multiples=[3, 2],
+          expected=[
+              [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]],
+      ),
+
+      #=========================================================================
+      # rank=3, ragged_rank=2
+      #=========================================================================
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axis 0',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[2, 1, 1],
+          expected=[[[1, 2], [3]], [], [[4]],
+                    [[1, 2], [3]], [], [[4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axis 1',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[1, 2, 1],
+          expected=[[[1, 2], [3], [1, 2], [3]], [], [[4], [4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axis 2',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[1, 1, 2],
+          expected=[[[1, 2, 1, 2], [3, 3]], [], [[4, 4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axes 0 and 1',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[2, 2, 1],
+          expected=[[[1, 2], [3], [1, 2], [3]], [], [[4], [4]],
+                    [[1, 2], [3], [1, 2], [3]], [], [[4], [4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axes 0 and 2',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[2, 1, 2],
+          expected=[[[1, 2, 1, 2], [3, 3]], [], [[4, 4]],
+                    [[1, 2, 1, 2], [3, 3]], [], [[4, 4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axes 1 and 2',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[1, 2, 2],
+          expected=[[[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]],
+                    [], [[4, 4], [4, 4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat all axes',
+          rt_input=[[['a', 'b'], ['c']], [], [['d']]],
+          multiples=[4, 3, 2],
+          expected=[[[b'a', b'b']*2, [b'c']*2]*3, []*3, [[b'd']*2]*3]*4),
+      #=========================================================================
+      # rank=3, ragged_rank=1
+      #=========================================================================
+      dict(
+          descr='rank=3, ragged_rank=1, repeat axis 0',
+          ragged_rank=1,
+          rt_input=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          multiples=[2, 1, 1],
+          expected=[[[1, 2], [3, 4]], [], [[5, 6]],
+                    [[1, 2], [3, 4]], [], [[5, 6]]]),
+      dict(
+          descr='rank=3, ragged_rank=1, repeat axis 1',
+          ragged_rank=1,
+          rt_input=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          multiples=[1, 2, 1],
+          expected=[[[1, 2], [3, 4], [1, 2], [3, 4]], [], [[5, 6], [5, 6]]]),
+      dict(
+          descr='rank=3, ragged_rank=1, repeat axis 2',
+          ragged_rank=1,
+          rt_input=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          multiples=[1, 1, 2],
+          expected=[[[1, 2, 1, 2], [3, 4, 3, 4]], [], [[5, 6, 5, 6]]]),
+      #=========================================================================
+      # rank=4, ragged_rank=3
+      #=========================================================================
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 0',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[2, 1, 1, 1],
+          expected=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]],
+                    [[[1], [2]], [[3]]], [[]], [[[4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 1',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 2, 1, 1],
+          expected=[[[[1], [2]], [[3]], [[1], [2]], [[3]]],
+                    [[], []],
+                    [[[4, 5]], [[4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 2',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 2, 1],
+          expected=[[[[1], [2], [1], [2]], [[3], [3]]],
+                    [[]],
+                    [[[4, 5], [4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 3',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 1, 2],
+          expected=[[[[1, 1], [2, 2]], [[3, 3]]], [[]], [[[4, 5, 4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat all axes',
+          rt_input=[[[['a'], ['b']], [['c']]], [[]], [[['d', 'e']]]],
+          multiples=[5, 4, 3, 2],
+          expected=[[[[b'a']*2, [b'b']*2]*3, [[b'c']*2]*3]*4,
+                    [[]*3]*4,
+                    [[[b'd', b'e']*2]*3]*4]*5),
+      dict(
+          descr='rank=5, ragged_rank=4, repeat all axes',
+          rt_input=[[[[['a']]]]],
+          multiples=[6, 5, 4, 3, 2],
+          expected=[[[[[b'a']*2]*3]*4]*5]*6),
+      #=========================================================================
+      # multiple=0
+      #=========================================================================
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 0 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[0, 1, 1, 1],
+          expected=[]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 1 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 0, 1, 1],
+          expected=[[], [], []]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 2 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 0, 1],
+          expected=[[[], []], [[]], [[]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 3 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 1, 0],
+          expected=[[[[], []], [[]]], [[]], [[[]]]]),
+
+  ])  # pyformat: disable
+  def testRaggedTile(self,
+                     descr,
+                     rt_input,
+                     multiples,
+                     expected,
+                     ragged_rank=None):
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank)
+
+    expected_shape = [
+        None if dim is None else dim * multiple
+        for (dim, multiple) in zip(rt.shape.as_list(), multiples)
+    ]
+
+    # Test with both const & non-const multiples: ragged_tile has a few code
+    # paths that optimize the case where multiples[d] is known to be 1.
+    const_multiples = constant_op.constant(multiples, dtypes.int64)
+    non_const_multiples = array_ops.placeholder_with_default(
+        const_multiples, shape=[len(multiples)])
+
+    for multiples_tensor in (const_multiples, non_const_multiples):
+      tiled = ragged_array_ops.tile(rt, multiples_tensor)
+      self.assertEqual(tiled.ragged_rank, rt.ragged_rank)
+      self.assertEqual(tiled.shape.ndims, rt.shape.ndims)
+      if multiples_tensor is const_multiples:
+        self.assertEqual(tiled.shape.as_list(), expected_shape)
+      with self.test_session():
+        self.assertEqual(tiled.eval().tolist(), expected)
+
+  def testRaggedTileWithTensorInput(self):
+    # When the input is a `Tensor`, ragged_tile just delegates to tf.tile.
+    dt = constant_op.constant([[1, 2], [3, 4]])
+    tiled = ragged_array_ops.tile(dt, [3, 2])
+    expected = [[1, 2, 1, 2], [3, 4, 3, 4],
+                [1, 2, 1, 2], [3, 4, 3, 4],
+                [1, 2, 1, 2], [3, 4, 3, 4]]  # pyformat: disable
+    with self.test_session():
+      self.assertEqual(tiled.eval().tolist(), expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fd31837c62de43a1ecb1162f2c1818094d34633
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
@@ -0,0 +1,193 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.to_sparse op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
+
+  def testDocStringExample(self):
+    rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+    st = ragged.to_sparse(rt)
+    expected = ('SparseTensorValue(indices='
+                'array([[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]]), '
+                'values=array([1, 2, 3, 4, 5, 6], dtype=int32), '
+                'dense_shape=array([4, 3]))')
+    with self.test_session():
+      self.assertEqual(' '.join(repr(st.eval()).split()), expected)
+
+  def test2DRaggedTensorWithOneRaggedDimension(self):
+    rt = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    with self.test_session():
+      st = ragged.to_sparse(rt).eval()
+      self.assertAllEqual(
+          st.indices, [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [4, 0]])
+      self.assertAllEqual(st.values, b'a b c d e f g'.split())
+      self.assertAllEqual(st.dense_shape, [5, 3])
+
+  def test3DRaggedTensorWithOneRaggedDimension(self):
+    rt = ragged.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]],
+                          [[11, 12]], [], [[13, 14]]],
+                         ragged_rank=1)
+    with self.test_session():
+      st = ragged.to_sparse(rt).eval()
+      self.assertAllEqual(
+          st.indices, [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0],
+                       [1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 2, 0], [1, 2, 1],
+                       [2, 0, 0], [2, 0, 1], [4, 0, 0], [4, 0, 1]])
+      self.assertAllEqual(st.values,
+                          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+      self.assertAllEqual(st.dense_shape, [5, 3, 2])
+
+  def test4DRaggedTensorWithOneRaggedDimension(self):
+    rt = ragged.constant(
+        [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [], [[[9, 10], [11, 12]]]],
+        ragged_rank=1)
+    with self.test_session():
+      st = ragged.to_sparse(rt).eval()
+      self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
+      self.assertAllEqual(
+          st.indices,
+          [
+              [0, 0, 0, 0],  # index for value=1
+              [0, 0, 0, 1],  # index for value=2
+              [0, 0, 1, 0],  # index for value=3
+              [0, 0, 1, 1],  # index for value=4
+              [0, 1, 0, 0],  # index for value=5
+              [0, 1, 0, 1],  # index for value=6
+              [0, 1, 1, 0],  # index for value=7
+              [0, 1, 1, 1],  # index for value=8
+              [2, 0, 0, 0],  # index for value=9
+              [2, 0, 0, 1],  # index for value=10
+              [2, 0, 1, 0],  # index for value=11
+              [2, 0, 1, 1],  # index for value=12
+          ])
+      self.assertAllEqual(st.dense_shape, [3, 2, 2, 2])
+
+  def test4DRaggedTensorWithTwoRaggedDimensions(self):
+    rt = ragged.constant([[[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]]],
+                          [[[11, 12]], [], [[13, 14]]], []],
+                         ragged_rank=2)
+    with self.test_session():
+      st = ragged.to_sparse(rt).eval()
+      self.assertAllEqual(
+          st.indices,
+          [
+              [0, 0, 0, 0],  # index for value=1
+              [0, 0, 0, 1],  # index for value=2
+              [0, 0, 1, 0],  # index for value=3
+              [0, 0, 1, 1],  # index for value=4
+              [0, 1, 0, 0],  # index for value=5
+              [0, 1, 0, 1],  # index for value=6
+              [0, 1, 1, 0],  # index for value=7
+              [0, 1, 1, 1],  # index for value=8
+              [0, 1, 2, 0],  # index for value=9
+              [0, 1, 2, 1],  # index for value=10
+              [1, 0, 0, 0],  # index for value=11
+              [1, 0, 0, 1],  # index for value=12
+              [1, 2, 0, 0],  # index for value=13
+              [1, 2, 0, 1],  # index for value=14
+          ])
+      self.assertAllEqual(st.values,
+                          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+      self.assertAllEqual(st.dense_shape, [3, 3, 3, 2])
+
+  def testShape(self):
+    rt = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    st = ragged.to_sparse(rt)
+    self.assertEqual(st.indices.shape.as_list(), [7, 2])
+    self.assertEqual(st.values.shape.as_list(), [7])
+    self.assertEqual(st.dense_shape.shape.as_list(), [2])
+
+    rt = ragged.constant([[[1, 2]], [], [[3, 4]], []], ragged_rank=1)
+    st = ragged.to_sparse(rt)
+    self.assertEqual(st.indices.shape.as_list(), [4, 3])
+    self.assertEqual(st.values.shape.as_list(), [4])
+    self.assertEqual(st.dense_shape.shape.as_list(), [3])
+
+    rt = ragged.constant([[[1], [2, 3, 4, 5, 6, 7]], [[]]])
+    st = ragged.to_sparse(rt)
+    self.assertEqual(st.indices.shape.as_list(), [7, 3])
+    self.assertEqual(st.values.shape.as_list(), [7])
+    self.assertEqual(st.dense_shape.shape.as_list(), [3])
+
+  def testKernelErrors(self):
+    # An empty vector, defined using a placeholder to ensure that we can't
+    # determine that it's invalid at graph-construction time.
+    empty_vector = array_ops.placeholder_with_default(
+        array_ops.zeros([0], dtypes.int64), shape=None)
+
+    bad_rt1 = ragged.from_row_splits(row_splits=[2, 3], values=[1, 2, 3])
+    with self.test_session():
+      bad_split0_error = r'First value of ragged splits must be 0.*'
+      self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0_error,
+                              ragged.to_sparse(bad_rt1).eval)
+
+    bad_rt2 = ragged.from_row_splits(row_splits=[0, 5], values=empty_vector)
+    bad_rt3 = ragged.from_row_splits(
+        row_splits=[0, 1],
+        values=ragged.from_row_splits(row_splits=[0, 5], values=empty_vector))
+    with self.test_session():
+      split_mismatch1_error = r'Final value of ragged splits must match.*'
+      for rt in [bad_rt2, bad_rt3]:
+        self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                split_mismatch1_error,
+                                ragged.to_sparse(rt).eval)
+
+    bad_rt4 = ragged.from_row_splits(
+        row_splits=[0, 5],
+        values=ragged.from_row_splits(row_splits=[0], values=empty_vector))
+    with self.test_session():
+      split_mismatch2_error = r'Final value of ragged splits must match.*'
+      self.assertRaisesRegexp(errors.InvalidArgumentError,
+                              split_mismatch2_error,
+                              ragged.to_sparse(bad_rt4).eval)
+
+    bad_rt5 = ragged.from_row_splits(row_splits=empty_vector, values=[])
+    with self.test_session():
+      empty_splits_error = (r'ragged splits may not be empty.*')
+      self.assertRaisesRegexp(errors.InvalidArgumentError, empty_splits_error,
+                              ragged.to_sparse(bad_rt5).eval)
+
+  def testGradient(self):
+    # rt1.shape == rt2.shape == [2, (D2), (D3), 2].
+    rt1 = ragged.constant([[[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0]]]],
+                          ragged_rank=2)
+    rt2 = ragged.constant([[[[9.0, 8.0], [7.0, 6.0]], [[5.0, 4.0]]]],
+                          ragged_rank=2)
+    rt = rt1 + rt2 * 2.0
+    st = ragged.to_sparse(rt)
+
+    g1, g2 = gradients_impl.gradients(st.values, [rt1.inner_values,
+                                                  rt2.inner_values])
+    print(g1, g2)
+    with self.test_session():
+      self.assertEqual(g1.eval().tolist(), [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]])
+      self.assertEqual(g2.eval().tolist(), [[2.0, 2.0], [2.0, 2.0], [2.0, 2.0]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..688676e46c699d8e86da487043ad6d484c5fdc64
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -0,0 +1,142 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.to_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
+                                 parameterized.TestCase):
+
+  def testDocStringExamples(self):
+    """Example from ragged_to_tensor.__doc__."""
+    rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+    dt = ragged.to_tensor(rt)
+    with self.test_session():
+      self.assertEqual(str(dt.eval()),
+                       '[[9 8 7]\n'
+                       ' [0 0 0]\n'
+                       ' [6 5 0]\n'
+                       ' [4 0 0]]')  # pyformat: disable
+
+  @parameterized.parameters(
+      {
+          'rt_input': [],
+          'ragged_rank': 1,
+          'expected': [],
+          'expected_shape': [0, 0],
+      },
+      {
+          'rt_input': [[1, 2, 3], [], [4], [5, 6]],
+          'expected': [[1, 2, 3], [0, 0, 0], [4, 0, 0], [5, 6, 0]]
+      },
+      {
+          'rt_input': [[1, 2, 3], [], [4], [5, 6]],
+          'default': 9,
+          'expected': [[1, 2, 3], [9, 9, 9], [4, 9, 9], [5, 6, 9]]
+      },
+      {
+          'rt_input': [[[1], [2], [3]], [], [[4]], [[5], [6]]],
+          'ragged_rank':
+              1,
+          'default': [9],
+          'expected': [[[1], [2], [3]], [[9], [9], [9]], [[4], [9], [9]],
+                       [[5], [6], [9]]]
+      },
+      {
+          'rt_input': [[[1, 2], [], [3, 4]], [], [[5]], [[6, 7], [8]]],
+          'expected': [
+              [[1, 2], [0, 0], [3, 4]],  #
+              [[0, 0], [0, 0], [0, 0]],  #
+              [[5, 0], [0, 0], [0, 0]],  #
+              [[6, 7], [8, 0], [0, 0]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1, 2], [], [3, 4]], [], [[5]], [[6, 7], [8]]],
+          'default':
+              9,
+          'expected': [
+              [[1, 2], [9, 9], [3, 4]],  #
+              [[9, 9], [9, 9], [9, 9]],  #
+              [[5, 9], [9, 9], [9, 9]],  #
+              [[6, 7], [8, 9], [9, 9]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1], [2], [3]]],
+          'ragged_rank': 1,
+          'default': 0,
+          'expected': [[[1], [2], [3]]],
+      },
+      {
+          'rt_input': [[[[1], [2]], [], [[3]]]],
+          'default': 9,
+          'expected': [[[[1], [2]], [[9], [9]], [[3], [9]]]],
+      },
+  )
+  def testRaggedTensorToTensor(self,
+                               rt_input,
+                               expected,
+                               ragged_rank=None,
+                               default=None,
+                               expected_shape=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    dt = ragged.to_tensor(rt, default)
+    self.assertEqual(type(dt), ops.Tensor)
+    self.assertEqual(rt.dtype, dt.dtype)
+    self.assertTrue(dt.shape.is_compatible_with(rt.shape))
+    with self.test_session():
+      self.assertEqual(dt.eval().tolist(), expected)
+      if expected_shape is not None:
+        dt_shape = array_ops.shape(dt)
+        self.assertEqual(dt_shape.eval().tolist(), expected_shape)
+
+  @parameterized.parameters(
+      {
+          'rt_input': [[1, 2, 3]],
+          'default': [0],
+          'error': (ValueError, r'Shape \(1,\) must have rank at most 0'),
+      },
+      {
+          'rt_input': [[[1, 2], [3, 4]], [[5, 6]]],
+          'ragged_rank': 1,
+          'default': [7, 8, 9],
+          'error': (ValueError, r'Shapes \(3,\) and \(2,\) are incompatible'),
+      },
+      {
+          'rt_input': [[1, 2, 3]],
+          'default': 'a',
+          'error': (TypeError, "Expected int32, got 'a' of type 'str' instead"),
+      },
+  )
+  def testError(self, rt_input, default, error, ragged_rank=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    with self.assertRaisesRegexp(error[0], error[1]):
+      ragged.to_tensor(rt, default)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_util.py b/tensorflow/python/ops/ragged/ragged_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f050de514e7f13de34ba7df23629d56b8ec453
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_util.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Private convenience functions for RaggedTensors.
+
+None of these methods are exposed in the main "ragged" package.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+
+
+def convert_to_int_tensor(tensor, name, dtype=dtypes.int32):
+  """Converts the given value to an integer Tensor."""
+  tensor = ops.convert_to_tensor(tensor, name=name, preferred_dtype=dtype)
+  if tensor.dtype.is_integer:
+    tensor = math_ops.cast(tensor, dtype)
+  else:
+    raise TypeError(
+        "%s must be an integer tensor; dtype=%s" % (name, tensor.dtype))
+  return tensor
+
+
+def get_positive_axis(axis, ndims):
+  """Validate an `axis` parameter, and normalize it to be positive.
+
+  If `ndims` is known (i.e., not `None`), then check that `axis` is in the
+  range `-ndims <= axis < ndims`, and return `axis` (if `axis >= 0`) or
+  `axis + ndims` (otherwise).
+  If `ndims` is not known, and `axis` is positive, then return it as-is.
+  If `ndims` is not known, and `axis` is negative, then report an error.
+
+  Args:
+    axis: An integer constant
+    ndims: An integer constant, or `None`
+
+  Returns:
+    The normalized `axis` value.
+
+  Raises:
+    ValueError: If `axis` is out-of-bounds, or if `axis` is negative and
+      `ndims is None`.
+  """
+  if not isinstance(axis, int):
+    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
+  if ndims is not None:
+    if 0 <= axis < ndims:
+      return axis
+    elif -ndims <= axis < 0:
+      return axis + ndims
+    else:
+      raise ValueError(
+          "axis=%s out of bounds: expected %s<=axis<%s" % (axis, -ndims, ndims))
+  elif axis < 0:
+    raise ValueError("axis may only be negative if ndims is statically known.")
+  return axis
+
+
+def assert_splits_match(nested_splits_lists):
+  """Checks that the given splits lists are identical.
+
+  Performs static tests to ensure that the given splits lists are identical,
+  and returns a list of control dependency op tensors that check that they are
+  fully identical.
+
+  Args:
+    nested_splits_lists: A list of nested_splits_lists, where each split_list is
+      a list of `splits` tensors from a `RaggedTensor`, ordered from outermost
+      ragged dimension to innermost ragged dimension.
+
+  Returns:
+    A list of control dependency op tensors.
+  Raises:
+    ValueError: If the splits are not identical.
+  """
+  error_msg = "Inputs must have identical ragged splits"
+  for splits_list in nested_splits_lists:
+    if len(splits_list) != len(nested_splits_lists[0]):
+      raise ValueError(error_msg)
+  return [
+      check_ops.assert_equal(s1, s2, message=error_msg)
+      for splits_list in nested_splits_lists[1:]
+      for (s1, s2) in zip(nested_splits_lists[0], splits_list)
+  ]
+
+
+# This op is intended to exactly match the semantics of numpy.repeat, with
+# one exception: numpy.repeat has special (and somewhat non-intuitive) behavior
+# when axis is not specified.  Rather than implement that special behavior, we
+# simply make `axis` be a required argument.
+#
+# External (OSS) `tf.repeat` feature request:
+# https://github.com/tensorflow/tensorflow/issues/8246
+def repeat(data, repeats, axis, name=None):
+  """Repeats elements of `data`.
+
+  Args:
+    data: An `N`-dimensional tensor.
+    repeats: A 1-D integer tensor specifying how many times each element in
+      `axis` should be repeated.  `len(repeats)` must equal `data.shape[axis]`.
+      Supports broadcasting from a scalar value.
+    axis: `int`.  The axis along which to repeat values.  Must be less than
+      `max(N, 1)`.
+    name: A name for the operation.
+
+  Returns:
+    A tensor with `max(N, 1)` dimensions.  Has the same shape as `data`,
+    except that dimension `axis` has size `sum(repeats)`.
+
+  #### Examples:
+    ```python
+    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
+    ['a', 'a', 'a', 'c', 'c']
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
+    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
+    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    ```
+  """
+  if not isinstance(axis, int):
+    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
+
+  with ops.name_scope(name, "Repeat", [data, repeats]):
+    data = ops.convert_to_tensor(data, name="data")
+    repeats = convert_to_int_tensor(repeats, name="repeats")
+    repeats.shape.with_rank_at_most(1)
+
+    # If `data` is a scalar, then upgrade it to a vector.
+    data = _with_nonzero_rank(data)
+    data_shape = array_ops.shape(data)
+
+    # If `axis` is negative, then convert it to a positive value.
+    axis = get_positive_axis(axis, data.shape.ndims)
+
+    # Check data Tensor shapes.
+    if repeats.shape.ndims == 1:
+      data.shape.dims[axis].assert_is_compatible_with(repeats.shape[0])
+
+    # If we know that `repeats` is a scalar, then we can just tile & reshape.
+    if repeats.shape.ndims == 0:
+      expanded = array_ops.expand_dims(data, axis + 1)
+      tiled = tile_one_dimension(expanded, axis + 1, repeats)
+      result_shape = array_ops.concat(
+          [data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0)
+      return array_ops.reshape(tiled, result_shape)
+
+    # Broadcast the `repeats` tensor so rank(repeats) == axis + 1.
+    if repeats.shape.ndims != axis + 1:
+      repeats_shape = array_ops.shape(repeats)
+      repeats_ndims = array_ops.rank(repeats)
+      broadcast_shape = array_ops.concat(
+          [data_shape[:axis + 1 - repeats_ndims], repeats_shape], axis=0)
+      repeats = array_ops.broadcast_to(repeats, broadcast_shape)
+      repeats.set_shape([None] * (axis + 1))
+
+    # Create a "sequence mask" based on `repeats`, where slices across `axis`
+    # contain one `True` value for each repetition.  E.g., if
+    # `repeats = [3, 1, 2]`, then `mask = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]`.
+    max_repeat = math_ops.maximum(0, math_ops.reduce_max(repeats))
+    mask = array_ops.sequence_mask(repeats, max_repeat)
+
+    # Add a new dimension around each value that needs to be repeated, and
+    # then tile that new dimension to match the maximum number of repetitions.
+    expanded = array_ops.expand_dims(data, axis + 1)
+    tiled = tile_one_dimension(expanded, axis + 1, max_repeat)
+
+    # Use `boolean_mask` to discard the extra repeated values.  This also
+    # flattens all dimensions up through `axis`.
+    masked = array_ops.boolean_mask(tiled, mask)
+
+    # Reshape the output tensor to add the outer dimensions back.
+    if axis == 0:
+      result = masked
+    else:
+      result_shape = array_ops.concat(
+          [data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0)
+      result = array_ops.reshape(masked, result_shape)
+
+    # Preserve shape information.
+    if data.shape.ndims is not None:
+      new_axis_size = 0 if repeats.shape[0] == 0 else None
+      result.set_shape(data.shape[:axis].concatenate(
+          [new_axis_size]).concatenate(data.shape[axis + 1:]))
+
+    return result
+
+
+def tile_one_dimension(data, axis, multiple):
+  """Tiles a single dimension of a tensor."""
+  # Assumes axis is a nonnegative int.
+  if data.shape.ndims is not None:
+    multiples = [1] * data.shape.ndims
+    multiples[axis] = multiple
+  else:
+    ones = array_ops.ones(array_ops.rank(data), dtypes.int32)
+    multiples = array_ops.concat([ones[:axis], [multiple], ones[axis + 1:]],
+                                 axis=0)
+  return array_ops.tile(data, multiples)
+
+
+def _with_nonzero_rank(data):
+  """If `data` is scalar, then add a dimension; otherwise return as-is."""
+  if data.shape.ndims is not None:
+    if data.shape.ndims == 0:
+      return array_ops.stack([data])
+    else:
+      return data
+  else:
+    data_shape = array_ops.shape(data)
+    data_ndims = array_ops.rank(data)
+    return array_ops.reshape(
+        data,
+        array_ops.concat([[1], data_shape], axis=0)[-data_ndims:])
diff --git a/tensorflow/python/ops/ragged/ragged_util_test.py b/tensorflow/python/ops/ragged/ragged_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c24ea65353104f78f9f4e3e90b0c73edb923c7e2
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_util_test.py
@@ -0,0 +1,228 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.platform import googletest
+
+# Example 3d tensor for test cases.  Has shape [4, 2, 3].
+TENSOR_3D = [[[('%d%d%d' % (i, j, k)).encode('utf-8')
+               for k in range(3)]
+              for j in range(2)]
+             for i in range(4)]
+
+# Example 4d tensor for test cases.  Has shape [4, 2, 3, 5].
+TENSOR_4D = [[[[('%d%d%d%d' % (i, j, k, l)).encode('utf-8')
+                for l in range(5)]
+               for k in range(3)]
+              for j in range(2)]
+             for i in range(4)]
+
+
+class RaggedRepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      # Docstring examples
+      dict(
+          data=['a', 'b', 'c'],
+          repeats=[3, 0, 2],
+          axis=0,
+          expected=[b'a', b'a', b'a', b'c', b'c']),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=[2, 3],
+          axis=0,
+          expected=[[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=[2, 3],
+          axis=1,
+          expected=[[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]),
+
+      # Scalar repeats value
+      dict(
+          data=['a', 'b', 'c'],
+          repeats=2,
+          axis=0,
+          expected=[b'a', b'a', b'b', b'b', b'c', b'c']),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=2,
+          axis=0,
+          expected=[[1, 2], [1, 2], [3, 4], [3, 4]]),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=2,
+          axis=1,
+          expected=[[1, 1, 2, 2], [3, 3, 4, 4]]),
+
+      # data & repeats are broadcast to have at least one dimension,
+      # so these are all equivalent:
+      dict(data=3, repeats=4, axis=0, expected=[3, 3, 3, 3]),
+      dict(data=[3], repeats=4, axis=0, expected=[3, 3, 3, 3]),
+      dict(data=3, repeats=[4], axis=0, expected=[3, 3, 3, 3]),
+      dict(data=[3], repeats=[4], axis=0, expected=[3, 3, 3, 3]),
+      # Empty tensor
+      dict(data=[], repeats=[], axis=0, expected=[]),
+  ])
+  def testRepeat(self, data, repeats, expected, axis=None):
+    result = ragged_util.repeat(data, repeats, axis)
+    with self.test_session():
+      self.assertEqual(result.eval().tolist(), expected)
+
+  @parameterized.parameters([
+      dict(mode=mode, **args)
+      for mode in ['constant', 'dynamic', 'unknown_shape']
+      for args in [
+          # data & repeats are broadcast to have at least one dimension,
+          # so these are all equivalent:
+          dict(data=3, repeats=4, axis=0),
+          dict(data=[3], repeats=4, axis=0),
+          dict(data=3, repeats=[4], axis=0),
+          dict(data=[3], repeats=[4], axis=0),
+
+          # 1-dimensional data tensor.
+          dict(data=[], repeats=5, axis=0),
+          dict(data=[1, 2, 3], repeats=5, axis=0),
+          dict(data=[1, 2, 3], repeats=[3, 0, 2], axis=0),
+          dict(data=[1, 2, 3], repeats=[3, 0, 2], axis=-1),
+          dict(data=[b'a', b'b', b'c'], repeats=[3, 0, 2], axis=0),
+
+          # 2-dimensional data tensor.
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=3, axis=0),
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=3, axis=1),
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=[3, 5], axis=0),
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=[3, 5, 7], axis=1),
+
+          # 3-dimensional data tensor: shape=[4, 2, 3].
+          dict(data=TENSOR_3D, repeats=2, axis=0),
+          dict(data=TENSOR_3D, repeats=2, axis=1),
+          dict(data=TENSOR_3D, repeats=2, axis=2),
+          dict(data=TENSOR_3D, repeats=[2, 0, 4, 1], axis=0),
+          dict(data=TENSOR_3D, repeats=[3, 2], axis=1),
+          dict(data=TENSOR_3D, repeats=[1, 3, 1], axis=2),
+
+          # 4-dimensional data tensor: shape=[4, 2, 3, 5].
+          dict(data=TENSOR_4D, repeats=2, axis=0),
+          dict(data=TENSOR_4D, repeats=2, axis=1),
+          dict(data=TENSOR_4D, repeats=2, axis=2),
+          dict(data=TENSOR_4D, repeats=2, axis=3),
+          dict(data=TENSOR_4D, repeats=[2, 0, 4, 1], axis=0),
+          dict(data=TENSOR_4D, repeats=[3, 2], axis=1),
+          dict(data=TENSOR_4D, repeats=[1, 3, 1], axis=2),
+          dict(data=TENSOR_4D, repeats=[1, 3, 0, 0, 2], axis=3),
+      ]
+  ])
+  def testValuesMatchesNumpy(self, mode, data, repeats, axis):
+    # Exception: we can't handle negative axis if data.ndims is unknown.
+    if axis < 0 and mode == 'unknown_shape':
+      return
+
+    expected = np.repeat(data, repeats, axis)
+
+    if mode == 'constant':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+    elif mode == 'dynamic':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+      data = array_ops.placeholder_with_default(data, data.shape)
+      repeats = array_ops.placeholder_with_default(repeats, repeats.shape)
+    elif mode == 'unknown_shape':
+      data = array_ops.placeholder_with_default(data, None)
+      repeats = array_ops.placeholder_with_default(repeats, None)
+
+    result = ragged_util.repeat(data, repeats, axis)
+    with self.test_session():
+      self.assertEqual(result.eval().tolist(), expected.tolist())
+
+  @parameterized.parameters([
+      dict(
+          descr='axis >= rank(data)',
+          mode='dynamic',
+          data=[1, 2, 3],
+          repeats=[3, 0, 2],
+          axis=1,
+          error='axis=1 out of bounds: expected -1<=axis<1'),
+      dict(
+          descr='axis < -rank(data)',
+          mode='dynamic',
+          data=[1, 2, 3],
+          repeats=[3, 0, 2],
+          axis=-2,
+          error='axis=-2 out of bounds: expected -1<=axis<1'),
+      dict(
+          descr='len(repeats) != data.shape[axis]',
+          mode='dynamic',
+          data=[[1, 2, 3], [4, 5, 6]],
+          repeats=[2, 3],
+          axis=1,
+          error='Dimensions 3 and 2 are not compatible'),
+      dict(
+          descr='rank(repeats) > 1',
+          mode='dynamic',
+          data=[[1, 2, 3], [4, 5, 6]],
+          repeats=[[3], [5]],
+          axis=1,
+          error=r'Shape \(2, 1\) must have rank at most 1'),
+      dict(
+          descr='non-integer axis',
+          mode='constant',
+          data=[1, 2, 3],
+          repeats=2,
+          axis='foo',
+          exception=TypeError,
+          error='axis must be an int'),
+  ])
+  def testError(self,
+                descr,
+                mode,
+                data,
+                repeats,
+                axis,
+                exception=ValueError,
+                error=None):
+    # Make sure that this is also an error case for numpy.
+    with self.assertRaises(exception):
+      np.repeat(data, repeats, axis)
+
+    if mode == 'constant':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+    elif mode == 'dynamic':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+      data = array_ops.placeholder_with_default(data, data.shape)
+      repeats = array_ops.placeholder_with_default(repeats, repeats.shape)
+    elif mode == 'unknown_shape':
+      data = array_ops.placeholder_with_default(data, None)
+      repeats = array_ops.placeholder_with_default(repeats, None)
+
+    with self.assertRaisesRegexp(exception, error):
+      ragged_util.repeat(data, repeats, axis)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..03672e4521be1f72e891f27cd5c2925c2cdd18d1
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -0,0 +1,199 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.where."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Examples
+      #=========================================================================
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          expected=[[0, 0], [0, 2], [1, 1]]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([True, False]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'd', b'e']])),
+      #=========================================================================
+      # Coordinate-retrieval mode
+      #=========================================================================
+      dict(  # shape=[D1]
+          condition=[True, False, True, False, True],
+          expected=[[0], [2], [4]]),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          expected=[[0, 0], [1, 1]]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          expected=[[0, 0], [0, 2], [1, 1]]),
+      dict(  # shape=[D1, (D2), (D3)]
+          condition=ragged.constant_value([
+              [[True, False, True], [False, True]],
+              [[True], [], [False], [False, True, False]]
+          ]),
+          expected=[[0, 0, 0], [0, 0, 2], [0, 1, 1],
+                    [1, 0, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          expected=[[0, 0, 0], [0, 1, 1],
+                    [1, 0, 0], [1, 2, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          expected=[[0, 0, 1, 0],
+                    [1, 0, 0, 0], [1, 0, 0, 2], [1, 0, 1, 1],
+                    [1, 1, 0, 0], [1, 1, 3, 1]]),
+
+      #=========================================================================
+      # Elementwise value-selection mode
+      #=========================================================================
+      dict(  # shape=[]
+          condition=True, x='A', y='a', expected=b'A'),
+      dict(  # shape=[]
+          condition=False, x='A', y='a', expected=b'a'),
+      dict(  # shape=[D1]
+          condition=[True, False, True],
+          x=['A', 'B', 'C'],
+          y=['a', 'b', 'c'],
+          expected=[b'A', b'b', b'C']),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          x=[['A', 'B'], ['D', 'E']],
+          y=[['a', 'b'], ['d', 'e']],
+          expected=[[b'A', b'b'], [b'd', b'E']]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          x=ragged.constant_value([
+              [['A', 'B'], ['C', 'D']],
+              [['E', 'F'], ['G', 'H'], ['I', 'J'], ['K', 'L']]
+          ], ragged_rank=1),
+          y=ragged.constant_value([
+              [['a', 'b'], ['c', 'd']],
+              [['e', 'f'], ['g', 'h'], ['i', 'j'], ['k', 'l']]
+          ], ragged_rank=1),
+          expected=ragged.constant_value([
+              [[b'A', b'b'], [b'c', b'D']],
+              [[b'E', b'f'], [b'g', b'h'], [b'I', b'j'], [b'k', b'L']]
+          ], ragged_rank=1)),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          x=ragged.constant_value([
+              [[[], ['A']]],
+              [[['B', 'C', 'D'], ['E', 'F']],
+               [['G'], [], ['H'], ['I', 'J', 'K']]]
+          ]),
+          y=ragged.constant_value([
+              [[[], ['a']]],
+              [[['b', 'c', 'd'], ['e', 'f']],
+               [['g'], [], ['h'], ['i', 'j', 'k']]]
+          ]),
+          expected=ragged.constant_value([
+              [[[], [b'A']]],
+              [[[b'B', b'c', b'D'], [b'e', b'F']],
+               [[b'G'], [], [b'h'], [b'i', b'J', b'k']]]
+          ])),
+
+      #=========================================================================
+      # Elementwise row-selection mode
+      #=========================================================================
+      dict(  # shape=[D1, D2]
+          condition=[True, False, True],
+          x=[['A', 'B'], ['C', 'D'], ['E', 'F']],
+          y=[['a', 'b'], ['c', 'd'], ['e', 'f']],
+          expected=[[b'A', b'B'], [b'c', b'd'], [b'E', b'F']]),
+      dict(  # shape=[D1, (D2)]
+          condition=[True, False, True],
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
+          y=ragged.constant_value([['a', 'b'], ['c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'c'],
+                                          [b'F', b'G']])),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged.constant_value([True, False]),
+          x=ragged.constant_value([
+              [[[], ['A']]],
+              [[['B', 'C', 'D'], ['E', 'F']],
+               [['G'], [], ['H'], ['I', 'J', 'K']]]
+          ]),
+          y=ragged.constant_value([[[['a']]], [[['b']]]]),
+          expected=ragged.constant_value([[[[], [b'A']]], [[[b'b']]]])),
+  ])   # pyformat: disable
+  def testRaggedWhere(self, condition, expected, x=None, y=None):
+    result = ragged.where(condition, x, y)
+    self.assertEqual(
+        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    with self.test_session():
+      result_value = self.evaluate(result)
+      if hasattr(result_value, 'tolist'):
+        result_value = result_value.tolist()
+      if hasattr(expected, 'tolist'):
+        expected = expected.tolist()
+      self.assertEqual(result_value, expected)
+
+  @parameterized.parameters([
+      dict(
+          condition=[True, False],
+          x=[1, 2],
+          error=ValueError,
+          message='x and y must be either both None or both non-None'),
+      dict(
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=[['a', 'b'], ['d', 'e']],
+          error=ValueError,
+          message='Input shapes do not match.'),
+  ])
+  def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
+    with self.assertRaisesRegexp(error, message):
+      ragged.where(condition, x, y)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2970c3e75af36d3f042ab23ab70c8d2cdb36ca
--- /dev/null
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -0,0 +1,107 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for converting between row_splits and segment_ids."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_util
+
+
+# For background on "segments" and "segment ids", see:
+# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+def row_splits_to_segment_ids(splits, name=None):
+  """Generates the segmentation corresponding to a RaggedTensor `splits` vector.
+
+  Returns an integer vector `segment_ids`, where `segment_ids[i] == j` if
+  `splits[j] <= i < splits[j+1]`.  Example:
+
+  ```python
+  >>> ragged.row_splits_to_segment_ids([0, 3, 3, 5, 6, 9]).eval()
+  [ 0 0 0 2 2 3 4 4 4 ]
+  ```
+
+  Args:
+    splits: A sorted 1-D int64 Tensor.  `splits[0]` must be zero.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A sorted 1-D int64 Tensor, with `shape=[splits[-1]]`
+
+  Raises:
+    ValueError: If `splits` is invalid.
+  """
+  with ops.name_scope(name, "RaggedSplitsToSegmentIds", [splits]) as name:
+    splits = ops.convert_to_tensor(splits, dtype=dtypes.int64, name="splits")
+    splits.shape.assert_has_rank(1)
+    if tensor_shape.dimension_value(splits.shape[0]) == 0:
+      raise ValueError("Invalid row_splits: []")
+    row_lengths = splits[1:] - splits[:-1]
+    nrows = array_ops.shape(splits, out_type=dtypes.int64)[-1] - 1
+    indices = math_ops.range(nrows)
+    return ragged_util.repeat(indices, repeats=row_lengths, axis=0)
+
+
+# For background on "segments" and "segment ids", see:
+# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+def segment_ids_to_row_splits(segment_ids, num_segments=None, name=None):
+  """Generates the RaggedTensor `splits` vector corresponding to a segmentation.
+
+  Returns an integer vector `splits`, where `splits[0] = 0` and
+  `splits[i] = splits[i-1] + count(segment_ids==i)`.  Example:
+
+  ```python
+  >>> ragged.segment_ids_to_row_splits([0, 0, 0, 2, 2, 3, 4, 4, 4]).eval()
+  [ 0 3 3 5 6 9 ]
+  ```
+
+  Args:
+    segment_ids: A 1-D integer Tensor.
+    num_segments: A scalar integer indicating the number of segments.  Defaults
+      to `max(segment_ids) + 1` (or zero if `segment_ids` is empty).
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A sorted 1-D int64 Tensor, with `shape=[num_segments + 1]`.
+  """
+  with ops.name_scope(name, "SegmentIdsToRaggedSplits", [segment_ids]) as name:
+    segment_ids = ragged_util.convert_to_int_tensor(segment_ids, "segment_ids")
+    segment_ids.shape.assert_has_rank(1)
+    if num_segments is not None:
+      num_segments = ragged_util.convert_to_int_tensor(num_segments,
+                                                       "num_segments")
+      num_segments.shape.assert_has_rank(0)
+
+    row_lengths = math_ops.bincount(
+        segment_ids,
+        minlength=num_segments,
+        maxlength=num_segments,
+        dtype=dtypes.int64)
+    splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
+
+    # Update shape information, if possible.
+    if num_segments is not None:
+      const_num_segments = tensor_util.constant_value(num_segments)
+      if const_num_segments is not None:
+        splits.set_shape(tensor_shape.TensorShape([const_num_segments + 1]))
+
+    return splits
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index f827a20ff8ff9afb99ba2298ec02d0849e67195a..c893ef011be1e55e8057a7e343bf2c8ad0bc0f4b 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -44,7 +44,8 @@ def _ShapeTensor(shape):
   return ops.convert_to_tensor(shape, dtype=dtype, name="shape")
 
 
-@tf_export("random.normal", "random_normal")
+@tf_export("random.normal", v1=["random.normal", "random_normal"])
+@deprecation.deprecated_endpoints("random_normal")
 def random_normal(shape,
                   mean=0.0,
                   stddev=1.0,
@@ -182,7 +183,8 @@ ops.NotDifferentiable("ParameterizedTruncatedNormal")
 ops.NotDifferentiable("TruncatedNormal")
 
 
-@tf_export("random.uniform", "random_uniform")
+@tf_export("random.uniform", v1=["random.uniform", "random_uniform"])
+@deprecation.deprecated_endpoints("random_uniform")
 def random_uniform(shape,
                    minval=0,
                    maxval=None,
@@ -247,7 +249,8 @@ def random_uniform(shape,
 ops.NotDifferentiable("RandomUniform")
 
 
-@tf_export("random.shuffle", "random_shuffle")
+@tf_export("random.shuffle", v1=["random.shuffle", "random_shuffle"])
+@deprecation.deprecated_endpoints("random_shuffle")
 def random_shuffle(value, seed=None, name=None):
   """Randomly shuffles a tensor along its first dimension.
 
@@ -278,7 +281,8 @@ def random_shuffle(value, seed=None, name=None):
       value, seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export("image.random_crop", "random_crop")
+@tf_export("image.random_crop", v1=["image.random_crop", "random_crop"])
+@deprecation.deprecated_endpoints("random_crop")
 def random_crop(value, size, seed=None, name=None):
   """Randomly crops a tensor to a given size.
 
@@ -321,7 +325,9 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
-@tf_export("random.multinomial", "multinomial")
+@tf_export(v1=["random.multinomial", "multinomial"])
+@deprecation.deprecated(
+    date=None, instructions="Use tf.random.categorical instead.")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   """Draws samples from a multinomial distribution.
 
@@ -338,9 +344,7 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
       `[i, :]` represents the unnormalized log-probabilities for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A Python integer. Used to create a random seed for the distribution.
-      See
-      `tf.set_random_seed`
-      for behavior.
+      See `tf.set_random_seed` for behavior.
     name: Optional name for the operation.
     output_dtype: integer type to use for the output. Defaults to int64.
 
@@ -348,10 +352,43 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
     The drawn samples of shape `[batch_size, num_samples]`.
   """
   with ops.name_scope(name, "multinomial", [logits]):
-    logits = ops.convert_to_tensor(logits, name="logits")
-    seed1, seed2 = random_seed.get_seed(seed)
-    return gen_random_ops.multinomial(
-        logits, num_samples, seed=seed1, seed2=seed2, output_dtype=output_dtype)
+    return multinomial_categorical_impl(logits, num_samples, output_dtype, seed)
+
+
+@tf_export("random.categorical", v1=[])
+def categorical(logits, num_samples, dtype=None, seed=None, name=None):
+  """Draws samples from a multinomial distribution.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.random.categorical(tf.log([[10., 10.]]), 5)
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    dtype: integer type to use for the output. Defaults to int64.
+    seed: A Python integer. Used to create a random seed for the distribution.
+      See `tf.set_random_seed` for behavior.
+    name: Optional name for the operation.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "categorical", [logits]):
+    return multinomial_categorical_impl(logits, num_samples, dtype, seed)
+
+
+def multinomial_categorical_impl(logits, num_samples, dtype, seed):
+  """Implementation for random.multinomial (v1) and random.categorical (v2)."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  seed1, seed2 = random_seed.get_seed(seed)
+  return gen_random_ops.multinomial(
+      logits, num_samples, seed=seed1, seed2=seed2, output_dtype=dtype)
 
 
 ops.NotDifferentiable("Multinomial")
@@ -441,7 +478,7 @@ def random_gamma(shape,
             shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
 
 
-@tf_export("random.poisson", v1=["random.poisson", "random_poisson"])
+@tf_export(v1=["random.poisson", "random_poisson"])
 @deprecation.deprecated_endpoints("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
@@ -474,6 +511,45 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
       for behavior.
     name: Optional name for the operation.
 
+  Returns:
+    samples: a `Tensor` of shape `tf.concat([shape, tf.shape(lam)], axis=0)`
+      with values of type `dtype`.
+  """
+  return random_poisson_v2(shape, lam, dtype, seed, name)
+
+
+@tf_export("random.poisson", v1=[])
+def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None):
+  """Draws `shape` samples from each of the given Poisson distribution(s).
+
+  `lam` is the rate parameter describing the distribution(s).
+
+  Example:
+
+  ```python
+  samples = tf.random_poisson([10], [0.5, 1.5])
+  # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
+  # the samples drawn from each distribution
+
+  samples = tf.random_poisson([7, 5], [12.2, 3.3])
+  # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
+  # represents the 7x5 samples drawn from each of the two distributions
+  ```
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output samples
+      to be drawn per "rate"-parameterized distribution.
+    lam: A Tensor or Python value or N-D array of type `dtype`.
+      `lam` provides the rate parameter(s) describing the poisson
+      distribution(s) to sample.
+    dtype: The type of the output: `float16`, `float32`, `float64`, `int32` or
+      `int64`.
+    seed: A Python integer. Used to create a random seed for the distributions.
+      See
+      `tf.set_random_seed`
+      for behavior.
+    name: Optional name for the operation.
+
   Returns:
     samples: a `Tensor` of shape `tf.concat([shape, tf.shape(lam)], axis=0)`
       with values of type `dtype`.
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 488b6fcbcdb2fb5158b6d6a08b90f79aa4630047..c20f8fb9389e3cec1bd512959af066eed02a39ce 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -519,7 +519,10 @@ class ResourceVariable(variables.RefVariable):
       snapshot = g.as_graph_element(
           ops.prepend_name_scope(
               variable_def.snapshot_name, import_scope=import_scope))
-      self._cached_value = snapshot
+      if snapshot.op.type != "ReadVariableOp":
+        self._cached_value = snapshot
+      else:
+        self._cached_value = None
       while snapshot.op.type != "ReadVariableOp":
         snapshot = snapshot.op.inputs[0]
       self._graph_element = snapshot
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 393e269abf5925ca1820fd757157a0c3599e99d3..c23b85847cb263662bcd2071a4d968e7ec97b7bf 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -347,7 +348,10 @@ def _reverse_seq(input_seq, lengths):
   return results
 
 
-@tf_export("nn.bidirectional_dynamic_rnn")
+@deprecation.deprecated(None, "Please use `keras.layers.Bidirectional("
+                        "keras.layers.RNN(cell))`, which is equivalent to "
+                        "this API")
+@tf_export(v1=["nn.bidirectional_dynamic_rnn"])
 def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
                               initial_state_fw=None, initial_state_bw=None,
                               dtype=None, parallel_iterations=None,
@@ -480,7 +484,10 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
   return (outputs, output_states)
 
 
-@tf_export("nn.dynamic_rnn")
+@deprecation.deprecated(
+    None,
+    "Please use `keras.layers.RNN(cell)`, which is equivalent to this API")
+@tf_export(v1=["nn.dynamic_rnn"])
 def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
                 dtype=None, parallel_iterations=None, swap_memory=False,
                 time_major=False, scope=None):
@@ -891,7 +898,7 @@ def _dynamic_rnn_loop(cell,
   return (final_outputs, final_state)
 
 
-@tf_export("nn.raw_rnn")
+@tf_export(v1=["nn.raw_rnn"])
 def raw_rnn(cell, loop_fn,
             parallel_iterations=None, swap_memory=False, scope=None):
   """Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`.
@@ -1210,7 +1217,10 @@ def raw_rnn(cell, loop_fn,
     return (emit_ta, final_state, final_loop_state)
 
 
-@tf_export("nn.static_rnn")
+@deprecation.deprecated(
+    None, "Please use `keras.layers.RNN(cell, unroll=True)`, "
+    "which is equivalent to this API")
+@tf_export(v1=["nn.static_rnn"])
 def static_rnn(cell,
                inputs,
                initial_state=None,
@@ -1483,7 +1493,10 @@ def static_state_saving_rnn(cell,
   return (outputs, state)
 
 
-@tf_export("nn.static_bidirectional_rnn")
+@deprecation.deprecated(None, "Please use `keras.layers.Bidirectional("
+                        "keras.layers.RNN(cell, unroll=True))`, which is "
+                        "equivalent to this API")
+@tf_export(v1=["nn.static_bidirectional_rnn"])
 def static_bidirectional_rnn(cell_fw,
                              cell_bw,
                              inputs,
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 5a2dd9dec84c1c448d1ae37203dee039ba69cd84..85efd6a4f75471f66866e66c5a452e2593e34490 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
@@ -410,7 +411,7 @@ class BasicRNNCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     if activation:
@@ -462,7 +463,7 @@ class BasicRNNCell(LayerRNNCell):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export("nn.rnn_cell.GRUCell")
+@tf_export(v1=["nn.rnn_cell.GRUCell"])
 class GRUCell(LayerRNNCell):
   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
 
@@ -488,6 +489,8 @@ class GRUCell(LayerRNNCell):
       `trainable` etc when constructing the cell from configs of get_config().
   """
 
+  @deprecated(None, "This class is equivalent as tf.keras.layers.GRUCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                activation=None,
@@ -505,7 +508,7 @@ class GRUCell(LayerRNNCell):
                    "Please use tf.contrib.cudnn_rnn.CudnnGRU for better "
                    "performance on GPU.", self)
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     if activation:
@@ -610,8 +613,7 @@ class LSTMStateTuple(_LSTMStateTuple):
     return c.dtype
 
 
-# TODO(scottzhu): Stop exporting this class in TF 2.0.
-@tf_export("nn.rnn_cell.BasicLSTMCell")
+@tf_export(v1=["nn.rnn_cell.BasicLSTMCell"])
 class BasicLSTMCell(LayerRNNCell):
   """DEPRECATED: Please use `tf.nn.rnn_cell.LSTMCell` instead.
 
@@ -634,10 +636,8 @@ class BasicLSTMCell(LayerRNNCell):
   better performance on CPU.
   """
 
-  @deprecated(None, "This class is deprecated, please use "
-                    "tf.nn.rnn_cell.LSTMCell, which supports all the feature "
-                    "this cell currently has. Please replace the existing code "
-                    "with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').")
+  @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                forget_bias=1.0,
@@ -684,7 +684,7 @@ class BasicLSTMCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._forget_bias = forget_bias
@@ -779,7 +779,7 @@ class BasicLSTMCell(LayerRNNCell):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export("nn.rnn_cell.LSTMCell")
+@tf_export(v1=["nn.rnn_cell.LSTMCell"])
 class LSTMCell(LayerRNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
@@ -807,6 +807,8 @@ class LSTMCell(LayerRNNCell):
   better performance on CPU.
   """
 
+  @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self, num_units,
                use_peepholes=False, cell_clip=None,
                initializer=None, num_proj=None, proj_clip=None,
@@ -870,7 +872,7 @@ class LSTMCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._use_peepholes = use_peepholes
@@ -1393,7 +1395,7 @@ class DeviceWrapper(RNNCell):
       return self._cell(inputs, state, scope=scope)
 
 
-@tf_export("nn.rnn_cell.MultiRNNCell")
+@tf_export(v1=["nn.rnn_cell.MultiRNNCell"])
 class MultiRNNCell(RNNCell):
   """RNN cell composed sequentially of multiple simple cells.
 
@@ -1406,6 +1408,9 @@ class MultiRNNCell(RNNCell):
   ```
   """
 
+  @deprecated(None, "This class is equivalent as "
+                    "tf.keras.layers.StackedRNNCells, and will be replaced by "
+                    "that in Tensorflow 2.0.")
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 2ec4b540fb612f776da83d55d011db66ad4e3bbe..a5b31aff91660a6ac79c980dffb543e87fd40dfa 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_script_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -305,13 +306,14 @@ def _EagerPyFuncGrad(op, *dy):
         is_grad_func=True)
 
 
+@tf_export("py_function")
 def eager_py_func(func, inp, Tout, name=None):
   """Wraps a python function into a TensorFlow op that executes it eagerly.
 
   This function allows expressing computations in a TensorFlow graph as
   Python functions. In particular, it wraps a Python function `func`
   in a once-differentiable TensorFlow operation that executes it with eager
-  exeuction enabled. As a consequence, `tf.contrib.eager.py_func` makes it
+  execution enabled. As a consequence, `tf.contrib.eager.py_func` makes it
   possible to express control flow using Python constructs (`if`, `while`,
   `for`, etc.), instead of TensorFlow control flow constructs (`tf.cond`,
   `tf.while_loop`). For example, you might use `tf.contrib.eager.py_func` to
@@ -387,7 +389,16 @@ def eager_py_func(func, inp, Tout, name=None):
   return _internal_py_func(func=func, inp=inp, Tout=Tout, eager=True, name=name)
 
 
-@tf_export("py_func")
+@deprecation.deprecated(
+    date=None,
+    instructions="""tf.py_func is deprecated in TF V2. Instead, use
+    tf.py_function, which takes a python function which manipulates tf eager
+    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
+    an ndarray (just call tensor.numpy()) but having access to eager tensors
+    means `tf.py_function`s can use accelerators such as GPUs as well as
+    being differentiable using a gradient tape.
+    """)
+@tf_export(v1=["py_func"])
 def py_func(func, inp, Tout, stateful=True, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index 720be098c25a878df06632734de37de2cb00c866..c6cf2fe9adf58bef84ec677466f01bb16dd61f8b 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -13,11 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tensor Handle Operations.
-
-See the [Session Ops](https://tensorflow.org/api_guides/python/session_ops)
-guide.
-"""
+"""Tensor Handle Operations."""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index 21e08d03d213c173d12dfc6676fe7f009811e93f..ee9c9b6bc0b36a374957178653eaae4c91ad733c 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -31,7 +31,7 @@ _VALID_DTYPES = set([
     dtypes.uint8, dtypes.uint16, dtypes.string])
 
 
-@tf_export("sets.set_size")
+@tf_export("sets.size", v1=["sets.size", "sets.set_size"])
 def set_size(a, validate_indices=True):
   """Compute number of unique elements along last dimension of `a`.
 
@@ -133,7 +133,8 @@ def _set_operation(a, b, set_operation, validate_indices=True):
   return sparse_tensor.SparseTensor(indices, values, shape)
 
 
-@tf_export("sets.set_intersection")
+@tf_export(
+    "sets.intersection", v1=["sets.intersection", "sets.set_intersection"])
 def set_intersection(a, b, validate_indices=True):
   """Compute set intersection of elements in last dimension of `a` and `b`.
 
@@ -200,7 +201,8 @@ def set_intersection(a, b, validate_indices=True):
   return _set_operation(a, b, "intersection", validate_indices)
 
 
-@tf_export("sets.set_difference")
+@tf_export(
+	   "sets.difference", v1=["sets.difference", "sets.set_difference"])
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -271,7 +273,8 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
   return _set_operation(a, b, "a-b" if aminusb else "b-a", validate_indices)
 
 
-@tf_export("sets.set_union")
+@tf_export(
+	   "sets.union", v1=["sets.union", "sets.set_union"])
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
diff --git a/tensorflow/python/ops/signal/BUILD b/tensorflow/python/ops/signal/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..da2bf9c1d2d73aeae8dd2d61c4e690bb1ab93b70
--- /dev/null
+++ b/tensorflow/python/ops/signal/BUILD
@@ -0,0 +1,36 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "signal",
+    srcs = [
+        "dct_ops.py",
+        "fft_ops.py",
+        "mel_ops.py",
+        "mfcc_ops.py",
+        "reconstruction_ops.py",
+        "shape_ops.py",
+        "signal.py",
+        "spectral_ops.py",
+        "util_ops.py",
+        "window_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:spectral_ops_gen",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d042c95c049538354836ef83f0b21d8babccedc8
--- /dev/null
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -0,0 +1,192 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Discrete Cosine Transform ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math as _math
+
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm):
+  """Checks that DCT/IDCT arguments are compatible and well formed."""
+  if n is not None:
+    raise NotImplementedError("The DCT length argument is not implemented.")
+  if axis != -1:
+    raise NotImplementedError("axis must be -1. Got: %s" % axis)
+  if dct_type not in (1, 2, 3):
+    raise ValueError("Only Types I, II and III (I)DCT are supported.")
+  if dct_type == 1:
+    if norm == "ortho":
+      raise ValueError("Normalization is not supported for the Type-I DCT.")
+    if input_tensor.shape[-1] is not None and input_tensor.shape[-1] < 2:
+      raise ValueError(
+          "Type-I DCT requires the dimension to be greater than one.")
+
+  if norm not in (None, "ortho"):
+    raise ValueError(
+        "Unknown normalization. Expected None or 'ortho', got: %s" % norm)
+
+
+# TODO(rjryan): Implement `n` and `axis` parameters.
+@tf_export("signal.dct", v1=["signal.dct", "spectral.dct"])
+def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
+  """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
+
+  Currently only Types I, II and III are supported.
+  Type I is implemented using a length `2N` padded `tf.spectral.rfft`.
+  Type II is implemented using a length `2N` padded `tf.spectral.rfft`, as
+  described here:
+  https://dsp.stackexchange.com/a/10606.
+  Type III is a fairly straightforward inverse of Type II
+  (i.e. using a length `2N` padded `tf.spectral.irfft`).
+
+  @compatibility(scipy)
+  Equivalent to scipy.fftpack.dct for Type-I, Type-II and Type-III DCT.
+  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
+  @end_compatibility
+
+  Args:
+    input: A `[..., samples]` `float32` `Tensor` containing the signals to
+      take the DCT of.
+    type: The DCT type to perform. Must be 1, 2 or 3.
+    n: For future expansion. The length of the transform. Must be `None`.
+    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
+    norm: The normalization to apply. `None` for no normalization or `'ortho'`
+      for orthonormal normalization.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.
+
+  Raises:
+    ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is
+      not `-1`, or `norm` is not `None` or `'ortho'`.
+    ValueError: If `type` is `1` and `norm` is `ortho`.
+
+  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
+  """
+  _validate_dct_arguments(input, type, n, axis, norm)
+  with _ops.name_scope(name, "dct", [input]):
+    # We use the RFFT to compute the DCT and TensorFlow only supports float32
+    # for FFTs at the moment.
+    input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)
+
+    axis_dim = (tensor_shape.dimension_value(input.shape[-1])
+                or _array_ops.shape(input)[-1])
+    axis_dim_float = _math_ops.to_float(axis_dim)
+
+    if type == 1:
+      dct1_input = _array_ops.concat([input, input[..., -2:0:-1]], axis=-1)
+      dct1 = _math_ops.real(fft_ops.rfft(dct1_input))
+      return dct1
+
+    if type == 2:
+      scale = 2.0 * _math_ops.exp(
+          _math_ops.complex(
+              0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
+              axis_dim_float))
+
+      # TODO(rjryan): Benchmark performance and memory usage of the various
+      # approaches to computing a DCT via the RFFT.
+      dct2 = _math_ops.real(
+          fft_ops.rfft(
+              input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)
+
+      if norm == "ortho":
+        n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
+        n2 = n1 * _math_ops.sqrt(2.0)
+        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
+        weights = _array_ops.pad(
+            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
+            constant_values=n2)
+        dct2 *= weights
+
+      return dct2
+
+    elif type == 3:
+      if norm == "ortho":
+        n1 = _math_ops.sqrt(axis_dim_float)
+        n2 = n1 * _math_ops.sqrt(0.5)
+        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
+        weights = _array_ops.pad(
+            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
+            constant_values=n2)
+        input *= weights
+      else:
+        input *= axis_dim_float
+      scale = 2.0 * _math_ops.exp(
+          _math_ops.complex(
+              0.0,
+              _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
+              axis_dim_float))
+      dct3 = _math_ops.real(
+          fft_ops.irfft(
+              scale * _math_ops.complex(input, 0.0),
+              fft_length=[2 * axis_dim]))[..., :axis_dim]
+
+      return dct3
+
+
+# TODO(rjryan): Implement `n` and `axis` parameters.
+@tf_export("signal.idct", v1=["signal.idct", "spectral.idct"])
+def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
+  """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
+
+  Currently only Types I, II and III are supported. Type III is the inverse of
+  Type II, and vice versa.
+
+  Note that you must re-normalize by 1/(2n) to obtain an inverse if `norm` is
+  not `'ortho'`. That is:
+  `signal == idct(dct(signal)) * 0.5 / signal.shape[-1]`.
+  When `norm='ortho'`, we have:
+  `signal == idct(dct(signal, norm='ortho'), norm='ortho')`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.fftpack.idct for Type-I, Type-II and Type-III DCT.
+  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.idct.html
+  @end_compatibility
+
+  Args:
+    input: A `[..., samples]` `float32` `Tensor` containing the signals to take
+      the DCT of.
+    type: The IDCT type to perform. Must be 1, 2 or 3.
+    n: For future expansion. The length of the transform. Must be `None`.
+    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
+    norm: The normalization to apply. `None` for no normalization or `'ortho'`
+      for orthonormal normalization.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `float32` `Tensor` containing the IDCT of `input`.
+
+  Raises:
+    ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is
+      not `-1`, or `norm` is not `None` or `'ortho'`.
+
+  [idct]:
+  https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms
+  """
+  _validate_dct_arguments(input, type, n, axis, norm)
+  inverse_type = {1: 1, 2: 3, 3: 2}[type]
+  return dct(input, type=inverse_type, n=n, axis=axis, norm=norm, name=name)
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/signal/fft_ops.py
similarity index 51%
rename from tensorflow/python/ops/spectral_ops.py
rename to tensorflow/python/ops/signal/fft_ops.py
index 4dcc90aefa978b89856ee6f8d77b73c3e7edb550..2d14b2bbd75864b6477bccc5cef562b617674c08 100644
--- a/tensorflow/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -12,16 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Spectral operators (e.g. DCT, FFT, RFFT)."""
+"""Fast-Fourier Transform ops."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math as _math
+import numpy as np
 
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util as _tensor_util
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
@@ -112,6 +111,7 @@ def _rfft_wrapper(fft_fn, fft_rank, default_name):
   """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
 
   def _rfft(input_tensor, fft_length=None, name=None):
+    """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
     with _ops.name_scope(name, default_name,
                          [input_tensor, fft_length]) as name:
       input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.float32)
@@ -130,6 +130,7 @@ def _irfft_wrapper(ifft_fn, fft_rank, default_name):
   """Wrapper around gen_spectral_ops.irfft* that infers fft_length argument."""
 
   def _irfft(input_tensor, fft_length=None, name=None):
+    """Wrapper irfft* that infers fft_length argument."""
     with _ops.name_scope(name, default_name,
                          [input_tensor, fft_length]) as name:
       input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.complex64)
@@ -145,6 +146,8 @@ def _irfft_wrapper(ifft_fn, fft_rank, default_name):
   return _irfft
 
 
+# FFT/IFFT 1/2/3D are exported via
+# third_party/tensorflow/core/api_def/python_api/
 fft = gen_spectral_ops.fft
 ifft = gen_spectral_ops.ifft
 fft2d = gen_spectral_ops.fft2d
@@ -152,159 +155,176 @@ ifft2d = gen_spectral_ops.ifft2d
 fft3d = gen_spectral_ops.fft3d
 ifft3d = gen_spectral_ops.ifft3d
 rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
-tf_export("spectral.rfft")(rfft)
+tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(rfft)
 irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
-tf_export("spectral.irfft")(irfft)
+tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(irfft)
 rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
-tf_export("spectral.rfft2d")(rfft2d)
+tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(rfft2d)
 irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
-tf_export("spectral.irfft2d")(irfft2d)
+tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(irfft2d)
 rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
-tf_export("spectral.rfft3d")(rfft3d)
+tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(rfft3d)
 irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
-tf_export("spectral.irfft3d")(irfft3d)
-
-
-def _validate_dct_arguments(dct_type, n, axis, norm):
-  if n is not None:
-    raise NotImplementedError("The DCT length argument is not implemented.")
-  if axis != -1:
-    raise NotImplementedError("axis must be -1. Got: %s" % axis)
-  if dct_type not in (2, 3):
-    raise ValueError("Only Types II and III (I)DCT are supported.")
-  if norm not in (None, "ortho"):
-    raise ValueError(
-        "Unknown normalization. Expected None or 'ortho', got: %s" % norm)
-
-
-# TODO(rjryan): Implement `type`, `n` and `axis` parameters.
-@tf_export("spectral.dct")
-def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
-  """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
-
-  Currently only Types II and III are supported. Type II is implemented using a
-  length `2N` padded `tf.spectral.rfft`, as described here:
-  https://dsp.stackexchange.com/a/10606. Type III is a fairly straightforward
-  inverse of Type II (i.e. using a length `2N` padded `tf.spectral.irfft`).
-
-  @compatibility(scipy)
-  Equivalent to scipy.fftpack.dct for Type-II and Type-III DCT.
-  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
-  @end_compatibility
-
-  Args:
-    input: A `[..., samples]` `float32` `Tensor` containing the signals to
-      take the DCT of.
-    type: The DCT type to perform. Must be 2 or 3.
-    n: For future expansion. The length of the transform. Must be `None`.
-    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
-    norm: The normalization to apply. `None` for no normalization or `'ortho'`
-      for orthonormal normalization.
-    name: An optional name for the operation.
-
-  Returns:
-    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.
-
-  Raises:
-    ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not
-      `-1`, or `norm` is not `None` or `'ortho'`.
-
-  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
-  """
-  _validate_dct_arguments(type, n, axis, norm)
-  with _ops.name_scope(name, "dct", [input]):
-    # We use the RFFT to compute the DCT and TensorFlow only supports float32
-    # for FFTs at the moment.
-    input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)
-
-    axis_dim = (tensor_shape.dimension_value(input.shape[-1])
-                or _array_ops.shape(input)[-1])
-    axis_dim_float = _math_ops.to_float(axis_dim)
-    if type == 2:
-      scale = 2.0 * _math_ops.exp(
-          _math_ops.complex(
-              0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
-              axis_dim_float))
-
-      # TODO(rjryan): Benchmark performance and memory usage of the various
-      # approaches to computing a DCT via the RFFT.
-      dct2 = _math_ops.real(
-          rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)
-
-      if norm == "ortho":
-        n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
-        n2 = n1 * _math_ops.sqrt(2.0)
-        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
-        weights = _array_ops.pad(
-            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
-            constant_values=n2)
-        dct2 *= weights
-
-      return dct2
-
-    elif type == 3:
-      if norm == "ortho":
-        n1 = _math_ops.sqrt(axis_dim_float)
-        n2 = n1 * _math_ops.sqrt(0.5)
-        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
-        weights = _array_ops.pad(
-            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
-            constant_values=n2)
-        input *= weights
-      else:
-        input *= axis_dim_float
-      scale = 2.0 * _math_ops.exp(
-          _math_ops.complex(
-              0.0,
-              _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
-              axis_dim_float))
-      dct3 = _math_ops.real(
-          irfft(
-              scale * _math_ops.complex(input, 0.0),
-              fft_length=[2 * axis_dim]))[..., :axis_dim]
-
-      return dct3
-
-
-# TODO(rjryan): Implement `type`, `n` and `axis` parameters.
-@tf_export("spectral.idct")
-def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
-  """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
-
-  Currently only Types II and III are supported. Type III is the inverse of
-  Type II, and vice versa.
-
-  Note that you must re-normalize by 1/(2n) to obtain an inverse if `norm` is
-  not `'ortho'`. That is:
-  `signal == idct(dct(signal)) * 0.5 / signal.shape[-1]`.
-  When `norm='ortho'`, we have:
-  `signal == idct(dct(signal, norm='ortho'), norm='ortho')`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.fftpack.idct for Type-II and Type-III DCT.
-  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.idct.html
-  @end_compatibility
-
-  Args:
-    input: A `[..., samples]` `float32` `Tensor` containing the signals to take
-      the DCT of.
-    type: The IDCT type to perform. Must be 2 or 3.
-    n: For future expansion. The length of the transform. Must be `None`.
-    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
-    norm: The normalization to apply. `None` for no normalization or `'ortho'`
-      for orthonormal normalization.
-    name: An optional name for the operation.
-
-  Returns:
-    A `[..., samples]` `float32` `Tensor` containing the IDCT of `input`.
-
-  Raises:
-    ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not
-      `-1`, or `norm` is not `None` or `'ortho'`.
-
-  [idct]:
-  https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms
-  """
-  _validate_dct_arguments(type, n, axis, norm)
-  inverse_type = {2: 3, 3: 2}[type]
-  return dct(input, type=inverse_type, n=n, axis=axis, norm=norm, name=name)
+tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(irfft3d)
+
+
+def _fft_size_for_grad(grad, rank):
+  return _math_ops.reduce_prod(_array_ops.shape(grad)[-rank:])
+
+
+@_ops.RegisterGradient("FFT")
+def _fft_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 1), grad.dtype)
+  return ifft(grad) * size
+
+
+@_ops.RegisterGradient("IFFT")
+def _ifft_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 1), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft(grad) * rsize
+
+
+@_ops.RegisterGradient("FFT2D")
+def _fft2d_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 2), grad.dtype)
+  return ifft2d(grad) * size
+
+
+@_ops.RegisterGradient("IFFT2D")
+def _ifft2d_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 2), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft2d(grad) * rsize
+
+
+@_ops.RegisterGradient("FFT3D")
+def _fft3d_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 3), grad.dtype)
+  return ifft3d(grad) * size
+
+
+@_ops.RegisterGradient("IFFT3D")
+def _ifft3d_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 3), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft3d(grad) * rsize
+
+
+def _rfft_grad_helper(rank, irfft_fn):
+  """Returns a gradient function for an RFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for RFFT3D.
+  assert rank in (1, 2), "Gradient for RFFT3D is not implemented."
+
+  def _grad(op, grad):
+    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
+    fft_length = op.inputs[1]
+    input_shape = _array_ops.shape(op.inputs[0])
+    is_even = _math_ops.cast(1 - (fft_length[-1] % 2), _dtypes.complex64)
+
+    def _tile_for_broadcasting(matrix, t):
+      expanded = _array_ops.reshape(
+          matrix,
+          _array_ops.concat([
+              _array_ops.ones([_array_ops.rank(t) - 2], _dtypes.int32),
+              _array_ops.shape(matrix)
+          ], 0))
+      return _array_ops.tile(
+          expanded, _array_ops.concat([_array_ops.shape(t)[:-2], [1, 1]], 0))
+
+    def _mask_matrix(length):
+      """Computes t_n = exp(sqrt(-1) * pi * n^2 / line_len)."""
+      # TODO(rjryan): Speed up computation of twiddle factors using the
+      # following recurrence relation and cache them across invocations of RFFT.
+      #
+      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
+      # for n = 0, 1,..., line_len-1.
+      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
+      a = _array_ops.tile(
+          _array_ops.expand_dims(_math_ops.range(length), 0), (length, 1))
+      b = _array_ops.transpose(a, [1, 0])
+      return _math_ops.exp(
+          -2j * np.pi * _math_ops.cast(a * b, _dtypes.complex64) /
+          _math_ops.cast(length, _dtypes.complex64))
+
+    def _ymask(length):
+      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
+      return _math_ops.cast(1 - 2 * (_math_ops.range(length) % 2),
+                            _dtypes.complex64)
+
+    y0 = grad[..., 0:1]
+    if rank == 1:
+      ym = grad[..., -1:]
+      extra_terms = y0 + is_even * ym * _ymask(input_shape[-1])
+    elif rank == 2:
+      # Create a mask matrix for y0 and ym.
+      base_mask = _mask_matrix(input_shape[-2])
+
+      # Tile base_mask to match y0 in shape so that we can batch-matmul the
+      # inner 2 dimensions.
+      tiled_mask = _tile_for_broadcasting(base_mask, y0)
+
+      y0_term = _math_ops.matmul(tiled_mask, _math_ops.conj(y0))
+      extra_terms = y0_term
+
+      ym = grad[..., -1:]
+      ym_term = _math_ops.matmul(tiled_mask, _math_ops.conj(ym))
+
+      inner_dim = input_shape[-1]
+      ym_term = _array_ops.tile(
+          ym_term,
+          _array_ops.concat([
+              _array_ops.ones([_array_ops.rank(grad) - 1], _dtypes.int32),
+              [inner_dim]
+          ], 0)) * _ymask(inner_dim)
+
+      extra_terms += is_even * ym_term
+
+    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
+    # factor, plus some additional terms to make up for the components dropped
+    # due to Hermitian symmetry.
+    input_size = _math_ops.to_float(_fft_size_for_grad(op.inputs[0], rank))
+    the_irfft = irfft_fn(grad, fft_length)
+    return 0.5 * (the_irfft * input_size + _math_ops.real(extra_terms)), None
+
+  return _grad
+
+
+def _irfft_grad_helper(rank, rfft_fn):
+  """Returns a gradient function for an IRFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for IRFFT3D.
+  assert rank in (1, 2), "Gradient for IRFFT3D is not implemented."
+
+  def _grad(op, grad):
+    """A gradient function for IRFFT with the provided `rank` and `rfft_fn`."""
+    # Generate a simple mask like [1.0, 2.0, ..., 2.0, 1.0] for even-length FFTs
+    # and [1.0, 2.0, ..., 2.0] for odd-length FFTs. To reduce extra ops in the
+    # graph we special-case the situation where the FFT length and last
+    # dimension of the input are known at graph construction time.
+    fft_length = op.inputs[1]
+    is_odd = _math_ops.mod(fft_length[-1], 2)
+    input_last_dimension = _array_ops.shape(op.inputs[0])[-1]
+    mask = _array_ops.concat(
+        [[1.0], 2.0 * _array_ops.ones([input_last_dimension - 2 + is_odd]),
+         _array_ops.ones([1 - is_odd])], 0)
+
+    rsize = _math_ops.reciprocal(_math_ops.to_float(
+        _fft_size_for_grad(grad, rank)))
+
+    # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
+    # factor and a mask. The mask scales the gradient for the Hermitian
+    # symmetric components of the RFFT by a factor of two, since these
+    # components are de-duplicated in the RFFT.
+    the_rfft = rfft_fn(grad, fft_length)
+    return the_rfft * _math_ops.cast(rsize * mask, _dtypes.complex64), None
+
+  return _grad
+
+
+_ops.RegisterGradient("RFFT")(_rfft_grad_helper(1, irfft))
+_ops.RegisterGradient("IRFFT")(_irfft_grad_helper(1, rfft))
+_ops.RegisterGradient("RFFT2D")(_rfft_grad_helper(2, irfft2d))
+_ops.RegisterGradient("IRFFT2D")(_irfft_grad_helper(2, rfft2d))
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py
similarity index 98%
rename from tensorflow/contrib/signal/python/ops/mel_ops.py
rename to tensorflow/python/ops/signal/mel_ops.py
index ecc2fedb9f82151511bab3f3c0496bc4e290903f..6488e1df59b4a0bd801ebb23dc3b5ea5b31e00c2 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/python/ops/signal/mel_ops.py
@@ -18,11 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops import shape_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.util.tf_export import tf_export
+
 
 # mel spectrum constants.
 _MEL_BREAK_FREQUENCY_HERTZ = 700.0
@@ -85,6 +87,7 @@ def _validate_arguments(num_mel_bins, sample_rate,
     raise ValueError('dtype must be a floating point type. Got: %s' % dtype)
 
 
+@tf_export('signal.linear_to_mel_weight_matrix')
 def linear_to_mel_weight_matrix(num_mel_bins=20,
                                 num_spectrogram_bins=129,
                                 sample_rate=8000,
diff --git a/tensorflow/contrib/signal/python/ops/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
similarity index 90%
rename from tensorflow/contrib/signal/python/ops/mfcc_ops.py
rename to tensorflow/python/ops/signal/mfcc_ops.py
index b379db55daedabc21e7f14f23b0d8efc1ed37ba2..601409dea901f34cca02861971850c3238378163 100644
--- a/tensorflow/contrib/signal/python/ops/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -22,9 +22,11 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import dct_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('signal.mfccs_from_log_mel_spectrograms')
 def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.
 
@@ -48,14 +50,14 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   pcm = tf.placeholder(tf.float32, [None, None])
 
   # A 1024-point STFT with frames of 64 ms and 75% overlap.
-  stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256,
-                                 fft_length=1024)
+  stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256,
+                         fft_length=1024)
   spectrograms = tf.abs(stfts)
 
   # Warp the linear scale spectrograms into the mel-scale.
   num_spectrogram_bins = stfts.shape[-1].value
   lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
-  linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
+  linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
     num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
     upper_edge_hertz)
   mel_spectrograms = tf.tensordot(
@@ -67,7 +69,7 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
 
   # Compute MFCCs from log_mel_spectrograms and take the first 13.
-  mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
+  mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
     log_mel_spectrograms)[..., :13]
   ```
 
@@ -104,5 +106,5 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
     else:
       num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]
 
-    dct2 = spectral_ops.dct(log_mel_spectrograms)
+    dct2 = dct_ops.dct(log_mel_spectrograms, type=2)
     return dct2 * math_ops.rsqrt(math_ops.to_float(num_mel_bins) * 2.0)
diff --git a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
similarity index 97%
rename from tensorflow/contrib/signal/python/ops/reconstruction_ops.py
rename to tensorflow/python/ops/signal/reconstruction_ops.py
index 503b33a54df3d6a17142940a110a25940bbdc21c..0fc7fec23933d600c89513fb39d3a45856a8618b 100644
--- a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -18,12 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops import shape_ops
-from tensorflow.contrib.signal.python.ops import util_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.ops.signal import util_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _shuffle_to_front(input_tensor, k):
@@ -57,6 +58,7 @@ def _shuffle_to_front(input_tensor, k):
   return array_ops.transpose(input_tensor, perm=permutation)
 
 
+@tf_export("signal.overlap_and_add")
 def overlap_and_add(signal, frame_step, name=None):
   """Reconstructs a signal from a framed representation.
 
diff --git a/tensorflow/contrib/signal/python/ops/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
similarity index 97%
rename from tensorflow/contrib/signal/python/ops/shape_ops.py
rename to tensorflow/python/ops/signal/shape_ops.py
index 91862f0cc0ba53c6b3bc31d7f5e93cbbbd7ae494..ae9c2ef28e4f1c857519838f22a4844ac2c9e7b4 100644
--- a/tensorflow/contrib/signal/python/ops/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -18,13 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-from tensorflow.contrib.signal.python.ops import util_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import util_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
@@ -53,6 +52,7 @@ def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
   return outer_dimensions + [num_frames, frame_length] + inner_dimensions
 
 
+@tf_export("signal.frame")
 def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
           name=None):
   """Expands `signal`'s `axis` dimension into frames of `frame_length`.
@@ -70,8 +70,8 @@ def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
 
   ```python
   pcm = tf.placeholder(tf.float32, [None, 9152])
-  frames = tf.contrib.signal.frame(pcm, 512, 180)
-  magspec = tf.abs(tf.spectral.rfft(frames, [512]))
+  frames = tf.signal.frame(pcm, 512, 180)
+  magspec = tf.abs(tf.signal.rfft(frames, [512]))
   image = tf.expand_dims(magspec, 3)
   ```
 
diff --git a/tensorflow/python/ops/signal/signal.py b/tensorflow/python/ops/signal/signal.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc4d1c1911a8570ba28a0b42bd6da5d83fd40e1
--- /dev/null
+++ b/tensorflow/python/ops/signal/signal.py
@@ -0,0 +1,65 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal processing operations.
+
+See the [tf.signal](https://tensorflow.org/api_guides/python/contrib.signal)
+guide.
+
+@@frame
+@@hamming_window
+@@hann_window
+@@inverse_stft
+@@inverse_stft_window_fn
+@@mfccs_from_log_mel_spectrograms
+@@linear_to_mel_weight_matrix
+@@overlap_and_add
+@@stft
+
+[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+[mel]: https://en.wikipedia.org/wiki/Mel_scale
+[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.ops.signal.dct_ops import dct
+from tensorflow.python.ops.signal.dct_ops import idct
+from tensorflow.python.ops.signal.fft_ops import fft
+from tensorflow.python.ops.signal.fft_ops import fft2d
+from tensorflow.python.ops.signal.fft_ops import fft3d
+from tensorflow.python.ops.signal.fft_ops import ifft
+from tensorflow.python.ops.signal.fft_ops import ifft2d
+from tensorflow.python.ops.signal.fft_ops import ifft3d
+from tensorflow.python.ops.signal.fft_ops import irfft
+from tensorflow.python.ops.signal.fft_ops import irfft2d
+from tensorflow.python.ops.signal.fft_ops import irfft3d
+from tensorflow.python.ops.signal.fft_ops import rfft
+from tensorflow.python.ops.signal.fft_ops import rfft2d
+from tensorflow.python.ops.signal.fft_ops import rfft3d
+from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
+from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
+from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
+from tensorflow.python.ops.signal.shape_ops import frame
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
+from tensorflow.python.ops.signal.spectral_ops import stft
+from tensorflow.python.ops.signal.window_ops import hamming_window
+from tensorflow.python.ops.signal.window_ops import hann_window
+# pylint: enable=unused-import
diff --git a/tensorflow/contrib/signal/python/ops/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
similarity index 90%
rename from tensorflow/contrib/signal/python/ops/spectral_ops.py
rename to tensorflow/python/ops/signal/spectral_ops.py
index a8b5deff6ca3a4a756d31b904e577f08f6155fd7..f029e0a8b59777b50e38ab4d8f801e811467c561 100644
--- a/tensorflow/contrib/signal/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -18,23 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
 import numpy as np
 
-from tensorflow.contrib.signal.python.ops import reconstruction_ops
-from tensorflow.contrib.signal.python.ops import shape_ops
-from tensorflow.contrib.signal.python.ops import window_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.ops.signal import reconstruction_ops
+from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.ops.signal import window_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('signal.stft')
 def stft(signals, frame_length, frame_step, fft_length=None,
-         window_fn=functools.partial(window_ops.hann_window, periodic=True),
+         window_fn=window_ops.hann_window,
          pad_end=False, name=None):
   """Computes the [Short-time Fourier Transform][stft] of `signals`.
 
@@ -86,14 +86,14 @@ def stft(signals, frame_length, frame_step, fft_length=None,
       window = window_fn(frame_length, dtype=framed_signals.dtype)
       framed_signals *= window
 
-    # spectral_ops.rfft produces the (fft_length/2 + 1) unique components of the
+    # fft_ops.rfft produces the (fft_length/2 + 1) unique components of the
     # FFT of the real windowed signals in framed_signals.
-    return spectral_ops.rfft(framed_signals, [fft_length])
+    return fft_ops.rfft(framed_signals, [fft_length])
 
 
+@tf_export('signal.inverse_stft_window_fn')
 def inverse_stft_window_fn(frame_step,
-                           forward_window_fn=functools.partial(
-                               window_ops.hann_window, periodic=True),
+                           forward_window_fn=window_ops.hann_window,
                            name=None):
   """Generates a window function that can be used in `inverse_stft`.
 
@@ -152,18 +152,18 @@ def inverse_stft_window_fn(frame_step,
   return inverse_stft_window_fn_inner
 
 
+@tf_export('signal.inverse_stft')
 def inverse_stft(stfts,
                  frame_length,
                  frame_step,
                  fft_length=None,
-                 window_fn=functools.partial(window_ops.hann_window,
-                                             periodic=True),
+                 window_fn=window_ops.hann_window,
                  name=None):
   """Computes the inverse [Short-time Fourier Transform][stft] of `stfts`.
 
   To reconstruct an original waveform, a complimentary window function should
   be used in inverse_stft. Such a window function can be constructed with
-  tf.contrib.signal.inverse_stft_window_fn.
+  tf.signal.inverse_stft_window_fn.
 
   Example:
 
@@ -171,10 +171,10 @@ def inverse_stft(stfts,
   frame_length = 400
   frame_step = 160
   waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
-  stft = tf.contrib.signal.stft(waveform, frame_length, frame_step)
-  inverse_stft = tf.contrib.signal.inverse_stft(
+  stft = tf.signal.stft(waveform, frame_length, frame_step)
+  inverse_stft = tf.signal.inverse_stft(
       stft, frame_length, frame_step,
-      window_fn=tf.contrib.signal.inverse_stft_window_fn(frame_step))
+      window_fn=tf.signal.inverse_stft_window_fn(frame_step))
   ```
 
   if a custom window_fn is used in stft, it must be passed to
@@ -185,11 +185,11 @@ def inverse_stft(stfts,
   frame_step = 160
   window_fn = functools.partial(window_ops.hamming_window, periodic=True),
   waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
-  stft = tf.contrib.signal.stft(
+  stft = tf.signal.stft(
       waveform, frame_length, frame_step, window_fn=window_fn)
-  inverse_stft = tf.contrib.signal.inverse_stft(
+  inverse_stft = tf.signal.inverse_stft(
       stft, frame_length, frame_step,
-      window_fn=tf.contrib.signal.inverse_stft_window_fn(
+      window_fn=tf.signal.inverse_stft_window_fn(
          frame_step, forward_window_fn=window_fn))
   ```
 
@@ -232,7 +232,7 @@ def inverse_stft(stfts,
       fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
       fft_length.shape.assert_has_rank(0)
 
-    real_frames = spectral_ops.irfft(stfts, [fft_length])
+    real_frames = fft_ops.irfft(stfts, [fft_length])
 
     # frame_length may be larger or smaller than fft_length, so we pad or
     # truncate real_frames to frame_length.
diff --git a/tensorflow/contrib/signal/python/ops/util_ops.py b/tensorflow/python/ops/signal/util_ops.py
similarity index 100%
rename from tensorflow/contrib/signal/python/ops/util_ops.py
rename to tensorflow/python/ops/signal/util_ops.py
diff --git a/tensorflow/contrib/signal/python/ops/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
similarity index 97%
rename from tensorflow/contrib/signal/python/ops/window_ops.py
rename to tensorflow/python/ops/signal/window_ops.py
index 59e67e8ba414df1f9c777d1f5a3f3dba975648a2..730c989cfe9866f6e0a22d6e5eeda46dab0ab94b 100644
--- a/tensorflow/contrib/signal/python/ops/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -27,8 +27,10 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('signal.hann_window')
 def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   """Generate a [Hann window][hann].
 
@@ -53,6 +55,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
                                dtype, 0.5, 0.5)
 
 
+@tf_export('signal.hamming_window')
 def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
                    name=None):
   """Generate a [Hamming][hamming] window.
diff --git a/tensorflow/python/ops/sort_ops.py b/tensorflow/python/ops/sort_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3e23d701ed546ca76e2dd08e999ff869e87c816
--- /dev/null
+++ b/tensorflow/python/ops/sort_ops.py
@@ -0,0 +1,197 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for sorting tensors.
+
+@@argsort
+@@sort
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('sort')
+def sort(values, axis=-1, direction='ASCENDING', name=None):
+  """Sorts a tensor.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same dtype and shape as `values`, with the elements
+        sorted along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  with framework_ops.name_scope(name, 'sort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=False)
+
+
+@tf_export('argsort')
+def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
+  """Returns the indices of a tensor that give its sorted order along an axis.
+
+  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
+  `tf.sort(values)`. For higher dimensions, the output has the same shape as
+  `values`, but along the given axis, values represent the index of the sorted
+  element in that slice of the tensor at the given position.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    stable: If True, equal elements in the original tensor will not be
+      re-ordered in the returned order. Unstable sort is not yet implemented,
+      but will eventually be the default for performance reasons. If you require
+      a stable order, pass `stable=True` for forwards compatibility.
+    name: Optional name for the operation.
+
+  Returns:
+    An int32 `Tensor` with the same shape as `values`. The indices that would
+        sort each slice of the given `values` along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  del stable  # Unused.
+  with framework_ops.name_scope(name, 'argsort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=True)
+
+
+def _sort_or_argsort(values, axis, direction, return_argsort):
+  """Internal sort/argsort implementation.
+
+  Args:
+    values: The input values.
+    axis: The axis along which to sort.
+    direction: 'ASCENDING' or 'DESCENDING'.
+    return_argsort: Whether to return the argsort result.
+
+  Returns:
+    Either the sorted values, or the indices of the sorted values in the
+        original tensor. See the `sort` and `argsort` docstrings.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  if direction not in _SORT_IMPL:
+    raise ValueError('%s should be one of %s' % (direction, ', '.join(
+        sorted(_SORT_IMPL.keys()))))
+  # Axis must be an integer, not a Tensor.
+  axis = framework_ops.convert_to_tensor(axis, name='axis')
+  axis_static = tensor_util.constant_value(axis)
+  if axis.shape.ndims != 0 or axis_static is None:
+    raise ValueError('axis must be a constant scalar')
+  axis_static = int(axis_static)  # Avoids NumPy casting error
+
+  values = framework_ops.convert_to_tensor(values, name='values')
+
+  return _SORT_IMPL[direction](values, axis_static, return_argsort)
+
+
+def _descending_sort(values, axis, return_argsort=False):
+  """Sorts values in reverse using `top_k`.
+
+  Args:
+    values: Tensor of numeric values.
+    axis: Index of the axis which values should be sorted along.
+    return_argsort: If False, return the sorted values. If True, return the
+      indices that would sort the values.
+
+  Returns:
+    The sorted values.
+  """
+  k = array_ops.shape(values)[axis]
+  rank = array_ops.rank(values)
+  static_rank = values.shape.ndims
+  # Fast path: sorting the last axis.
+  if axis == -1 or axis + 1 == values.get_shape().ndims:
+    top_k_input = values
+    transposition = None
+  else:
+    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
+    if axis < 0:
+      # Calculate the actual axis index if counting from the end. Use the static
+      # rank if available, or else make the axis back into a tensor.
+      axis += static_rank or rank
+    if static_rank is not None:
+      # Prefer to calculate the transposition array in NumPy and make it a
+      # constant.
+      transposition = constant_op.constant(
+          np.r_[
+              # Axes up to axis are unchanged.
+              np.arange(axis),
+              # Swap axis and rank - 1.
+              [static_rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              np.arange(axis + 1, static_rank - 1),
+              # Swap axis and rank - 1.
+              [axis]],
+          name='transposition')
+    else:
+      # Generate the transposition array from the tensors.
+      transposition = array_ops.concat(
+          [
+              # Axes up to axis are unchanged.
+              math_ops.range(axis),
+              # Swap axis and rank - 1.
+              [rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              math_ops.range(axis + 1, rank - 1),
+              # Swap axis and rank - 1.
+              [axis]
+          ],
+          axis=0)
+    top_k_input = array_ops.transpose(values, transposition)
+
+  values, indices = nn_ops.top_k(top_k_input, k)
+  return_value = indices if return_argsort else values
+  if transposition is not None:
+    # transposition contains a single cycle of length 2 (swapping 2 elements),
+    # so it is an involution (it is its own inverse).
+    return_value = array_ops.transpose(return_value, transposition)
+  return return_value
+
+
+def _ascending_sort(values, axis, return_argsort=False):
+  # Negate the values to get the ascending order from descending sort.
+  values_or_indices = _descending_sort(-values, axis, return_argsort)
+  # If not argsort, negate the values again.
+  return values_or_indices if return_argsort else -values_or_indices
+
+
+_SORT_IMPL = {
+    'ASCENDING': _ascending_sort,
+    'DESCENDING': _descending_sort,
+}
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops_test.py b/tensorflow/python/ops/sort_ops_test.py
similarity index 96%
rename from tensorflow/contrib/framework/python/ops/sort_ops_test.py
rename to tensorflow/python/ops/sort_ops_test.py
index 791b32cd1e2eea9f466a14585a8b15d085bd450f..8a92f4926646865a02e2edde57ad6b71081f1573 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops_test.py
+++ b/tensorflow/python/ops/sort_ops_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import sort_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -28,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sort_ops
 from tensorflow.python.platform import test
 
 
@@ -88,9 +88,7 @@ class SortTest(test.TestCase):
       self.assertAllEqual(
           np.sort(arr, axis=0)[::-1],
           sort_ops.sort(
-              constant_op.constant(arr),
-              axis=0,
-              direction='DESCENDING').eval())
+              constant_op.constant(arr), axis=0, direction='DESCENDING').eval())
 
   def testSort_staticallyKnownRank_constantTransposition(self):
     # The transposition array should be a constant if the rank of "values" is
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 077e4558b7cc0f46133d685f1878d3abcc9053d6..58cd8291e136df8ca189f576a77dd865ecd322a2 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -16,7 +16,7 @@
 # pylint: disable=g-short-docstring-punctuation
 """Sparse Tensor Representation.
 
-See the [Sparse Ops](https://tensorflow.org/api_guides/python/sparse_ops) guide.
+See also `tf.SparseTensor`.
 """
 
 from __future__ import absolute_import
@@ -185,8 +185,14 @@ def sparse_eye(num_rows,
         dense_shape=[num_rows, num_columns])
 
 
+@tf_export("sparse.concat", "sparse_concat", v1=[])
+def sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dim=False,
+                     concat_dim=None, name=None):
+  return sparse_concat(axis, sp_inputs, name, expand_nonconcat_dim, concat_dim)
+
+
 # pylint: disable=protected-access
-@tf_export("sparse.concat", "sparse_concat")
+@tf_export(v1=["sparse.concat", "sparse_concat"])
 @deprecation.deprecated_endpoints("sparse_concat")
 @deprecation.deprecated_args(
     None, "concat_dim is deprecated, use axis instead", "concat_dim")
@@ -319,7 +325,7 @@ def sparse_concat(axis,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse.add", v1=["sparse.add", "sparse_add"])
+@tf_export(v1=["sparse.add", "sparse_add"])
 @deprecation.deprecated_endpoints("sparse_add")
 def sparse_add(a, b, thresh=0):
   """Adds two tensors, at least one of each is a `SparseTensor`.
@@ -402,6 +408,65 @@ def sparse_add(a, b, thresh=0):
                                                   a.dense_shape, b)
 
 
+@tf_export("sparse.add", v1=[])
+def sparse_add_v2(a, b, threshold=0):
+  """Adds two tensors, at least one of each is a `SparseTensor`.
+
+  If one `SparseTensor` and one `Tensor` are passed in, returns a `Tensor`.  If
+  both arguments are `SparseTensor`s, this returns a `SparseTensor`.  The order
+  of arguments does not matter.  Use vanilla `tf.add()` for adding two dense
+  `Tensor`s.
+
+  The shapes of the two operands must match: broadcasting is not supported.
+
+  The indices of any input `SparseTensor` are assumed ordered in standard
+  lexicographic order.  If this is not the case, before this step run
+  `SparseReorder` to restore index ordering.
+
+  If both arguments are sparse, we perform "clipping" as follows.  By default,
+  if two values sum to zero at some index, the output `SparseTensor` would still
+  include that particular location in its index, storing a zero in the
+  corresponding value slot.  To override this, callers can specify `threshold`,
+  indicating that if the sum has a magnitude strictly smaller than `threshold`,
+  its corresponding value and index would then not be included.  In particular,
+  `threshold == 0.0` (default) means everything is kept and actual thresholding
+  happens only for a positive value.
+
+  For example, suppose the logical sum of two sparse operands is (densified):
+
+      [       2]
+      [.1     0]
+      [ 6   -.2]
+
+  Then,
+
+      * `threshold == 0` (the default): all 5 index/value pairs will be
+          returned.
+      * `threshold == 0.11`: only .1 and 0 will vanish, and the remaining three
+          index/value pairs will be returned.
+      * `threshold == 0.21`: .1, 0, and -.2 will vanish.
+
+  Args:
+    a: The first operand; `SparseTensor` or `Tensor`.
+    b: The second operand; `SparseTensor` or `Tensor`.  At least one operand
+      must be sparse.
+    threshold: A 0-D `Tensor`.  The magnitude threshold that determines if an
+    output value/index pair takes space.  Its dtype should match that of the
+    values if they are real; if the latter are complex64/complex128, then the
+    dtype should be float32/float64, correspondingly.
+
+  Returns:
+    A `SparseTensor` or a `Tensor`, representing the sum.
+
+  Raises:
+    TypeError: If both `a` and `b` are `Tensor`s.  Use `tf.add()` instead.
+  """
+  return sparse_add(
+      a=a,
+      b=b,
+      thresh=threshold)
+
+
 @tf_export("sparse.cross")
 def sparse_cross(inputs, name=None):
   """Generates sparse cross from a list of sparse and dense tensors.
@@ -705,7 +770,7 @@ class KeywordRequired(object):
     return "KeywordRequired()"
 
 
-@tf_export("sparse.split", "sparse_split")
+@tf_export(v1=["sparse.split", "sparse_split"])
 @deprecation.deprecated_endpoints("sparse_split")
 @deprecation.deprecated_args(
     None, "split_dim is deprecated, use axis instead", "split_dim")
@@ -779,6 +844,51 @@ def sparse_split(keyword_required=KeywordRequired(),
   return sparse_tensors
 
 
+@tf_export("sparse.split", v1=[])
+def sparse_split_v2(sp_input=None,
+                    num_split=None,
+                    axis=None,
+                    name=None):
+  """Split a `SparseTensor` into `num_split` tensors along `axis`.
+
+  If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
+  each slice starting from 0:`shape[axis] % num_split` gets extra one
+  dimension. For example, if `axis = 1` and `num_split = 2` and the
+  input is:
+
+      input_tensor = shape = [2, 7]
+      [    a   d e  ]
+      [b c          ]
+
+  Graphically the output tensors are:
+
+      output_tensor[0] =
+      [    a ]
+      [b c   ]
+
+      output_tensor[1] =
+      [ d e  ]
+      [      ]
+
+  Args:
+    sp_input: The `SparseTensor` to split.
+    num_split: A Python integer. The number of ways to split.
+    axis: A 0-D `int32` `Tensor`. The dimension along which to split.
+    name: A name for the operation (optional).
+
+  Returns:
+    `num_split` `SparseTensor` objects resulting from splitting `value`.
+
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return sparse_split(sp_input=sp_input,
+                      num_split=num_split,
+                      axis=axis,
+                      name=name,
+                      split_dim=None)
+
+
 @tf_export("sparse.slice", v1=["sparse.slice", "sparse_slice"])
 @deprecation.deprecated_endpoints("sparse_slice")
 def sparse_slice(sp_input, start, size, name=None):
@@ -888,7 +998,86 @@ def sparse_to_dense(sparse_indices,
       name=name)
 
 
-@tf_export("sparse.reduce_max", "sparse_reduce_max")
+@tf_export("sparse.reduce_max", v1=[])
+def sparse_reduce_max_v2(
+    sp_input, axis=None, keepdims=None, output_is_sparse=False, name=None):
+  """Computes the max of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+  if `output_is_sparse` is `False`, or a `SparseTensor` if `output_is_sparse`
+  is `True`.
+
+  Note: A gradient is not defined for this function, so it can't be used
+  in training models that need gradient descent.
+
+  Reduces `sp_input` along the dimensions given in `axis`.  Unless
+  `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `axis`. If `keepdims` is true, the reduced dimensions are retained
+  with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  The values not defined in `sp_input` don't participate in the reduce max,
+  as opposed to be implicitly assumed 0 -- hence it can return negative values
+  for sparse `axis`. But, in case there are no values in
+  `axis`, it will reduce to 0. See second example below.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 2]
+  #                 [?, 3, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse.reduce_max(x) ==> 3
+  tf.sparse.reduce_max(x, 0) ==> [1, 3, 2]
+  tf.sparse.reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
+  tf.sparse.reduce_max(x, 1, keepdims=True) ==> [[2], [3]]
+  tf.sparse.reduce_max(x, [0, 1]) ==> 3
+
+  # 'y' represents [[-7, ?]
+  #                 [ 4, 3]
+  #                 [ ?, ?]
+  tf.sparse.reduce_max(x, 1) ==> [-7, 4, 0]
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keepdims: If true, retain reduced dimensions with length 1.
+    output_is_sparse: If true, returns a `SparseTensor` instead of a dense
+      `Tensor` (the default).
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced Tensor or the reduced SparseTensor if `output_is_sparse` is
+    True.
+  """
+  if keepdims is None:
+    keepdims = False
+
+  # reduction_axes is the deprecated name for axis.
+  reduction_axes = None
+
+  if output_is_sparse:
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops.sparse_reduce_max_sparse(
+            sp_input.indices, sp_input.values, sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            name=name))
+
+    return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+  return gen_sparse_ops.sparse_reduce_max(
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      name=name)
+
+
+@tf_export(v1=["sparse.reduce_max", "sparse_reduce_max"])
 @deprecation.deprecated_endpoints("sparse_reduce_max")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -956,7 +1145,7 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse.reduce_max_sparse", "sparse_reduce_max_sparse")
+@tf_export(v1=["sparse.reduce_max_sparse", "sparse_reduce_max_sparse"])
 @deprecation.deprecated_endpoints("sparse_reduce_max_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1007,7 +1196,74 @@ def sparse_reduce_max_sparse(sp_input,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse.reduce_sum", "sparse_reduce_sum")
+@tf_export("sparse.reduce_sum", v1=[])
+def sparse_reduce_sum_v2(
+    sp_input, axis=None, keepdims=None, output_is_sparse=False, name=None):
+  """Computes the sum of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+  if `output_is_sparse` is `False`, or a `SparseTensor` if `output_is_sparse`
+  is `True`.
+
+  Note: if `output_is_sparse` is True, a gradient is not defined for this
+  function, so it can't be used in training models that need gradient descent.
+
+  Reduces `sp_input` along the dimensions given in `axis`.  Unless `keepdims` is
+  true, the rank of the tensor is reduced by 1 for each entry in `axis`. If
+  `keepdims` is true, the reduced dimensions are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 1]
+  #                 [?, 1, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse.reduce_sum(x) ==> 3
+  tf.sparse.reduce_sum(x, 0) ==> [1, 1, 1]
+  tf.sparse.reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
+  tf.sparse.reduce_sum(x, 1, keepdims=True) ==> [[2], [1]]
+  tf.sparse.reduce_sum(x, [0, 1]) ==> 3
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keepdims: If true, retain reduced dimensions with length 1.
+    output_is_sparse: If true, returns a `SparseTensor` instead of a dense
+      `Tensor` (the default).
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced Tensor or the reduced SparseTensor if `output_is_sparse` is
+    True.
+  """
+  if keepdims is None:
+    keepdims = False
+
+  # reduction_axes is the deprecated name for axis.
+  reduction_axes = None
+
+  if output_is_sparse:
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops.sparse_reduce_sum_sparse(
+            sp_input.indices, sp_input.values, sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            name=name))
+    return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+  return gen_sparse_ops.sparse_reduce_sum(
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      name=name)
+
+
+@tf_export(v1=["sparse.reduce_sum", "sparse_reduce_sum"])
 @deprecation.deprecated_endpoints("sparse_reduce_sum")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1062,7 +1318,7 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse")
+@tf_export(v1=["sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse"])
 @deprecation.deprecated_endpoints("sparse_reduce_sum_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1231,8 +1487,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
         sp_new, default_value=False, validate_indices=False, name=name)
 
 
-@tf_export("sparse.merge", v1=["sparse.merge", "sparse_merge"])
-@deprecation.deprecated_endpoints("sparse_merge")
+@tf_export(v1=["sparse.merge", "sparse_merge"])
+@deprecation.deprecated(None, "No similar op available at this time.")
 def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
                  already_sorted=False):
   """Combines a batch of feature ids and values into a single `SparseTensor`.
@@ -1798,7 +2054,9 @@ def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
-@tf_export("sparse.matmul", v1=["sparse.matmul", "sparse_tensor_dense_matmul"])
+@tf_export("sparse.sparse_dense_matmul",
+           v1=["sparse.sparse_dense_matmul", "sparse.matmul",
+               "sparse_tensor_dense_matmul"])
 @deprecation.deprecated_endpoints("sparse_tensor_dense_matmul")
 def sparse_tensor_dense_matmul(sp_a,
                                b,
diff --git a/tensorflow/python/ops/spectral_grad.py b/tensorflow/python/ops/spectral_grad.py
deleted file mode 100644
index 0af24114acbe5fa6283191f9d71e32805eba3f29..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/spectral_grad.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Gradients for operators defined in spectral_ops.py."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
-
-
-def _FFTSizeForGrad(grad, rank):
-  return math_ops.reduce_prod(array_ops.shape(grad)[-rank:])
-
-
-@ops.RegisterGradient("FFT")
-def _FFTGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype)
-  return spectral_ops.ifft(grad) * size
-
-
-@ops.RegisterGradient("IFFT")
-def _IFFTGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft(grad) * rsize
-
-
-@ops.RegisterGradient("FFT2D")
-def _FFT2DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype)
-  return spectral_ops.ifft2d(grad) * size
-
-
-@ops.RegisterGradient("IFFT2D")
-def _IFFT2DGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft2d(grad) * rsize
-
-
-@ops.RegisterGradient("FFT3D")
-def _FFT3DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype)
-  return spectral_ops.ifft3d(grad) * size
-
-
-@ops.RegisterGradient("IFFT3D")
-def _IFFT3DGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft3d(grad) * rsize
-
-
-def _RFFTGradHelper(rank, irfft_fn):
-  """Returns a gradient function for an RFFT of the provided rank."""
-  # Can't happen because we don't register a gradient for RFFT3D.
-  assert rank in (1, 2), "Gradient for RFFT3D is not implemented."
-
-  def _Grad(op, grad):
-    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
-    fft_length = op.inputs[1]
-    input_shape = array_ops.shape(op.inputs[0])
-    is_even = math_ops.cast(1 - (fft_length[-1] % 2), dtypes.complex64)
-
-    def _TileForBroadcasting(matrix, t):
-      expanded = array_ops.reshape(
-          matrix,
-          array_ops.concat([
-              array_ops.ones([array_ops.rank(t) - 2], dtypes.int32),
-              array_ops.shape(matrix)
-          ], 0))
-      return array_ops.tile(
-          expanded, array_ops.concat([array_ops.shape(t)[:-2], [1, 1]], 0))
-
-    def _MaskMatrix(length):
-      # TODO(rjryan): Speed up computation of twiddle factors using the
-      # following recurrence relation and cache them across invocations of RFFT.
-      #
-      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
-      # for n = 0, 1,..., line_len-1.
-      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
-      a = array_ops.tile(
-          array_ops.expand_dims(math_ops.range(length), 0), (length, 1))
-      b = array_ops.transpose(a, [1, 0])
-      return math_ops.exp(-2j * np.pi * math_ops.cast(a * b, dtypes.complex64) /
-                          math_ops.cast(length, dtypes.complex64))
-
-    def _YMMask(length):
-      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
-      return math_ops.cast(1 - 2 * (math_ops.range(length) % 2),
-                           dtypes.complex64)
-
-    y0 = grad[..., 0:1]
-    if rank == 1:
-      ym = grad[..., -1:]
-      extra_terms = y0 + is_even * ym * _YMMask(input_shape[-1])
-    elif rank == 2:
-      # Create a mask matrix for y0 and ym.
-      base_mask = _MaskMatrix(input_shape[-2])
-
-      # Tile base_mask to match y0 in shape so that we can batch-matmul the
-      # inner 2 dimensions.
-      tiled_mask = _TileForBroadcasting(base_mask, y0)
-
-      y0_term = math_ops.matmul(tiled_mask, math_ops.conj(y0))
-      extra_terms = y0_term
-
-      ym = grad[..., -1:]
-      ym_term = math_ops.matmul(tiled_mask, math_ops.conj(ym))
-
-      inner_dim = input_shape[-1]
-      ym_term = array_ops.tile(
-          ym_term,
-          array_ops.concat([
-              array_ops.ones([array_ops.rank(grad) - 1], dtypes.int32),
-              [inner_dim]
-          ], 0)) * _YMMask(inner_dim)
-
-      extra_terms += is_even * ym_term
-
-    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
-    # factor, plus some additional terms to make up for the components dropped
-    # due to Hermitian symmetry.
-    input_size = math_ops.to_float(_FFTSizeForGrad(op.inputs[0], rank))
-    irfft = irfft_fn(grad, fft_length)
-    return 0.5 * (irfft * input_size + math_ops.real(extra_terms)), None
-
-  return _Grad
-
-
-def _IRFFTGradHelper(rank, rfft_fn):
-  """Returns a gradient function for an IRFFT of the provided rank."""
-  # Can't happen because we don't register a gradient for IRFFT3D.
-  assert rank in (1, 2), "Gradient for IRFFT3D is not implemented."
-
-  def _Grad(op, grad):
-    """A gradient function for IRFFT with the provided `rank` and `rfft_fn`."""
-    # Generate a simple mask like [1.0, 2.0, ..., 2.0, 1.0] for even-length FFTs
-    # and [1.0, 2.0, ..., 2.0] for odd-length FFTs. To reduce extra ops in the
-    # graph we special-case the situation where the FFT length and last
-    # dimension of the input are known at graph construction time.
-    fft_length = op.inputs[1]
-    is_odd = math_ops.mod(fft_length[-1], 2)
-    input_last_dimension = array_ops.shape(op.inputs[0])[-1]
-    mask = array_ops.concat(
-        [[1.0], 2.0 * array_ops.ones([input_last_dimension - 2 + is_odd]),
-         array_ops.ones([1 - is_odd])], 0)
-
-    rsize = math_ops.reciprocal(math_ops.to_float(_FFTSizeForGrad(grad, rank)))
-
-    # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
-    # factor and a mask. The mask scales the gradient for the Hermitian
-    # symmetric components of the RFFT by a factor of two, since these
-    # components are de-duplicated in the RFFT.
-    rfft = rfft_fn(grad, fft_length)
-    return rfft * math_ops.cast(rsize * mask, dtypes.complex64), None
-
-  return _Grad
-
-
-ops.RegisterGradient("RFFT")(_RFFTGradHelper(1, spectral_ops.irfft))
-ops.RegisterGradient("IRFFT")(_IRFFTGradHelper(1, spectral_ops.rfft))
-ops.RegisterGradient("RFFT2D")(_RFFTGradHelper(2, spectral_ops.irfft2d))
-ops.RegisterGradient("IRFFT2D")(_IRFFTGradHelper(2, spectral_ops.rfft2d))
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 4f1662ab086796af0fff9acbf4ad425c6460e37d..c614d072badbdf7927d6c889288e1cf4e8d988ef 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import math_grad
 from tensorflow.python.ops import random_grad
 from tensorflow.python.ops import sparse_grad
-from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
 from tensorflow.python.ops import tensor_array_grad
 
@@ -51,6 +50,7 @@ from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
 from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=redefined-builtin
 # pylint: enable=redefined-builtin
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.ops.control_flow_ops import while_loop
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
@@ -72,6 +72,7 @@ from tensorflow.python.ops.partitioned_variables import *
 from tensorflow.python.ops.random_ops import *
 from tensorflow.python.ops.script_ops import py_func
 from tensorflow.python.ops.session_ops import *
+from tensorflow.python.ops.sort_ops import *
 from tensorflow.python.ops.sparse_ops import *
 from tensorflow.python.ops.state_ops import assign
 from tensorflow.python.ops.state_ops import assign_add
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 920047f38b07e62ec832f2cf411d83180b6fa160..76684f89f8ac9347486a115c12e0b4f5ff49ba30 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -15,7 +15,7 @@
 
 """Variables.
 
-See the [Variables](https://tensorflow.org/api_guides/python/state_ops) guide.
+See the [Variables](https://www.tensorflow.org/guide/variables) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index c6defabacdbe227d7e4ed20badae9c3ce0c553b0..b119049b163dd57aee08f078e5ab5ca913f61706 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable("StatelessMultinomial")
@@ -179,7 +180,9 @@ def stateless_truncated_normal(shape,
     return math_ops.add(rnd * stddev, mean, name=name)
 
 
-@tf_export("random.stateless_multinomial")
+@tf_export(v1=["random.stateless_multinomial"])
+@deprecation.deprecated(
+    date=None, instructions="Use tf.random.stateless_categorical instead.")
 def stateless_multinomial(logits,
                           num_samples,
                           seed,
@@ -207,13 +210,58 @@ def stateless_multinomial(logits,
       `[i, :]` represents the unnormalized log-probabilities for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A shape [2] integer Tensor of seeds to the random number generator.
-    name: Optional name for the operation.
     output_dtype: integer type to use for the output. Defaults to int64.
+    name: Optional name for the operation.
 
   Returns:
     The drawn samples of shape `[batch_size, num_samples]`.
   """
   with ops.name_scope(name, "stateless_multinomial", [logits, seed]):
-    logits = ops.convert_to_tensor(logits, name="logits")
-    return gen_stateless_random_ops.stateless_multinomial(
-        logits, num_samples, seed, output_dtype=output_dtype)
+    return stateless_multinomial_categorical_impl(logits, num_samples,
+                                                  output_dtype, seed)
+
+
+@tf_export("random.stateless_categorical")
+def stateless_categorical(logits,
+                          num_samples,
+                          seed,
+                          dtype=dtypes.int64,
+                          name=None):
+  """Draws deterministic pseudorandom samples from a categorical distribution.
+
+  This is a stateless version of `tf.categorical`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.random.stateless_categorical(
+      tf.log([[10., 10.]]), 5, seed=[7, 17])
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    dtype: integer type to use for the output. Defaults to int64.
+    name: Optional name for the operation.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "stateless_categorical", [logits, seed]):
+    return stateless_multinomial_categorical_impl(logits, num_samples, dtype,
+                                                  seed)
+
+
+def stateless_multinomial_categorical_impl(logits, num_samples, dtype, seed):
+  """Implementation for stateless multinomial/categorical ops (v1/v2)."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  return gen_stateless_random_ops.stateless_multinomial(
+      logits, num_samples, seed, output_dtype=dtype)
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index ed14aa7d90060aca44687e9f83b7d83ab196e9c2..a20eec20b80217065d72135dcf0060d1823ad624 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Operations for working with string Tensors.
-
-See the [Strings](https://tensorflow.org/api_guides/python/string_ops) guide.
-"""
+"""Operations for working with string Tensors."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -340,10 +337,14 @@ reduce_join.__doc__ = reduce_join.__doc__.replace("tf.reduce_join(",
 
 # This wrapper provides backwards compatibility for code that predates the
 # unit argument and that passed 'name' as a positional argument.
-@tf_export("strings.length")
+@tf_export(v1=["strings.length"])
 def string_length(input, name=None, unit="BYTE"):
   return gen_string_ops.string_length(input, unit=unit, name=name)
 
+@tf_export("strings.length", v1=[])
+def string_length_v2(input, unit="BYTE", name=None):
+  return string_length(input, name, unit)
+
 
 string_length.__doc__ = gen_string_ops.string_length.__doc__
 
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index 14aa44a920863b06221c8ba2d65c246979f3301d..c72a9aefc3fa53d2a94a5f84a44f728208d82915 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -22,6 +22,7 @@ import contextlib
 import re
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import distribution_strategy_context
 
@@ -44,13 +45,27 @@ _INVALID_TAG_CHARACTERS = re.compile(r'[^-/\w\.]')
 
 
 def skip_summary():
-  # If using multiple replicas in distributed strategy, skip summaries on all
-  # replicas except the first one (replica_id=0).
+  """Determines if summary should be skipped.
+
+  If using multiple replicas in distributed strategy, skip summaries on all
+  replicas except the first one (replica_id=0).
+
+  Returns:
+    True if the summary is skipped; False otherwise.
+  """
+
   # TODO(priyag): Add a new optional argument that will provide multiple
   # alternatives to override default behavior. (e.g. run on last replica,
   # compute sum or mean across replicas).
   replica_context = distribution_strategy_context.get_replica_context()
-  return replica_context and replica_context.replica_id > 0
+  if not replica_context:
+    return False
+  # TODO(b/118385803): when replica_id of _TPUReplicaContext is properly
+  # initialized, remember to change here as well.
+  replica_id = replica_context.replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id and replica_id > 0
 
 
 def clean_tag(name):
diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py
deleted file mode 100644
index ec4d4a6e9242107fd7f4bebe1416198457e32cee..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/summary_ops.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Summary Operations."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_logging_ops
-from tensorflow.python.ops import summary_op_util
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.gen_logging_ops import *
-from tensorflow.python.util.tf_export import tf_export
-# pylint: enable=wildcard-import
-
-
-@tf_export("summary.tensor_summary")
-def tensor_summary(name,
-                   tensor,
-                   summary_description=None,
-                   collections=None,
-                   summary_metadata=None,
-                   family=None,
-                   display_name=None):
-  """Outputs a `Summary` protocol buffer with a serialized tensor.proto.
-
-  Args:
-    name: A name for the generated node. If display_name is not set, it will
-      also serve as the tag name in TensorBoard. (In that case, the tag
-      name will inherit tf name scopes.)
-    tensor: A tensor of any type and shape to serialize.
-    summary_description: A long description of the summary sequence. Markdown
-      is supported.
-    collections: Optional list of graph collections keys. The new summary op is
-      added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-    summary_metadata: Optional SummaryMetadata proto (which describes which
-      plugins may use the summary value).
-    family: Optional; if provided, used as the prefix of the summary tag,
-      which controls the name used for display on TensorBoard when
-      display_name is not set.
-    display_name: A string used to name this data in TensorBoard. If this is
-      not set, then the node name will be used instead.
-
-  Returns:
-    A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-    buffer.
-  """
-
-  if summary_metadata is None:
-    summary_metadata = summary_pb2.SummaryMetadata()
-
-  if summary_description is not None:
-    summary_metadata.summary_description = summary_description
-
-  if display_name is not None:
-    summary_metadata.display_name = display_name
-
-  serialized_summary_metadata = summary_metadata.SerializeToString()
-
-  if summary_op_util.skip_summary():
-    return constant_op.constant("")
-  with summary_op_util.summary_scope(
-      name, family, values=[tensor]) as (tag, scope):
-    val = gen_logging_ops.tensor_summary_v2(
-        tensor=tensor,
-        tag=tag,
-        name=scope,
-        serialized_summary_metadata=serialized_summary_metadata)
-    summary_op_util.collect(val, collections, [ops.GraphKeys.SUMMARIES])
-  return val
-
-ops.NotDifferentiable("TensorSummary")
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 18cefb8e1c4f1eb03d2ac746e1864a48c9aec6b8..a0ad43b444c3f3f2052cd29196c3d01abcf6a44b 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -40,11 +40,14 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
 
 
-# A global dictionary mapping graph keys to boolean values indicating whether
-# we should record summaries for this particular graph or not.
+# Dictionary mapping graph keys to a boolean Tensor (or callable returning
+# a boolean Tensor) indicating whether we should record summaries for the
+# graph identified by the key of the dictionary.
 _SHOULD_RECORD_SUMMARIES = {}
 
 # A global dictionary mapping graph keys to a list of summary writer init ops.
@@ -55,62 +58,73 @@ _RUN_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,512}$")
 _USER_NAME_PATTERNS = re.compile(r"^[a-z]([-a-z0-9]{0,29}[a-z0-9])?$", re.I)
 
 
+@tf_export("summary.should_record_summaries", v1=[])
 def should_record_summaries():
   """Returns boolean Tensor which is true if summaries should be recorded."""
   global _SHOULD_RECORD_SUMMARIES
   key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  return _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  should = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  return should() if callable(should) else should
 
 
-# TODO(apassos) consider how to handle local step here.
+@tf_export("summary.record_summaries", v1=[])
 @tf_contextlib.contextmanager
-def record_summaries_every_n_global_steps(n, global_step=None):
-  """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
-  if global_step is None:
-    global_step = training_util.get_or_create_global_step()
+def record_summaries(boolean=True):
+  """Sets summary recording on or off per the provided boolean value.
+
+  The provided value can be a python boolean, a scalar boolean Tensor, or
+  or a callable providing such a value; if a callable is passed it will be
+  invoked each time should_record_summaries() is called to determine whether
+  summary writing should be enabled.
+
+  Args:
+    boolean: can be True, False, a bool Tensor, or a callable providing such.
+      Defaults to True.
+
+  Yields:
+    Returns a context manager that sets this value on enter and restores the
+    previous value on exit.
+  """
+  # TODO(nickfelt): make this threadlocal
   global _SHOULD_RECORD_SUMMARIES
   key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
   try:
-    with ops.device("cpu:0"):
-      _SHOULD_RECORD_SUMMARIES[key] = math_ops.equal(global_step % n, 0)
+    _SHOULD_RECORD_SUMMARIES[key] = boolean
     yield
   finally:
     _SHOULD_RECORD_SUMMARIES[key] = old
 
 
-@tf_contextlib.contextmanager
+# TODO(apassos) consider how to handle local step here.
+def record_summaries_every_n_global_steps(n, global_step=None):
+  """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
+  if global_step is None:
+    global_step = training_util.get_or_create_global_step()
+  with ops.device("cpu:0"):
+    should = lambda: math_ops.equal(global_step % n, 0)
+    if not context.executing_eagerly():
+      should = should()
+  return record_summaries(should)
+
+
 def always_record_summaries():
   """Sets the should_record_summaries Tensor to always true."""
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
-  try:
-    _SHOULD_RECORD_SUMMARIES[key] = True
-    yield
-  finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+  return record_summaries(True)
 
 
-@tf_contextlib.contextmanager
 def never_record_summaries():
   """Sets the should_record_summaries Tensor to always false."""
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
-  try:
-    _SHOULD_RECORD_SUMMARIES[key] = False
-    yield
-  finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+  return record_summaries(False)
 
 
+@tf_export("summary.SummaryWriter", v1=[])
 class SummaryWriter(object):
   """Encapsulates a stateful summary writer resource.
 
   See also:
-  - `tf.contrib.summary.create_file_writer`
-  - `tf.contrib.summary.create_db_writer`
+  - `tf.summary.create_file_writer`
+  - `tf.summary.create_db_writer`
   """
 
   def  __init__(self, resource, init_op_fn):
@@ -205,6 +219,7 @@ def initialize(
     session.run(_graph(x, 0), feed_dict={x: data})
 
 
+@tf_export("summary.create_file_writer", v1=[])
 def create_file_writer(logdir,
                        max_queue=None,
                        flush_millis=None,
@@ -280,7 +295,7 @@ def create_db_writer(db_uri,
       `tf.Graph`.
 
   Returns:
-    A `tf.contrib.summary.SummaryWriter` instance.
+    A `tf.summary.SummaryWriter` instance.
   """
   with ops.device("cpu:0"):
     if experiment_name is None:
@@ -329,7 +344,7 @@ def _nothing():
 def all_summary_ops():
   """Graph-mode only. Returns all summary ops.
 
-  Please note this excludes `tf.contrib.summary.graph` ops.
+  Please note this excludes `tf.summary.graph` ops.
 
   Returns:
     The summary ops.
@@ -497,7 +512,7 @@ def graph(param, step=None, name=None):
   """Writes a TensorFlow graph to the summary interface.
 
   The graph summary is, strictly speaking, not a summary. Conditions
-  like `tf.contrib.summary.never_record_summaries` do not apply. Only
+  like `tf.summary.should_record_summaries` do not apply. Only
   a single graph can be associated with a particular run. If multiple
   graphs are written, then only the last one will be considered by
   TensorBoard.
@@ -541,14 +556,13 @@ def graph(param, step=None, name=None):
 _graph = graph  # for functions with a graph parameter
 
 
+@tf_export("summary.import_event", v1=[])
 def import_event(tensor, name=None):
   """Writes a `tf.Event` binary proto.
 
-  When using create_db_writer(), this can be used alongside
-  `tf.TFRecordReader` to load event logs into the database. Please
-  note that this is lower level than the other summary functions and
-  will ignore any conditions set by methods like
-  `tf.contrib.summary.should_record_summaries`.
+  This can be used to import existing event logs into a new summary writer sink.
+  Please note that this is lower level than the other summary functions and
+  will ignore the `tf.summary.should_record_summaries` setting.
 
   Args:
     tensor: A `tf.Tensor` of type `string` containing a serialized
@@ -562,13 +576,14 @@ def import_event(tensor, name=None):
       context.context().summary_writer_resource, tensor, name=name)
 
 
+@tf_export("summary.flush", v1=[])
 def flush(writer=None, name=None):
   """Forces summary writer to send any buffered data to storage.
 
   This operation blocks until that finishes.
 
   Args:
-    writer: The `tf.contrib.summary.SummaryWriter` resource to flush.
+    writer: The `tf.summary.SummaryWriter` resource to flush.
       The thread default will be used if this parameter is None.
       Otherwise a `tf.no_op` is returned.
     name: A name for the operation (optional).
@@ -595,6 +610,8 @@ def eval_dir(model_dir, name=None):
   return os.path.join(model_dir, "eval" if not name else "eval_" + name)
 
 
+@deprecation.deprecated(date=None,
+                        instructions="Renamed to create_file_writer().")
 def create_summary_file_writer(*args, **kwargs):
   """Please use `tf.contrib.summary.create_file_writer`."""
   logging.warning("Deprecation Warning: create_summary_file_writer was renamed "
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index e7ad261615f57c1e0ff967d0f7cd498571d21bc7..7c2d3be338766a4e25a817f824e06c665059bc01 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.tf_export import tf_export
 __all__ = ["make_template"]
 
 
-@tf_export("make_template")
+@tf_export(v1=["make_template"])
 def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
                   custom_getter_=None, **kwargs):
   """Given an arbitrary function, wrap it so that it does variable sharing.
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index f86dfb35276f608c5cb323fe5deceb58733be007..e3375ad0abe0edb93977a0b52e6143f5911bccdb 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -20,8 +20,10 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import os
 import weakref
 
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,12 +32,18 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
 
+ENABLE_TENSOR_ARRAY_V2 = (
+    tf2.enabled() or os.getenv("TF_ENABLE_TENSOR_ARRAY_V2") is not None)
+
+
 # _GraphTensorArray accesses many of the hidden generated ops, but is in
 # fact built to wrap these methods.
 # pylint: disable=protected-access
@@ -393,6 +401,246 @@ class _GraphTensorArray(object):
     return gen_data_flow_ops.tensor_array_close_v3(
         handle=self._handle, name=name)
 
+
+class _GraphTensorArrayV2(object):
+  """Graph-mode implementation of TensorArray backed by TensorLists.
+
+  The backing tensor of this TensorArray is a TensorList variant tensor which is
+  stored in the `flow`. The `handle` is always none here. The reason we use the
+  `flow` field and not the `handle` field is to ensure backwards compatibility
+  with legacy control flow.
+  """
+
+  def __init__(self,
+               dtype,
+               size=None,
+               dynamic_size=None,
+               clear_after_read=None,
+               tensor_array_name=None,
+               handle=None,
+               flow=None,
+               infer_shape=True,
+               element_shape=None,
+               colocate_with_first_write_call=True,
+               name=None):
+    """Constructs a graph mode TensorArray.
+
+    Args:
+      dtype: (required) data type of the TensorArray.
+      size: (optional) int32 scalar `Tensor`: the size of the TensorArray.
+        Required if flow is not provided.
+      dynamic_size: (optional) Python bool: If true, writes to the TensorArray
+        can grow the TensorArray past its initial size.  Default: False.
+      clear_after_read: (optional) unused. Not supported in TensorLists.
+      tensor_array_name: (optional) unused.
+      handle: (optional) Must always be None.
+      flow: (optional) A variant `Tensor` scalar for a TensorList.
+      infer_shape: (optional, default: True) If True, shape inference is
+        enabled.  In this case, all elements must have the same shape.
+      element_shape: (optional, default: None) A `TensorShape` object specifying
+        the shape constraints of each of the elements of the TensorArray. Need
+        not be fully defined.
+      colocate_with_first_write_call: (optional). unused.
+      name: (optional) A name for the operation.
+
+    Raises:
+      ValueError: if both handle and tensor_array_name are provided.
+      TypeError: if handle is provided but is not a Tensor.
+    """
+    assert handle is None
+    del handle
+    del clear_after_read
+    del tensor_array_name
+    del colocate_with_first_write_call
+
+    del dynamic_size  # TODO(b/117943489): Unused for now.
+
+    if (flow is not None and
+        (not isinstance(flow, ops.Tensor) or flow.dtype != dtypes.variant)):
+      raise TypeError("flow must be a variant tensor")
+    if flow is None and size is None:
+      raise ValueError("Size must be provided if flow is not provided")
+    if flow is not None and size is not None:
+      raise ValueError("Cannot provide both a flow and size "
+                       "at the same time")
+    if flow is not None and element_shape is not None:
+      raise ValueError("Cannot provide both a flow and element_shape "
+                       "at the same time")
+
+    self._dtype = dtype
+
+    # Record the current static shape for the array elements. The element
+    # shape is defined either by `element_shape` or the shape of the tensor
+    # of the first write. If `infer_shape` is true, all writes checks for
+    # shape equality.
+    if element_shape is None:
+      self._infer_shape = infer_shape
+      self._element_shape = []
+    else:
+      self._infer_shape = True
+      self._element_shape = [tensor_shape.TensorShape(element_shape)]
+    with ops.name_scope(name, "TensorArrayV2", [size, flow]) as scope:
+      if flow is None:
+        self._flow = list_ops.tensor_list_reserve(
+            element_shape=element_shape,
+            num_elements=size,
+            element_dtype=dtype,
+            name=scope)
+      else:
+        self._flow = flow
+
+    # For backwards compatibility.
+    self._colocate_with_first_write_call = None
+    self._colocate_with = None
+
+  @property
+  def flow(self):
+    return self._flow
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def handle(self):
+    # We intentionally do not raise an error so that legacy while_loop does not
+    # complain.
+    return None
+
+  def _merge_element_shape(self, shape):
+    """Changes the element shape of the array given a shape to merge with.
+
+    Args:
+      shape: A `TensorShape` object to merge with.
+
+    Raises:
+      ValueError: if the provided shape is incompatible with the current
+          element shape of the `TensorArray`.
+    """
+
+    if self._element_shape:
+      if not shape.is_compatible_with(self._element_shape[0]):
+        raise ValueError(
+            "Inconsistent shapes: saw %s but expected %s "
+            "(and infer_shape=True)" % (shape, self._element_shape[0]))
+      self._element_shape[0] = self._element_shape[0].merge_with(shape)
+    else:
+      self._element_shape.append(shape)
+
+  def identity(self):
+    """See TensorArray."""
+    flow = array_ops.identity(self._flow)
+    ta = TensorArray(
+        dtype=self._dtype, flow=flow, infer_shape=self._infer_shape)
+    ta._element_shape = self._element_shape
+    return ta
+
+  def grad(self, source, flow=None, name=None):
+    """Not supported."""
+    raise NotImplementedError()
+
+  def read(self, index, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_get_item(
+        input_handle=self._flow,
+        index=index,
+        element_dtype=self._dtype,
+        name=name)
+    if self._element_shape:
+      value.set_shape(self._element_shape[0].dims)
+    return value
+
+  @tf_should_use.should_use_result
+  def write(self, index, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayV2Write", [self._flow, index, value]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape:
+        self._merge_element_shape(value.shape)
+      flow_out = list_ops.tensor_list_set_item(
+          input_handle=self._flow, index=index, item=value, name=name)
+      ta = TensorArray(dtype=self._dtype, handle=None, flow=flow_out)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      return ta
+
+  def stack(self, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayV2Stack", [self._flow]):
+      value = list_ops.tensor_list_stack(
+          input_handle=self._flow, element_dtype=self._dtype)
+      if self._element_shape and self._element_shape[0].dims is not None:
+        value.set_shape([None] + self._element_shape[0].dims)
+      return value
+
+  def gather(self, indices, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_gather(
+        input_handle=self._flow,
+        indices=indices,
+        element_dtype=self._dtype,
+        name=name)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims)
+    return value
+
+  def concat(self, name=None):
+    """See TensorArray."""
+    raise NotImplementedError("TensorArray.concat")
+
+  @tf_should_use.should_use_result
+  def unstack(self, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayUnstack", [self._flow, value]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape and not context.executing_eagerly():
+        self._merge_element_shape(value.shape[1:])
+      flow_out = list_ops.tensor_list_from_tensor(
+          tensor=value, element_shape=value.shape[1:])
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  @tf_should_use.should_use_result
+  def scatter(self, indices, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayScatter",
+                        [self._flow, value, indices]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape and not context.executing_eagerly():
+        self._merge_element_shape(value.shape[1:])
+      flow_out = list_ops.tensor_list_scatter(
+          tensor=value, indices=indices, element_shape=-1)
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  @tf_should_use.should_use_result
+  def split(self, value, lengths, name=None):
+    """See TensorArray."""
+    raise NotImplementedError("TensorArray.split")
+
+  def size(self, name=None):
+    """See TensorArray."""
+    return list_ops.tensor_list_length(input_handle=self._flow, name=name)
+
+  @tf_should_use.should_use_result
+  def close(self, name=None):
+    """See TensorArray."""
+    return gen_control_flow_ops.no_op(name=name)
+
 # pylint: enable=protected-access
 
 
@@ -738,8 +986,10 @@ class TensorArray(object):
     if context.executing_eagerly():
       implementation = _EagerTensorArray
     else:
-      implementation = _GraphTensorArray
-
+      if ENABLE_TENSOR_ARRAY_V2:
+        implementation = _GraphTensorArrayV2
+      else:
+        implementation = _GraphTensorArray
     self._implementation = implementation(
         dtype,
         size=size,
@@ -768,7 +1018,7 @@ class TensorArray(object):
   @property
   def handle(self):
     """The reference to the TensorArray."""
-    return self._implementation._handle
+    return self._implementation.handle
 
   @property
   def _infer_shape(self):
@@ -953,4 +1203,16 @@ class TensorArray(object):
     """Close the current TensorArray."""
     return self._implementation.close(name=name)
 
+
+def build_ta_with_new_flow(old_ta, flow):
+  ta = TensorArray(
+      dtype=old_ta.dtype,
+      handle=old_ta.handle,
+      flow=flow,
+      infer_shape=old_ta._infer_shape,
+      colocate_with_first_write_call=old_ta._colocate_with_first_write_call)
+  ta._colocate_with = old_ta._colocate_with
+  ta._element_shape = old_ta._element_shape
+  return ta
+
 # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/tensor_forest_ops.py b/tensorflow/python/ops/tensor_forest_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..842f0c648b12551624fc6306a6fa869392dd4465
--- /dev/null
+++ b/tensorflow/python/ops/tensor_forest_ops.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for tensor_forest."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import ops
+from tensorflow.python.ops import gen_tensor_forest_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.training import saver
+
+
+class TreeVariableSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Resource that holds a tree."""
+
+  def __init__(self, type_name, name, container, config, resource_handle_func,
+               create_op_func, is_initialized_op_func, serialize_op_func,
+               deserialize_op_func):
+
+    with ops.name_scope(name, type_name) as name:
+      self._resource_handle = resource_handle_func(
+          container, shared_name=name, name=name)
+
+    self._is_initialized_op = is_initialized_op_func(self._resource_handle)
+    tensor = serialize_op_func(self._resource_handle)
+    self._create_op = create_op_func(self._resource_handle, config)
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree variable. So we just pass an empty
+    # value.
+    slice_spec = ''
+    specs = [saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name)]
+    super(TreeVariableSaveable, self).__init__(self._resource_handle, specs,
+                                               name)
+
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+
+    resources.register_resource(self._resource_handle, self._create_op,
+                                self._is_initialized_op)
+    self._deserialize_op_func = deserialize_op_func
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return self._deserialize_op_func(
+          self._resource_handle,
+          restored_tensors[0],
+      )
+
+  @property
+  def resource(self):
+    return self._resource_handle
+
+
+def tree_variable(tree_config, name, container=None):
+  return TreeVariableSaveable(
+      'TreeVariable', name, container, tree_config,
+      gen_tensor_forest_ops.tensor_forest_tree_resource_handle_op,
+      gen_tensor_forest_ops.tensor_forest_create_tree_variable,
+      gen_tensor_forest_ops.tensor_forest_tree_is_initialized_op,
+      gen_tensor_forest_ops.tensor_forest_tree_serialize,
+      gen_tensor_forest_ops.tensor_forest_tree_deserialize).resource
+
+
+class ForestVariables(object):
+  """Resource that holds all trees from a forest."""
+
+  def __init__(self, params, tree_configs=None):
+
+    self._variables = []
+
+    for i in range(params.n_trees):
+      tree_config = ''
+      if tree_configs is not None:
+        tree_config = tree_configs[i]
+      self._variables.append(tree_variable(
+          tree_config,
+          'tree-%s' % i,
+      ))
+
+  def __getitem__(self, t):
+    return self._variables[t]
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 41a8f57642ea6c6a499e67f4e7802d0dce7a21b3..077bb647efbfeeef46a55e25ab9256d2c1ffad7f 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -437,37 +437,43 @@ class _VariableStore(object):
           raise ValueError(
               "Partitioner must be callable, but received: %s" % partitioner)
         with ops.name_scope(None):
-          return self._get_partitioned_variable(name=name,
-                                                shape=shape,
-                                                dtype=dtype,
-                                                initializer=initializer,
-                                                regularizer=regularizer,
-                                                reuse=reuse,
-                                                trainable=trainable,
-                                                collections=collections,
-                                                caching_device=caching_device,
-                                                partitioner=partitioner,
-                                                validate_shape=validate_shape,
-                                                use_resource=use_resource,
-                                                constraint=constraint)
+          return self._get_partitioned_variable(
+              name=name,
+              shape=shape,
+              dtype=dtype,
+              initializer=initializer,
+              regularizer=regularizer,
+              reuse=reuse,
+              trainable=trainable,
+              collections=collections,
+              caching_device=caching_device,
+              partitioner=partitioner,
+              validate_shape=validate_shape,
+              use_resource=use_resource,
+              constraint=constraint,
+              synchronization=synchronization,
+              aggregation=aggregation)
 
       # Special case for partitioned variable to allow reuse without having to
       # specify partitioner.
       if (reuse is True and partitioner is None
           and name in self._partitioned_vars):
-        return self._get_partitioned_variable(name=name,
-                                              shape=shape,
-                                              dtype=dtype,
-                                              initializer=initializer,
-                                              regularizer=regularizer,
-                                              reuse=reuse,
-                                              trainable=trainable,
-                                              collections=collections,
-                                              caching_device=caching_device,
-                                              partitioner=None,
-                                              validate_shape=validate_shape,
-                                              use_resource=use_resource,
-                                              constraint=constraint)
+        return self._get_partitioned_variable(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            initializer=initializer,
+            regularizer=regularizer,
+            reuse=reuse,
+            trainable=trainable,
+            collections=collections,
+            caching_device=caching_device,
+            partitioner=None,
+            validate_shape=validate_shape,
+            use_resource=use_resource,
+            constraint=constraint,
+            synchronization=synchronization,
+            aggregation=aggregation)
 
       # Single variable case
       if "%s/part_0" % name in self._vars:
@@ -553,7 +559,9 @@ class _VariableStore(object):
                                 caching_device=None,
                                 validate_shape=True,
                                 use_resource=None,
-                                constraint=None):
+                                constraint=None,
+                                synchronization=VariableSynchronization.AUTO,
+                                aggregation=VariableAggregation.NONE):
     """Gets or creates a sharded variable list with these parameters.
 
     The `partitioner` must be a callable that accepts a fully defined
@@ -619,6 +627,15 @@ class _VariableStore(object):
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
 
     Returns:
       A `PartitionedVariable` object.
@@ -629,10 +646,6 @@ class _VariableStore(object):
         when violating reuse during variable creation, or if an existing
         sharded variable exists for the given name but with different sharding.
     """
-    if context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
-
     initializing_from_value = initializer is not None and isinstance(
         initializer, ops.Tensor)
     reuse_without_partition = reuse and not partitioner
@@ -776,7 +789,9 @@ class _VariableStore(object):
             caching_device=caching_device,
             validate_shape=validate_shape,
             use_resource=use_resource,
-            constraint=constraint)
+            constraint=constraint,
+            synchronization=synchronization,
+            aggregation=aggregation)
 
       # pylint: disable=protected-access
       var._set_save_slice_info(variables.Variable.SaveSliceInfo(
@@ -1061,9 +1076,6 @@ class VariableScope(object):
       if self._caching_device is not None:
         raise NotImplementedError("Caching devices is not yet supported "
                                   "when eager execution is enabled.")
-      if self._partitioner is not None:
-        raise NotImplementedError("Partitioned variables are not yet supported "
-                                  "when eager execution is enabled.")
       self._reuse = AUTO_REUSE
       self._use_resource = True
 
@@ -1143,9 +1155,6 @@ class VariableScope(object):
 
   def set_partitioner(self, partitioner):
     """Set partitioner for this scope."""
-    if partitioner and context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
     self._partitioner = partitioner
 
   def set_custom_getter(self, custom_getter):
@@ -1254,11 +1263,10 @@ class VariableScope(object):
                                 partitioner=None,
                                 validate_shape=True,
                                 use_resource=None,
-                                constraint=None):
+                                constraint=None,
+                                synchronization=VariableSynchronization.AUTO,
+                                aggregation=VariableAggregation.NONE):
     """Gets an existing variable with this name or create a new one."""
-    if context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
     if initializer is None:
       initializer = self._initializer
     if regularizer is None:
@@ -1300,11 +1308,21 @@ class VariableScope(object):
     with ops.name_scope(None):
       # pylint: disable=protected-access
       return var_store._get_partitioned_variable(
-          full_name, shape=shape, dtype=dtype, initializer=initializer,
-          regularizer=regularizer, reuse=self.reuse, trainable=trainable,
-          collections=collections, caching_device=caching_device,
-          partitioner=partitioner, validate_shape=validate_shape,
-          use_resource=use_resource, constraint=constraint)
+          full_name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          reuse=self.reuse,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          validate_shape=validate_shape,
+          use_resource=use_resource,
+          constraint=constraint,
+          synchronization=synchronization,
+          aggregation=aggregation)
       # pylint: enable=protected-access
 
 
@@ -1661,7 +1679,9 @@ def _get_partitioned_variable(name,
                               partitioner=None,
                               validate_shape=True,
                               use_resource=None,
-                              constraint=None):
+                              constraint=None,
+                              synchronization=VariableSynchronization.AUTO,
+                              aggregation=VariableAggregation.NONE):
   """Gets or creates a sharded variable list with these parameters.
 
   The `partitioner` must be a callable that accepts a fully defined
@@ -1719,6 +1739,15 @@ def _get_partitioned_variable(name,
       variable and return the Tensor for the projected value
       (which must have the same shape). Constraints are not safe to
       use when doing asynchronous distributed training.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      `tf.VariableSynchronization`. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      `tf.VariableAggregation`.
 
   Returns:
     A tuple `(shards, partitions)` where `shards` is the list of `Variable`
@@ -1740,11 +1769,21 @@ def _get_partitioned_variable(name,
         "If so, consider instead using get_variable with a non-empty "
         "partitioner parameter instead." % scope.custom_getter)
   return scope._get_partitioned_variable(
-      _get_default_variable_store(), name, shape=shape, dtype=dtype,
-      initializer=initializer, regularizer=regularizer, trainable=trainable,
-      collections=collections, caching_device=caching_device,
-      partitioner=partitioner, validate_shape=validate_shape,
-      use_resource=use_resource, constraint=constraint)
+      _get_default_variable_store(),
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      trainable=trainable,
+      collections=collections,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      validate_shape=validate_shape,
+      use_resource=use_resource,
+      constraint=constraint,
+      synchronization=synchronization,
+      aggregation=aggregation)
   # pylint: enable=protected-access
 
 
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index e43736069e38a69e26a1dae3c393ceca0eb94f71..5bee522481989a78af44e13c90bb965f28ebe799 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -2457,34 +2457,6 @@ class PartitionedVariable(object):
   @end_compatibility
   """
 
-  class PartitionedVariableIterator(object):
-    """An iterator that allows accessing the underlying `Variable` objects.
-
-    This iterator is necessary to control order of access when Variables
-    are not partitioned in a standard way along a single axis.
-
-    Allows e.g. `list(partitioned_variable)` to return a proper list.
-    """
-
-    def __init__(self, partitioned_variable):
-      self._ix = 0
-      self._partitioned_variable = partitioned_variable
-
-    def __iter__(self):
-      return self
-
-    def __next__(self):  # For python3 compatibility.
-      return self.next()
-
-    def next(self):
-      # pylint: disable=protected-access
-      if self._ix >= len(self._partitioned_variable._variable_list):
-        raise StopIteration()
-      variable = self._partitioned_variable._variable_list[self._ix]
-      # pylint: enable=protected-access
-      self._ix += 1
-      return variable
-
   def __init__(self, name, shape, dtype, variable_list, partitions):
     """Creates a new partitioned variable wrapper.
 
@@ -2504,11 +2476,7 @@ class PartitionedVariable(object):
         `partitions` is not a list.
       ValueError: If `variable_list` is empty, or the `Variable` shape
         information does not match `shape`, or `partitions` has invalid values.
-      RuntimeError: If eager execution is enabled
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "tf.PartitionedVariable not supported with eager execution enabled.")
     if not isinstance(variable_list, (list, tuple)):
       raise TypeError(
           "variable_list is not a list or tuple: %s" % variable_list)
@@ -2545,7 +2513,7 @@ class PartitionedVariable(object):
 
   def __iter__(self):
     """Return an iterable for accessing the underlying partition Variables."""
-    return self.PartitionedVariableIterator(self)
+    return iter(self._variable_list)
 
   def __len__(self):
     num_partition_axes = len(self._partition_axes())
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 254fae11f4b08e6555eb59991226c32857d13a95..5ab7bffedc72ed7568559cc0cbf1a03c9e2205dd 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -23,7 +23,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph as func_graph_module
@@ -31,6 +30,7 @@ from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -66,9 +66,7 @@ def while_loop(cond,
                maximum_iterations=None,
                name=None):
   """Like tf.while_loop, except emits a single While op."""
-  if _is_in_xla_context() and maximum_iterations is None:
-    raise ValueError("maximum_iterations is required in XLA context.")
-
+  maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
   # Keep the original loop_vars around to know which args were TensorArrays.
   orig_loop_vars = loop_vars
   # Cache its length since we use it at multiple places below.
@@ -85,13 +83,6 @@ def while_loop(cond,
   else:
     shape_invariants = nest.map_structure(lambda t: t.shape, loop_vars)
 
-  if maximum_iterations is not None:
-    maximum_iterations = ops.convert_to_tensor(
-        maximum_iterations, name="maximum_iterations")
-    if maximum_iterations.shape.ndims != 0:
-      raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
-                       maximum_iterations.shape)
-
   if not name:
     name = "while"
 
@@ -108,7 +99,8 @@ def while_loop(cond,
     # Add loop counter needed for computing gradients.
     loop_vars = [loop_counter] + loop_vars
 
-    shape_invariants = [tensor_shape.scalar()] + shape_invariants
+    shape_invariants = type(shape_invariants)([tensor_shape.scalar()
+                                              ]) + shape_invariants
 
     # Automatic control dependencies are added in defuns, but not in v1
     # graphs. Propagate that behavior here.
@@ -141,9 +133,8 @@ def while_loop(cond,
     # the value of that tensor in each iteration is the same as it was at the
     # beginning of the loop execution.
     loop_vars = loop_vars + cond_graph.external_captures
-    shape_invariants = shape_invariants + [
-        t.shape for t in cond_graph.external_captures
-    ]
+    shape_invariants = shape_invariants + type(shape_invariants)(
+        [t.shape for t in cond_graph.external_captures])
 
     def wrapped_body(loop_counter, *args):
       """Loop body augmented with counter update.
@@ -214,11 +205,10 @@ def while_loop(cond,
     intermediate_tensors = _get_intermediates(body_graph)
 
     for intermediate_tensor in intermediate_tensors:
-      # TODO(srbs): Cache and re-use empty tensor lists.
       tensor_list = list_ops.empty_tensor_list(
           element_dtype=intermediate_tensor.dtype,
-          element_shape=_get_tensor_convertible_shape(
-              intermediate_tensor.shape))
+          element_shape=intermediate_tensor.shape,
+          max_num_elements=maximum_iterations)
       loop_vars.append(tensor_list)
       with cond_graph.as_default():
         # Add a placeholder to cond_graph's inputs corresponding to the
@@ -253,7 +243,8 @@ def while_loop(cond,
         name=scope)
 
     _copy_handle_data(body_graph.outputs, outputs)
-    _maybe_set_lowering_attr(outputs[0].op)
+    util.maybe_set_lowering_attr(outputs[0].op)
+    _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
 
     # Return identities for each output of the While op, rather than the output
     # of the While op directly. This makes pruning work if the output of
@@ -264,11 +255,14 @@ def while_loop(cond,
     outputs = tuple(array_ops.identity(t) for t in outputs)
 
   # First var is loop counter.
-  if num_flattened_outputs == 1:
-    return outputs[1]
+  outputs = _pack_sequence_as(orig_loop_vars,
+                              outputs[1:1 + num_flattened_outputs])
+
+  flattened_outputs = nest.flatten(outputs)
+  if len(flattened_outputs) == 1:
+    return flattened_outputs[0]
   else:
-    return _pack_sequence_as(orig_loop_vars,
-                             outputs[1:1 + num_flattened_outputs])
+    return outputs
 
 
 @ops.RegisterGradient("While")
@@ -314,10 +308,15 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
 
   intermediate_tensors = _get_intermediates(body_grad_graph)
 
+  maximum_iterations = op.get_attr(
+      "_maximum_iterations") if _is_in_xla_context() else None
+  assert not _is_in_xla_context() or maximum_iterations is not None
   for intermediate_tensor in intermediate_tensors:
     tensor_list = list_ops.empty_tensor_list(
         element_dtype=intermediate_tensor.dtype,
-        element_shape=_get_tensor_convertible_shape(intermediate_tensor.shape))
+        element_shape=intermediate_tensor.shape,
+        max_num_elements=maximum_iterations)
+
     with body_grad_graph.as_default():
       tensor_list_ph = body_grad_graph.capture(tensor_list, whitelisted=True)
       # Push the intermediate tensor to the tensor list.
@@ -345,7 +344,11 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       name="%s_grad" % op.name)
 
   _copy_handle_data(body_grad_graph.outputs, outputs)
-  _maybe_set_lowering_attr(outputs[0].op)
+  util.maybe_set_lowering_attr(outputs[0].op)
+  _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
+
+  # See comment in while_loop.
+  outputs = [array_ops.identity(t) for t in outputs]
 
   # Set None as the output gradient for tensors with None input gradient
   # e.g. TensorArray handles.
@@ -362,6 +365,46 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   return none_padded_outputs
 
 
+def _validate_and_convert_to_tensor(maximum_iterations):
+  """Checks that `maximum_iterations` is valid.
+
+  In XLA context, `maximum_iterations` is required and must be statically
+  inferable, e.g. output tensor of a Const node.
+
+  Args:
+    maximum_iterations: The maximum_iterations passed to while_loop.
+
+  Returns:
+    A scalar valued tensor of type int32 or None.
+
+  Raises:
+    ValueError: If `maximum_iterations` is invalid.
+  """
+  if _is_in_xla_context():
+    if maximum_iterations is None:
+      raise ValueError("maximum_iterations is None. It is required and must "
+                       "be statically known (e.g. a constant value or known "
+                       "shape dimension) when building while_loop in XLA "
+                       "context.")
+    if isinstance(maximum_iterations, ops.Tensor):
+      # Get the constant value from the `maximum_iterations` tensor to avoid
+      # capturing a Const tensor from outside this graph.
+      maximum_iterations = tensor_util.constant_value(maximum_iterations)
+      if maximum_iterations is None:
+        raise ValueError("maximum_iterations must be statically known (e.g. a "
+                         "constant value or known shape dimension) when "
+                         "building while_loop in XLA context.")
+
+  if maximum_iterations is not None:
+    # EmptyTensorList expects `max_num_elements` to be of type int32.
+    maximum_iterations = ops.convert_to_tensor(
+        maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
+    if maximum_iterations.shape.ndims != 0:
+      raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
+                       maximum_iterations.shape)
+  return maximum_iterations
+
+
 # TODO(srbs): Pull this into common utils for cond_v2 and while_v2.
 def _get_body_graph(while_op):
   """Returns `FuncGraph` for the while body.
@@ -753,49 +796,28 @@ def _copy_handle_data(src_tensors, tgt_tensors):
     custom_gradient.copy_handle_data(src_t, tgt_t)
 
 
-# TODO(srbs): Move to common utils for cond_v2 and while_v2.
-def _maybe_set_lowering_attr(op):
-  """Sets the flag to enable lowering on the `While` op if necessary.
-
-  Lowering allows while_v2 to avoid some of the limitations of Functions,
-  allowing users to specify devices & colocation inside of while_v2
-  branches, and enabling non-strict evaluation & partial pruning of while_v2
-  branches. This brings while_v2 closer to feature parity with
-  tf.while_loop.
-
-  However, we do not lower `While` in the XLA context because it is easier
-  for XLA to apply its own optimizations when dealing with un-lowered
-  `While` operators than with low-level control flow primitives.
-
-  Args:
-    op: The While op.
-  """
-  if not control_flow_util.IsInXLAContext(op):
-    # pylint: disable=protected-access
-    op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
-    # pylint: enable=protected-access
+def _maybe_set_maximum_iterations_attr(op, maximum_iterations):
+  if control_flow_util.IsInXLAContext(op):
+    # Store the maximum_iterations to use in the gradient pass.
+    op._set_attr(  # pylint: disable=protected-access
+        "_maximum_iterations",
+        attr_value_pb2.AttrValue(
+            i=tensor_util.constant_value(maximum_iterations)))
 
 
 # TODO(srbs): This method should be in control_flow_util but that introduces
 # a circular dependency ops -> control_flow_util -> ops.
 def _is_in_xla_context():
   """Returns whether the current context is inside an XLA context."""
-  cur_ctxt = ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
+  outer_graph = ops.get_default_graph()
+  # The `_control_flow_context` is not copied when building a FuncGraph so
+  # we look it up from the base graph.
+  while isinstance(outer_graph, func_graph_module.FuncGraph):
+    outer_graph = outer_graph.outer_graph
+  cur_ctxt = outer_graph._get_control_flow_context()  # pylint: disable=protected-access
   return control_flow_util.GetContainingXLAContext(cur_ctxt) is not None
 
 
-def _get_tensor_convertible_shape(shape):
-  assert isinstance(shape, tensor_shape.TensorShape)
-  if shape.is_fully_defined():
-    return shape
-  if not shape:  # Unknown shape.
-    return -1
-  # Partially defined shape.
-  shape_list = shape.as_list()
-  shape_list = [s if s is not None else -1 for s in shape_list]
-  return ops.convert_to_tensor(shape_list)
-
-
 def _graph_name(graph):
   if isinstance(graph, func_graph_module.FuncGraph):
     return graph.name
diff --git a/tensorflow/python/platform/__init__.py b/tensorflow/python/platform/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 4c91bc3652dc77274acfbf43859c03fad8a46a38..7b917235c0a73421552b7aebaa3192de969e5f3a 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -108,7 +108,7 @@ def _define_help_flags():
     _define_help_flags_called = True
 
 
-@tf_export('app.run')
+@tf_export(v1=['app.run'])
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index 5927bc2409bb2744c2f6f003b90c0682e5ba5eb9..d0159e9e9816ba730c843d2b46936b142d47ff79 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('gfile.GFile', 'gfile.Open')
+@tf_export(v1=['gfile.GFile', 'gfile.Open'], v2=['io.gfile.GFile'])
 class GFile(_FileIO):
   """File I/O wrappers without thread locking.
 
@@ -52,7 +52,7 @@ class GFile(_FileIO):
     super(GFile, self).__init__(name=name, mode=mode)
 
 
-@tf_export('gfile.FastGFile')
+@tf_export(v1=['gfile.FastGFile'])
 class FastGFile(_FileIO):
   """File I/O wrappers without thread locking.
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index b2d95518552de3a170d1c04bfc3f061dc8f8f54a..8f4c5c190ccaa5a06beaf89430c33ad935c1df9d 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -24,7 +24,7 @@ from tensorflow.python.util import tf_inspect as _inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('resource_loader.load_resource')
+@tf_export(v1=['resource_loader.load_resource'])
 def load_resource(path):
   """Load the resource at given path, where path is relative to tensorflow/.
 
@@ -46,7 +46,7 @@ def load_resource(path):
 
 
 # pylint: disable=protected-access
-@tf_export('resource_loader.get_data_files_path')
+@tf_export(v1=['resource_loader.get_data_files_path'])
 def get_data_files_path():
   """Get a direct path to the data files colocated with the script.
 
@@ -57,7 +57,7 @@ def get_data_files_path():
   return _os.path.dirname(_inspect.getfile(_sys._getframe(1)))
 
 
-@tf_export('resource_loader.get_root_dir_with_all_resources')
+@tf_export(v1=['resource_loader.get_root_dir_with_all_resources'])
 def get_root_dir_with_all_resources():
   """Get a root directory containing all the data attributes in the build rule.
 
@@ -97,7 +97,7 @@ def get_root_dir_with_all_resources():
   return data_files_dir or script_dir
 
 
-@tf_export('resource_loader.get_path_to_datafile')
+@tf_export(v1=['resource_loader.get_path_to_datafile'])
 def get_path_to_datafile(path):
   """Get the path to the specified file in the data dependencies.
 
@@ -117,7 +117,7 @@ def get_path_to_datafile(path):
   return _os.path.join(data_files_path, path)
 
 
-@tf_export('resource_loader.readahead_file_path')
+@tf_export(v1=['resource_loader.readahead_file_path'])
 def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 5dc4037d62b478648baf2d57838c85aeda6cc738..943832af7a2c58d40cb2143048ddd6517596e406 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -46,9 +46,9 @@ from tensorflow.python.util.tf_export import tf_export
 if sys.version_info.major == 2:
   import mock                # pylint: disable=g-import-not-at-top,unused-import
 else:
-  from unittest import mock  # pylint: disable=g-import-not-at-top
+  from unittest import mock  # pylint: disable=g-import-not-at-top,g-importing-member
 
-tf_export('test.mock')(mock)
+tf_export(v1=['test.mock'])(mock)
 
 # Import Benchmark class
 Benchmark = _googletest.Benchmark  # pylint: disable=invalid-name
@@ -64,7 +64,7 @@ def main(argv=None):
   return _googletest.main(argv)
 
 
-@tf_export('test.get_temp_dir')
+@tf_export(v1=['test.get_temp_dir'])
 def get_temp_dir():
   """Returns a temporary directory for use during tests.
 
@@ -76,7 +76,7 @@ def get_temp_dir():
   return _googletest.GetTempDir()
 
 
-@tf_export('test.test_src_dir_path')
+@tf_export(v1=['test.test_src_dir_path'])
 def test_src_dir_path(relative_path):
   """Creates an absolute test srcdir path given a relative path.
 
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 59e60856ae80db76caa7ecd23db0db597bf60c6f..9f00abb201ae31fadb30e2a1063b741af0e46863 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -130,37 +130,37 @@ def _get_logger():
     _logger_lock.release()
 
 
-@tf_export('logging.log')
+@tf_export(v1=['logging.log'])
 def log(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
 
-@tf_export('logging.debug')
+@tf_export(v1=['logging.debug'])
 def debug(msg, *args, **kwargs):
   _get_logger().debug(msg, *args, **kwargs)
 
 
-@tf_export('logging.error')
+@tf_export(v1=['logging.error'])
 def error(msg, *args, **kwargs):
   _get_logger().error(msg, *args, **kwargs)
 
 
-@tf_export('logging.fatal')
+@tf_export(v1=['logging.fatal'])
 def fatal(msg, *args, **kwargs):
   _get_logger().fatal(msg, *args, **kwargs)
 
 
-@tf_export('logging.info')
+@tf_export(v1=['logging.info'])
 def info(msg, *args, **kwargs):
   _get_logger().info(msg, *args, **kwargs)
 
 
-@tf_export('logging.warn')
+@tf_export(v1=['logging.warn'])
 def warn(msg, *args, **kwargs):
   _get_logger().warn(msg, *args, **kwargs)
 
 
-@tf_export('logging.warning')
+@tf_export(v1=['logging.warning'])
 def warning(msg, *args, **kwargs):
   _get_logger().warning(msg, *args, **kwargs)
 
@@ -183,18 +183,18 @@ _log_prefix = None  # later set to google2_log_prefix
 _log_counter_per_token = {}
 
 
-@tf_export('logging.TaskLevelStatusMessage')
+@tf_export(v1=['logging.TaskLevelStatusMessage'])
 def TaskLevelStatusMessage(msg):
   error(msg)
 
 
-@tf_export('logging.flush')
+@tf_export(v1=['logging.flush'])
 def flush():
   raise NotImplementedError()
 
 
 # Code below is taken from pyglib/logging
-@tf_export('logging.vlog')
+@tf_export(v1=['logging.vlog'])
 def vlog(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
@@ -214,7 +214,7 @@ def _GetNextLogCountPerToken(token):
   return _log_counter_per_token[token]
 
 
-@tf_export('logging.log_every_n')
+@tf_export(v1=['logging.log_every_n'])
 def log_every_n(level, msg, n, *args):
   """Log 'msg % args' at level 'level' once per 'n' times.
 
@@ -231,7 +231,7 @@ def log_every_n(level, msg, n, *args):
   log_if(level, msg, not (count % n), *args)
 
 
-@tf_export('logging.log_first_n')
+@tf_export(v1=['logging.log_first_n'])
 def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   """Log 'msg % args' at level 'level' only first 'n' times.
 
@@ -247,7 +247,7 @@ def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   log_if(level, msg, count < n, *args)
 
 
-@tf_export('logging.log_if')
+@tf_export(v1=['logging.log_if'])
 def log_if(level, msg, condition, *args):
   """Log 'msg % args' at level 'level' only if condition is fulfilled."""
   if condition:
@@ -296,13 +296,13 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
   return s
 
 
-@tf_export('logging.get_verbosity')
+@tf_export(v1=['logging.get_verbosity'])
 def get_verbosity():
   """Return how much logging output will be produced."""
   return _get_logger().getEffectiveLevel()
 
 
-@tf_export('logging.set_verbosity')
+@tf_export(v1=['logging.set_verbosity'])
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
   _get_logger().setLevel(v)
@@ -318,8 +318,8 @@ def _get_thread_id():
 
 _log_prefix = google2_log_prefix
 
-tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
-tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
-tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
-tf_export('logging.INFO').export_constant(__name__, 'INFO')
-tf_export('logging.WARN').export_constant(__name__, 'WARN')
+tf_export(v1=['logging.DEBUG']).export_constant(__name__, 'DEBUG')
+tf_export(v1=['logging.ERROR']).export_constant(__name__, 'ERROR')
+tf_export(v1=['logging.FATAL']).export_constant(__name__, 'FATAL')
+tf_export(v1=['logging.INFO']).export_constant(__name__, 'INFO')
+tf_export(v1=['logging.WARN']).export_constant(__name__, 'WARN')
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index acf02096fffe8b38e68824878fa698ed69d3895c..4b2d9052b7879ceaf4a250ba56f438f3798b669b 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -122,7 +122,7 @@ def _build_advisor_options(options):
   return opts
 
 
-@tf_export('profiler.Profiler')
+@tf_export(v1=['profiler.Profiler'])
 class Profiler(object):
   """TensorFlow multi-step profiler.
 
@@ -306,7 +306,7 @@ class Profiler(object):
     print_mdl.WriteProfile(filename)
 
 
-@tf_export('profiler.profile')
+@tf_export(v1=['profiler.profile'])
 def profile(graph=None,
             run_meta=None,
             op_log=None,
@@ -381,7 +381,7 @@ def profile(graph=None,
   return tfprof_node
 
 
-@tf_export('profiler.advise')
+@tf_export(v1=['profiler.advise'])
 def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   """Auto profile and advise.
 
@@ -398,7 +398,7 @@ def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   Returns:
     Returns AdviceProto proto
   """
-  if not graph and context.in_eager_execution():
+  if not graph and not context.executing_eagerly():
     graph = ops.get_default_graph()
 
   if options == _DEFAULT_ADVISE_OPTIONS:
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 94c685274a764bb099da6c0501b397d73d239f35..8648f0b5148ecc6afcf0afe49ff91fe7c255e700 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -93,10 +93,10 @@ class PrintModelAnalysisTest(test.TestCase):
           config=self._no_rewrite_session_config()) as sess, ops.device(dev):
         x = lib.BuildSmallModel()
 
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         pctx.trace_next_step()
         pctx.dump_next_step()
-        _ = sess.run(x)
+        _ = self.evaluate(x)
 
         pctx.profiler.profile_name_scope(options=opts)
 
@@ -160,7 +160,7 @@ class PrintModelAnalysisTest(test.TestCase):
                         ) as sess, ops.device('/device:CPU:0'):
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -186,7 +186,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -220,9 +220,9 @@ class PrintModelAnalysisTest(test.TestCase):
       with session.Session(config=self._no_rewrite_session_config()) as sess:
         x = lib.BuildFullModel()
 
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         pctx.trace_next_step()
-        _ = sess.run(x)
+        _ = self.evaluate(x)
         tfprof_node = pctx.profiler.profile_python(options=opts)
 
         # pylint: disable=line-too-long
@@ -281,7 +281,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -309,7 +309,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -345,7 +345,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -391,7 +391,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -424,7 +424,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -490,7 +490,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -555,7 +555,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -587,10 +587,10 @@ class PrintModelAnalysisTest(test.TestCase):
   def _trainLoop(self, train_op, train_steps, time_dir, time_step,
                  memory_dir, memory_step, profile_dir, dump_step):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       # start from 1 because variable_initializer took one step.
       for i in range(1, train_steps + 1):
-        _ = sess.run(train_op)
+        _ = self.evaluate(train_op)
         if i in time_step:
           ret = gfile.ListDirectory(time_dir)
           self.assertEqual(len(ret), 1)
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 2ad7adf76933df65ca795dca361397f436adb995..9d8f7683a658e74c649d9ea337e7dbc10f870ef2 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -23,7 +23,7 @@ from tensorflow.python.profiler import tfprof_logger
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('profiler.ProfileOptionBuilder')
+@tf_export(v1=['profiler.ProfileOptionBuilder'])
 class ProfileOptionBuilder(object):
   # pylint: disable=line-too-long
   """Option Builder for Profiling API.
diff --git a/tensorflow/python/profiler/profile_context_test.py b/tensorflow/python/profiler/profile_context_test.py
index 107ad443c32e20ab69f3c2fb71c652d97a9c0cc6..abbeb8bedfde04efb37be29344ddb492fa7c6b32 100644
--- a/tensorflow/python/profiler/profile_context_test.py
+++ b/tensorflow/python/profiler/profile_context_test.py
@@ -48,7 +48,7 @@ class ProfilerContextTest(test.TestCase):
     with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
       pctx.add_auto_profiling("op", options=opts, profile_steps=[15, 50, 100])
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         total_steps = 101
         for i in range(total_steps):
           sess.run(x)
@@ -75,7 +75,7 @@ class ProfilerContextTest(test.TestCase):
 
     with profile_context.ProfileContext(test.get_temp_dir(), debug=True):
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
           sess.run(x)
           for f in gfile.ListDirectory(test.get_temp_dir()):
@@ -96,7 +96,7 @@ class ProfilerContextTest(test.TestCase):
     with profile_context.ProfileContext(test.get_temp_dir(),
                                         enabled=False) as pctx:
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
           sess.run(x)
       self.assertTrue(pctx.profiler is None)
@@ -105,7 +105,7 @@ class ProfilerContextTest(test.TestCase):
 
     with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
           sess.run(x)
       self.assertFalse(pctx.profiler is None)
diff --git a/tensorflow/python/profiler/profiler.py b/tensorflow/python/profiler/profiler.py
index efbdd1ba6842d85e82149346e9b4559527a1aacd..5f62690b54e2ff6e2c655eb5256299cce169f59a 100644
--- a/tensorflow/python/profiler/profiler.py
+++ b/tensorflow/python/profiler/profiler.py
@@ -49,7 +49,7 @@ _allowed_symbols.extend([
 ])
 
 # Export protos
-tf_export('profiler.GraphNodeProto')(GraphNodeProto)
-tf_export('profiler.MultiGraphNodeProto')(MultiGraphNodeProto)
-tf_export('profiler.AdviceProto')(AdviceProto)
-tf_export('profiler.OpLogProto')(OpLogProto)
+tf_export(v1=['profiler.GraphNodeProto'])(GraphNodeProto)
+tf_export(v1=['profiler.MultiGraphNodeProto'])(MultiGraphNodeProto)
+tf_export(v1=['profiler.AdviceProto'])(AdviceProto)
+tf_export(v1=['profiler.OpLogProto'])(OpLogProto)
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index e651de32ea3bce32a965bfbeefc76ff08a79ac38..6ccd0e0ff3b5f9f067f49b7a1b64e62af7c7af5d 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -188,7 +188,7 @@ def merge_default_with_oplog(graph, op_log=None, run_meta=None,
   return tmp_op_log
 
 
-@tf_export('profiler.write_op_log')
+@tf_export(v1=['profiler.write_op_log'])
 def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 576ad8ed65cfa6aabf58fe053cc08d5f342f50cc..e7a3b8afd5daf279569d1866a700d8084633dfa9 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -21,9 +21,9 @@ py_library(
     deps = [
         ":builder",
         ":constants",
-        ":export",
         ":loader",
         ":main_op",
+        ":save",
         ":signature_constants",
         ":signature_def_utils",
         ":simple_save",
@@ -265,9 +265,9 @@ py_test(
 )
 
 py_library(
-    name = "export",
+    name = "save",
     srcs = [
-        "export.py",
+        "save.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -285,11 +285,11 @@ py_library(
 )
 
 py_test(
-    name = "export_test",
-    srcs = ["export_test.py"],
+    name = "save_test",
+    srcs = ["save_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":export",
+        ":save",
         ":signature_constants",
         ":tag_constants",
         "//tensorflow/python/eager:def_function",
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index be49c70c60476ae8b95c07007abb32a222466958..b929934eebb14a340d89fbb570a322b2b7144154 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -24,5 +24,6 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.saved_model.builder_impl import _SavedModelBuilder
 from tensorflow.python.saved_model.builder_impl import SavedModelBuilder
 # pylint: enable=unused-import
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 4f68f7c5aeac4e8526dd3181a2eb347d52b8f550..ce7641cd98de7bc81f5f9070b065c583b0b746cd 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
 
 from google.protobuf.any_pb2 import Any
@@ -39,8 +40,7 @@ from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(v1=["saved_model.Builder", "saved_model.builder.SavedModelBuilder"])
-class SavedModelBuilder(object):
+class _SavedModelBuilder(object):
   """Builds the `SavedModel` protocol buffer and saves variables and assets.
 
   The `SavedModelBuilder` class provides functionality to build a `SavedModel`
@@ -68,7 +68,7 @@ class SavedModelBuilder(object):
     builder.add_meta_graph_and_variables(sess,
                                     ["foo-tag"],
                                     signature_def_map=foo_signatures,
-                                    assets_collection=foo_assets)
+                                    assets_list=foo_assets)
   ...
 
   with tf.Session(graph=tf.Graph()) as sess:
@@ -105,19 +105,8 @@ class SavedModelBuilder(object):
     # weights.
     self._has_saved_variables = False
 
-  def _save_and_write_assets(self, assets_collection_to_add=None):
-    """Saves asset to the meta graph and writes asset files to disk.
-
-    Args:
-      assets_collection_to_add: The collection where the asset paths are setup.
-    """
-    asset_filename_map = _maybe_save_assets(assets_collection_to_add)
-
-    # Return if there are no assets to write.
-    if not asset_filename_map:
-      tf_logging.info("No assets to write.")
-      return
-
+  def _copy_assets_to_destination_dir(self, asset_filename_map):
+    """Copy all assets from source path to destination path."""
     assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
         self._export_dir)
 
@@ -136,6 +125,25 @@ class SavedModelBuilder(object):
     tf_logging.info("Assets written to: %s",
                     compat.as_text(assets_destination_dir))
 
+  def _save_and_write_assets(self, meta_graph_def, assets_list=None):
+    """Saves asset to the meta graph and writes asset files to disk.
+
+    Args:
+      meta_graph_def: The meta graph def to which the assets will be added.
+      assets_list: The list where the asset paths are setup.
+    """
+    # Creates a function that adds assets into the meta graph def.
+    write_fn = functools.partial(_add_asset_to_metagraph, meta_graph_def)
+    asset_filename_map = _maybe_save_assets(write_fn, assets_list)
+
+    # Return if there are no assets to write.
+    if not asset_filename_map:
+      tf_logging.info("No assets to write.")
+      return
+
+    # Copy assets from source path to destination path.
+    self._copy_assets_to_destination_dir(asset_filename_map)
+
   def _maybe_add_main_op(self, main_op):
     """Adds main op to the SavedModel.
 
@@ -252,12 +260,8 @@ class SavedModelBuilder(object):
         for outputs_key in outputs:
           self._validate_tensor_info(outputs[outputs_key])
 
-  def _add_collections(
-      self, assets_collection, main_op, train_op):
+  def _add_collections(self, main_op, train_op):
     """Add asset and op collections to be saved."""
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
-
     self._maybe_add_main_op(main_op)
 
     self._add_train_op(train_op)
@@ -280,7 +284,7 @@ class SavedModelBuilder(object):
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
-                     assets_collection=None,
+                     assets_list=None,
                      legacy_init_op=None,
                      clear_devices=False,
                      main_op=None,
@@ -297,8 +301,8 @@ class SavedModelBuilder(object):
       tags: The set of tags to annotate the meta graph def with.
       signature_def_map: The map of signature defs to be added to the meta graph
           def.
-      assets_collection: Assets collection to be saved with SavedModel. Note
-          that this collection should be a subset of the assets saved as part of
+      assets_list: Assets to be saved with SavedModel. Note
+          that this list should be a subset of the assets saved as part of
           the first meta graph in the SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
           restore op upon a load. Deprecated; please use main_op instead.
@@ -332,8 +336,8 @@ class SavedModelBuilder(object):
     # Re-mapping to main_op, as treatment is identical regardless.
     main_op = main_op or legacy_init_op
 
-    # Add assets and ops
-    self._add_collections(assets_collection, main_op, None)
+    # Add ops to collection.
+    self._add_collections(main_op=main_op, train_op=None)
 
     saver = self._maybe_create_saver(saver)
 
@@ -347,6 +351,9 @@ class SavedModelBuilder(object):
     meta_graph_def = saver.export_meta_graph(
         clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
 
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(meta_graph_def, assets_list)
+
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
@@ -357,7 +364,7 @@ class SavedModelBuilder(object):
                                    sess,
                                    tags,
                                    signature_def_map=None,
-                                   assets_collection=None,
+                                   assets_list=None,
                                    legacy_init_op=None,
                                    clear_devices=False,
                                    main_op=None,
@@ -378,7 +385,7 @@ class SavedModelBuilder(object):
       tags: The set of tags with which to save the meta graph.
       signature_def_map: The map of signature def map to add to the meta graph
         def.
-      assets_collection: Assets collection to be saved with SavedModel.
+      assets_list: Assets to be saved with SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
           restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
@@ -408,8 +415,8 @@ class SavedModelBuilder(object):
     # Re-mapping to main_op, as treatment is identical regardless.
     main_op = main_op or legacy_init_op
 
-    # Add assets and ops
-    self._add_collections(assets_collection, main_op, None)
+    # Add ops to collection.
+    self._add_collections(main_op=main_op, train_op=None)
 
     saved_model_utils.get_or_create_variables_dir(self._export_dir)
     variables_path = saved_model_utils.get_variables_path(self._export_dir)
@@ -434,6 +441,9 @@ class SavedModelBuilder(object):
     meta_graph_def = saver.export_meta_graph(
         clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
 
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(meta_graph_def, assets_list)
+
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
@@ -471,11 +481,157 @@ class SavedModelBuilder(object):
     return path
 
 
-def _maybe_save_assets(assets_collection_to_add=None):
+@tf_export(v1=["saved_model.Builder", "saved_model.builder.SavedModelBuilder"])  # pylint: disable=missing-docstring
+class SavedModelBuilder(_SavedModelBuilder):
+  __doc__ = _SavedModelBuilder.__doc__.replace("assets_list",
+                                               "assets_collection")
+
+  def __init__(self, export_dir):
+    super(SavedModelBuilder, self).__init__(export_dir=export_dir)
+
+  def _add_collections(self, assets_collection, main_op, train_op):
+    """Add asset and op collections to be saved."""
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(assets_collection)
+
+    self._maybe_add_main_op(main_op)
+
+    self._add_train_op(train_op)
+
+  def _save_and_write_assets(self, assets_collection_to_add=None):
+    """Saves asset to the meta graph and writes asset files to disk.
+
+    Args:
+      assets_collection_to_add: The collection where the asset paths are setup.
+    """
+    # Add assets to the collection with key `constants.ASSETS_KEY`, in the
+    # graph.
+    asset_filename_map = _maybe_save_assets(_add_asset_to_collection,
+                                            assets_collection_to_add)
+
+    # Return if there are no assets to write.
+    if not asset_filename_map:
+      tf_logging.info("No assets to write.")
+      return
+
+    # Copy assets from source path to destination path.
+    self._copy_assets_to_destination_dir(asset_filename_map)
+
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
+  def add_meta_graph(self,
+                     tags,
+                     signature_def_map=None,
+                     assets_collection=None,
+                     legacy_init_op=None,
+                     clear_devices=False,
+                     main_op=None,
+                     strip_default_attrs=False,
+                     saver=None):
+    if not self._has_saved_variables:
+      raise AssertionError(
+          "Graph state including variables and assets has not been saved yet. "
+          "Please invoke `add_meta_graph_and_variables()` first.")
+
+    # Validate the signature def map to ensure all included TensorInfos are
+    # properly populated.
+    self._validate_signature_def_map(signature_def_map)
+
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
+    # Add assets and ops
+    self._add_collections(assets_collection, main_op, None)
+
+    saver = self._maybe_create_saver(saver)
+
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
+  def add_meta_graph_and_variables(self,
+                                   sess,
+                                   tags,
+                                   signature_def_map=None,
+                                   assets_collection=None,
+                                   legacy_init_op=None,
+                                   clear_devices=False,
+                                   main_op=None,
+                                   strip_default_attrs=False,
+                                   saver=None):
+    if self._has_saved_variables:
+      raise AssertionError("Graph state including variables and assets has "
+                           "already been saved. Please invoke "
+                           "`add_meta_graph()` instead.")
+
+    # Validate the signature def map to ensure all included TensorInfos are
+    # properly populated.
+    self._validate_signature_def_map(signature_def_map)
+
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
+    # Add assets and ops
+    self._add_collections(assets_collection, main_op, None)
+
+    saved_model_utils.get_or_create_variables_dir(self._export_dir)
+    variables_path = saved_model_utils.get_variables_path(self._export_dir)
+
+    saver = self._maybe_create_saver(saver)
+
+    # Save the variables. Also, disable writing the checkpoint state proto. The
+    # file is not used during SavedModel loading. In addition, since a
+    # SavedModel can be copied or moved, this avoids the checkpoint state to
+    # become outdated.
+    saver.save(sess, variables_path, write_meta_graph=False, write_state=False)
+
+    # Export the meta graph def.
+
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+    # Mark this instance of SavedModel as having saved variables, such that
+    # subsequent attempts to save variables will fail.
+    self._has_saved_variables = True
+
+  add_meta_graph.__doc__ = _SavedModelBuilder.add_meta_graph.__doc__.replace(
+      "assets_list", "assets_collection")
+  add_meta_graph_and_variables.__doc__ = \
+      _SavedModelBuilder.add_meta_graph_and_variables.__doc__.replace(
+          "assets_list", "assets_collection")
+
+
+def _maybe_save_assets(write_fn, assets_to_add=None):
   """Saves assets to the meta graph.
 
   Args:
-    assets_collection_to_add: The collection where the asset paths are setup.
+    write_fn: A function callback that writes asset into meta graph.
+    assets_to_add: The list where the asset paths are setup.
 
   Returns:
     A dict of asset basenames for saving to the original full path to the asset.
@@ -486,14 +642,13 @@ def _maybe_save_assets(assets_collection_to_add=None):
   # Map of target file names to original filenames
   asset_filename_map = {}
 
-  if assets_collection_to_add is None:
+  if assets_to_add is None:
     tf_logging.info("No assets to save.")
     return asset_filename_map
 
-  # Iterate over the supplied asset collection, build the `AssetFile` proto
-  # and add them to the collection with key `constants.ASSETS_KEY`, in the
-  # graph.
-  for asset_tensor in assets_collection_to_add:
+  # Iterate over the supplied assets, build the `AssetFile` proto and add them
+  # to the meta graph.
+  for asset_tensor in assets_to_add:
     asset_source_filepath = _asset_path_from_tensor(asset_tensor)
     if not asset_source_filepath:
       raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
@@ -501,10 +656,11 @@ def _maybe_save_assets(assets_collection_to_add=None):
     asset_filename = _get_asset_filename_to_add(
         asset_source_filepath, asset_filename_map)
 
-    # Build `AssetFile` proto and add it to the asset collection in the graph.
+    # Call the passed-in function that builds AssetFileDef proto and adds it
+    # to either the collection or asset_file_def field of the meta graph.
     # Note that this should be done even when the file is a duplicate of an
     # already-added file, as the tensor reference should still exist.
-    _add_asset_to_collection(asset_filename, asset_tensor)
+    write_fn(asset_filename, asset_tensor)
 
     # In the cases where we are adding a duplicate, this will result in the
     # last of the filepaths being the one used for copying the file to the
@@ -542,7 +698,7 @@ def _get_asset_filename_to_add(asset_filepath, asset_filename_map):
 
   other_asset_filepath = asset_filename_map[asset_filename]
   if other_asset_filepath == asset_filepath:
-    # This is the same file, stored twice in the collection list. No need
+    # This is the same file, stored twice in the list. No need
     # to make unique.
     return asset_filename
 
@@ -589,6 +745,20 @@ def _asset_path_from_tensor(path_tensor):
   return str_values[0]
 
 
+def _add_asset_to_metagraph(meta_graph_def, asset_filename, asset_tensor):
+  """Builds an asset proto and adds it to the meta graph def.
+
+  Args:
+    meta_graph_def: The meta graph def to which the asset will be added.
+    asset_filename: The filename of the asset to be added.
+    asset_tensor: The asset tensor used to populate the tensor info of the asset
+      proto.
+  """
+  asset_proto = meta_graph_def.asset_file_def.add()
+  asset_proto.filename = asset_filename
+  asset_proto.tensor_info.name = asset_tensor.name
+
+
 def _add_asset_to_collection(asset_filename, asset_tensor):
   """Builds an asset proto and adds it to the asset collection of the graph.
 
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 0addbdc9686316bb27f6eaf65006daf4e88a4706..f696d4815bab96e5887fe9c3608d0b8d6ab9ffe5 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -40,7 +40,6 @@ tf_export(
 # CollectionDef key for the legacy init op.
 LEGACY_INIT_OP_KEY = "legacy_init_op"
 tf_export(
-    "saved_model.LEGACY_INIT_OP_KEY",
     v1=[
         "saved_model.LEGACY_INIT_OP_KEY",
         "saved_model.constants.LEGACY_INIT_OP_KEY"
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 8c8eaf038a1b908e48ee7ad23a48d064f06102ca..f50a07fee4e71ad17fc8addf562cb0d77aa40673 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -99,22 +99,29 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   collection_def = meta_graph_def_to_load.collection_def
 
   asset_tensor_dict = {}
-  if constants.ASSETS_KEY in collection_def:
-    # Location of the assets for SavedModel.
-    assets_directory = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes(constants.ASSETS_DIRECTORY))
+  asset_protos = []
+
+  if meta_graph_def_to_load.asset_file_def:
+    asset_protos = meta_graph_def_to_load.asset_file_def
+  elif constants.ASSETS_KEY in collection_def:
     assets_any_proto = collection_def[constants.ASSETS_KEY].any_list.value
-    # Process each asset and add it to the asset tensor dictionary.
     for asset_any_proto in assets_any_proto:
       asset_proto = meta_graph_pb2.AssetFileDef()
       asset_any_proto.Unpack(asset_proto)
-      tensor_name = asset_proto.tensor_info.name
-      if import_scope:
-        tensor_name = "%s/%s" % (import_scope, tensor_name)
-      asset_tensor_dict[tensor_name] = os.path.join(
-          compat.as_bytes(assets_directory),
-          compat.as_bytes(asset_proto.filename))
+      asset_protos.append(asset_proto)
+
+  # Location of the assets for SavedModel.
+  assets_directory = os.path.join(
+      compat.as_bytes(export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY))
+  # Process each asset and add it to the asset tensor dictionary.
+  for asset_proto in asset_protos:
+    tensor_name = asset_proto.tensor_info.name
+    if import_scope:
+      tensor_name = "%s/%s" % (import_scope, tensor_name)
+    asset_tensor_dict[tensor_name] = os.path.join(
+        compat.as_bytes(assets_directory),
+        compat.as_bytes(asset_proto.filename))
+
   return asset_tensor_dict
 
 
@@ -145,12 +152,11 @@ def _get_main_op_tensor(
   return main_op_tensor
 
 
-@tf_export(
+@tf_export(v1=[
+    "saved_model.contains_saved_model",
     "saved_model.maybe_saved_model_directory",
-    v1=[
-        "saved_model.maybe_saved_model_directory",
-        "saved_model.loader.maybe_saved_model_directory"
-    ])
+    "saved_model.loader.maybe_saved_model_directory"
+])
 @deprecation.deprecated_endpoints(
     "saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
@@ -173,6 +179,25 @@ def maybe_saved_model_directory(export_dir):
   return file_io.file_exists(txt_path) or file_io.file_exists(pb_path)
 
 
+@tf_export("saved_model.contains_saved_model", v1=[])
+def contains_saved_model(export_dir):
+  """Checks whether the provided export directory could contain a SavedModel.
+
+  Note that the method does not load any data by itself. If the method returns
+  `false`, the export directory definitely does not contain a SavedModel. If the
+  method returns `true`, the export directory may contain a SavedModel but
+  provides no guarantee that it can be loaded.
+
+  Args:
+    export_dir: Absolute string path to possible export location. For example,
+                '/my/foo/model'.
+
+  Returns:
+    True if the export directory contains SavedModel files, False otherwise.
+  """
+  return maybe_saved_model_directory(export_dir)
+
+
 @tf_export(v1=["saved_model.load", "saved_model.loader.load"])
 @deprecation.deprecated(
     None,
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index 924b2e7c0655130df9c0f7c5fe7742fc5ebaddc6..0b97a734415bd066c859952a0cd020a0f501c544 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -50,7 +50,7 @@ class SavedModelLoaderTest(test.TestCase):
       x = variables.VariableV1(5, name="x")
       y = variables.VariableV1(11, name="y")
       z = x + y
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       foo_sig_def = signature_def_utils.build_signature_def(
           {"foo_input": utils.build_tensor_info(x)},
@@ -138,14 +138,14 @@ class SavedModelLoaderTest(test.TestCase):
       y = variables.VariableV1(0, name="y")
       z = x * y
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       # There are variables to restore, so a saver must be created.
       with self.assertRaises(ValueError):
         loader.restore_variables(sess, None)
 
       loader.restore_variables(sess, tf_saver.Saver())
-      self.assertEqual(55, z.eval())
+      self.assertEqual(55, self.evaluate(z))
 
   def test_run_init_op(self):
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
diff --git a/tensorflow/python/saved_model/export.py b/tensorflow/python/saved_model/save.py
similarity index 75%
rename from tensorflow/python/saved_model/export.py
rename to tensorflow/python/saved_model/save.py
index 2c64ddfc070f226ca7d57c18f9c8500bba4e8a37..02c8dc7c13b9401b6dc782584d045b6db4ecd2ec 100644
--- a/tensorflow/python/saved_model/export.py
+++ b/tensorflow/python/saved_model/save.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
+from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
@@ -32,18 +33,47 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import util
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _find_function_to_export(root):
+  """Iterate over `root`'s attributes, finding traced functions."""
+  functions = []
+  function_attribute_names = []
+  for attribute_name in dir(root):
+    attribute_value = getattr(root, attribute_name, None)
+    if isinstance(attribute_value, def_function.PolymorphicFunction):
+      functions.append(attribute_value)
+      function_attribute_names.append(attribute_name)
+  # TODO(allenl): Automatically infer signatures for Keras functional models?
+  if not functions:
+    raise ValueError(
+        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
+         "argument specified, and with no @tf.function-decorated methods "
+         "attached to it. In the future this will be a supported use-case for "
+         "Python re-import, but at the moment saving a SavedModel without "
+         "signatures does not make sense, as the only consumers will expect "
+         "signatures. Either decorate a method or specify a signature function "
+         "explicitly."))
+  elif len(functions) > 1:
+    raise ValueError(
+        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
+         "argument specified, and with more than one @tf.function-decorated "
+         "method attached to it: {}. The signature keys for these functions "
+         "are ambiguous. Specify signature functions explicitly.").format(
+             function_attribute_names))
+  return functions[0]
 
 
 def _canonicalize_signatures(signatures):
   """Converts `signatures` into a dictionary of concrete functions."""
-  if signatures is None:
-    signatures = {}
-  elif not isinstance(signatures, collections.Mapping):
+  if not isinstance(signatures, collections.Mapping):
     signatures = {
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
   concrete_signatures = {}
@@ -343,38 +373,66 @@ def _make_graph_def(root, signature_functions, object_saver):
   return graph_def, signatures, saver_def
 
 
-def export(obj, export_dir, signatures=None):
+@tf_export("saved_model.save", v1=["saved_model.experimental.save"])
+def save(obj, export_dir, signatures=None):
   # pylint: disable=line-too-long
   """Exports the Checkpointable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
 
-  The `signatures` argument indicates TensorFlow functions which will be
+  Example usage:
+
+  ```python
+  class Adder(tf.train.Checkpoint):
+
+    @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+    def add(self, x):
+      return x + x + 1.
+
+  to_export = Adder()
+  tf.saved_model.save(to_export, '/tmp/adder')
+  ```
+
+  The resulting SavedModel is then servable with an input named "x", its value
+  having any shape and dtype float32.
+
+  The optional `signatures` argument controls which methods in `obj` will be
   available to programs which consume `SavedModel`s, for example serving
   APIs. Python functions may be decorated with
   `@tf.function(input_signature=...)` and passed as signatures directly, or
-  created without a signature using `@tf.function` and then converted to a
-  concrete TensorFlow function using `f.get_concrete_function(...)`.
-
-  In either case, `Tensor` inputs to `signatures` functions which are not
-  associated with a unique Python argument name must have names explicitly
-  specified in their `tf.TensorSpec` objects. Cases where this is necessary
-  include positional arguments passed through variadic `*args` and multiple
-  `Tensor` inputs which are part of the same nested structure.
+  lazily with a call to `get_concrete_function` on the method decorated with
+  `@tf.function`.
+
+  If the `signatures` argument is omitted, `obj` will be searched for
+  `@tf.function`-decorated methods. If exactly one `@tf.function` is found, that
+  method will be used as the default signature for the SavedModel. This behavior
+  is expected to change in the future, when a corresponding
+  `tf.saved_model.load` symbol is added. At that point signatures will be
+  completely optional, and any `@tf.function` attached to `obj` or its
+  dependencies will be exported for use with `load`.
+
+  When invoking a signature in an exported SavedModel, `Tensor` arguments are
+  identified by name. These names will come from the Python function's argument
+  names by default. They may be overridden by specifying a `name=...` argument
+  in the corresponding `tf.TensorSpec` object. Explicit naming is required if
+  multiple `Tensor`s are passed through a single argument to the Python
+  function.
 
   The outputs of functions used as `signatures` must either be flat lists, in
   which case outputs will be numbered, or a dictionary mapping string keys to
-  Tensors, in which case the string keys will be used to name outputs.
+  `Tensor`, in which case the keys will be used to name outputs.
 
-  Exporting with a signature specified:
+  Since `tf.keras.Model` objects are also Checkpointable, this function can be
+  used to export Keras models. For example, exporting with a signature
+  specified:
 
   ```python
   class Model(tf.keras.Model):
 
-    @tf.function(input_signature=tf.TensorSpec(shape=[None], dtype=tf.string))
-    def serve(serialized):
+    @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)])
+    def serve(self, serialized):
       ...
 
   m = Model()
-  tf.saved_model.export(m, '/tmp/saved_model/', signatures=m.serve)
+  tf.saved_model.save(m, '/tmp/saved_model/')
   ```
 
   Exporting from a function without a fixed signature:
@@ -383,13 +441,13 @@ def export(obj, export_dir, signatures=None):
   class Model(tf.keras.Model):
 
     @tf.function
-    def compute(x):
+    def call(self, x):
       ...
 
   m = Model()
-  tf.saved_model.export(
+  tf.saved_model.save(
       m, '/tmp/saved_model/',
-      signatures=m.compute.get_concrete_function(
+      signatures=m.call.get_concrete_function(
           tf.TensorSpec(shape=[None, 3], dtype=tf.float32, name="inp")))
   ```
 
@@ -399,14 +457,47 @@ def export(obj, export_dir, signatures=None):
   automatically. This is the same tracking scheme that `tf.train.Checkpoint`
   uses, and an exported `Checkpoint` object may be restored as a training
   checkpoint by pointing `tf.train.Checkpoint.restore` to the SavedModel's
-  "variables/" subdirectory.
+  "variables/" subdirectory. Currently variables are the only stateful objects
+  supported by `tf.saved_model.save`, but others (e.g. tables) will be supported
+  in the future.
+
+  `tf.function` does not hard-code device annotations from outside the function
+  body, instead using the calling context's device. This means for example that
+  exporting a model which runs on a GPU and serving it on a CPU will generally
+  work, with some exceptions. `tf.device` annotations inside the body of the
+  function will be hard-coded in the exported model; this type of annotation is
+  discouraged. Device-specific operations, e.g. with "cuDNN" in the name or with
+  device-specific layouts, may cause issues. Currently a `DistributionStrategy`
+  is another exception: active distribution strategies will cause device
+  placements to be hard-coded in a function. Exporting a single-device
+  computation and importing under a `DistributionStrategy` is not currently
+  supported, but may be in the future.
+
+  SavedModels exported with `tf.saved_model.save` [strip default-valued
+  attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes)
+  automatically, which removes one source of incompatibilities when the consumer
+  of a SavedModel is running an older TensorFlow version than the
+  producer. There are however other sources of incompatibilities which are not
+  handled automatically, such as when the exported model contains operations
+  which the consumer does not have definitions for.
+
+  The current implementation of `tf.saved_model.save` targets serving use-cases,
+  but omits information which will be necessary for the planned future
+  implementation of `tf.saved_model.load`. Exported models using the current
+  `save` implementation, and other existing SavedModels, will not be compatible
+  with `tf.saved_model.load` when it is implemented. Further, `save` will in the
+  future attempt to export `@tf.function`-decorated methods which it does not
+  currently inspect, so some objects which are exportable today will raise
+  exceptions on export in the future (e.g. due to complex/non-serializable
+  default arguments). Such backwards-incompatible API changes are expected only
+  prior to the TensorFlow 2.0 release.
 
   Args:
     obj: A checkpointable object to export.
     export_dir: A directory in which to write the SavedModel.
     signatures: Optional, either a `tf.function` with an input signature
       specified or the result of `f.get_concrete_function` on a
-      `tf.function`-decorated function `f`, in which case `f` will be used to
+      `@tf.function`-decorated function `f`, in which case `f` will be used to
       generate a signature for the SavedModel under the default serving
       signature key. `signatures` may also be a dictionary, in which case it
       maps from signature keys to either `tf.function` instances with input
@@ -421,6 +512,10 @@ def export(obj, export_dir, signatures=None):
   if not isinstance(obj, base.CheckpointableBase):
     raise ValueError(
         "Expected a Checkpointable object for export, got {}.".format(obj))
+  if signatures is None:
+    # Note that we run this before saving the checkpoint, since looping over
+    # attributes may have the side effect of creating variables in some cases.
+    signatures = _find_function_to_export(obj)
   object_saver = util.CheckpointableSaver(obj)
   utils_impl.get_or_create_variables_dir(export_dir)
   object_saver.save(utils_impl.get_variables_path(export_dir))
@@ -432,6 +527,7 @@ def export(obj, export_dir, signatures=None):
   saved_model.saved_model_schema_version = (
       constants.SAVED_MODEL_SCHEMA_VERSION)
   meta_graph_def = saved_model.meta_graphs.add()
+  meta_graph_def.meta_info_def.tags.append(tag_constants.SERVING)
   meta_graph_def.saver_def.CopyFrom(saver_def)
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
@@ -439,6 +535,7 @@ def export(obj, export_dir, signatures=None):
   meta_graph_def.graph_def.MergeFrom(graph_def)
   for signature_key, signature in signatures.items():
     meta_graph_def.signature_def[signature_key].MergeFrom(signature)
+  meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
   path = os.path.join(
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
diff --git a/tensorflow/python/saved_model/export_test.py b/tensorflow/python/saved_model/save_test.py
similarity index 58%
rename from tensorflow/python/saved_model/export_test.py
rename to tensorflow/python/saved_model/save_test.py
index 8c8ac861ac8cbf4d18bdf42123e09eab7129faac..04cd9d0683cfeb003e3bf9265a605cc9a386ff94 100644
--- a/tensorflow/python/saved_model/export_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for checkpointable object SavedModel export."""
+"""Tests for checkpointable object SavedModel save."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -33,11 +33,13 @@ from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.saved_model import export
 from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import adam
 from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util
 
 
 class _ModelWithOptimizer(training.Model):
@@ -59,15 +61,15 @@ class _ModelWithOptimizer(training.Model):
     return {"loss": loss}
 
 
-class ExportTest(test.TestCase):
+class SaveTest(test.TestCase):
 
   def _import_and_infer(
-      self, export_dir, inputs,
+      self, save_dir, inputs,
       signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
     """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
     graph = ops.Graph()
     with graph.as_default(), self.session(graph) as session:
-      model = loader.load(session, [], export_dir)
+      model = loader.load(session, [tag_constants.SERVING], save_dir)
       signature = model.signature_def[signature_key]
       self.assertEqual(set(inputs.keys()), set(signature.inputs.keys()))
       feed_dict = {}
@@ -80,42 +82,42 @@ class ExportTest(test.TestCase):
             output_tensor_info.name)
       return session.run(output_dict, feed_dict=feed_dict)
 
-  def test_method_export_signature(self):
+  def test_method_save_signature(self):
     root = tracking.Checkpointable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
     root.f(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(root, export_dir, root.f)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, root.f)
     self.assertEqual(
         {"output_0": 2.},
-        self._import_and_infer(export_dir, {"x": 1.}))
+        self._import_and_infer(save_dir, {"x": 1.}))
 
-  def test_method_export_concrete(self):
+  def test_method_save_concrete(self):
     root = tracking.Checkpointable()
     root.f = def_function.function(
         lambda z: {"out": 2. * z})
     root.f(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(
         root,
-        export_dir,
+        save_dir,
         {"non_default_key": root.f.get_concrete_function(
             tensor_spec.TensorSpec(None, dtypes.float32))})
     self.assertEqual(
         {"out": 2.},
         self._import_and_infer(
-            export_dir, {"z": 1.}, signature_key="non_default_key"))
+            save_dir, {"z": 1.}, signature_key="non_default_key"))
 
   def test_non_concrete_error(self):
     root = tracking.Checkpointable()
     root.f = def_function.function(lambda x: 2. * x)
     root.f(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(
         ValueError, "must be converted to concrete functions"):
-      export.export(root, export_dir, root.f)
+      save.save(root, save_dir, root.f)
 
   def test_nested_inputs(self):
     root = tracking.Checkpointable()
@@ -124,7 +126,7 @@ class ExportTest(test.TestCase):
         input_signature=([tensor_spec.TensorSpec(None, dtypes.float32),
                           tensor_spec.TensorSpec(None, dtypes.float32)],))
     root.f([constant_op.constant(1.), constant_op.constant(1.)])
-    # Concrete functions must always have uniquely named Tensor inputs. Export
+    # Concrete functions must always have uniquely named Tensor inputs. Save
     # relies on this.
     with self.assertRaisesRegexp(
         ValueError, "two arguments named 'x'"):
@@ -134,22 +136,22 @@ class ExportTest(test.TestCase):
     root = tracking.Checkpointable()
     root.f = def_function.function(lambda x: (2. * x, (3. * x, 4. * x)))
     root.f(constant_op.constant(1.))
-    to_export = root.f.get_concrete_function(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    to_save = root.f.get_concrete_function(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(
         ValueError, "non-flat outputs"):
-      export.export(root, export_dir, to_export)
+      save.save(root, save_dir, to_save)
 
   def test_nested_dict_outputs(self):
     root = tracking.Checkpointable()
     root.f = def_function.function(
         lambda x: {"a": 2. * x, "b": (3. * x, 4. * x)})
     root.f(constant_op.constant(1.))
-    to_export = root.f.get_concrete_function(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    to_save = root.f.get_concrete_function(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(
         ValueError, "dictionary containing non-Tensor value"):
-      export.export(root, export_dir, to_export)
+      save.save(root, save_dir, to_save)
 
   def test_variable(self):
     root = tracking.Checkpointable()
@@ -158,24 +160,99 @@ class ExportTest(test.TestCase):
     root.f = def_function.function(
         lambda x: root.v1 * root.v2 * x)
     root.f(constant_op.constant(1.))
-    to_export = root.f.get_concrete_function(constant_op.constant(1.))
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(root, export_dir, to_export)
+    to_save = root.f.get_concrete_function(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, to_save)
     self.assertAllEqual({"output_0": 12.},
-                        self._import_and_infer(export_dir, {"x": 2.}))
+                        self._import_and_infer(save_dir, {"x": 2.}))
 
   def test_optimizer(self):
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
     model = _ModelWithOptimizer()
     first_loss = model(x, y)
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(model, export_dir, model.call)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir, model.call)
     second_loss = model(x, y)
     self.assertNotEqual(first_loss, second_loss)
     self.assertAllClose(
         second_loss,
-        self._import_and_infer(export_dir, {"x": [[3., 4.]], "y": [2.]}))
+        self._import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
+
+  def test_trivial_save_exception(self):
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertRaisesRegexp(ValueError, "signature"):
+      save.save(tracking.Checkpointable(), save_dir)
+
+  def test_single_method_default_signature(self):
+    model = _ModelWithOptimizer()
+    x = constant_op.constant([[3., 4.]])
+    y = constant_op.constant([2.])
+    model(x, y)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertIn("loss",
+                  self._import_and_infer(save_dir,
+                                         {"x": [[3., 4.]], "y": [2.]}))
+
+  def test_single_function_default_signature(self):
+    model = tracking.Checkpointable()
+    model.f = def_function.function(lambda: 3., input_signature=())
+    model.f()
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertAllClose({"output_0": 3.},
+                        self._import_and_infer(save_dir, {}))
+
+  def test_ambiguous_signatures(self):
+    model = _ModelWithOptimizer()
+    x = constant_op.constant([[3., 4.]])
+    y = constant_op.constant([2.])
+    model(x, y)
+    model.second_function = def_function.function(lambda: 1.)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertRaisesRegexp(ValueError, "call.*second_function"):
+      save.save(model, save_dir)
+
+  def test_docstring(self):
+
+    class Adder(util.Checkpoint):
+
+      @def_function.function(input_signature=[tensor_spec.TensorSpec(
+          shape=None, dtype=dtypes.float32)])
+      def add(self, x):
+        return x + x + 1.
+
+    to_save = Adder()
+    to_save.add(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(to_save, save_dir)
+    self.assertAllClose({"output_0": 7.},
+                        self._import_and_infer(save_dir, {"x": 3.}))
+
+  def test_default_attr_stripping(self):
+
+    class Complex(util.Checkpoint):
+
+      @def_function.function(input_signature=[])
+      def __call__(self):
+        return math_ops.complex(
+            constant_op.constant(1.),
+            constant_op.constant(2.),
+            name="complex")
+
+    to_save = Complex()
+    to_save()
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(to_save, save_dir)
+    graph = ops.Graph()
+    with graph.as_default(), self.session(graph) as session:
+      loader.load(session, [tag_constants.SERVING], save_dir)
+      func, = graph._functions.values()
+      complex_node, = [
+          node for node in func.definition.node_def if node.op == "Complex"]
+      self.assertNotIn("T", complex_node.attr)
+      self.assertNotIn("Tout", complex_node.attr)
 
 
 class MemoryTests(test.TestCase):
@@ -192,8 +269,8 @@ class MemoryTests(test.TestCase):
       # TODO(allenl): debug reference cycles in Python 2.x
       self.skipTest("This test only works in Python 3+. Reference cycles are "
                     "created in older Python versions.")
-    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    export.export(self._model, export_dir, self._model.call)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(self._model, save_dir, self._model.call)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index 6702c99607136475cdf096f863ccd0bbddd57845..fcde6b47e4ff10dbd84801e08597591a10818d51 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -29,8 +29,8 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model.save import save
 # pylint: enable=unused-import
 # pylint: disable=wildcard-import
 from tensorflow.python.saved_model.simple_save import *
 # pylint: enable=wildcard-import
-
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 5d6167ab38f5a07d56143f608770d1aadb17a2fb..e722b6ceaeac0aa95d5c89f8c32d71c2b2fd56ac 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -54,15 +54,15 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
-class SavedModelTest(test.TestCase):
+class SavedModelTestBase(test.TestCase):
 
   def _get_export_dir(self, label):
     return os.path.join(test.get_temp_dir(), label)
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.VariableV1(variable_value, name=variable_name)
-    sess.run(variables.global_variables_initializer())
-    self.assertEqual(variable_value, v.eval())
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(variable_value, self.evaluate(v))
 
   def _build_asset_collection(self, asset_file_name, asset_file_contents,
                               asset_file_tensor_name, asset_subdir=""):
@@ -78,14 +78,16 @@ class SavedModelTest(test.TestCase):
     asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
     return asset_collection
 
-  def _validate_asset_collection(self, export_dir, graph_collection_def,
-                                 expected_asset_file_name,
-                                 expected_asset_file_contents,
-                                 expected_asset_tensor_name,
-                                 asset_id=0):
-    assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
-    asset = meta_graph_pb2.AssetFileDef()
-    assets_any[asset_id].Unpack(asset)
+
+class SavedModelTest(SavedModelTestBase):
+
+  def _validate_assets(self,
+                       export_dir,
+                       asset_file_def,
+                       expected_asset_file_name,
+                       expected_asset_file_contents,
+                       expected_asset_tensor_name,
+                       asset_id=0):
     assets_path = os.path.join(
         compat.as_bytes(export_dir),
         compat.as_bytes(constants.ASSETS_DIRECTORY),
@@ -93,8 +95,10 @@ class SavedModelTest(test.TestCase):
     actual_asset_contents = file_io.read_file_to_string(assets_path)
     self.assertEqual(expected_asset_file_contents,
                      compat.as_text(actual_asset_contents))
-    self.assertEqual(expected_asset_file_name, asset.filename)
-    self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
+    self.assertEqual(expected_asset_file_name,
+                     asset_file_def[asset_id].filename)
+    self.assertEqual(expected_asset_tensor_name,
+                     asset_file_def[asset_id].tensor_info.name)
 
   def _validate_inputs_tensor_info_fail(self, builder, tensor_info):
     with self.session(graph=ops.Graph()) as sess:
@@ -185,7 +189,7 @@ class SavedModelTest(test.TestCase):
 
   def testVerifySessionGraphUsage(self):
     export_dir = self._get_export_dir("test_verify_session_graph_usage")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
@@ -205,7 +209,7 @@ class SavedModelTest(test.TestCase):
 
   def testSequence(self):
     export_dir = self._get_export_dir("test_sequence")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Expect an assertion error since add_meta_graph_and_variables() should be
     # invoked before any add_meta_graph() calls.
@@ -222,7 +226,7 @@ class SavedModelTest(test.TestCase):
 
   def testTags(self):
     export_dir = self._get_export_dir("test_tags")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -311,7 +315,7 @@ class SavedModelTest(test.TestCase):
 
   def testVariables(self):
     export_dir = self._get_export_dir("test_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with two variables. SavedModel invoked to:
     # - add with weights.
@@ -363,7 +367,7 @@ class SavedModelTest(test.TestCase):
 
   def testGraphWithoutVariables(self):
     export_dir = self._get_export_dir("test_graph_has_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with no variables.
     with self.session(graph=ops.Graph()) as sess:
@@ -385,7 +389,7 @@ class SavedModelTest(test.TestCase):
       a = ops.get_default_graph().get_tensor_by_name(constant_5_name)
       b = constant_op.constant(6.0)
       c = a * b
-      self.assertEqual(30.0, sess.run(c))
+      self.assertEqual(30.0, self.evaluate(c))
 
     # Restore the graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
@@ -394,11 +398,11 @@ class SavedModelTest(test.TestCase):
       a = ops.get_default_graph().get_tensor_by_name(constant_6_name)
       b = constant_op.constant(5.0)
       c = a * b
-      self.assertEqual(30.0, sess.run(c))
+      self.assertEqual(30.0, self.evaluate(c))
 
   def testNoOverwrite(self):
     export_dir = self._get_export_dir("test_no_overwrite")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -417,12 +421,12 @@ class SavedModelTest(test.TestCase):
 
     # An attempt to create another builder with the same export directory should
     # result in an assertion error.
-    self.assertRaises(AssertionError, saved_model_builder.SavedModelBuilder,
+    self.assertRaises(AssertionError, saved_model_builder._SavedModelBuilder,
                       export_dir)
 
   def testSaveAsText(self):
     export_dir = self._get_export_dir("test_astext")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -453,15 +457,15 @@ class SavedModelTest(test.TestCase):
 
   def testCollections(self):
     export_dir = self._get_export_dir("test_collections")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable added to a collection. SavedModel invoked to:
     # - add with weights.
     with self.session(graph=ops.Graph()) as sess:
       v = variables.VariableV1(42, name="v")
       ops.add_to_collection("foo_vars", v)
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(42, v.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(42, self.evaluate(v))
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Graph with the same single variable added to a different collection.
@@ -470,8 +474,8 @@ class SavedModelTest(test.TestCase):
     with self.session(graph=ops.Graph()) as sess:
       v = variables.VariableV1(43, name="v")
       ops.add_to_collection("bar_vars", v)
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(43, v.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(43, self.evaluate(v))
       builder.add_meta_graph(["bar"])
 
     # Save the SavedModel to disk.
@@ -503,7 +507,7 @@ class SavedModelTest(test.TestCase):
 
   def testSignatureDefs(self):
     export_dir = self._get_export_dir("test_signature_defs")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable and a single entry in the signature def map.
     # SavedModel is invoked to add with weights.
@@ -563,7 +567,7 @@ class SavedModelTest(test.TestCase):
 
   def testSignatureDefValidationFails(self):
     export_dir = self._get_export_dir("test_signature_def_validation_fail")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     tensor_without_encoding = meta_graph_pb2.TensorInfo()
     tensor_without_encoding.dtype = types_pb2.DT_FLOAT
@@ -585,11 +589,11 @@ class SavedModelTest(test.TestCase):
     tensor_with_name.dtype = types_pb2.DT_FLOAT
 
     export_dir = self._get_export_dir("test_signature_def_validation_name_1")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_inputs_tensor_info_accept(builder, tensor_with_name)
 
     export_dir = self._get_export_dir("test_signature_def_validation_name_2")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_outputs_tensor_info_accept(builder, tensor_with_name)
 
   def testSignatureDefValidationSucceedsWithCoo(self):
@@ -599,16 +603,16 @@ class SavedModelTest(test.TestCase):
     tensor_with_coo.dtype = types_pb2.DT_FLOAT
 
     export_dir = self._get_export_dir("test_signature_def_validation_coo_1")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_inputs_tensor_info_accept(builder, tensor_with_coo)
 
     export_dir = self._get_export_dir("test_signature_def_validation_coo_2")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_outputs_tensor_info_accept(builder, tensor_with_coo)
 
   def testAssets(self):
     export_dir = self._get_export_dir("test_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
@@ -618,21 +622,19 @@ class SavedModelTest(test.TestCase):
           compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
       file_io.write_string_to_file(ignored_filepath, "will be ignored")
 
-      asset_collection = self._build_asset_collection("hello42.txt",
-                                                      "foo bar baz",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
@@ -641,64 +643,66 @@ class SavedModelTest(test.TestCase):
 
   def testAssetsNameCollisionDiffFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_diff_file")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar bak", "asset_file_tensor",
-          asset_subdir="1")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar bak", "asset_file_tensor", asset_subdir="1")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
-          asset_subdir="2")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar bak",
-                                      "asset_file_tensor:0")
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt_1", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar bak", "asset_file_tensor:0")
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt_1",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
 
   def testAssetsNameCollisionSameFilepath(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_path")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor_1")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       # The second tensor should be recorded, but the same.
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
@@ -707,35 +711,35 @@ class SavedModelTest(test.TestCase):
 
   def testAssetsNameCollisionSameFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_file")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor",
-          asset_subdir="1")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor", asset_subdir="1")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
-          asset_subdir="2")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       # The second tensor should be recorded, but the same.
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
@@ -744,19 +748,21 @@ class SavedModelTest(test.TestCase):
 
   def testAssetsNameCollisionManyFiles(self):
     export_dir = self._get_export_dir("test_assets_name_collision_many_files")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       for i in range(5):
         idx = str(i)
-        asset_collection = self._build_asset_collection(
-            "hello42.txt", "foo bar baz " + idx, "asset_file_tensor_" + idx,
+        asset_list = self._build_asset_collection(
+            "hello42.txt",
+            "foo bar baz " + idx,
+            "asset_file_tensor_" + idx,
             asset_subdir=idx)
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -765,18 +771,20 @@ class SavedModelTest(test.TestCase):
       foo_graph = loader.load(sess, ["foo"], export_dir)
       for i in range(1, 5):
         idx = str(i)
-        self._validate_asset_collection(
-            export_dir, foo_graph.collection_def, "hello42.txt_" + idx,
-            "foo bar baz " + idx, "asset_file_tensor_{}:0".format(idx),
+        self._validate_assets(
+            export_dir,
+            foo_graph.asset_file_def,
+            "hello42.txt_" + idx,
+            "foo bar baz " + idx,
+            "asset_file_tensor_{}:0".format(idx),
             asset_id=i)
 
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz 0",
-                                      "asset_file_tensor_0:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz 0", "asset_file_tensor_0:0")
 
   def testCustomMainOp(self):
     export_dir = self._get_export_dir("test_main_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -794,7 +802,7 @@ class SavedModelTest(test.TestCase):
         add_v1_v2 = math_ops.add(v1._ref(), v2._ref())
         custom_main_op = control_flow_ops.group(state_ops.assign(v3, add_v1_v2))
 
-      sess.run(custom_main_op)
+      self.evaluate(custom_main_op)
       builder.add_meta_graph_and_variables(
           sess, ["foo"], main_op=custom_main_op)
 
@@ -811,7 +819,7 @@ class SavedModelTest(test.TestCase):
 
   def testLegacyInitOp(self):
     export_dir = self._get_export_dir("test_legacy_init_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -828,7 +836,7 @@ class SavedModelTest(test.TestCase):
       assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
       legacy_init_op = control_flow_ops.group(assign_v3, name="legacy_init_op")
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(
           sess, ["foo"], legacy_init_op=legacy_init_op)
 
@@ -855,7 +863,7 @@ class SavedModelTest(test.TestCase):
     self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
 
   def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     g = ops.Graph()
     with self.session(graph=g) as sess:
@@ -871,7 +879,7 @@ class SavedModelTest(test.TestCase):
       assign_v2 = state_ops.assign(v2, v1)
       init_op = control_flow_ops.group(assign_v2, name="init_op")
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       ops.add_to_collection(key, control_flow_ops.no_op())
       # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
@@ -885,7 +893,7 @@ class SavedModelTest(test.TestCase):
 
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -894,10 +902,10 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       train_op = state_ops.assign_add(v1, v2)
 
-      sess.run(train_op)
+      self.evaluate(train_op)
       # TODO(karmel): remove explicit call when in the public method.
       builder._add_train_op(train_op)
       builder.add_meta_graph_and_variables(sess, ["foo"])
@@ -914,7 +922,7 @@ class SavedModelTest(test.TestCase):
 
   def testTrainOpGroup(self):
     export_dir = self._get_export_dir("test_train_op_group")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -923,10 +931,10 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       train_op = control_flow_ops.group()
 
-      sess.run(train_op)
+      self.evaluate(train_op)
       # TODO(karmel): remove explicit call when in the public method.
       builder._add_train_op(train_op)
       builder.add_meta_graph_and_variables(sess, ["foo"])
@@ -943,7 +951,7 @@ class SavedModelTest(test.TestCase):
 
   def testTrainOpAfterVariables(self):
     export_dir = self._get_export_dir("test_train_op_after_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -952,11 +960,11 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["pre_foo"])
 
       train_op = state_ops.assign_add(v1, v2)
-      sess.run(train_op)
+      self.evaluate(train_op)
       # TODO(karmel): remove explicit call when in the public method.
       builder._add_train_op(train_op)
       builder.add_meta_graph(["foo"])
@@ -975,28 +983,28 @@ class SavedModelTest(test.TestCase):
 
   def testMultipleAssets(self):
     export_dir = self._get_export_dir("test_multiple_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `foo` graph.
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "foo".
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `bar` graph.
-      asset_collection = self._build_asset_collection("bar.txt", "content_bar",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("bar.txt", "content_bar",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+      builder.add_meta_graph(["bar"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1004,43 +1012,41 @@ class SavedModelTest(test.TestCase):
     # Check assets restored for graph with tag "foo".
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
       bar_graph = loader.load(sess, ["bar"], export_dir)
-      self._validate_asset_collection(export_dir, bar_graph.collection_def,
-                                      "bar.txt", "content_bar",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, bar_graph.asset_file_def, "bar.txt",
+                            "content_bar", "asset_file_tensor:0")
 
   def testDuplicateAssets(self):
     export_dir = self._get_export_dir("test_duplicate_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `foo` specific
       # content.
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "foo".
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `bar` specific
       # content.
-      asset_collection = self._build_asset_collection("foo.txt", "content_bar",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_bar",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+      builder.add_meta_graph(["bar"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1048,9 +1054,8 @@ class SavedModelTest(test.TestCase):
     # Check assets restored for graph with tag "foo".
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
@@ -1059,13 +1064,12 @@ class SavedModelTest(test.TestCase):
       # Validate the assets for `bar` graph. `foo.txt` should contain the
       # original contents corresponding to `foo` graph since an asset with the
       # same name across multiple graphs is only stored the first time
-      self._validate_asset_collection(export_dir, bar_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, bar_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
   def testOp(self):
     export_dir = self._get_export_dir("test_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with session.Session(
         graph=ops.Graph(),
@@ -1086,7 +1090,7 @@ class SavedModelTest(test.TestCase):
       ops.add_to_collection("v", v3)
       ops.add_to_collection("init_op", init_op)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
 
@@ -1108,7 +1112,7 @@ class SavedModelTest(test.TestCase):
 
   def testCustomSaveable(self):
     export_dir = self._get_export_dir("custom_saveable")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with session.Session(
         graph=ops.Graph(),
@@ -1137,11 +1141,11 @@ class SavedModelTest(test.TestCase):
 
   def testCustomSaver(self):
     export_dir = self._get_export_dir("test_custom_saver")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       custom_saver = training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"], saver=custom_saver)
 
@@ -1159,11 +1163,11 @@ class SavedModelTest(test.TestCase):
 
   def testNoCustomSaver(self):
     export_dir = self._get_export_dir("test_no_custom_saver")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"])
 
@@ -1181,11 +1185,11 @@ class SavedModelTest(test.TestCase):
 
   def testMultipleCustomSavers(self):
     export_dir = self._get_export_dir("test_multiple_custom_savers")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["tag_0"])
 
       saver_1 = training.Saver()
@@ -1211,19 +1215,19 @@ class SavedModelTest(test.TestCase):
 
   def testImportScope(self):
     export_dir = self._get_export_dir("test_scoped_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Build a SavedModel with a variable, an asset, and a constant tensor.
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
       constant_op.constant("constant value", name="constant_tensor_name")
       builder.add_meta_graph_and_variables(
-          sess, ["tag_name"], assets_collection=asset_collection)
+          sess, ["tag_name"], assets_list=asset_list)
 
       # Save the asset file path for later comparison.
-      asset_file_path = asset_collection[0].eval()
+      asset_file_path = asset_list[0].eval()
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1244,16 +1248,14 @@ class SavedModelTest(test.TestCase):
 
       # The loaded asset tensor should be scoped, but the asset file path and
       # contents should be unchanged.
-      asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
-      self.assertEqual(1, len(asset_collection))
-      self.assertEqual(asset_file_path, asset_collection[0].eval())
-      self.assertEqual("scope_name/asset_file_tensor:0",
-                       asset_collection[0].name)
+      asset_list = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      self.assertEqual(1, len(asset_list))
+      self.assertEqual(asset_file_path, asset_list[0].eval())
+      self.assertEqual("scope_name/asset_file_tensor:0", asset_list[0].name)
       # The static asset data inside graph_proto.collection_def should not be
       # scoped.
-      self._validate_asset_collection(export_dir, graph_proto.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, graph_proto.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
       # The constant tensor should be scoped, but its contents should be
       # unchanged.
@@ -1264,7 +1266,7 @@ class SavedModelTest(test.TestCase):
 
   def testClearDevices(self):
     export_dir = self._get_export_dir("test_clear_devices")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Specify a device and save a variable.
     ops.reset_default_graph()
@@ -1288,7 +1290,7 @@ class SavedModelTest(test.TestCase):
 
   def testStripDefaultAttrs(self):
     export_dir = self._get_export_dir("test_strip_default_attrs")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Add a graph with two float32 variables and a Complex Op composing them
     # with strip_default_attrs enabled.
@@ -1296,7 +1298,7 @@ class SavedModelTest(test.TestCase):
       real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
       imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(
           sess, ["foo"], strip_default_attrs=True)
 
@@ -1306,7 +1308,7 @@ class SavedModelTest(test.TestCase):
       real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
       imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph(["bar"], strip_default_attrs=False)
 
     # Save the SavedModel to disk in text format.
@@ -1361,14 +1363,14 @@ class SavedModelTest(test.TestCase):
   def testInconsistentConsumerDefaultAttrs(self):
     export_dir = self._get_export_dir(
         "test_strip_default_attrs_no_consumer_defaults")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Add a graph with a single variable and a test op with a defaultless
     # float32 attr, "test_attr".
     with session.Session(graph=ops.Graph()) as sess:
       variables.VariableV1(1.0, dtype=dtypes.float64, name="var")
       test_ops.test_attr(T=dtypes.float32, name="test_attr")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Save the SavedModel to disk in text format.
@@ -1428,5 +1430,60 @@ class SavedModelTest(test.TestCase):
       loader.load(sess, ["foo"], export_dir)
 
 
+class SavedModelV1Test(SavedModelTestBase):
+
+  def _validate_asset_collection(self,
+                                 export_dir,
+                                 graph_collection_def,
+                                 expected_asset_file_name,
+                                 expected_asset_file_contents,
+                                 expected_asset_tensor_name,
+                                 asset_id=0):
+    assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
+    asset = meta_graph_pb2.AssetFileDef()
+    assets_any[asset_id].Unpack(asset)
+    assets_path = os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes(constants.ASSETS_DIRECTORY),
+        compat.as_bytes(expected_asset_file_name))
+    actual_asset_contents = file_io.read_file_to_string(assets_path)
+    self.assertEqual(expected_asset_file_contents,
+                     compat.as_text(actual_asset_contents))
+    self.assertEqual(expected_asset_file_name, asset.filename)
+    self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
+
+  def testWritingAssetsToCollection(self):
+    export_dir = self._get_export_dir("test_writing_assets_to_collection")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      # Build an asset list.
+      ignored_filepath = os.path.join(
+          compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
+      file_io.write_string_to_file(ignored_filepath, "will be ignored")
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor")
+
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.session(graph=ops.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz",
+                                      "asset_file_tensor:0")
+      ignored_asset_path = os.path.join(
+          compat.as_bytes(export_dir),
+          compat.as_bytes(constants.ASSETS_DIRECTORY),
+          compat.as_bytes("ignored.txt"))
+      self.assertFalse(file_io.file_exists(ignored_asset_path))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/simple_save_test.py b/tensorflow/python/saved_model/simple_save_test.py
index 18f82daadad6ae7142c249c66e61ea13782b33ac..0d0665072ac96a6085fb479261344083b6e1e941 100644
--- a/tensorflow/python/saved_model/simple_save_test.py
+++ b/tensorflow/python/saved_model/simple_save_test.py
@@ -33,8 +33,8 @@ class SimpleSaveTest(test.TestCase):
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
-    sess.run(variables.global_variables_initializer())
-    self.assertEqual(variable_value, v.eval())
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(variable_value, self.evaluate(v))
     return v
 
   def _check_variable_info(self, actual_variable, expected_variable):
diff --git a/tensorflow/python/saved_model/utils.py b/tensorflow/python/saved_model/utils.py
index 27c355490934e7d20ee72ae10eca9fdb8bbfca14..9bd0126ae3aac4130f0ef2f6a38cfb9abd2c6f8b 100644
--- a/tensorflow/python/saved_model/utils.py
+++ b/tensorflow/python/saved_model/utils.py
@@ -22,5 +22,6 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 from tensorflow.python.saved_model.utils_impl import build_tensor_info
+from tensorflow.python.saved_model.utils_impl import build_tensor_info_from_op
 from tensorflow.python.saved_model.utils_impl import get_tensor_from_tensor_info
 # pylint: enable=unused-import
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 2ee4d9f4e042f2b0c27d0614c59617d688d07bba..10667419761214fe1830199d86e9cf9bf577d7dd 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import constants
 from tensorflow.python.util import compat
@@ -42,7 +44,7 @@ from tensorflow.python.util.tf_export import tf_export
     "library as tf.compat.v1.saved_model.utils.build_tensor_info or "
     "tf.compat.v1.saved_model.build_tensor_info.")
 def build_tensor_info(tensor):
-  """Utility function to build TensorInfo proto.
+  """Utility function to build TensorInfo proto from a Tensor.
 
   Args:
     tensor: Tensor or SparseTensor whose name, dtype and shape are used to
@@ -64,6 +66,41 @@ def build_tensor_info(tensor):
   return tensor_info
 
 
+def build_tensor_info_from_op(op):
+  """Utility function to build TensorInfo proto from an Op.
+
+  Note that this function should be used with caution. It is strictly restricted
+  to TensorFlow internal use-cases only. Please make sure you do need it before
+  using it.
+
+  This utility function overloads the TensorInfo proto by setting the name to
+  the Op's name, dtype to DT_INVALID and tensor_shape as None. One typical usage
+  is for the Op of the call site for the defunned function:
+  ```python
+    @function.defun
+    def some_vairable_initialiation_fn(value_a, value_b):
+      a = value_a
+      b = value_b
+
+    value_a = constant_op.constant(1, name="a")
+    value_b = constant_op.constant(2, name="b")
+    op_info = utils.build_op_info(
+        some_vairable_initialiation_fn(value_a, value_b))
+  ```
+
+  Args:
+    op: An Op whose name is used to build the TensorInfo. The name that points
+        to the Op could be fetched at run time in the Loader session.
+
+  Returns:
+    A TensorInfo protocol buffer constructed based on the supplied argument.
+  """
+  return meta_graph_pb2.TensorInfo(
+      dtype=types_pb2.DT_INVALID,
+      tensor_shape=tensor_shape.unknown_shape().as_proto(),
+      name=op.name)
+
+
 @tf_export(v1=["saved_model.get_tensor_from_tensor_info",
                "saved_model.utils.get_tensor_from_tensor_info"])
 @deprecation.deprecated(
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index c9b38ed60323332e430ef109c039898e1f8c8130..0888dcb411e34b030416362663fe4e2d11899cfd 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -19,16 +19,41 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import utils
 
 
 class UtilsTest(test.TestCase):
 
+  def testBuildTensorInfoOp(self):
+    x = constant_op.constant(1, name="x")
+    y = constant_op.constant(2, name="y")
+    z = control_flow_ops.group([x, y], name="op_z")
+    z_op_info = utils.build_tensor_info_from_op(z)
+    self.assertEqual("op_z", z_op_info.name)
+    self.assertEqual(types_pb2.DT_INVALID, z_op_info.dtype)
+    self.assertEqual(0, len(z_op_info.tensor_shape.dim))
+
+  def testBuildTensorInfoDefunOp(self):
+    @function.defun
+    def my_init_fn(x, y):
+      self.x_var = x
+      self.y_var = y
+
+    x = constant_op.constant(1, name="x")
+    y = constant_op.constant(2, name="y")
+    init_op_info = utils.build_tensor_info_from_op(my_init_fn(x, y))
+    self.assertEqual("PartitionedFunctionCall", init_op_info.name)
+    self.assertEqual(types_pb2.DT_INVALID, init_op_info.dtype)
+    self.assertEqual(0, len(init_op_info.tensor_shape.dim))
+
   def testBuildTensorInfoDense(self):
     x = array_ops.placeholder(dtypes.float32, 1, name="x")
     x_tensor_info = utils.build_tensor_info(x)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index fbae2b77fafaac921f4419df4b8fa4378f9554b1..0c13016712f316e113723c4c0c250ef636a3fcf0 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -28,12 +28,12 @@ from google.protobuf import json_format as _json_format
 # pylint: disable=unused-import
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.framework.summary_pb2 import SummaryDescription
+from tensorflow.core.framework.summary_pb2 import SummaryMetadata as _SummaryMetadata  # pylint: enable=unused-import
 from tensorflow.core.util.event_pb2 import Event
 from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.core.util.event_pb2 import TaggedRunMetadata
 # pylint: enable=unused-import
 
-
 from tensorflow.python.eager import context as _context
 from tensorflow.python.framework import constant_op as _constant_op
 from tensorflow.python.framework import dtypes as _dtypes
@@ -42,16 +42,6 @@ from tensorflow.python.ops import gen_logging_ops as _gen_logging_ops
 from tensorflow.python.ops import gen_summary_ops as _gen_summary_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import summary_op_util as _summary_op_util
 
-# exports tensor-related summaries
-# pylint: disable=unused-import
-from tensorflow.python.ops.summary_ops import tensor_summary
-# pylint: enable=unused-import
-
-# exports text
-# pylint: disable=unused-import
-from tensorflow.python.summary.text_summary import text_summary as text
-# pylint: enable=unused-import
-
 # exports FileWriter, FileWriterCache
 # pylint: disable=unused-import
 from tensorflow.python.summary.writer.writer import FileWriter
@@ -62,7 +52,7 @@ from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('summary.scalar')
+@tf_export(v1=['summary.scalar'])
 def scalar(name, tensor, collections=None, family=None):
   """Outputs a `Summary` protocol buffer containing a single scalar value.
 
@@ -92,7 +82,7 @@ def scalar(name, tensor, collections=None, family=None):
   return val
 
 
-@tf_export('summary.image')
+@tf_export(v1=['summary.image'])
 def image(name, tensor, max_outputs=3, collections=None, family=None):
   """Outputs a `Summary` protocol buffer with images.
 
@@ -148,7 +138,7 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
   return val
 
 
-@tf_export('summary.histogram')
+@tf_export(v1=['summary.histogram'])
 def histogram(name, values, collections=None, family=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
@@ -189,7 +179,7 @@ def histogram(name, values, collections=None, family=None):
   return val
 
 
-@tf_export('summary.audio')
+@tf_export(v1=['summary.audio'])
 def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
           family=None):
   # pylint: disable=line-too-long
@@ -238,7 +228,104 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
   return val
 
 
-@tf_export('summary.merge')
+@tf_export(v1=['summary.text'])
+def text(name, tensor, collections=None):
+  """Summarizes textual data.
+
+  Text data summarized via this plugin will be visible in the Text Dashboard
+  in TensorBoard. The standard TensorBoard Text Dashboard will render markdown
+  in the strings, and will automatically organize 1d and 2d tensors into tables.
+  If a tensor with more than 2 dimensions is provided, a 2d subarray will be
+  displayed along with a warning message. (Note that this behavior is not
+  intrinsic to the text summary api, but rather to the default TensorBoard text
+  plugin.)
+
+  Args:
+    name: A name for the generated node. Will also serve as a series name in
+      TensorBoard.
+    tensor: a string-type Tensor to summarize.
+    collections: Optional list of ops.GraphKeys.  The collections to add the
+      summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
+
+  Returns:
+    A TensorSummary op that is configured so that TensorBoard will recognize
+    that it contains textual data. The TensorSummary is a scalar `Tensor` of
+    type `string` which contains `Summary` protobufs.
+
+  Raises:
+    ValueError: If tensor has the wrong type.
+  """
+  if tensor.dtype != _dtypes.string:
+    raise ValueError('Expected tensor %s to have dtype string, got %s' %
+                     (tensor.name, tensor.dtype))
+
+  summary_metadata = _SummaryMetadata(
+      plugin_data=_SummaryMetadata.PluginData(plugin_name='text'))
+  t_summary = tensor_summary(
+      name=name,
+      tensor=tensor,
+      summary_metadata=summary_metadata,
+      collections=collections)
+  return t_summary
+
+
+@tf_export(v1=['summary.tensor_summary'])
+def tensor_summary(name,
+                   tensor,
+                   summary_description=None,
+                   collections=None,
+                   summary_metadata=None,
+                   family=None,
+                   display_name=None):
+  """Outputs a `Summary` protocol buffer with a serialized tensor.proto.
+
+  Args:
+    name: A name for the generated node. If display_name is not set, it will
+      also serve as the tag name in TensorBoard. (In that case, the tag
+      name will inherit tf name scopes.)
+    tensor: A tensor of any type and shape to serialize.
+    summary_description: A long description of the summary sequence. Markdown
+      is supported.
+    collections: Optional list of graph collections keys. The new summary op is
+      added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
+    summary_metadata: Optional SummaryMetadata proto (which describes which
+      plugins may use the summary value).
+    family: Optional; if provided, used as the prefix of the summary tag,
+      which controls the name used for display on TensorBoard when
+      display_name is not set.
+    display_name: A string used to name this data in TensorBoard. If this is
+      not set, then the node name will be used instead.
+
+  Returns:
+    A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+    buffer.
+  """
+
+  if summary_metadata is None:
+    summary_metadata = _SummaryMetadata()
+
+  if summary_description is not None:
+    summary_metadata.summary_description = summary_description
+
+  if display_name is not None:
+    summary_metadata.display_name = display_name
+
+  serialized_summary_metadata = summary_metadata.SerializeToString()
+
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
+  with _summary_op_util.summary_scope(
+      name, family, values=[tensor]) as (tag, scope):
+    val = _gen_logging_ops.tensor_summary_v2(
+        tensor=tensor,
+        tag=tag,
+        name=scope,
+        serialized_summary_metadata=serialized_summary_metadata)
+    _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
+  return val
+
+
+@tf_export(v1=['summary.merge'])
 def merge(inputs, collections=None, name=None):
   # pylint: disable=line-too-long
   """Merges summaries.
@@ -284,7 +371,7 @@ def merge(inputs, collections=None, name=None):
   return val
 
 
-@tf_export('summary.merge_all')
+@tf_export(v1=['summary.merge_all'])
 def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
   """Merges all summaries collected in the default graph.
 
@@ -317,7 +404,7 @@ def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
     return merge(summary_ops, name=name)
 
 
-@tf_export('summary.get_summary_description')
+@tf_export(v1=['summary.get_summary_description'])
 def get_summary_description(node_def):
   """Given a TensorSummary node_def, retrieve its SummaryDescription.
 
diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py
index ac5eb4dbbe3b652dc69d34922f4dc5d33de5e28a..cacc28cc596f9f0bb0694f7675e56d92fe1a6d6d 100644
--- a/tensorflow/python/summary/summary_test.py
+++ b/tensorflow/python/summary/summary_test.py
@@ -12,6 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Tests for the API surface of the V1 tf.summary ops.
+
+These tests don't check the actual serialized proto summary value for the
+more complex summaries (e.g. audio, image).  Those test live separately in
+tensorflow/python/kernel_tests/summary_v1_*.py.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -29,7 +36,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary as summary_lib
 
 
-class ScalarSummaryTest(test.TestCase):
+class SummaryTest(test.TestCase):
 
   def testScalarSummary(self):
     with self.cached_session() as s:
@@ -135,6 +142,12 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(len(summary.value), 1)
     self.assertEqual(summary.value[0].tag, 'family/outer/family/inner')
 
+  def testHistogramSummaryTypes(self):
+    for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32,
+                  dtypes.float32, dtypes.float64):
+      const = constant_op.constant(10, dtype=dtype)
+      summary_lib.histogram('h', const)
+
   def testAudioSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
@@ -165,6 +178,21 @@ class ScalarSummaryTest(test.TestCase):
                       for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  def testTextSummary(self):
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        num = array_ops.constant(1)
+        summary_lib.text('foo', num)
+
+      # The API accepts vectors.
+      arr = array_ops.constant(['one', 'two', 'three'])
+      summ = summary_lib.text('foo', arr)
+      self.assertEqual(summ.op.type, 'TensorSummaryV2')
+
+      # the API accepts scalars
+      summ = summary_lib.text('foo', array_ops.constant('one'))
+      self.assertEqual(summ.op.type, 'TensorSummaryV2')
+
   def testSummaryNameConversion(self):
     c = constant_op.constant(3)
     s = summary_lib.scalar('name with spaces', c)
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
deleted file mode 100644
index 6418c847f3c819cf2491bb449921d15c39eae288..0000000000000000000000000000000000000000
--- a/tensorflow/python/summary/text_summary.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implements text_summary in TensorFlow, with TensorBoard support.
-
-The text_summary is a wrapper around the generic tensor_summary that takes a
-string-type tensor and emits a TensorSummary op with SummaryMetadata that
-notes that this summary is textual data for the TensorBoard text plugin.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops.summary_ops import tensor_summary
-from tensorflow.python.util.tf_export import tf_export
-
-PLUGIN_NAME = "text"
-
-
-@tf_export("summary.text")
-def text_summary(name, tensor, collections=None):
-  """Summarizes textual data.
-
-  Text data summarized via this plugin will be visible in the Text Dashboard
-  in TensorBoard. The standard TensorBoard Text Dashboard will render markdown
-  in the strings, and will automatically organize 1d and 2d tensors into tables.
-  If a tensor with more than 2 dimensions is provided, a 2d subarray will be
-  displayed along with a warning message. (Note that this behavior is not
-  intrinsic to the text summary api, but rather to the default TensorBoard text
-  plugin.)
-
-  Args:
-    name: A name for the generated node. Will also serve as a series name in
-      TensorBoard.
-    tensor: a string-type Tensor to summarize.
-    collections: Optional list of ops.GraphKeys.  The collections to add the
-      summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
-
-  Returns:
-    A TensorSummary op that is configured so that TensorBoard will recognize
-    that it contains textual data. The TensorSummary is a scalar `Tensor` of
-    type `string` which contains `Summary` protobufs.
-
-  Raises:
-    ValueError: If tensor has the wrong type.
-  """
-  if tensor.dtype != dtypes.string:
-    raise ValueError("Expected tensor %s to have dtype string, got %s" %
-                     (tensor.name, tensor.dtype))
-
-  summary_metadata = summary_pb2.SummaryMetadata(
-      plugin_data=summary_pb2.SummaryMetadata.PluginData(
-          plugin_name=PLUGIN_NAME))
-  t_summary = tensor_summary(
-      name=name,
-      tensor=tensor,
-      summary_metadata=summary_metadata,
-      collections=collections)
-  return t_summary
diff --git a/tensorflow/python/summary/text_summary_test.py b/tensorflow/python/summary/text_summary_test.py
deleted file mode 100644
index 5b0db43cc1caeb7eb847ea53df57b8d49a302e08..0000000000000000000000000000000000000000
--- a/tensorflow/python/summary/text_summary_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import googletest
-from tensorflow.python.summary import text_summary
-
-
-class TextPluginTest(test_util.TensorFlowTestCase):
-  """Test the Text Summary API.
-
-  These tests are focused on testing the API design of the text_summary method.
-  It doesn't test the PluginAsset and tensors registry functionality, because
-  that is better tested by the text_plugin test that actually consumes that
-  metadata.
-  """
-
-  def testTextSummaryAPI(self):
-    with self.cached_session():
-
-      with self.assertRaises(ValueError):
-        num = array_ops.constant(1)
-        text_summary.text_summary("foo", num)
-
-      # The API accepts vectors.
-      arr = array_ops.constant(["one", "two", "three"])
-      summ = text_summary.text_summary("foo", arr)
-      self.assertEqual(summ.op.type, "TensorSummaryV2")
-
-      # the API accepts scalars
-      summ = text_summary.text_summary("foo", array_ops.constant("one"))
-      self.assertEqual(summ.op.type, "TensorSummaryV2")
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 09d4b63fbb61780db1aa9341cd2d98010b839989..20b62e5016afbefbc42cb8927a997c83d16d89ef 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -309,12 +309,11 @@ class FileWriterTestCase(test.TestCase):
       summ = summary_pb2.Summary(
           value=[summary_pb2.Summary.Value(
               tag="i", simple_value=1.0)])
-      sw.add_summary(summ.SerializeToString(), i.eval())
+      sw.add_summary(summ.SerializeToString(), self.evaluate(i))
       sw.add_summary(
           summary_pb2.Summary(
-              value=[summary_pb2.Summary.Value(
-                  tag="l", simple_value=2.0)]),
-          l.eval())
+              value=[summary_pb2.Summary.Value(tag="l", simple_value=2.0)]),
+          self.evaluate(l))
       sw.close()
 
     rr = self._EventsReader(test_dir)
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 39174fa5890c9cfbaf0f7139f0ba6f853bc303e5..4e1bf3d8362dbcf78a1aa93b620694603d3a9532 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -38,7 +38,6 @@ limitations under the License.
 
 %include "tensorflow/python/lib/io/file_io.i"
 %include "tensorflow/python/training/quantize_training.i"
-%include "tensorflow/python/training/server_lib.i"
 
 %include "tensorflow/python/framework/python_op_gen.i"
 
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 384c7a82d27b786839545a6ad979e12a73ee88c1..901d6bc335f3a10439e2f02d0db2b237a89fece0 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -29,6 +29,8 @@ py_library(
         ":optimize_for_inference_lib",
         ":selective_registration_header_lib",
         ":strip_unused_lib",
+        # Include the TF upgrade script to users can run it directly after install TF
+        "//tensorflow/tools/compatibility:tf_upgrade_v2",
     ],
 )
 
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 2e5d875a58ae4af1fb164694f925383b0d952fc3..5e64cc64d2408fa459b6daa0c9134793bd9d5327 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -20,7 +20,8 @@ def gen_api_init_files(
         packages = ["tensorflow.python", "tensorflow.lite.python.lite"],
         package_deps = ["//tensorflow/python:no_contrib"],
         output_package = "tensorflow",
-        output_dir = ""):
+        output_dir = "",
+        root_file_name = "__init__.py"):
     """Creates API directory structure and __init__.py files.
 
     Creates a genrule that generates a directory structure with __init__.py
@@ -54,13 +55,14 @@ def gen_api_init_files(
       output_package: Package where generated API will be added to.
       output_dir: Subdirectory to output API to.
         If non-empty, must end with '/'.
+      root_file_name: Name of the root file with all the root imports.
     """
     root_init_template_flag = ""
     if root_init_template:
         root_init_template_flag = "--root_init_template=$(location " + root_init_template + ")"
 
     primary_package = packages[0]
-    api_gen_binary_target = ("create_" + primary_package + "_api_%d") % api_version
+    api_gen_binary_target = ("create_" + primary_package + "_api_%d_%s") % (api_version, name)
     native.py_binary(
         name = api_gen_binary_target,
         srcs = ["//tensorflow/python/tools/api/generator:create_python_api.py"],
@@ -73,6 +75,11 @@ def gen_api_init_files(
         ],
     )
 
+    # Replace name of root file with root_file_name.
+    output_files = [
+        root_file_name if f == "__init__.py" else f
+        for f in output_files
+    ]
     all_output_files = ["%s%s" % (output_dir, f) for f in output_files]
     compat_api_version_flags = ""
     for compat_api_version in compat_api_versions:
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index f7de02ed6fa13a0668a2e7870d6e6b053efb2b71..b41a1bc8f6f3628cb2328c30cd112129b1100a26 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -4,17 +4,18 @@
 TENSORFLOW_API_INIT_FILES = [
     # BEGIN GENERATED FILES
     "__init__.py",
-    "app/__init__.py",
     "bitwise/__init__.py",
     "compat/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
-    "distributions/__init__.py",
+    "distribute/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
+    "experimental/__init__.py",
     "feature_column/__init__.py",
     "gfile/__init__.py",
+    "io/gfile/__init__.py",
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
@@ -62,26 +63,22 @@ TENSORFLOW_API_INIT_FILES = [
     "linalg/__init__.py",
     "lite/__init__.py",
     "lite/constants/__init__.py",
-    "logging/__init__.py",
     "losses/__init__.py",
     "math/__init__.py",
     "metrics/__init__.py",
     "nn/__init__.py",
     "nn/rnn_cell/__init__.py",
-    "profiler/__init__.py",
     "quantization/__init__.py",
     "random/__init__.py",
-    "resource_loader/__init__.py",
-    "strings/__init__.py",
     "saved_model/__init__.py",
     "sets/__init__.py",
+    "signal/__init__.py",
     "sparse/__init__.py",
-    "spectral/__init__.py",
+    "strings/__init__.py",
     "summary/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
     "train/__init__.py",
-    "user_ops/__init__.py",
     "version/__init__.py",
     # END GENERATED FILES
 ]
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 520ce54a8819918ae2f4eb467133fc8dc77cf8de..0fadec00ab0459d5482afaef0dec42dd28be7460 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -10,9 +10,11 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
+    "distribute/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
+    "experimental/__init__.py",
     "feature_column/__init__.py",
     "gfile/__init__.py",
     "graph_util/__init__.py",
@@ -80,6 +82,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "saved_model/__init__.py",
     "saved_model/builder/__init__.py",
     "saved_model/constants/__init__.py",
+    "saved_model/experimental/__init__.py",
     "saved_model/loader/__init__.py",
     "saved_model/main_op/__init__.py",
     "saved_model/signature_constants/__init__.py",
@@ -87,6 +90,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "saved_model/tag_constants/__init__.py",
     "saved_model/utils/__init__.py",
     "sets/__init__.py",
+    "signal/__init__.py",
     "sparse/__init__.py",
     "spectral/__init__.py",
     "summary/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py
index f62580342136938d847af1b48ed62856fc8c522e..be988c572cb6c2b5157642715c788eb1b43e562d 100644
--- a/tensorflow/python/tools/api/generator/create_python_api.py
+++ b/tensorflow/python/tools/api/generator/create_python_api.py
@@ -45,10 +45,10 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 \"\"\"%s
 \"\"\"
 
-from __future__ import print_function
+from __future__ import print_function as _print_function
 
 """
-_GENERATED_FILE_FOOTER = '\n\ndel print_function\n'
+_GENERATED_FILE_FOOTER = '\n\ndel _print_function\n'
 
 
 class SymbolExposedTwiceError(Exception):
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index fbec9c6635c060aa846c704f49921a4b5ceed42c..9e211d172ec6bc6a4e373a7be3c46b7650141cdf 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -35,10 +35,11 @@ DocSource.__new__.__defaults__ = (None,) * len(DocSource._fields)
 
 _TENSORFLOW_DOC_SOURCES = {
     'app': DocSource(docstring_module_name='platform.app'),
+    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
     'compat': DocSource(docstring_module_name='util.compat'),
+    'distribute': DocSource(docstring_module_name='training.distribute'),
     'distributions': DocSource(
         docstring_module_name='ops.distributions.distributions'),
-    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
     'errors': DocSource(docstring_module_name='framework.errors'),
     'gfile': DocSource(docstring_module_name='platform.gfile'),
     'graph_util': DocSource(docstring_module_name='framework.graph_util'),
@@ -56,8 +57,8 @@ _TENSORFLOW_DOC_SOURCES = {
     'resource_loader': DocSource(
         docstring_module_name='platform.resource_loader'),
     'sets': DocSource(docstring_module_name='ops.sets'),
+    'signal': DocSource(docstring_module_name='ops.signal.signal'),
     'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
-    'spectral': DocSource(docstring_module_name='ops.spectral_ops'),
     'strings': DocSource(docstring_module_name='ops.string_ops'),
     'sysconfig': DocSource(docstring_module_name='platform.sysconfig'),
     'test': DocSource(docstring_module_name='platform.test'),
diff --git a/tensorflow/python/tools/strip_unused_test.py b/tensorflow/python/tools/strip_unused_test.py
index 7cf0c3e3ed9b5748b263913566150eff8acf857a..e906ff94ba8c0ad5ebb5014f244b0ef128d23a7a 100644
--- a/tensorflow/python/tools/strip_unused_test.py
+++ b/tensorflow/python/tools/strip_unused_test.py
@@ -50,7 +50,7 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
           wanted_input_node, 2.0, name="output_node")
       math_ops.add(output_node, 2.0, name="later_node")
       sess = session.Session()
-      output = sess.run(output_node)
+      output = self.evaluate(output_node)
       self.assertNear(-4.0, output, 0.00001)
       graph_io.write_graph(sess.graph, self.get_temp_dir(), input_graph_name)
 
@@ -113,7 +113,7 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
           input_node1, input_node2, name="output_node")
       math_ops.add(output_node, 2.0, name="later_node")
       sess = session.Session()
-      output = sess.run(output_node)
+      output = self.evaluate(output_node)
       self.assertNear(6.0, output, 0.00001)
       graph_io.write_graph(sess.graph, self.get_temp_dir(), input_graph_name)
 
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index a14ac895ac096e351cad91aa8a53ca0026b18c9d..7cbaf1039f94d12570c43b98f79e273877871ab2 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -177,12 +177,11 @@ class AdadeltaOptimizerTest(test.TestCase):
             1.0, 1.0, 1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval())
+        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/adagrad_da_test.py b/tensorflow/python/training/adagrad_da_test.py
index 00801be3b4da878619cac753707b088352afe803..761f703cb5bf7e7a3a938db9f9866495376fb2e1 100644
--- a/tensorflow/python/training/adagrad_da_test.py
+++ b/tensorflow/python/training/adagrad_da_test.py
@@ -92,12 +92,13 @@ class AdagradDAOptimizerTest(test.TestCase):
             1.0, global_step).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-1, -1]], var0.eval(), rtol=0.01)
+        self.assertAllCloseAccordingToType([[-1, -1]],
+                                           self.evaluate(var0),
+                                           rtol=0.01)
 
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in [dtypes.float64, dtypes.float32]:
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index 7caf01f64d5e1cf7a4084444721aff9c55a9fb0b..962e65c41f510ab315c0934330b77a4dbbf2b1e2 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -107,13 +107,14 @@ class AdagradOptimizerTest(test.TestCase):
         sgd_op = adagrad.AdagradOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType(
-            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]],
+                                           self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1], [3, 4]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -128,16 +129,18 @@ class AdagradOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1))
 
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -159,16 +162,16 @@ class AdagradOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([[1.0], [2.0]], var0.eval())
-        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        self.assertAllClose([[1.0], [2.0]], self.evaluate(var0))
+        self.assertAllClose([[3.0], [4.0]], self.evaluate(var1))
         # Run 3 step of sgd
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+            np.array([[-1.6026098728179932], [2.0]]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([[3.0], [3.715679168701172]]), var1.eval())
+            np.array([[3.0], [3.715679168701172]]), self.evaluate(var1))
 
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -193,12 +196,12 @@ class AdagradOptimizerTest(test.TestCase):
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
         self.assertAllClose(aggregated_update_var.eval(),
-                            repeated_index_update_var.eval())
+                            self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
           self.assertAllClose(aggregated_update_var.eval(),
-                              repeated_index_update_var.eval())
+                              self.evaluate(repeated_index_update_var))
 
   def testSparseRepeatedIndicesResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -217,12 +220,12 @@ class AdagradOptimizerTest(test.TestCase):
             2.0).minimize(loss_aggregated)
         variables.global_variables_initializer().run()
         self.assertAllCloseAccordingToType(
-            var_repeated.eval(), var_aggregated.eval())
+            self.evaluate(var_repeated), self.evaluate(var_aggregated))
         for _ in range(3):
           update_op_repeated.run()
           update_op_aggregated.run()
           self.assertAllCloseAccordingToType(
-              var_repeated.eval(), var_aggregated.eval())
+              self.evaluate(var_repeated), self.evaluate(var_aggregated))
 
   def testSparseStability(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -253,12 +256,12 @@ class AdagradOptimizerTest(test.TestCase):
           init.run()
           ada_update.run()
           self.assertAllCloseAccordingToType(
-              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), self.evaluate(slot0))
           self.assertAllCloseAccordingToType(
               np.array([[
                   0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
                   -0.01029443
-              ]]), var0.eval())
+              ]]), self.evaluate(var0))
 
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -282,17 +285,19 @@ class AdagradOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values.
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Mix the first and the second adagrad for 3 steps.
         ada_update1.run()
         ada_update2.run()
         ada_update1.run()
         # Validate updated params (the same as with only 1 Adagrad).
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1))
 
   def testDynamicShapeVariable_Ok(self):
     with self.cached_session():
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 0d42cc7b9c690d9c5582bc6282739b7abb4739c1..87dad0a8a6513df11cb52295d888aa9c9a55b442 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -83,23 +83,24 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testSparse(self):
     self.doTestSparse(use_resource=False)
@@ -143,12 +144,12 @@ class AdamOptimizerTest(test.TestCase):
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
         self.assertAllClose(aggregated_update_var.eval(),
-                            repeated_index_update_var.eval())
+                            self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
           self.assertAllClose(aggregated_update_var.eval(),
-                              repeated_index_update_var.eval())
+                              self.evaluate(repeated_index_update_var))
 
   def doTestBasic(self, use_resource=False, use_callable_params=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
@@ -254,23 +255,24 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -294,13 +296,14 @@ class AdamOptimizerTest(test.TestCase):
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 3 steps of intertwined Adam1 and Adam2.
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           if t % 2 == 0:
             update1.run()
           else:
@@ -310,8 +313,8 @@ class AdamOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTwoSessions(self):
     optimizer = adam.AdamOptimizer()
diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py
index 7af821c81928e67e0f258bc064d582a4186995c1..68fcb97a1c32f00ee059d78f1198d63218192763 100644
--- a/tensorflow/python/training/basic_loops.py
+++ b/tensorflow/python/training/basic_loops.py
@@ -21,7 +21,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.basic_train_loop")
+@tf_export(v1=["train.basic_train_loop"])
 def basic_train_loop(supervisor, train_step_fn, args=None,
                      kwargs=None, master=""):
   """Basic loop to train a model.
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 1efabcd854d7f72c51e39dcf1f5ce65b0168cbcc..60db654e9cc34a55cfb9406034e7ef3da7efc703 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -976,7 +976,7 @@ class FeedFnHook(session_run_hook.SessionRunHook):
         fetches=None, feed_dict=self.feed_fn())
 
 
-@tf_export("train.ProfilerHook")
+@tf_export(v1=["train.ProfilerHook"])
 class ProfilerHook(session_run_hook.SessionRunHook):
   """Captures CPU/GPU profiling information every N steps or seconds.
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 2d469634e0ec99d71e244eb85c8f493759c79738..13c9e9aa67b7f8cb238ce77d40d59122005387eb 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -243,7 +243,7 @@ class LoggingTensorHookTest(test.TestCase):
           tensors=[t.name], at_end=True)
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       self.logged_message = ''
       for _ in range(3):
         mon_sess.run(train_op)
@@ -261,7 +261,7 @@ class LoggingTensorHookTest(test.TestCase):
         tensors=[t.name], every_n_iter=10, at_end=at_end)
     hook.begin()
     mon_sess = monitored_session._HookedSession(sess, [hook])
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
     mon_sess.run(train_op)
     self.assertRegexpMatches(str(self.logged_message), t.name)
     for _ in range(3):
@@ -308,7 +308,7 @@ class LoggingTensorHookTest(test.TestCase):
           tensors={'foo': t}, every_n_iter=1)
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess.run(train_op)
       self.assertRegexpMatches(str(self.logged_message), 'foo')
       # in first run, elapsed time is None.
@@ -322,7 +322,7 @@ class LoggingTensorHookTest(test.TestCase):
         tensors=[t.name], every_n_secs=1.0, at_end=at_end)
     hook.begin()
     mon_sess = monitored_session._HookedSession(sess, [hook])
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
 
     mon_sess.run(train_op)
     self.assertRegexpMatches(str(self.logged_message), t.name)
@@ -366,7 +366,7 @@ class LoggingTensorHookTest(test.TestCase):
           formatter=lambda items: 'qqq=%s' % items[t.name])
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess.run(train_op)
       self.assertEqual(self.logged_message[0], 'qqq=42.0')
 
@@ -921,7 +921,7 @@ class StepCounterHookTest(test.TestCase):
       hook = basic_session_run_hooks.StepCounterHook(
           summary_writer=summary_writer, every_n_steps=10)
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       with test.mock.patch.object(tf_logging, 'warning') as mock_log:
         for _ in range(30):
@@ -950,7 +950,7 @@ class StepCounterHookTest(test.TestCase):
           summary_writer=summary_writer, every_n_steps=None, every_n_secs=0.1)
 
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       mon_sess.run(train_op)
       time.sleep(0.2)
@@ -987,7 +987,7 @@ class StepCounterHookTest(test.TestCase):
           summary_writer=summary_writer, every_n_steps=1, every_n_secs=None)
 
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       mon_sess.run(train_op)
       mon_sess.run(train_op)
@@ -1007,7 +1007,7 @@ class StepCounterHookTest(test.TestCase):
     with ops.Graph().as_default(), session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(0)  # keep same.
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       hook = basic_session_run_hooks.StepCounterHook(
           every_n_steps=1, every_n_secs=None)
       hook.begin()
@@ -1034,7 +1034,7 @@ class StepCounterHookTest(test.TestCase):
         summary_writer=self.summary_writer, every_n_steps=every_n_steps)
     self.hook._set_steps_per_run(steps_per_run)
     self.hook.begin()
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
     self.mon_sess = monitored_session._HookedSession(sess, [self.hook])
 
   def test_steps_per_run_less_than_every_n_steps(self):
@@ -1147,7 +1147,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(30):
         mon_sess.run(self.train_op)
@@ -1179,7 +1179,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(10):
         mon_sess.run(self.train_op)
@@ -1207,7 +1207,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(4):
         mon_sess.run(self.train_op)
@@ -1242,7 +1242,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(8):
         mon_sess.run(self.train_op)
@@ -1285,7 +1285,7 @@ class GlobalStepWaiterHookTest(test.TestCase):
       hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=1000)
       hook.begin()
       with session_lib.Session() as sess:
-        sess.run(variables_lib.global_variables_initializer())
+        self.evaluate(variables_lib.global_variables_initializer())
         waiter = threading.Thread(
             target=hook.before_run,
             args=(session_run_hook.SessionRunContext(
@@ -1390,7 +1390,7 @@ class ResourceSummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(30):
         mon_sess.run(self.train_op)
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index 38910fb246d6dc149520f41aa161635497fd5cca..f745ab4824ac364b51758e6c3fb60a5679d210fb 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -36,6 +36,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,7 +56,11 @@ def _GetCheckpointFilename(save_dir, latest_filename):
   return os.path.join(save_dir, latest_filename)
 
 
-@tf_export("train.generate_checkpoint_state_proto")
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use tf.train.CheckpointManager to manage checkpoints rather "
+                  "than editing the Checkpoint proto manually."))
+@tf_export(v1=["train.generate_checkpoint_state_proto"])
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
                                     all_model_checkpoint_paths=None,
@@ -121,7 +126,11 @@ def generate_checkpoint_state_proto(save_dir,
   return coord_checkpoint_proto
 
 
-@tf_export("train.update_checkpoint_state")
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use tf.train.CheckpointManager to manage checkpoints rather "
+                  "than manually editing the Checkpoint proto."))
+@tf_export(v1=["train.update_checkpoint_state"])
 def update_checkpoint_state(save_dir,
                             model_checkpoint_path,
                             all_model_checkpoint_paths=None,
@@ -344,7 +353,10 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
   return None
 
 
-@tf_export("train.checkpoint_exists")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use standard file APIs to check for files with this prefix.")
+@tf_export(v1=["train.checkpoint_exists"])
 def checkpoint_exists(checkpoint_prefix):
   """Checks whether a V1 or V2 checkpoint exists with the specified prefix.
 
@@ -369,7 +381,10 @@ def checkpoint_exists(checkpoint_prefix):
     return False
 
 
-@tf_export("train.get_checkpoint_mtimes")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use standard file utilities to get mtimes.")
+@tf_export(v1=["train.get_checkpoint_mtimes"])
 def get_checkpoint_mtimes(checkpoint_prefixes):
   """Returns the mtimes (modification timestamps) of the checkpoints.
 
@@ -408,7 +423,10 @@ def get_checkpoint_mtimes(checkpoint_prefixes):
   return mtimes
 
 
-@tf_export("train.remove_checkpoint")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use standard file APIs to delete files with this prefix.")
+@tf_export(v1=["train.remove_checkpoint"])
 def remove_checkpoint(checkpoint_prefix,
                       checkpoint_format_version=saver_pb2.SaverDef.V2,
                       meta_graph_suffix="meta"):
@@ -458,6 +476,7 @@ def meta_graph_filename(checkpoint_filename, meta_graph_suffix="meta"):
 
 
 # TODO(allenl): Allow tf.keras.Model instances in the constructor directly?
+@tf_export("train.CheckpointManager")
 class CheckpointManager(object):
   """Deletes old checkpoints.
 
@@ -634,13 +653,10 @@ class CheckpointManager(object):
     """
     return self._checkpoint_prefix
 
-  def save(self, session=None, checkpoint_number=None):
+  def save(self, checkpoint_number=None):
     """Creates a new checkpoint and manages it.
 
     Args:
-      session: The session to evaluate variables in. Ignored when executing
-        eagerly. If not provided when graph building, the default session is
-        used.
       checkpoint_number: An optional integer, or an integer-dtype `Variable` or
         `Tensor`, used to number the checkpoint. If `None` (default),
         checkpoints are numbered using `checkpoint.save_counter`. Even if
@@ -657,9 +673,9 @@ class CheckpointManager(object):
     if context.executing_eagerly():
       save_counter = self._checkpoint.save_counter
       save_counter.assign_add(1)
+      session = None
     else:
-      if session is None:
-        session = ops.get_default_session()
+      session = ops.get_default_session()
 
       def _initializing_creator(next_creator, **kwargs):
         """Initialize the save counter if it has been newly created."""
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 3a061bcb35c1c1a6ef31645c8e0ef892e9d9aa62..b61ed17531a872587ddec38a7134f061b2b8c347 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -123,9 +123,9 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
           # Record a short training history.
           variables.global_variables_initializer().run()
           save.save(sess, filepath, global_step=0)
-          inc.eval()
+          self.evaluate(inc)
           save.save(sess, filepath, global_step=1)
-          inc.eval()
+          self.evaluate(inc)
           save.save(sess, filepath, global_step=2)
 
         with self.cached_session() as sess:
diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py
index dde84314977f6ffc8c93e90f6ad76e13c2f02cb0..21ad3df1c8f4c71ff43dddb6681f167b873efd76 100644
--- a/tensorflow/python/training/checkpoint_ops_test.py
+++ b/tensorflow/python/training/checkpoint_ops_test.py
@@ -47,7 +47,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
       with variable_scope.variable_scope('some_scope'):
         variable_scope.get_variable(name='embeddings', shape=[5, 16],
                                     initializer=initializer)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver = saver_lib.Saver()
       saver.save(sess, checkpoint_prefix, global_step=5)
     self.checkpoint_file = '{}-5'.format(checkpoint_prefix)
@@ -115,7 +115,8 @@ class LoadAndRemapWrappersTest(test.TestCase):
         axis=1)
 
     with self.cached_session():
-      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+      self.assertAllClose(expected_remapped_matrix,
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_output_layer_weight_initializer_linear(self):
     """Tests for the output layer initializer in the linear multi-class case."""
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 57954ec56a5943eab821f6e0e5c4872f7b52e595..58166dbb6818e686bbb938f71ed36ec3786cc2a3 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -101,7 +101,7 @@ def list_variables(ckpt_dir_or_file):
   return result
 
 
-@tf_export("train.init_from_checkpoint")
+@tf_export(v1=["train.init_from_checkpoint"])
 def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """Replaces `tf.Variable` initializers so they load from a checkpoint file.
 
@@ -187,7 +187,7 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
     _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
   else:
     distribution_strategy_context.get_replica_context().merge_call(
-        _init_from_checkpoint, ckpt_dir_or_file, assignment_map)
+        _init_from_checkpoint, args=(ckpt_dir_or_file, assignment_map))
 
 
 def _init_from_checkpoint(_, ckpt_dir_or_file, assignment_map):
@@ -318,13 +318,13 @@ def _set_checkpoint_initializer(variable,
         saveable_objects.append(s)
 
     assert len(saveable_objects) == 1  # Should be only one variable.
-    init_op = saveable_objects[0].restore([restore_op], restored_shapes=None)
+  init_op = saveable_objects[0].restore([restore_op], restored_shapes=None)
 
-    # pylint:disable=protected-access
-    variable._initializer_op = init_op
-    restore_op.set_shape(variable.shape)
-    variable._initial_value = restore_op
-    # pylint:enable=protected-access
+  # pylint:disable=protected-access
+  variable._initializer_op = init_op
+  restore_op.set_shape(variable.shape)
+  variable._initial_value = restore_op
+  # pylint:enable=protected-access
 
 
 def _set_variable_or_list_initializer(variable_or_list, ckpt_file,
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index 61dcbdb2b8f92256c97b0bd62c1944516646fa03..a3e58de4a31bca7ccb20606bd43ebdb732137f4b 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -207,9 +207,6 @@ class CheckpointsTest(test.TestCase):
 
       checkpoint_utils.init_from_checkpoint(checkpoint_dir,
                                             {"useful_scope/": "useful_scope/"})
-      # initializer runs on the same task but always on CPU.
-      self.assertEqual(my4._initializer_op.op.inputs[1].device,
-                       "/job:ps/device:CPU:0")
 
   def testInitFromRootCheckpoint(self):
     checkpoint_dir = self.get_temp_dir()
diff --git a/tensorflow/python/training/checkpointable/tracking.py b/tensorflow/python/training/checkpointable/tracking.py
index 558ae0855e4c46883bf1d21abeed01e5b6f9fcfa..c85b208d47985553ced692ccf0ef1627f9428a89 100644
--- a/tensorflow/python/training/checkpointable/tracking.py
+++ b/tensorflow/python/training/checkpointable/tracking.py
@@ -19,6 +19,11 @@ from __future__ import print_function
 
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.util import tf_contextlib
+
+
+# global _RESOURCE_TRACKER_STACK
+_RESOURCE_TRACKER_STACK = []
 
 
 class NotCheckpointable(object):
@@ -72,10 +77,57 @@ class Checkpointable(base.CheckpointableBase):
     return data_structures.NoDependency(value)
 
 
+class ResourceTracker(object):
+  """An object that tracks a list of resources."""
+
+  def __init__(self):
+    self._resources = []
+
+  @property
+  def resources(self):
+    return self._resources
+
+  def add_resource(self, resource):
+    self._resources.append(resource)
+
+
+@tf_contextlib.contextmanager
+def resource_tracker_scope(resource_tracker):
+  """A context to manage resource trackers.
+
+  Use this in order to collect up all resources created within a block of code.
+  Example usage:
+
+  ```python
+  resource_tracker = ResourceTracker()
+  with resource_tracker_scope(resource_tracker):
+    resource = TrackableResource()
+
+  assert resource_tracker.resources == [resource]
+
+  Args:
+    resource_tracker: The passed in ResourceTracker object
+
+  Yields:
+    A scope in which the resource_tracker is active.
+  """
+  global _RESOURCE_TRACKER_STACK
+  old = list(_RESOURCE_TRACKER_STACK)
+  _RESOURCE_TRACKER_STACK.append(resource_tracker)
+  try:
+    yield
+  finally:
+    _RESOURCE_TRACKER_STACK = old
+
+
 class TrackableResource(base.CheckpointableBase):
   """Base class for all resources that need to be tracked."""
 
   def __init__(self):
+    global _RESOURCE_TRACKER_STACK
+    for resource_tracker in _RESOURCE_TRACKER_STACK:
+      resource_tracker.add_resource(self)
+
     self._resource_handle = None
 
   def create_resource(self):
diff --git a/tensorflow/python/training/checkpointable/tracking_test.py b/tensorflow/python/training/checkpointable/tracking_test.py
index a44c570fb9fe4104e44588c40069ddaa4b97c282..17c5461bc25e5e409cc04d0182603e8406dc7d47 100644
--- a/tensorflow/python/training/checkpointable/tracking_test.py
+++ b/tensorflow/python/training/checkpointable/tracking_test.py
@@ -193,5 +193,62 @@ class InterfaceTests(test.TestCase):
     self.assertAllClose({"k": [numpy.ones([2, 2]), numpy.zeros([3, 3])]},
                         self.evaluate(a.tensors))
 
+
+class _DummyResource(tracking.TrackableResource):
+
+  def __init__(self, handle_name):
+    self._handle_name = handle_name
+    super(_DummyResource, self).__init__()
+
+  def create_resource(self):
+    return self._handle_name
+
+
+class ResourceTrackerTest(test.TestCase):
+
+  def testBasic(self):
+    resource_tracker = tracking.ResourceTracker()
+    with tracking.resource_tracker_scope(resource_tracker):
+      dummy_resource1 = _DummyResource("test1")
+      dummy_resource2 = _DummyResource("test2")
+
+    self.assertEqual(2, len(resource_tracker.resources))
+    self.assertEqual("test1", resource_tracker.resources[0].resource_handle)
+    self.assertEqual("test2", resource_tracker.resources[1].resource_handle)
+
+  def testTwoScopes(self):
+    resource_tracker1 = tracking.ResourceTracker()
+    with tracking.resource_tracker_scope(resource_tracker1):
+      dummy_resource1 = _DummyResource("test1")
+
+    resource_tracker2 = tracking.ResourceTracker()
+    with tracking.resource_tracker_scope(resource_tracker2):
+      dummy_resource2 = _DummyResource("test2")
+
+    self.assertEqual(1, len(resource_tracker1.resources))
+    self.assertEqual("test1", resource_tracker1.resources[0].resource_handle)
+    self.assertEqual(1, len(resource_tracker1.resources))
+    self.assertEqual("test2", resource_tracker2.resources[0].resource_handle)
+
+  def testNestedScopesScopes(self):
+    resource_tracker = tracking.ResourceTracker()
+    with tracking.resource_tracker_scope(resource_tracker):
+      resource_tracker1 = tracking.ResourceTracker()
+      with tracking.resource_tracker_scope(resource_tracker1):
+        dummy_resource1 = _DummyResource("test1")
+
+      resource_tracker2 = tracking.ResourceTracker()
+      with tracking.resource_tracker_scope(resource_tracker2):
+        dummy_resource2 = _DummyResource("test2")
+
+    self.assertEqual(1, len(resource_tracker1.resources))
+    self.assertEqual("test1", resource_tracker1.resources[0].resource_handle)
+    self.assertEqual(1, len(resource_tracker1.resources))
+    self.assertEqual("test2", resource_tracker2.resources[0].resource_handle)
+    self.assertEqual(2, len(resource_tracker.resources))
+    self.assertEqual("test1", resource_tracker.resources[0].resource_handle)
+    self.assertEqual("test2", resource_tracker.resources[1].resource_handle)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index 137d29b7859c7a2f57b044f7bcc2c79f46369a45..f45f7445f137058b7d78ad7a9c3e2e6a1cd008d7 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -821,7 +821,7 @@ def capture_dependencies(template):
     """
     def _call_next_creator_renaming_initializer(initializer, **inner_kwargs):
       inner_kwargs.pop("name")  # Ignored; this is the scope-stripped name which
-                                # we don't want to propagate.
+      # we don't want to propagate.
       return next_creator(
           initial_value=initializer,
           name=name,
@@ -982,6 +982,12 @@ class CheckpointLoadStatus(_LoadStatus):
         raise AssertionError(
             "Object not assigned a value from checkpoint: %s" % (node,))
     for checkpointable_object in list_objects(self._root_checkpointable):
+      # Remove data structures that do not contain any variables from
+      # restoration checks.
+      if (isinstance(checkpointable_object,
+                     data_structures.CheckpointableDataStructure) and
+          not checkpointable_object._checkpoint_dependencies):
+        continue
       self._checkpoint.all_python_objects.add(checkpointable_object)
     unused_python_objects = (
         _ObjectIdentitySet(self._checkpoint.all_python_objects)
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index 24fd42f6d2e466878c1d5a1bda066336c4dd1ff4..19955140123afcf7addfa94550a8352f3acf810f 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -1313,6 +1313,24 @@ class CheckpointingTests(test.TestCase):
       train_fn()
       self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_restore_after_adding_empty_checkpointable_data_structure(self):
+    model = NonLayerCheckpointable()
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    checkpoint.restore(None).initialize_or_restore()
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+
+    del model, checkpoint
+
+    model = NonLayerCheckpointable()
+    model.dict = {"a": 1}
+    model.list = {"b": 1}
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    load_status = checkpoint.restore(save_path)
+    load_status.assert_existing_objects_matched().run_restore_ops()
+
 
 class _ManualScope(tracking.Checkpointable):
 
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index be80c3657158b52d063b5d2b7731f25d184794a0..5874a1ff4152d835263cdc1ad87002b64c026eb8 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -130,7 +130,7 @@ class _ReplicaDeviceChooser(object):
     return worker_device.to_string()
 
 
-@tf_export("train.replica_device_setter")
+@tf_export(v1=["train.replica_device_setter"])
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
                           worker_device="/job:worker", merge_devices=True,
                           cluster=None, ps_ops=None, ps_strategy=None):
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 35ed52fa1293b1d146bcb1a515c3af9bf029b254..f930a89f999798077f329a9acdfb75bf297fae10 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -12,16 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Class DistributionStrategy, ReplicaContext, and supporting APIs."""
+"""Library for running a computation across multiple devices."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import threading
+import weakref
+import enum
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context as eager_context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -32,18 +38,19 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 
 # ------------------------------------------------------------------------------
-# Context tracking whether in a distribution.update() or .update_non_slot()
-# call.
+# Context tracking whether in a strategy.update() or .update_non_slot() call.
 
 
 _update_device = threading.local()
 
 
 def get_update_device():
-  """Get the current device if in a `DistributionStrategy.update()` call."""
+  """Get the current device if in a `tf.distribute.Strategy.update()` call."""
   try:
     return _update_device.current
   except AttributeError:
@@ -70,38 +77,46 @@ class UpdateContext(object):
 # Public utility functions.
 
 
+@tf_export("distribute.get_loss_reduction")
 def get_loss_reduction():
-  """Reduce `aggregation` corresponding to the last loss reduction."""
+  """`tf.distribute.ReduceOp` corresponding to the last loss reduction."""
   loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
   if loss_reduction == losses_impl.Reduction.SUM:
-    return variable_scope.VariableAggregation.SUM
-  return variable_scope.VariableAggregation.MEAN
+    return reduce_util.ReduceOp.SUM
+  return reduce_util.ReduceOp.MEAN
 
 
 # ------------------------------------------------------------------------------
 # Internal API for validating the current thread mode
 
 
-def _require_cross_replica_context(distribution_strategy):
-  """Verify in cross-replica context for `distribution_strategy`."""
+def _require_cross_replica_context_extended(extended):
+  """Verify in cross-replica context."""
   context = _get_per_thread_mode()
-  if context.cross_replica_context is distribution_strategy: return
+  cross_replica = context.cross_replica_context
+  if cross_replica is not None and cross_replica.extended is extended:
+    return
+  strategy = extended._container_strategy()  # pylint: disable=protected-access
   # We have an error to report, figure out the right message.
-  if context.distribution_strategy is not distribution_strategy:
-    if (context.distribution_strategy is
-        distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
-      raise RuntimeError(
-          'Need to be inside "with distribution_strategy.scope()" for %s' %
-          (distribution_strategy,))
-    else:
-      raise RuntimeError(
-          "Mixing different DistributionStrategy objects: %s is not %s" %
-          (context.distribution_strategy, distribution_strategy))
-  assert context.cross_replica_context is None
+  if context.distribution_strategy is not strategy:
+    _wrong_strategy_scope(strategy, context)
+  assert cross_replica is None
   raise RuntimeError("Method requires being in cross-replica context, use "
                      "get_replica_context().merge_call()")
 
 
+def _wrong_strategy_scope(strategy, context):
+  # Figure out the right error message.
+  if not distribution_strategy_context.has_distribution_strategy():
+    raise RuntimeError(
+        'Need to be inside "with strategy.scope()" for %s' %
+        (strategy,))
+  else:
+    raise RuntimeError(
+        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
+        (context.distribution_strategy, strategy))
+
+
 def require_replica_context(replica_ctx):
   """Verify in `replica_ctx` replica context."""
   context = _get_per_thread_mode()
@@ -110,27 +125,27 @@ def require_replica_context(replica_ctx):
   if context.replica_context is None:
     raise RuntimeError("Need to be inside `call_for_each_replica()`")
   if context.distribution_strategy is replica_ctx.distribution_strategy:
-    # Two different ReplicaContexts with the same DistributionStrategy.
-    raise RuntimeError("Mismatching replica context.")
+    # Two different ReplicaContexts with the same tf.distribute.Strategy.
+    raise RuntimeError("Mismatching ReplicaContext.")
   raise RuntimeError(
-      "Mismatching DistributionStrategy objects: %s is not %s." %
+      "Mismatching tf.distribute.Strategy objects: %s is not %s." %
       (context.distribution_strategy, replica_ctx.distribution_strategy))
 
 
-def _require_distribution_strategy_scope(distribution_strategy):
+def _require_distribution_strategy_scope_strategy(strategy):
+  """Verify in a `strategy.scope()` in this thread."""
+  context = _get_per_thread_mode()
+  if context.distribution_strategy is strategy: return
+  _wrong_strategy_scope(strategy, context)
+
+
+def _require_distribution_strategy_scope_extended(extended):
   """Verify in a `distribution_strategy.scope()` in this thread."""
   context = _get_per_thread_mode()
-  if context.distribution_strategy is distribution_strategy: return
-  # We have an error to report, figure out the right message.
-  if (context.distribution_strategy is
-      distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
-    raise RuntimeError(
-        'Need to be inside "with distribution_strategy.scope()" for %s' %
-        (distribution_strategy,))
-  else:
-    raise RuntimeError(
-        "Mixing different DistributionStrategy objects: %s is not %s" %
-        (context.distribution_strategy, distribution_strategy))
+  if context.distribution_strategy.extended is extended: return
+  # Report error.
+  strategy = extended._container_strategy()  # pylint: disable=protected-access
+  _wrong_strategy_scope(strategy, context)
 
 
 # ------------------------------------------------------------------------------
@@ -139,15 +154,18 @@ def _require_distribution_strategy_scope(distribution_strategy):
 
 
 class _CurrentDistributionContext(object):
-  """Context manager for setting the `DistributionStrategy` and var creator."""
+  """Context manager setting the current `tf.distribute.Strategy`.
+
+  Also: overrides the variable creator and optionally the current device.
+  """
 
   def __init__(self,
-               distribution_strategy,
+               strategy,
                var_creator_scope,
                var_scope=None,
                default_device=None):
     self._context = distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
-        distribution_strategy)
+        strategy)
     self._var_creator_scope = var_creator_scope
     self._var_scope = var_scope
     if default_device:
@@ -176,8 +194,8 @@ class _CurrentDistributionContext(object):
 class _SameScopeAgainContext(object):
   """Trivial context manager when you are already in `scope()`."""
 
-  def __init__(self, distribution_strategy):
-    self._distribution_strategy = distribution_strategy
+  def __init__(self, strategy):
+    self._distribution_strategy = strategy
 
   def __enter__(self):
     return self._distribution_strategy
@@ -186,25 +204,463 @@ class _SameScopeAgainContext(object):
     del exception_type, exception_value, traceback
 
 
+# TODO(yuefengz): add more replication modes.
+@tf_export("distribute.InputReplicationMode")
+class InputReplicationMode(enum.Enum):
+  """Replication mode for input function."""
+
+  # The input function will be called on each worker independently, creating as
+  # many input pipelines as number of workers. Replicas will dequeue from the
+  # local Dataset on their worker. Distribution Strategy doesn't manage any
+  # state sharing between such separate input pipelines.
+  PER_WORKER = "PER_WORKER"
+
+
+@tf_export("distribute.InputContext")
+class InputContext(object):
+  """A class wrapping information needed by an input function.
+
+  This is a context class that is passed to the user's input fn and contains
+  information about the compute replicas and input pipelines. The number of
+  compute replicas (in sync training) helps compute per input pipeline batch
+  size from the desired global batch size. Input pipeline information can be
+  used to return a different subset of the input in each input pipeline (for
+  e.g. shard the input pipeline, use a different input source etc).
+  """
+
+  def __init__(self,
+               num_input_pipelines=1,
+               input_pipeline_id=0,
+               num_replicas_in_sync=1):
+    """Initializes an InputContext object.
+
+    Args:
+      num_input_pipelines: the number of input pipelines in a cluster.
+      input_pipeline_id: the current input pipeline id, should be an int in
+        [0,`num_input_pipelines`).
+      num_replicas_in_sync: the number of replicas that are in sync.
+    """
+    self._num_input_pipelines = num_input_pipelines
+    self._input_pipeline_id = input_pipeline_id
+    self._num_replicas_in_sync = num_replicas_in_sync
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns the number of compute replicas in sync."""
+    return self._num_replicas_in_sync
+
+  @property
+  def input_pipeline_id(self):
+    """Returns the input pipeline ID."""
+    return self._input_pipeline_id
+
+  @property
+  def num_input_pipelines(self):
+    """Returns the number of input pipelines."""
+    return self._num_input_pipelines
+
+  def get_per_replica_batch_size(self, global_batch_size):
+    """Returns the per-replica batch size.
+
+    Args:
+      global_batch_size: the global batch size which should be divisible by
+        `num_replicas_in_sync`.
+
+    Returns:
+      the per-replica batch size.
+
+    Raises:
+      ValueError: if `global_batch_size` not divisible by
+        `num_replicas_in_sync`.
+    """
+    if global_batch_size % self._num_replicas_in_sync != 0:
+      raise ValueError("The `global_batch_size` %r is not divisible by "
+                       "`num_replicas_in_sync` %r " %
+                       (global_batch_size, self._num_replicas_in_sync))
+    return global_batch_size // self._num_replicas_in_sync
+
+
 # ------------------------------------------------------------------------------
 # Base classes for all distribution strategies.
 
 
+@tf_export("distribute.Strategy")
 class DistributionStrategy(object):
   """A list of devices with a state & compute distribution policy.
 
   See [tensorflow/contrib/distribute/README.md](
   https://www.tensorflow.org/code/tensorflow/contrib/distribute/README.md)
   for overview and examples.
+  """
+
+  # TODO(josh11b): Raise an exception if variable partitioning requested before
+  #   we add support.
+  # TODO(josh11b): Also `parameter_device_index` property?
+  # TODO(josh11b): `map()`
+  # TODO(josh11b): ClusterSpec/ClusterResolver
+  # TODO(josh11b): Partitioned computations, state; sharding
+  # TODO(josh11b): Model parallelism: "replicas" with multiple devices; shuffling
+  # TODO(josh11b): List of replicas with their worker and parameter devices
+  #   (where the parameter devices may overlap in the ps case).
+
+  def __init__(self, extended):
+    self._extended = extended
+
+  @property
+  def extended(self):
+    """`tf.distribute.StrategyExtended` with additional methods."""
+    return self._extended
+
+  def scope(self):
+    """Returns a context manager selecting this Strategy as current.
+
+    Inside a `with strategy.scope():` code block, this thread
+    will use a variable creator set by `strategy`, and will
+    enter its "cross-replica context".
+
+    Returns:
+      A context manager.
+    """
+    return self._extended._scope(self)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def read_var(self, v):
+    """DEPRECATED: use extended.read_var() instead."""
+    return self._extended.read_var(v)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def colocate_vars_with(self, colocate_with_variable):
+    """DEPRECATED: use extended.colocate_vars_with() instead."""
+    return self._extended.colocate_vars_with(colocate_with_variable)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED
+  def distribute_dataset(self, dataset_fn):
+    """Return a `dataset` split across all replicas.  DEPRECATED.
+
+    DEPRECATED: Please use `make_dataset_iterator` or
+    `make_input_fn_iterator` instead.
+
+    Suitable for providing input to `extended.call_for_each_replica()` by
+    creating an iterator:
+
+    ```
+    def dataset_fn():
+      return tf.data.Dataset.from_tensors([[1.]]).repeat()
+
+    with strategy.scope():
+      distributed_dataset = strategy.distribute_dataset(dataset_fn)
+      iterator = distributed_dataset.make_initializable_iterator()
+      replica_results = strategy.extended.call_for_each_replica(
+          replica_fn, args=(iterator.get_next(),))
+    ```
+
+    Args:
+      dataset_fn: A function that returns a `tf.data.Dataset`.
+
+    Returns:
+      A `PerReplicaDataset` that will produce data for each replica.
+    """
+    return self._extended._distribute_dataset(dataset_fn)  # pylint: disable=protected-access
+
+  def make_dataset_iterator(self, dataset):
+    """Makes an iterator for input provided via input_dataset.
+
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    global batch size. With this assumption, we will make a best effort to
+    divide each batch across all the replicas (one or more workers).
+    If this effort fails, an error will be thrown, and the user should instead
+    use `make_input_fn_iterator` which provides more control to the user, and
+    does not try to divide a batch across replicas.
+
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return self._extended._make_dataset_iterator(dataset)  # pylint: disable=protected-access
+
+  def make_input_fn_iterator(self,
+                             input_fn,
+                             replication_mode=InputReplicationMode.PER_WORKER):
+    """Returns an iterator split across replicas created from an input function.
+
+    The `input_fn` should take an `tf.distribute.InputContext` object where
+    information about input sharding can be accessed:
+
+    ```
+    def input_fn(input_context):
+      d = tf.data.Dataset.from_tensors([[1.]]).repeat()
+      return d.shard(input_context.num_input_pipelines,
+                     input_context.input_pipeline_id)
+    with strategy.scope():
+      iterator = strategy.make_input_fn_iterator(
+          input_fn)
+      replica_results = strategy.extended.call_for_each_replica(
+          replica_fn, iterator.get_next())
+    ```
+
+    Args:
+      input_fn: A function that returns a `tf.data.Dataset`. This function is
+        expected to take an `tf.distribute.InputContext` object.
+      replication_mode: an enum value of `tf.distribute.InputReplicationMode`.
+        Only `PER_WORKER` is supported currently.
+
+    Returns:
+      An iterator object that can be initialized and fetched next element.
+    """
+    if replication_mode != InputReplicationMode.PER_WORKER:
+      raise ValueError(
+          "Input replication mode not supported: %r" % replication_mode)
+    return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
+        input_fn, replication_mode=replication_mode)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def broadcast(self, tensor, destinations=None):
+    """DEPRECATED: use extended.broadcast_to() instead."""
+    return self._extended.broadcast_to(tensor, destinations)
+
+  @doc_controls.do_not_generate_docs  # Use experimental_initialize() instead.
+  def initialize(self):
+    """DEPRECATED: Use `experimental_initialize()` instead."""
+    return self._extended._initialize()  # pylint: disable=protected-access
+
+  def experimental_initialize(self):
+    """Any initialization to be done before running any computations.
+
+    In eager mode, it executes any initialization as a side effect.
+    In graph mode, it creates the initialization ops and returns them.
+
+    For example, TPU initialize_system ops.
+
+    Returns:
+      A list of ops to execute.
+    """
+    return self._extended._initialize()  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # Use experimental_finalize() instead.
+  def finalize(self):
+    """DEPRECATED: Use `experimental_finalize()` instead."""
+    return self._extended._finalize()  # pylint: disable=protected-access
+
+  def experimental_finalize(self):
+    """Any final actions to be done at the end of all computations.
+
+    In eager mode, it executes any finalize actions as a side effect.
+    In graph mode, it creates the finalize ops and returns them.
+
+    For example, TPU shutdown ops.
+
+    Returns:
+      A list of ops to execute.
+    """
+    return self._extended._finalize()  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def run_steps_on_dataset(self, fn, iterator, iterations=1,
+                           initial_loop_values=None):
+    """DEPRECATED: use extended.experimental_run_steps_on_iterator() instead."""
+    return self._extended.experimental_run_steps_on_iterator(
+        fn, iterator, iterations, initial_loop_values)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def call_for_each_replica(self, fn, *args, **kwargs):
+    """DEPRECATED: use extended.call_for_each_replica() instead."""
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to call_for_each_replica")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to call_for_each_replica")
+      kwargs = k
+    kwargs.pop("run_concurrently", None)  # Ignore old option.
+    return self._extended.call_for_each_replica(fn, args, kwargs)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def reduce(self, aggregation, value, destinations):
+    """DEPRECATED: use extended.reduce_to() instead."""
+    return self._extended.reduce_to(aggregation, value, destinations)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def batch_reduce(self, aggregation, value_destination_pairs):
+    """DEPRECATED: use extended.batch_reduce_to() instead."""
+    return self._extended.batch_reduce_to(aggregation, value_destination_pairs)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def update(self, var, fn, *args, **kwargs):
+    """DEPRECATED: use extended.update() instead."""
+    group = kwargs.pop("group", True)
+    # We temporarily support "grouped" in addition to "group" for backward-
+    # compatibility.
+    group = kwargs.pop("grouped", True) and group
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to update")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to update")
+      kwargs = k
+    return self._extended.update(var, fn, args, kwargs, group)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    """DEPRECATED: use extended.update_non_slot() instead."""
+    group = kwargs.pop("group", True)
+    # We temporarily support "grouped" in addition to "group" for backward-
+    # compatibility.
+    group = kwargs.pop("grouped", True) and group
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to update_non_slot")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to update_non_slot")
+      kwargs = k
+    return self._extended.update_non_slot(
+        colocate_with, fn, args, kwargs, group)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+  def unwrap(self, value):
+    """Returns the list of all per-replica values contained in `value`.
+
+    Args:
+      value: A value returned by `extended.call_for_each_replica()` or a
+        variable created in `scope`.
+
+    Returns:
+      A list of values contained in `value`. If `value` represents a single
+      value, this returns `[value].`
+    """
+    return self._extended._unwrap(value)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def value_container(self, value):
+    """DEPRECATED: use extended.value_container() instead."""
+    return self._extended.value_container(value)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+  def group(self, value, name=None):
+    """Shortcut for `tf.group(self.unwrap(value))`."""
+    return self._extended._group(value, name)  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def require_static_shapes(self):
+    """DEPRECATED: use extended.require_static_shapes instead."""
+    return self._extended.experimental_require_static_shapes
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    return self._extended._num_replicas_in_sync  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def worker_devices(self):
+    """DEPRECATED: use extended.worker_devices instead."""
+    return self._extended.worker_devices
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def parameter_devices(self):
+    """DEPRECATED: use extended.parameter_devices instead."""
+    return self._extended.parameter_devices
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def non_slot_devices(self, var_list):
+    """DEPRECATED: use extended.non_slot_devices instead."""
+    return self._extended.non_slot_devices(var_list)
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def between_graph(self):
+    """DEPRECATED: use extended.experimental_between_graph instead."""
+    return self._extended.experimental_between_graph
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, being replaced by a new API.
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    """Configures the strategy class."""
+    return self._extended._configure(  # pylint: disable=protected-access
+        session_config, cluster_spec, task_type, task_id)
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_init(self):
+    """DEPRECATED: use extended.should_init instead."""
+    return self._extended.experimental_should_init
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_checkpoint(self):
+    """DEPRECATED: use extended.should_checkpoint instead."""
+    return self._extended.should_checkpoint
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_save_summary(self):
+    """DEPRECATED: use extended.should_save_summary instead."""
+    return self._extended.should_save_summary
+
+  def __deepcopy__(self, memo):
+    # First do a regular deepcopy of `self`.
+    cls = self.__class__
+    result = cls.__new__(cls)
+    memo[id(self)] = result
+    for k, v in self.__dict__.items():
+      setattr(result, k, copy.deepcopy(v, memo))
+    # One little fix-up: we want `result._extended` to reference `result`
+    # instead of `self`.
+    result._extended._container_strategy_weakref = weakref.ref(result)  # pylint: disable=protected-access
+    return result
+
+  def __copy__(self):
+    raise RuntimeError("Must only deepcopy DistributionStrategy.")
+
+
+@tf_export("distribute.StrategyExtended")
+class DistributionStrategyExtended(object):
+  """Additional APIs for algorithms that need to be distribution-aware.
 
   The intent is that you can write an algorithm in a stylized way and
-  it will be usable with a variety of different `DistributionStrategy`
+  it will be usable with a variety of different
+  `tf.distribute.Strategy`
   implementations. Each descendant will implement a different strategy
   for distributing the algorithm across multiple devices/machines.
   Furthermore, these changes can be hidden inside the specific layers
   and other library classes that need special treatment to run in a
   distributed setting, so that most users' model definition code can
-  run unchanged. The `DistributionStrategy` API works the same way
+  run unchanged. The `tf.distribute.Strategy` API works the same way
   with eager and graph execution.
 
   First let's introduce a few high-level concepts:
@@ -252,72 +708,61 @@ class DistributionStrategy(object):
 
   We have then a few approaches we want to support:
 
-  * Code written (as if) with no knowledge of class `DistributionStrategy`.
+  * Code written (as if) with no knowledge of class `tf.distribute.Strategy`.
     This code should work as before, even if some of the layers, etc.
     used by that code are written to be distribution-aware. This is done
-    by having a default `DistributionStrategy` that gives ordinary behavior,
+    by having a default `tf.distribute.Strategy` that gives ordinary behavior,
     and by default being in a single replica context.
   * Ordinary model code that you want to run using a specific
-    `DistributionStrategy`. This can be as simple as:
+    `tf.distribute.Strategy`. This can be as simple as:
 
     ```
-    with my_distribution.scope():
-      iterator = my_distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
-      replica_train_ops = my_distribution.call_for_each_replica(
-          replica_fn, iterator.get_next())
-      train_op = tf.group(my_distribution.unwrap(replica_train_ops))
+    with my_strategy.scope():
+      iterator = my_strategy.make_dataset_iterator(dataset)
+      session.run(iterator.initialize())
+      replica_train_ops = my_strategy.extended.call_for_each_replica(
+          replica_fn, args=(iterator.get_next(),))
+      train_op = my_strategy.group(replica_train_ops)
     ```
 
     This takes an ordinary `dataset` and `replica_fn` and runs it
-    distributed using a particular `DistributionStrategy` in
-    `my_distribution`. Any variables created in `replica_fn` are created
-    using `my_distribution`'s policy, and library functions called by
+    distributed using a particular `tf.distribute.Strategy` in
+    `my_strategy`. Any variables created in `replica_fn` are created
+    using `my_strategy`'s policy, and library functions called by
     `replica_fn` can use the `get_replica_context()` API to get enhanced
     behavior in this case.
 
-    You can also create an initializable iterator instead of a one-shot
-    iterator. In that case, you will need to ensure that you initialize the
-    iterator before calling get_next.
-    ```
-    iterator = my_distribution.distribute_dataset(
-        dataset).make_initializable_iterator())
-    session.run(iterator.initializer)
-    ```
-
   * If you want to write a distributed algorithm, you may use any of
-    the `DistributionStrategy` APIs inside a
-    `with my_distribution.scope():` block of code.
+    the `tf.distribute.Strategy` APIs inside a
+    `with my_strategy.scope():` block of code.
 
   Lower-level concepts:
 
   * Wrapped values: In order to represent values parallel across devices
     (either replicas or the devices associated with a particular value), we
-    wrap them in a "PerDevice" or "Mirrored" object that contains a map
-    from device to values. "PerDevice" is used when the value may be
-    different across devices, and "Mirrored" when the value are the same.
-  * Unwrapping and merging: Consider calling a function `fn` on
-    multiple devices, like `call_for_each_replica(fn, w)` with an
-    argument `w` that is a wrapped value. This means `w` will have a
-    map taking replica device `d0` to `w0`, replica device `d1` to `w1`,
-    etc. `call_for_each_replica()` unwraps `w` before calling `fn`, so
-    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges
-    the return values from `fn()`, which can possibly result in
-    wrapped values. For example, let's say `fn()` returns a tuple with
-    three components: `(x, a, v0)` from replica 0, `(x, b, v1)` on replica 1,
-    etc. If the first component is the same object `x` from every
-    replica, then the first component of the merged result will also be
-    `x`. If the second component is different (`a`, `b`, ...)  from
-    each replica, then the merged value will have a wrapped map from
-    replica device to the different values. If the third component is
-    the members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to
-    `v1`, etc.), then the merged result will be that mirrored variable
-    (`v`).
+    wrap them in a "PerReplica" or "Mirrored" object that contains a map
+    from device to values. "PerReplica" is used when the value may be
+    different across replicas, and "Mirrored" when the value are the same.
+  * Unwrapping and merging: Consider calling a function `fn` on multiple
+    replicas, like `extended.call_for_each_replica(fn, args=[w])` with an
+    argument `w` that is a wrapped value. This means `w` will have a map taking
+    replica device `d0` to `w0`, replica device `d1` to `w1`,
+    etc. `extended.call_for_each_replica()` unwraps `w` before calling `fn`, so
+    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges the return
+    values from `fn()`, which can possibly result in wrapped values. For
+    example, let's say `fn()` returns a tuple with three components: `(x, a,
+    v0)` from replica 0, `(x, b, v1)` on replica 1, etc. If the first component
+    is the same object `x` from every replica, then the first component of the
+    merged result will also be `x`. If the second component is different (`a`,
+    `b`, ...)  from each replica, then the merged value will have a wrapped map
+    from replica device to the different values. If the third component is the
+    members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to `v1`, etc.),
+    then the merged result will be that mirrored variable (`v`).
   * Replica context vs. Cross-replica context: _replica context_ is when we
     are in some function that is being called once for each replica.
     Otherwise we are in cross-replica context, which is useful for
-    calling `DistributionStrategy` methods which operate across the
-    replicas (like `reduce()`). By default you start in a replica context
+    calling `tf.distribute.Strategy` methods which operate across the
+    replicas (like `reduce_to()`). By default you start in a replica context
     (the default "single replica context") and then some methods can
     switch you back and forth, as described below.
   * Worker devices vs. parameter devices: Most replica computations will
@@ -336,11 +781,11 @@ class DistributionStrategy(object):
     pick a consistent set of devices to pass to both
     `colocate_vars_with()` and `update_non_slot()`.
 
-  When using a `DistributionStrategy`, we have a new type dimension
+  When using a `tf.distribute.Strategy`, we have a new type dimension
   called _locality_ that says what values are compatible with which
   APIs:
 
-  * T: different value for each replica (e.g. a PerDevice-wrapped value).
+  * T: different value for each replica (e.g. a PerReplica-wrapped value).
   * M: value is "mirrored" across replicas, i.e. there are copies with the
     same value on each replica (e.g. a Mirrored-wrapped value).
   * V(`v`): value is "mirrored" across all the devices which have a
@@ -353,38 +798,40 @@ class DistributionStrategy(object):
 
   * `with d.scope()`: default single-replica context -> cross-replica context
     for `d`
-  * `with d.colocate_vars_with(v)`: in replica/cross-replica context, variables
-    will be created with locality V(`v`). That is, if we write
-    `with d.colocate_vars_with(v1): v2 = tf.get_variable(...)`, then
-    `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
+  * `with d.extended.colocate_vars_with(v)`: in replica/cross-replica context,
+    variables will be created with locality V(`v`). That is, if we write
+    `with d.extended.colocate_vars_with(v1): v2 = tf.get_variable(...)`,
+    then `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
     V(`v1`).
-  * `with d.colocate_vars_with(d.non_slot_devices(...))`: in
+  * `with d.extended.colocate_vars_with(d.extended.non_slot_devices(...))`: in
     replica/cross-replica context, variables will be created with locality N
   * `v = tf.get_variable(...)`: in replica/cross-replica context, creates
     a variable (which by definition will have locality V(`v`), though
     will match another locality if inside a `colocate_vars_with`
     scope).
-  * `d.distribute_dataset(dataset).make_one_shot_iterator()`: in cross-replica
+  * `d.make_dataset_iterator(dataset)` (or the deprecated
+    `d.distribute_dataset(dataset).make_one_shot_iterator()`): in cross-replica
     context, produces an iterator with locality T
-  * `d.broadcast(t)`: in cross-replica context, produces a value with locality M
-  * `d.broadcast(t, v)`: in cross-replica context, produces a value with
-    locality V(`v`)
-  * `d.call_for_each_replica(fn, ...)`: in cross-replica context, runs
+  * `d.extended.broadcast_to(t)`: in cross-replica context, produces a value
+    with locality M
+  * `d.extended.broadcast_to(t, v)`: in cross-replica context, produces a value
+    with locality V(`v`)
+  * `d.extended.call_for_each_replica(fn, ...)`: in cross-replica context, runs
     `fn()` in a replica context (and so may call `get_replica_context()` and
     use its API, including `merge_call()` to get back to cross-replica
     context), once for each replica. May use values with locality T or
     M, and any variable.
-  * `d.reduce(m, t, t)`: in cross-replica context, accepts t with locality T
-    and produces a value with locality M.
-  * `d.reduce(m, t, v)`: in cross-replica context, accepts t with
+  * `d.extended.reduce_to(m, t, t)`: in cross-replica context, accepts t with
+    locality T and produces a value with locality M.
+  * `d.extended.reduce_to(m, t, v)`: in cross-replica context, accepts t with
     locality T and produces a value with locality V(`v`).
-  * `d.batch_reduce(m, [(t, v)]): see `d.reduce()`
-  * `d.update(v, fn, ...)`: in cross-replica context, runs `fn()` once
+  * `d.extended.batch_reduce_to(m, [(t, v)]): see `d.extended.reduce_to()`
+  * `d.extended.update(v, fn, ...)`: in cross-replica context, runs `fn()` once
     for each device `v` is copied to, all inputs should have locality
     V(`v`), output will have locality V(`v`) as well.
-  * `d.update_non_slot(d.non_slot_devices(), fn)`: in cross-replica
-    context, like `d.update()` except with locality N.
-  * `d.read_var(v)`: Gets the (read-only) value of the variable `v` (on
+  * `d.extended.update_non_slot(d.extended.non_slot_devices(), fn)`: in
+    cross-replica context, like `d.extended.update()` except with locality N.
+  * `d.extended.read_var(v)`: Gets the (read-only) value of the variable `v` (on
     the device determined by the current device scope), aggregating
     across replicas for replica-local variables. Frequently, this will be
     done automatically when using `v` in an expression or fetching it in
@@ -394,83 +841,89 @@ class DistributionStrategy(object):
 
   The standard pattern for updating variables is to:
 
-  1. Wrap your input dataset in `d.distribute_dataset()` and create an iterator.
-  2. Define each replica `d.call_for_each_replica()` up to the point of
+  1. Create an input iterator with `d.make_dataset_iterator()`.
+  2. Define each replica `d.extended.call_for_each_replica()` up to the point of
      getting a list of gradient, variable pairs.
-  3. Call `d.reduce(VariableAggregation.SUM, t, v)` or `d.batch_reduce()` to sum
-     the gradients (with locality T) into values with locality V(`v`).
-  4. Call `d.update(v)` for each variable to update its value.
+  3. Call `d.extended.reduce_to(VariableAggregation.SUM, t, v)` or
+     `d.extended.batch_reduce_to()` to sum the gradients (with locality T)
+     into values with locality V(`v`).
+  4. Call `d.extended.update(v)` for each variable to update its value.
 
   Steps 3 and 4 are done automatically by class `Optimizer` if you call
   its `apply_gradients` method in a replica context. Otherwise you can
   manually call its `_distributed_apply` method in a cross-replica context.
 
-  Another thing you might want to do in the middle of your replica function
-  is an all-reduce of some intermediate value, using `d.reduce()` or
-  `d.batch_reduce()`. You simply provide the same tensor as the input and
-  destination.
+  Another thing you might want to do in the middle of your replica function is
+  an all-reduce of some intermediate value, using `d.extended.reduce_to()` or
+  `d.extended.batch_reduce_to()`. You simply provide the same tensor as the
+  input and destination.
 
   Layers should expect to be called in a replica context, and can use
-  the `get_replica_context()` function to get a `ReplicaContext` object. The
+  the `tf.distribute.get_replica_context` function to get a
+  `tf.distribute.ReplicaContext` object. The
   `ReplicaContext` object has a `merge_call()` method for entering
-  cross-replica context where you can use `reduce()` (or
-  `batch_reduce()`) and then optionally `update()` to update state.
+  cross-replica context where you can use `reduce_to()` (or
+  `batch_reduce_to()`) and then optionally `update()` to update state.
 
-  You may use this API whether or not a `DistributionStrategy` is
+  You may use this API whether or not a `tf.distribute.Strategy` is
   being used, since there is a default implementation of
-  `ReplicaContext` and `DistributionStrategy`. Or you can use the
-  `get_replica_context().is_single_replica` property to run different code
-  in the distributed vs. single replica cases.
-  """
+  `ReplicaContext` and `tf.distribute.Strategy`.
 
-  # TODO(josh11b): Raise an exception if variable partitioning requested before
-  #   we add support.
-  # TODO(josh11b): Also `parameter_device_index` property?
-  # TODO(josh11b): `map()`
-  # TODO(josh11b): ClusterSpec/ClusterResolver
-  # TODO(josh11b): Partitioned computations, state; sharding
-  # TODO(josh11b): Model parallelism: "replicas" with multiple devices; shuffling
-  # TODO(josh11b): List of replicas with their worker and parameter devices
-  #   (where the parameter devices may overlap in the ps case).
+  NOTE for new `tf.distribute.Strategy` implementations: Please put all logic
+  in a subclass of `tf.distribute.StrategyExtended`. The only code needed for
+  the `tf.distribute.Strategy` subclass is for instantiating your subclass of
+  `tf.distribute.StrategyExtended` in the `__init__` method.
+  """
 
-  def __init__(self):
+  def __init__(self, container_strategy):
+    self._container_strategy_weakref = weakref.ref(container_strategy)
     self._default_device = None
     # This property is used to determine if we should set drop_remainder=True
     # when creating Datasets from numpy array inputs.
     self._require_static_shapes = False
 
-  def scope(self):
-    """Returns a context manager selecting this DistributionStrategy as current.
+  def _container_strategy(self):
+    """Get the containing `DistributionStrategy`.
 
-    Inside a `with distribution_strategy.scope():` code block, this thread
-    will use a variable creator set by `distribution_strategy`, and will
-    enter its "cross-replica context".
+    This should not generally be needed except when creating a new
+    `ReplicaContext` and to validate that the caller is in the correct
+    `scope()`.
 
     Returns:
-      A context manager.
+      The `DistributionStrategy` such that `strategy.extended` is `self`.
     """
+    container_strategy = self._container_strategy_weakref()
+    assert container_strategy is not None
+    return container_strategy
+
+  def _scope(self, strategy):
+    """Implementation of DistributionStrategy.scope()."""
     if distribution_strategy_context.has_distribution_strategy():
-      _require_cross_replica_context(self)
-      return _SameScopeAgainContext(self)
+      _require_cross_replica_context_extended(self)
+      return _SameScopeAgainContext(strategy)
 
     def creator_with_resource_vars(*args, **kwargs):
-      _require_distribution_strategy_scope(self)
+      _require_distribution_strategy_scope_extended(self)
       kwargs["use_resource"] = True
       return self._create_variable(*args, **kwargs)
 
-    def disable_partitioned_variables(getter, *args, **kwargs):
-      if kwargs.pop("partitioner", None) is not None:
-        tf_logging.log_first_n(
-            tf_logging.WARN, "Partitioned variables are disabled when using "
-            "DistributionStrategy.", 1)
+    def distributed_getter(getter, *args, **kwargs):
+      if not self._allow_variable_partition():
+        if kwargs.pop("partitioner", None) is not None:
+          tf_logging.log_first_n(
+              tf_logging.WARN, "Partitioned variables are disabled when using "
+              "current tf.distribute.Strategy.", 1)
       return getter(*args, **kwargs)
 
     return _CurrentDistributionContext(
-        self, variable_scope.variable_creator_scope(creator_with_resource_vars),
+        strategy,
+        variable_scope.variable_creator_scope(creator_with_resource_vars),
         variable_scope.variable_scope(
             variable_scope.get_variable_scope(),
-            custom_getter=disable_partitioned_variables),
-        self._default_device)
+            custom_getter=distributed_getter), self._default_device)
+
+  def _allow_variable_partition(self):
+    return False
 
   def _create_variable(self, next_creator, *args, **kwargs):
     # Note: should support "colocate_with" argument.
@@ -483,7 +936,7 @@ class DistributionStrategy(object):
     (read-only) value of any other variable.
 
     Args:
-      v: A variable allocated within the scope of this `DistributionStrategy`.
+      v: A variable allocated within the scope of this `tf.distribute.Strategy`.
 
     Returns:
       A tensor representing the value of `v`, aggregated across replicas if
@@ -504,9 +957,9 @@ class DistributionStrategy(object):
     Example usage:
 
     ```
-    with distribution_strategy.scope():
+    with strategy.scope():
       var1 = tf.get_variable(...)
-      with distribution_strategy.colocate_vars_with(v1):
+      with strategy.extended.colocate_vars_with(v1):
         # var2 and var3 will be created on the same device(s) as var1
         var2 = tf.get_variable(...)
         var3 = tf.get_variable(...)
@@ -515,7 +968,7 @@ class DistributionStrategy(object):
         # operates on v1 from var1, v2 from var2, and v3 from var3
 
       # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
-      distribution_strategy.update(v1, fn, v2, v3)
+      strategy.extended.update(v1, fn, args=(v2, v3))
     ```
 
     Args:
@@ -527,104 +980,61 @@ class DistributionStrategy(object):
       A context manager.
     """
     def create_colocated_variable(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope(self)
+      _require_distribution_strategy_scope_extended(self)
       kwargs["use_resource"] = True
       kwargs["colocate_with"] = colocate_with_variable
       return next_creator(*args, **kwargs)
 
-    _require_distribution_strategy_scope(self)
+    _require_distribution_strategy_scope_extended(self)
     return variable_scope.variable_creator_scope(create_colocated_variable)
 
   def _call_dataset_fn(self, dataset_fn):
+    """Call the `dataset_fn` with `input_context` as argument."""
     result = dataset_fn()
     if not isinstance(result, dataset_ops.Dataset):
       raise ValueError(
           "dataset_fn() must return a tf.data.Dataset when using a "
-          "DistributionStrategy.")
+          "tf.distribute.Strategy.")
     return result
 
-  # TODO(josh11b): `PerDeviceDataset` currently only implements a few methods of
+  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
   # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
   # Extend to implement more functionality of datasets.
-  def distribute_dataset(self, dataset_fn):
-    """Return a `dataset` split across all replicas.
-
-    Suitable for providing input to for `call_for_each_replica()` by creating an
-    iterator:
-
-    ```
-    def dataset_fn():
-      return tf.data.Dataset.from_tensors([[1.]]).repeat()
-    with distribution_strategy.scope():
-      distributed_dataset = distribution_strategy.distribute_dataset(dataset_fn)
-      iterator = distributed_dataset.make_one_shot_iterator()
-      replica_results = distribution_strategy.call_for_each_replica(
-          replica_fn, iterator.get_next())
-    ```
+  def _distribute_dataset(self, dataset_fn):
+    raise NotImplementedError("must be implemented in descendants")
 
-    Args:
-      dataset_fn: A function that returns a `tf.data.Dataset`.
+  def _make_dataset_iterator(self, dataset):
+    raise NotImplementedError("must be implemented in descendants")
 
-    Returns:
-      A `PerDeviceDataset` that will produce data for each replica.
-    """
+  def _make_input_fn_iterator(self, input_fn, replication_mode):
     raise NotImplementedError("must be implemented in descendants")
 
-  def broadcast(self, tensor, destinations=None):
+  def broadcast_to(self, tensor, destinations):
     """Mirror a tensor on one device to all worker devices.
 
     Args:
       tensor: A Tensor value to broadcast.
-      destinations: An optional mirrored variable, device string, or
-        list of device strings, specifying the destination devices
-        to copy `tensor` to. Defaults to `self.worker_devices`.
+      destinations: A mirrored variable, device string, or list of device
+        strings, specifying the destination devices to copy `tensor` to.
 
     Returns:
       A value mirrored to `destinations` devices.
     """
     # TODO(josh11b): More docstring
-    _require_cross_replica_context(self)
-    return self._broadcast(tensor, destinations)
+    _require_cross_replica_context_extended(self)
+    return self._broadcast_to(tensor, destinations)
 
-  def _broadcast(self, tensor, destinations):
+  def _broadcast_to(self, tensor, destinations):
     raise NotImplementedError("must be implemented in descendants")
 
-  def initialize(self):
-    """Any initialization to be done before running any computations.
-
-    In eager mode, it executes any initialization as a side effect.
-    In graph mode, it creates the initialization ops and returns them.
-
-    For example, TPU initialize_system ops.
-
-    Returns:
-      In eager mode, returns `None`.
-      In graph mode, a list of ops to execute. Empty list if nothing to be done.
-    """
-    if eager_context.executing_eagerly():
-      return
-    else:
-      return []
-
-  def finalize(self):
-    """Any final actions to be done at the end of all computations.
-
-    In eager mode, it executes any finalize actions as a side effect.
-    In graph mode, it creates the finalize ops and returns them.
-
-    For example, TPU shutdown ops.
+  def _initialize(self):
+    return []
 
-    Returns:
-      In eager mode, returns `None`.
-      In graph mode, a list of ops to execute. Empty list if nothing to be done.
-    """
-    if eager_context.executing_eagerly():
-      return
-    else:
-      return []
+  def _finalize(self):
+    return []
 
-  def run_steps_on_dataset(self, fn, iterator, iterations=1,
-                           initial_loop_values=None):
+  def experimental_run_steps_on_iterator(self, fn, iterator, iterations=1,
+                                         initial_loop_values=None):
     """Run `fn` with input from `iterator` for `iterations` times.
 
     This method can be used to run a step function for training a number of
@@ -632,18 +1042,13 @@ class DistributionStrategy(object):
 
     Args:
       fn: function to run using this distribution strategy. The function must
-        have the following signature: `def fn(context, *inputs)`.
+        have the following signature: `def fn(context, inputs)`.
         `context` is an instance of `MultiStepContext` that will be passed when
         `fn` is run. `context` can be used to specify the outputs to be returned
         from `fn` by calling `context.set_last_step_output`. It can also be used
         to capture non tensor outputs by `context.set_non_tensor_output`.
         See `MultiStepContext` documentation for more information.
-        `inputs` will have same type/structure as `iterator.get_next()`. If the
-        `iterator.get_next()` returns a tuple say `return x, y` then whose will
-        be unpacked and passed to the `step_fn`; and step_fn signature would
-        look like `def step_fn(context, x, y)`. If the iterator returns a single
-        value say `return x` then the value is passed as is; the step_fn
-        signature would look like `def step_fn(context, x)`.
+        `inputs` will have same type/structure as `iterator.get_next()`.
         Typically, `fn` will use `call_for_each_replica` method of the strategy
         to distribute the computation over multiple replicas.
       iterator: Iterator of a dataset that represents the input for `fn`. The
@@ -665,70 +1070,19 @@ class DistributionStrategy(object):
         - non_tensor_outputs: A dictionatry containing anything that was set by
           `fn` by calling `context.set_non_tensor_output`.
     """
-    _require_cross_replica_context(self)
-    return self._run_steps_on_dataset(fn, iterator, iterations,
-                                      initial_loop_values)
+    _require_cross_replica_context_extended(self)
+    return self._experimental_run_steps_on_iterator(
+        fn, iterator, iterations, initial_loop_values)
 
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values):
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values):
     raise NotImplementedError("must be implemented in descendants")
 
-  def call_for_each_replica(self, fn, *args, **kwargs):
+  def call_for_each_replica(self, fn, args=(), kwargs=None):
     """Run `fn` once per replica.
 
     `fn` may call `tf.get_replica_context()` to access methods such as
-    `replica_id()` and `merge_call()`.
-
-    `merge_call()` is used to communicate between the replicas and
-    re-enter the cross-replica context. All replicas pause their execution
-    having encountered a `merge_call()` call. After that the
-    `merge_fn`-function is executed. Its results are then unwrapped and
-    given back to each replica call. After that execution resumes until
-    `fn` is complete or encounters another `merge_call()`.  Example:
-
-    ```python
-    # Called once in "cross-replica" context.
-    def merge_fn(distribution, three_plus_replica_id):
-      # sum the values across replicas
-      return sum(distribution.unwrap(three_plus_replica_id))
-
-    # Called once per replica in `distribution`, in a "replica" context.
-    def fn(three):
-      replica_ctx = tf.get_replica_context()
-      v = three + replica_ctx.replica_id
-      # Computes the sum of the `v` values across all replicas.
-      s = replica_ctx.merge_call(merge_fn, v)
-      return s + v
-
-    with distribution.scope():
-      # in "cross-replica" context
-      ...
-      merged_results = distribution.call_for_each_replica(fn, 3)
-      # merged_results has the values from every replica execution of `fn`.
-      print(distribution.unwrap(merged_results))  # Prints a list
-    ```
-
-    Args:
-      fn: function to run (will be run once per replica).
-      *args: positional arguments for `fn`
-      **kwargs: keyword arguments for `fn`.
-          `"run_concurrently"`: Boolean indicating whether executions of `fn`
-             can be run concurrently (under eager execution only), defaults to
-             `True`.
-
-    Returns:
-      Merged return value of `fn` across all replicas.
-    """
-    _require_cross_replica_context(self)
-    return self._call_for_each_replica(fn, *args, **kwargs)
-
-  def call_for_each_tower(self, fn, *args, **kwargs):
-    """Run `fn` once per replica. DEPRECATED.
-
-    DEPRECATED: Use `call_for_each_replica` instead.
-
-    `fn` may call `tf.get_replica_context()` to access methods such as
-    `replica_id()` and `merge_call()`.
+    `replica_id_in_sync_group` and `merge_call()`.
 
     `merge_call()` is used to communicate between the replicas and
     re-enter the cross-replica context. All replicas pause their execution
@@ -746,45 +1100,45 @@ class DistributionStrategy(object):
     # Called once per replica in `distribution`, in a "replica" context.
     def fn(three):
       replica_ctx = tf.get_replica_context()
-      v = three + replica_ctx.replica_id
+      v = three + replica_ctx.replica_id_in_sync_group
       # Computes the sum of the `v` values across all replicas.
-      s = replica_ctx.merge_call(merge_fn, v)
+      s = replica_ctx.merge_call(merge_fn, args=(v,))
       return s + v
 
     with distribution.scope():
       # in "cross-replica" context
       ...
-      merged_results = distribution.call_for_each_replica(fn, 3)
+      merged_results = distribution.call_for_each_replica(fn, args=[3])
       # merged_results has the values from every replica execution of `fn`.
       print(distribution.unwrap(merged_results))  # Prints a list
     ```
 
     Args:
       fn: function to run (will be run once per replica).
-      *args: positional arguments for `fn`
-      **kwargs: keyword arguments for `fn`.
-          `"run_concurrently"`: Boolean indicating whether executions of `fn`
-             can be run concurrently (under eager execution only), defaults to
-             `True`.
+      args: Tuple or list with positional arguments for `fn`.
+      kwargs: Dict with keyword arguments for `fn`.
 
     Returns:
       Merged return value of `fn` across all replicas.
     """
-    _require_cross_replica_context(self)
-    return self._call_for_each_replica(fn, *args, **kwargs)
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._call_for_each_replica(fn, args, kwargs)
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
+  def _call_for_each_replica(self, fn, args, kwargs):
     raise NotImplementedError("must be implemented in descendants")
 
-  def reduce(self, aggregation, value, destinations):
+  def reduce_to(self, reduce_op, value, destinations):
     """Combine (via e.g. sum or mean) values across replicas.
 
     Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`,
-        `tf.VariableAggregation.ONLY_FIRST_REPLICA`.
-      value: A per-device value with one value per replica.
-      destinations: A mirrored variable, a per-device tensor, a device string,
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+        DEPRECATED but still accepted values:
+        `tf.VariableAggregation.SUM`,
+        `tf.VariableAggregation.MEAN`,
+      value: A per-replica value with one value per replica.
+      destinations: A mirrored variable, a per-replica tensor, a device string,
         or list of device strings. The return value will be copied to all
         destination devices (or all the devices where the `destinations` value
         resides). To perform an all-reduction, pass `value` to `destinations`.
@@ -795,46 +1149,53 @@ class DistributionStrategy(object):
     # TODO(josh11b): More docstring
     # TODO(josh11b): Return an unwrapped value if colocate_with is a
     # single device.
-    _require_cross_replica_context(self)
-    assert aggregation in [
-        variable_scope.VariableAggregation.SUM,
-        variable_scope.VariableAggregation.MEAN,
-        variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-    ]
-    return self._reduce(aggregation, value, destinations)
-
-  def _reduce(self, aggregation, value, destinations):
+    _require_cross_replica_context_extended(self)
+
+    # TODO(priyag): Remove this when all callers have been updated.
+    if isinstance(reduce_op, variable_scope.VariableAggregation):
+      assert reduce_op in [
+          variable_scope.VariableAggregation.SUM,
+          variable_scope.VariableAggregation.MEAN,
+      ]
+      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    return self._reduce_to(reduce_op, value, destinations)
+
+  def _reduce_to(self, reduce_op, value, destinations):
     raise NotImplementedError("must be implemented in descendants")
 
-  def batch_reduce(self, aggregation, value_destination_pairs):
-    """Combine multiple `reduce` calls into one for faster execution.
+  def batch_reduce_to(self, reduce_op, value_destination_pairs):
+    """Combine multiple `reduce_to` calls into one for faster execution.
 
     Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`,
-        `tf.VariableAggregation.ONLY_FIRST_REPLICA`.
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+        DEPRECATED but still accepted values:
+        `tf.VariableAggregation.SUM`,
+        `tf.VariableAggregation.MEAN`,
       value_destination_pairs: A sequence of (value, destinations)
-        pairs. See `reduce()` for a description.
+        pairs. See `reduce_to()` for a description.
 
     Returns:
       A list of mirrored values, one per pair in `value_destination_pairs`.
     """
     # TODO(josh11b): More docstring
-    _require_cross_replica_context(self)
-    assert aggregation in [
-        variable_scope.VariableAggregation.SUM,
-        variable_scope.VariableAggregation.MEAN,
-        variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-    ]
-    return self._batch_reduce(aggregation, value_destination_pairs)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+    _require_cross_replica_context_extended(self)
+
+    # TODO(priyag): Remove this when all callers have been updated.
+    if isinstance(reduce_op, variable_scope.VariableAggregation):
+      assert reduce_op in [
+          variable_scope.VariableAggregation.SUM,
+          variable_scope.VariableAggregation.MEAN,
+      ]
+      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    return self._batch_reduce_to(reduce_op, value_destination_pairs)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
     return [
-        self.reduce(aggregation, t, destinations=v)
+        self.reduce_to(reduce_op, t, destinations=v)
         for t, v in value_destination_pairs
     ]
 
-  def update(self, var, fn, *args, **kwargs):
+  def update(self, var, fn, args=(), kwargs=None, group=True):
     """Run `fn` to update `var` using inputs mirrored to the same devices.
 
     If `var` is mirrored across multiple devices, then this implements
@@ -844,75 +1205,69 @@ class DistributionStrategy(object):
     results = {}
     for device, v in var:
       with tf.device(device):
-        # *args and **kwargs will be unwrapped if they are mirrored.
+        # args and kwargs will be unwrapped if they are mirrored.
         results[device] = fn(v, *args, **kwargs)
     return merged(results)
     ```
 
     Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.
 
-    Neither `*args` nor `**kwargs` may contain per-device values.
+    Neither `args` nor `kwargs` may contain per-replica values.
     If they contain mirrored values, they will be unwrapped before
     calling `fn`.
 
     Args:
       var: Variable, possibly mirrored to multiple devices, to operate on.
       fn: Function to call. Should take the variable as the first argument.
-      *args: Additional positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
-        specified, the return value will be unwrapped.
+      args: Tuple or list. Additional positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
 
     Returns:
       By default, the merged return value of `fn` across all replicas.  The
       merged result has dependencies to make sure that if it is evaluated at
       all, the side effects (updates) will happen on every replica. If instead
-      "grouped=False" is specified, this function will return a nest of lists
+      "group=False" is specified, this function will return a nest of lists
       where each list has an element per replica, and the caller is responsible
       for ensuring all elements are executed.
     """
-    _require_cross_replica_context(self)
-    options = {"grouped": kwargs.pop("grouped", True)}
-    return self._update(var, options, fn, *args, **kwargs)
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._update(var, fn, args, kwargs, group)
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     raise NotImplementedError("must be implemented in descendants")
 
-  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
+  def update_non_slot(
+      self, colocate_with, fn, args=(), kwargs=None, group=True):
     """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
 
     Args:
       colocate_with: The return value of `non_slot_devices()`.
       fn: Function to execute.
-      *args: Positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
-        specified, the return value will be unwrapped and the caller is
-        responsible for ensuring all elements are executed.
+      args: Tuple or list. Positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
 
     Returns:
       Return value of `fn`, possibly merged across devices.
     """
-    _require_cross_replica_context(self)
-    options = {"grouped": kwargs.pop("grouped", True)}
-    return self._update_non_slot(colocate_with, options, fn, *args, **kwargs)
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._update_non_slot(colocate_with, fn, args, kwargs, group)
 
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     raise NotImplementedError("must be implemented in descendants")
 
-  def unwrap(self, value):
-    """Returns the list of all per-device values contained in `value`.
-
-    Args:
-      value: A value returned by `call_for_each_replica()` or a variable
-        created in `scope()`.
-
-    Returns:
-      A list of values contained in `value`. If `value` represents a single
-      value, this returns `[value].`
-    """
-    return self._unwrap(value)
+  def _unwrap(self, distributed_value):
+    raise NotImplementedError("must be implemented in descendants")
 
   def value_container(self, value):
-    """Returns the container that this per-device `value` belongs to.
+    """Returns the container that this per-replica `value` belongs to.
 
     Args:
       value: A value returned by `call_for_each_replica()` or a variable
@@ -926,12 +1281,9 @@ class DistributionStrategy(object):
     """
     raise NotImplementedError("must be implemented in descendants")
 
-  def _unwrap(self, distributed_value):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def group(self, value, name=None):
+  def _group(self, value, name=None):
     """Shortcut for `tf.group(distribution.unwrap(value))`."""
-    value = nest.flatten(self.unwrap(value))
+    value = nest.flatten(self._unwrap(value))
 
     if len(value) != 1 or name is not None:
       return control_flow_ops.group(value, name=name)
@@ -942,24 +1294,11 @@ class DistributionStrategy(object):
     return v
 
   @property
-  def require_static_shapes(self):
+  def experimental_require_static_shapes(self):
     return self._require_static_shapes
 
   @property
-  def num_replicas(self):
-    """Returns number of replicas, for purposes of averaging across replicas."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def num_towers(self):
-    """Returns number of replicas, for purposes of averaging across replicas.
-
-    DEPRECATED: use `num_replicas` instead.
-    """
-    return self.num_replicas
-
-  @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     """Returns number of replicas over which gradients are aggregated."""
     raise NotImplementedError("must be implemented in descendants")
 
@@ -985,38 +1324,12 @@ class DistributionStrategy(object):
 
     Args:
       var_list: The list of variables being optimized, needed with the
-        default `DistributionStrategy`.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def worker_device_index(self):
-    """An object mapping worker device to an id.
-
-    This might be passed as an argument to `call_for_each_replica()`, as in:
-
-    ```
-    with distribution_strategy.scope():
-
-      def fn(device_id):
-        # device_id is an integer. `fn` is being executed on device:
-        #    distribution_strategy.worker_devices[device_id].
-
-      distribution_strategy.call_for_each_replica(
-          fn, distribution_strategy.worker_device_index)
-    ```
-
-    Returns:
-      An index object, or the integer 0 if there is only a single replica.
+        default `tf.distribute.Strategy`.
     """
-    _require_cross_replica_context(self)
-    return self._worker_device_index()
-
-  def _worker_device_index(self):
     raise NotImplementedError("must be implemented in descendants")
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
     """Whether the strategy uses between-graph replication or not.
 
       This is expected to return a constant value that will not be changed
@@ -1024,16 +1337,16 @@ class DistributionStrategy(object):
     """
     raise NotImplementedError("must be implemented in descendants")
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     """Configures the strategy class."""
     del session_config, cluster_spec, task_type, task_id
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     """Whether initialization is needed."""
     raise NotImplementedError("must be implemented in descendants")
 
@@ -1065,22 +1378,21 @@ class DistributionStrategy(object):
 #   around their model creation and graph definition. There is no
 #   anticipated need to define descendants of _CurrentDistributionContext.
 #   It sets the current DistributionStrategy for purposes of
-#   `get_distribution_strategy()` and `has_distribution_strategy()`
+#   `get_strategy()` and `has_strategy()`
 #   and switches the thread mode to a "cross-replica context".
+@tf_export("distribute.ReplicaContext")
 class ReplicaContext(object):
-  """DistributionStrategy API inside a `call_for_each_replica()` call."""
-
-  def __init__(self, distribution_strategy, replica_id=None, tower_id=None):
-    """`tower_id` is deprecated, use `replica_id` instead."""
-    if tower_id is not None:
-      replica_id = tower_id
-    assert replica_id is not None
-    self._distribution_strategy = distribution_strategy
+  """`tf.distribute.Strategy` API when in a replica context.
+
+  To be used inside your replicated step function, such as in a
+  `tf.distribute.StrategyExtended.call_for_each_replica` call.
+  """
+
+  def __init__(self, strategy, replica_id_in_sync_group):
+    self._distribution_strategy = strategy
     self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
         self)
-    self._replica_id = replica_id
-    # We keep a copy in _tower_id to ease the replica->tower transition.
-    self._tower_id = replica_id  # DEPRECATED
+    self._replica_id_in_sync_group = replica_id_in_sync_group
 
   def __enter__(self):
     _push_per_thread_mode(self._thread_context)
@@ -1088,38 +1400,41 @@ class ReplicaContext(object):
   def __exit__(self, exception_type, exception_value, traceback):
     _pop_per_thread_mode()
 
-  def merge_call(self, merge_fn, *args, **kwargs):
+  def merge_call(self, merge_fn, args=(), kwargs=None):
     """Merge args across replicas and run `merge_fn` in a cross-replica context.
 
     This allows communication and coordination when there are multiple calls
     to a model function triggered by a call to
-    `distribution.call_for_each_replica(model_fn, ...)`.
+    `strategy.extended.call_for_each_replica(model_fn, ...)`.
 
-    See `MirroredDistribution.call_for_each_replica()` for an explanation.
+    See `tf.distribute.StrategyExtended.call_for_each_replica` for an
+    explanation.
 
-    Otherwise, this is equivalent to:
+    If not inside a distributed scope, this is equivalent to:
 
     ```
-    distribution = get_distribution_strategy()
-    with cross-replica-context(distribution):
-      return merge_fn(distribution, *args, **kwargs)
+    strategy = tf.distribute.get_strategy()
+    with cross-replica-context(strategy):
+      return merge_fn(strategy, *args, **kwargs)
     ```
 
     Args:
       merge_fn: function that joins arguments from threads that are given as
-        PerDevice. It accepts `DistributionStrategy` object as the first
-        argument.
-      *args: positional per-thread arguments for `merge_fn`
-      **kwargs: keyword per-thread arguments for `merge_fn`.
+        PerReplica. It accepts `tf.distribute.Strategy` object as
+        the first argument.
+      args: List or tuple with positional per-thread arguments for `merge_fn`.
+      kwargs: Dict with keyword per-thread arguments for `merge_fn`.
 
     Returns:
-      The return value of `merge_fn`, except for `PerDevice` values which are
+      The return value of `merge_fn`, except for `PerReplica` values which are
       unpacked.
     """
     require_replica_context(self)
-    return self._merge_call(merge_fn, *args, **kwargs)
+    if kwargs is None:
+      kwargs = {}
+    return self._merge_call(merge_fn, args, kwargs)
 
-  def _merge_call(self, merge_fn, *args, **kwargs):
+  def _merge_call(self, merge_fn, args, kwargs):
     """Default implementation for single replica."""
     _push_per_thread_mode(  # thread-local, so not needed with multiple threads
         distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
@@ -1130,46 +1445,32 @@ class ReplicaContext(object):
       _pop_per_thread_mode()
 
   @property
-  def is_single_replica(self):
-    """Returns whether there is a single replica or multiple."""
-    require_replica_context(self)
-    return self._distribution_strategy.is_single_replica
-
-  @property
-  def num_towers(self):
-    """Returns number of replicas, for purposes of averaging across replicas.
-
-    DEPRECATED: use `num_replicas` instead.
-    """
-    return self._distribution_strategy.num_replicas
-
-  @property
-  def num_replicas(self):
-    """Returns number of replicas, for purposes of averaging across replicas."""
-    return self._distribution_strategy.num_replicas
+  def num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    return self._distribution_strategy.num_replicas_in_sync
 
   @property
-  def replica_id(self):
-    """Which replica is being defined, a number from 0 to `num_replicas - 1`."""
+  def replica_id_in_sync_group(self):
+    """Which replica is being defined, from 0 to `num_replicas_in_sync - 1`."""
     require_replica_context(self)
-    return self._replica_id
+    return self._replica_id_in_sync_group
 
   @property
-  def tower_id(self):
-    """DEPRECATED: Use `replica_id` instead."""
-    require_replica_context(self)
-    return self._replica_id
+  @doc_controls.do_not_generate_docs  # DEPRECATED, use `strategy`
+  def distribution_strategy(self):
+    """DEPRECATED: use `self.stratgey` instead."""
+    return self._distribution_strategy
 
   @property
-  def distribution_strategy(self):
-    """The current `DistributionStrategy` object."""
+  def strategy(self):
+    """The current `tf.distribute.Strategy` object."""
     return self._distribution_strategy
 
   @property
-  def device(self):
-    """The device this replica is to be executed on, as a string."""
+  def devices(self):
+    """The devices this replica is to be executed on, as a list of strings."""
     require_replica_context(self)
-    return device_util.current()
+    return [device_util.current()]
 
   # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
   # all-reduce. It would return a function returning the result of reducing `t`
@@ -1179,59 +1480,73 @@ class ReplicaContext(object):
   #   in the background, not blocking until the result was needed.
   # * When constructing a graph, it could batch up all reduction requests up
   #   to that point that the first result is needed. Most likely this can be
-  #   implemented in terms of `merge_call()` and `batch_reduce()`.
+  #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
 
 # ------------------------------------------------------------------------------
 
 
 class _DefaultDistributionStrategy(DistributionStrategy):
-  """Default `DistributionStrategy` if none is explicitly selected."""
+  """Default `tf.distribute.Strategy` if none is explicitly selected."""
 
-  def scope(self):
+  def __init__(self):
+    super(_DefaultDistributionStrategy, self).__init__(
+        _DefaultDistributionExtended(self))
+
+
+class _DefaultDistributionExtended(DistributionStrategyExtended):
+  """Implementation of _DefaultDistributionStrategy."""
+
+  def _scope(self, strategy):
     """Context manager setting a variable creator and `self` as current."""
     if distribution_strategy_context.has_distribution_strategy():
-      raise RuntimeError("Must not nest DistributionStrategy scopes.")
+      raise RuntimeError("Must not nest tf.distribute.Strategy scopes.")
 
     def creator(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope(self)
+      _require_distribution_strategy_scope_strategy(strategy)
       return next_creator(*args, **kwargs)
 
     return _CurrentDistributionContext(
-        self, variable_scope.variable_creator_scope(creator))
+        strategy, variable_scope.variable_creator_scope(creator))
 
   def colocate_vars_with(self, colocate_with_variable):
     """Does not require `self.scope`."""
-    _require_distribution_strategy_scope(self)
+    _require_distribution_strategy_scope_extended(self)
     return ops.colocate_with(colocate_with_variable)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     return self._call_dataset_fn(dataset_fn)
 
-  def _broadcast(self, tensor, destinations):
+  def _make_dataset_iterator(self, dataset):
+    return _DefaultDistributionExtended.DefaultInputIterator(dataset)
+
+  def _make_input_fn_iterator(self,
+                              input_fn,
+                              replication_mode=InputReplicationMode.PER_WORKER):
+    return input_fn(InputContext()).make_initializable_iterator()
+
+  def _broadcast_to(self, tensor, destinations):
     if destinations is None:
       return tensor
     else:
       raise NotImplementedError("TODO")
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
-    # We don't run `fn` in multiple threads in _DefaultDistributionStrategy.
-    kwargs.pop("run_concurrently", None)
-    with ReplicaContext(self, replica_id=0):
+  def _call_for_each_replica(self, fn, args, kwargs):
+    with ReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
       return fn(*args, **kwargs)
 
-  def _reduce(self, aggregation, value, destinations):
+  def _reduce_to(self, reduce_op, value, destinations):
     # TODO(josh11b): Use destinations?
-    del aggregation, destinations
+    del reduce_op, destinations
     return value
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     # The implementations of _update() and _update_non_slot() are identical
     # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
 
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, should_group):
     # TODO(josh11b): Figure out what we should be passing to UpdateContext()
     # once that value is used for something.
     with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
@@ -1251,29 +1566,48 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     return value
 
   @property
-  def num_replicas(self):
-    return 1
-
-  @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     return 1
 
   @property
   def worker_devices(self):
-    raise RuntimeError(
-        "worker_devices() method unsupported by _DefaultDistributionStrategy.")
+    raise RuntimeError("worker_devices() method unsupported by default "
+                       "tf.distribute.Strategy.")
 
   @property
   def parameter_devices(self):
-    raise RuntimeError("parameter_devices() method unsupported by "
-                       "_DefaultDistributionStrategy.")
+    raise RuntimeError("parameter_devices() method unsupported by default "
+                       "tf.distribute.Strategy.")
 
   def non_slot_devices(self, var_list):
     return min(var_list, key=lambda x: x.name)
 
-  def _worker_device_index(self):
-    raise RuntimeError("worker_device_index() method unsupported by "
-                       "_DefaultDistributionStrategy.")
+  # TODO(priyag): This should inherit from `InputIterator`, once dependency
+  # issues have been resolved.
+  class DefaultInputIterator(object):
+    """Default implementation of `InputIterator` for default strategy."""
+
+    def __init__(self, dataset):
+      self._dataset = dataset
+      if eager_context.executing_eagerly():
+        self._iterator = dataset.make_one_shot_iterator()
+      else:
+        self._iterator = dataset.make_initializable_iterator()
+
+    def get_next(self):
+      return self._iterator.get_next()
+
+    def initialize(self):
+      if eager_context.executing_eagerly():
+        self._iterator = self._dataset.make_one_shot_iterator()
+        return []
+      else:
+        return [self._iterator.initializer]
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
 
 
 # ------------------------------------------------------------------------------
@@ -1287,8 +1621,8 @@ _original_from_proto = resource_variable_ops._from_proto_fn
 def _from_proto_fn(v, import_scope=None):
   if distribution_strategy_context.has_distribution_strategy():
     raise NotImplementedError(
-        "Deserialization of variables is not yet supported when using"
-        "distributed strategies.")
+        "Deserialization of variables is not yet supported when using a "
+        "tf.distribute.Strategy.")
   else:
     return _original_from_proto(v, import_scope=import_scope)
 
diff --git a/tensorflow/python/training/distribute_test.py b/tensorflow/python/training/distribute_test.py
index 44c486c99760d309f12e330173829d4f88c9dd4e..4758e3d3d4641e5c113647dbfd86f19c0768aed8 100644
--- a/tensorflow/python/training/distribute_test.py
+++ b/tensorflow/python/training/distribute_test.py
@@ -18,13 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.python.training import distribute
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import distribution_strategy_context
 
 
-class _TestReplicaContext(distribute.ReplicaContext):
+class _TestReplicaContext(distribute_lib.ReplicaContext):
 
   def merge_call(self, fn, *args, **kwargs):
     return kwargs["test_arg"]
@@ -38,10 +40,18 @@ def _get_test_variable(name, synchronization, aggregation):
   }
 
 
-class _TestStrategy(distribute.DistributionStrategy):
+class _TestStrategy(distribute_lib.DistributionStrategy):
 
-  def _call_for_each_replica(self, fn, *args, **kwargs):
-    with _TestReplicaContext(self, replica_id=0):
+  def __init__(self):
+    super(_TestStrategy, self).__init__(_TestExtended(self))
+
+
+class _TestExtended(distribute_lib.DistributionStrategyExtended):
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    with _TestReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
       return fn(*args, **kwargs)
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -53,6 +63,7 @@ def _assert_in_default_state(t):
   t.assertIs(distribution_strategy_context._get_default_replica_context(),
              distribution_strategy_context.get_replica_context())
   t.assertIs(None, distribution_strategy_context.get_cross_replica_context())
+  t.assertFalse(distribution_strategy_context.in_cross_replica_context())
   t.assertIs(distribution_strategy_context._get_default_distribution_strategy(),
              distribution_strategy_context.get_distribution_strategy())
   t.assertFalse(distribution_strategy_context.has_distribution_strategy())
@@ -69,6 +80,7 @@ class TestStrategyTest(test.TestCase):
       self.assertTrue(replica_context is not None)
       self.assertIs(None,
                     distribution_strategy_context.get_cross_replica_context())
+      self.assertFalse(distribution_strategy_context.in_cross_replica_context())
       self.assertTrue(distribution_strategy_context.has_distribution_strategy())
       self.assertIs(dist,
                     distribution_strategy_context.get_distribution_strategy())
@@ -80,9 +92,9 @@ class TestStrategyTest(test.TestCase):
                            variable_scope.variable(1.0, name="bar"))
 
     with self.assertRaises(RuntimeError):
-      dist.call_for_each_replica(run_fn)
+      dist.extended.call_for_each_replica(run_fn)
     with dist.scope():
-      dist.call_for_each_replica(run_fn)
+      dist.extended.call_for_each_replica(run_fn)
     _assert_in_default_state(self)
 
   def testScope(self):
@@ -92,6 +104,7 @@ class TestStrategyTest(test.TestCase):
       self.assertIs(None, distribution_strategy_context.get_replica_context())
       self.assertIs(dist,
                     distribution_strategy_context.get_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
       self.assertTrue(distribution_strategy_context.has_distribution_strategy())
       self.assertIs(dist,
                     distribution_strategy_context.get_distribution_strategy())
@@ -131,6 +144,7 @@ class DefaultDistributionStrategyTest(test.TestCase):
       self.assertIs(None, distribution_strategy_context.get_replica_context())
       self.assertIs(dist,
                     distribution_strategy_context.get_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
       self.assertIs(dist,
                     distribution_strategy_context.get_distribution_strategy())
       self.assertFalse(
@@ -140,9 +154,26 @@ class DefaultDistributionStrategyTest(test.TestCase):
     replica_ctx = distribution_strategy_context.get_replica_context()
     self.assertIs(distribution_strategy_context._get_default_replica_context(),
                   replica_ctx)
-    self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, "bar"))
+    self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",)))
     _assert_in_default_state(self)
 
 
+class InputContextTest(test.TestCase):
+
+  def testProperties(self):
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=2, input_pipeline_id=1, num_replicas_in_sync=6)
+    self.assertEqual(6, input_context.num_replicas_in_sync)
+    self.assertEqual(1, input_context.input_pipeline_id)
+    self.assertEqual(2, input_context.num_input_pipelines)
+
+  def testPerReplicaBatchSize(self):
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=2, input_pipeline_id=1, num_replicas_in_sync=6)
+    self.assertEqual(2, input_context.get_per_replica_batch_size(12))
+    with self.assertRaises(ValueError):
+      input_context.get_per_replica_batch_size(13)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/distribution_strategy_context.py b/tensorflow/python/training/distribution_strategy_context.py
index d9559356f46af2e08a97334a68e53e88ee6204e8..0b3878de183610eaf064adf338ef39fdc50c196e 100644
--- a/tensorflow/python/training/distribution_strategy_context.py
+++ b/tensorflow/python/training/distribution_strategy_context.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.python.util.tf_export import tf_export
 
 
 # There is a circular dependency between this and `distribute` module. So we
@@ -85,91 +86,120 @@ def _get_per_thread_mode():
 # Public API for accessing the current thread mode
 
 
+@tf_export("distribute.get_replica_context")
 def get_replica_context():
-  """Returns the current ReplicaContext or None if in a cross-replica context.
+  """Returns the current `tf.distribute.ReplicaContext` or `None`.
+
+  Returns `None` if in a cross-replica context.
 
   Note that execution:
 
   1. starts in the default (single-replica) replica context (this function
-     will return the default ReplicaContext object);
+     will return the default `ReplicaContext` object);
   2. switches to cross-replica context (in which case this will return
-     None) when entering a `with DistributionStrategy.scope():` block;
+     `None`) when entering a `with tf.distribute.Strategy.scope():` block;
   3. switches to a (non-default) replica context inside
-     `call_for_each_replica(fn, ...)`;
-  4. if `fn` calls `get_replica_context()->merge_call(merge_fn, ...)`, then
+     `extended.call_for_each_replica(fn, ...)`;
+  4. if `fn` calls `get_replica_context().merge_call(merge_fn, ...)`, then
      inside `merge_fn` you are back in the cross-replica context (and again
-     this function will return None).
+     this function will return `None`).
 
   Note that you can also go directly from step 1 to 4 to switch to a
-  cross-replica context for the default `DistributionStrategy`. You may
+  cross-replica context for the default `tf.distribute.Strategy`. You may
   also switch from the cross-replica context of 4 to a replica context by
-  calling `call_for_each_replica()`, jumping back to step 3.
+  calling `extended.call_for_each_replica()`, jumping back to step 3.
 
-  Most `DistributionStrategy` methods may only be executed in
+  Most `tf.distribute.Strategy` methods may only be executed in
   a cross-replica context, in a replica context you should use the
   `ReplicaContext` API instead.
 
   Returns:
     The current `ReplicaContext` object when in a replica context scope,
-    else None.
+    else `None`.
 
-    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
-    will return None in a particular block.
+    Within a particular block, exactly one of these two things will be true:
+
+    * `get_replica_context()` returns non-`None`, or
+    * `tf.distribute.is_cross_replica_context()` returns True.
   """
   return _get_per_thread_mode().replica_context
 
 
 def get_cross_replica_context():
-  """Returns the current DistributionStrategy if in a cross-replica context.
+  """Returns the current tf.distribute.Strategy if in a cross-replica context.
+
+  DEPRECATED: Please use `in_cross_replica_context()` and
+  `get_distribution_strategy()` instead.
 
   Note that execution:
 
   1. starts in the default (single-replica) replica context;
   2. switches to cross-replica context when entering a
-     `with DistributionStrategy.scope():` block;
+     `with tf.distribute.Strategy.scope():` block;
   3. switches to a (non-default) replica context inside
      `call_for_each_replica(fn, ...)`;
   4. if `fn` calls `get_replica_context()->merge_call(merge_fn, ...)`, then
      inside `merge_fn` you are back in the cross-replica context.
 
   Note that you can also go directly from step 1 to 4 to switch to a
-  cross-replica context for the default `DistributionStrategy`. You may
+  cross-replica context for the default `tf.distribute.Strategy`. You may
   also switch from the cross-replica context of 4 to a replica context by
   calling `call_for_each_replica()`, jumping back to step 3.
 
-  Most `DistributionStrategy` methods may only be executed in
+  Most `tf.distribute.Strategy` methods may only be executed in
   a cross-replica context.
 
   Returns:
-    Returns the current `DistributionStrategy` object in a cross-replica
-    context, or None.
+    Returns the current `tf.distribute.Strategy` object in a cross-replica
+    context, or `None`.
 
     Exactly one of `get_replica_context()` and `get_cross_replica_context()`
-    will return None in a particular block.
+    will return `None` in a particular block.
   """
   return _get_per_thread_mode().cross_replica_context
 
 
+@tf_export("distribute.in_cross_replica_context")
+def in_cross_replica_context():
+  """Returns True if in a cross-replica context.
+
+  See `tf.distribute.get_replica_context` for details.
+
+  Returns:
+    True if in a cross-replica context (`get_replica_context()` returns
+    `None`), or False if in a replica context (`get_replica_context()` returns
+    non-`None`).
+  """
+  return _get_per_thread_mode().cross_replica_context is not None
+
+
+@tf_export("distribute.get_strategy")
 def get_distribution_strategy():
-  """Returns the current `DistributionStrategy` object.
+  """Returns the current `tf.distribute.Strategy` object.
+
+  Typically only used in a cross-replica context:
 
-  Prefer to use `get_replica_context()` or `get_cross_replica_context()`
-  instead when possible.
+  ```
+  if tf.distribute.in_cross_replica_context():
+    strategy = tf.distribute.get_strategy()
+    ...
+  ```
 
   Returns:
-    A `DistributionStrategy` object. Inside a
+    A `tf.distribute.Strategy` object. Inside a
     `with distribution_strategy.scope()` block, it returns
     `distribution_strategy`, otherwise it returns the default
-    (single-replica) `DistributionStrategy` object.
+    (single-replica) `tf.distribute.Strategy` object.
   """
   return _get_per_thread_mode().distribution_strategy
 
 
+@tf_export("distribute.has_strategy")
 def has_distribution_strategy():
-  """Return if there is a current non-default `DistributionStrategy`.
+  """Return if there is a current non-default `tf.distribute.Strategy`.
 
   Returns:
-    True if inside a `with distribution_strategy.scope():`.
+    True if inside a `with strategy.scope():`.
   """
   return get_distribution_strategy() is not _get_default_distribution_strategy()
 
@@ -196,7 +226,7 @@ def _get_default_distribution_strategy():
 def _get_default_replica_context():
   if _defaults["replica_context"] is None:
     _defaults["replica_context"] = distribute_lib.ReplicaContext(
-        _get_default_distribution_strategy(), replica_id=0)
+        _get_default_distribution_strategy(), replica_id_in_sync_group=0)
   return _defaults["replica_context"]
 
 
@@ -204,14 +234,3 @@ def _get_default_replica_mode():
   if _defaults["replica_mode"] is None:
     _defaults["replica_mode"] = _DefaultReplicaThreadMode()
   return _defaults["replica_mode"]
-
-
-#-------------------------------------------------------------------------------
-# For compatibility during the tower -> replica transistion.
-_CrossTowerThreadMode = _CrossReplicaThreadMode
-_InTowerThreadMode = _InReplicaThreadMode
-_DefaultTowerThreadMode = _DefaultReplicaThreadMode
-get_tower_context = get_replica_context
-get_cross_tower_context = get_cross_replica_context
-_get_default_tower_context = _get_default_replica_context
-_get_default_tower_mode = _get_default_replica_mode
diff --git a/tensorflow/python/training/ftrl_test.py b/tensorflow/python/training/ftrl_test.py
index 15c50bc8788c3939a135920b8f917a2bb46f3ceb..a61132a96676530aec4bc6ccf1a5e3dfabc31799 100644
--- a/tensorflow/python/training/ftrl_test.py
+++ b/tensorflow/python/training/ftrl_test.py
@@ -113,11 +113,13 @@ class FtrlOptimizerTest(test.TestCase):
         sgd_op = ftrl.FtrlOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType([[0, 1]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
   def testFtrlWithL1(self):
     for dtype in [dtypes.half, dtypes.float32]:
diff --git a/tensorflow/python/training/gradient_descent_test.py b/tensorflow/python/training/gradient_descent_test.py
index 1ddea598e52b3b86b821553b0cc74674fe5389d5..2028e7b4b096928f729dd003e47d12a48398eb43 100644
--- a/tensorflow/python/training/gradient_descent_test.py
+++ b/tensorflow/python/training/gradient_descent_test.py
@@ -47,15 +47,15 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
         self.assertEqual(0, len(optimizer.variables()))
 
   def testBasicResourceVariable(self):
@@ -73,15 +73,15 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -99,15 +99,15 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -124,16 +124,16 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -151,16 +151,16 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -174,15 +174,15 @@ class GradientDescentOptimizerTest(test.TestCase):
             lrate).apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -193,7 +193,7 @@ class GradientDescentOptimizerTest(test.TestCase):
         grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
         variables.global_variables_initializer().run()
         for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], grad.eval())
+          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
 
   def testWithGlobalStep(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -207,16 +207,16 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params and global_step
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        self.assertAllCloseAccordingToType(1, global_step.eval())
+                                           self.evaluate(var1))
+        self.assertAllCloseAccordingToType(1, self.evaluate(global_step))
 
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -237,15 +237,15 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testCapturingInDefunWhileExecutingEagerly(self):
     with context.eager_mode():
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 085b77d1d6aee7411d2e354d08518e7e9e17bcb9..31c2cc56c09f7d159b8beace46795240ae850ce3 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -58,9 +58,12 @@ class MatchFilenamesOnceTest(test_lib.TestCase):
       one = inp.match_filenames_once(additional[1])
       variables.global_variables_initializer().run()
       variables.local_variables_initializer().run()
-      self.assertItemsEqual(map(compat.as_bytes, filenames), star.eval())
-      self.assertItemsEqual(map(compat.as_bytes, additional), question.eval())
-      self.assertItemsEqual([compat.as_bytes(additional[1])], one.eval())
+      self.assertItemsEqual(
+          map(compat.as_bytes, filenames), self.evaluate(star))
+      self.assertItemsEqual(
+          map(compat.as_bytes, additional), self.evaluate(question))
+      self.assertItemsEqual([compat.as_bytes(additional[1])],
+                            self.evaluate(one))
 
 
 class LimitEpochsTest(test_lib.TestCase):
@@ -71,7 +74,7 @@ class LimitEpochsTest(test_lib.TestCase):
       seven_forever = inp.limit_epochs(seven)
       variables.local_variables_initializer().run()
       for _ in range(100):
-        self.assertEqual(7, seven_forever.eval())
+        self.assertEqual(7, self.evaluate(seven_forever))
 
   def testLimit(self):
     with self.cached_session():
@@ -79,10 +82,10 @@ class LimitEpochsTest(test_lib.TestCase):
       love_me_two_times = inp.limit_epochs(love_me, num_epochs=2)
       variables.global_variables_initializer().run()
       variables.local_variables_initializer().run()
-      self.assertEqual(b"Love Me", love_me_two_times.eval())
-      self.assertEqual(b"Love Me", love_me_two_times.eval())
+      self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
+      self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
       with self.assertRaises(errors_impl.OutOfRangeError):
-        love_me_two_times.eval()
+        self.evaluate(love_me_two_times)
 
 
 class InputProducerTest(test_lib.TestCase):
@@ -102,11 +105,12 @@ class InputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      self.assertAllEqual(input_tensor * num_epochs, dequeue_many.eval())
+      self.assertAllEqual(input_tensor * num_epochs,
+                          self.evaluate(dequeue_many))
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
@@ -127,11 +131,11 @@ class InputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      self.assertAllEqual(input_value * num_epochs, dequeue_many.eval())
+      self.assertAllEqual(input_value * num_epochs, self.evaluate(dequeue_many))
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
@@ -156,12 +160,12 @@ class StringInputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      output = dequeue_many.eval()
+      output = self.evaluate(dequeue_many)
       self.assertAllEqual(strings * num_epochs, output)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
@@ -184,7 +188,7 @@ class StringInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = dequeue_many.eval()
+        output = self.evaluate(dequeue_many)
         key = b"".join(output)
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -200,7 +204,7 @@ class StringInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
@@ -224,7 +228,7 @@ class StringInputProducerTest(test_lib.TestCase):
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       coord.request_stop()
       for thread in threads:
         thread.join()
@@ -272,12 +276,12 @@ class RangeInputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      output = dequeue_many.eval()
+      output = self.evaluate(dequeue_many)
       self.assertAllEqual(list(xrange(range_size)) * num_epochs, output)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
@@ -300,7 +304,7 @@ class RangeInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = dequeue_many.eval()
+        output = self.evaluate(dequeue_many)
         key = 10 * (output[0] + 1) + (output[1] + 1)
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -316,7 +320,7 @@ class RangeInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
@@ -470,7 +474,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -535,7 +539,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         expected_results = np.arange(i * batch_size, (i + 1) * batch_size)
         max_len = expected_results[-1]
         self.assertAllEqual(results[0], expected_results)
@@ -567,7 +571,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -606,7 +610,7 @@ class BatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertAllEqual(results[0], results[1].values)
@@ -647,7 +651,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -663,7 +667,7 @@ class BatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[0],
                           np.arange(num_batches * batch_size,
                                     num_batches * batch_size + extra_elements))
@@ -705,7 +709,7 @@ class BatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertAllEqual(results[0], results[1].values)
@@ -717,7 +721,7 @@ class BatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), extra_elements)
       self.assertAllEqual(results[0], results[1].values)
@@ -823,7 +827,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
@@ -938,7 +942,7 @@ class BatchTest(test_lib.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = batched.eval()
+      batched_np = self.evaluate(batched)
 
       coord.request_stop()
       for thread in threads:
@@ -1016,7 +1020,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(3, len(results))
         self.assertEqual(batch_size, len(results[0]))
         self.assertEqual(batch_size, len(results[2]))
@@ -1112,7 +1116,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertEqual(2, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[1]), batch_size)
@@ -1197,7 +1201,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -1217,7 +1221,7 @@ class BatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached the final batch with 2 * extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertEqual(len(results[2]), 2 * extra_elements)
@@ -1292,7 +1296,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[1]), batch_size)
@@ -1312,7 +1316,7 @@ class BatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached the final batch with 2 * extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertEqual(len(results[1]), 2 * extra_elements)
@@ -1406,7 +1410,7 @@ class BatchJoinTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(
             [0] * batch_size,
             np.mod(results[0], 2),)
@@ -1525,7 +1529,7 @@ class BatchJoinTest(test_lib.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = batched.eval()
+      batched_np = self.evaluate(batched)
 
       coord.request_stop()
       for thread in threads:
@@ -1575,7 +1579,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
         self.assertAllEqual(
@@ -1630,7 +1634,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for _ in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
         self.assertAllEqual(
@@ -1641,7 +1645,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[1].dense_shape, [extra_elements, 1])
       self.assertAllEqual(results[2], [b"string"] * extra_elements)
       all_counts.extend(results[0])
@@ -1683,7 +1687,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
@@ -1733,7 +1737,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
@@ -1745,7 +1749,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[0].shape, [extra_elements])
       self.assertAllEqual(results[1].dense_shape, [extra_elements, 1])
       self.assertAllEqual(results[2], [b"string"] * extra_elements)
@@ -1813,7 +1817,7 @@ class ShuffleBatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
@@ -1986,7 +1990,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(3, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -2078,7 +2082,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -2098,7 +2102,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached end with 2 * extra_elements left
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertAllEqual(results[1].dense_shape, [2 * extra_elements, 1])
       self.assertEqual(len(results[2]), 2 * extra_elements)
@@ -2199,7 +2203,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 8a21c39d32344d0027793f1eac3d4f9f43a8d920..b6cac6addfb19e186f177079b1b353d9ea02b5ae 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -183,8 +183,8 @@ class MomentumOptimizerTest(test.TestCase):
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
                                                                     accum1_np,
                                                                     3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
   def testSparseNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -224,8 +224,8 @@ class MomentumOptimizerTest(test.TestCase):
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
                                                                     accum1_np,
                                                                     3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
@@ -303,37 +303,43 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
   def _dbParamsMom01(self):
     """Return dist-belief momentum values.
@@ -445,7 +451,7 @@ class MomentumOptimizerTest(test.TestCase):
       variables.global_variables_initializer().run()
       for i in xrange(num_samples):
         mom_update.run(feed_dict={grads0: db_grad[i]})
-        self.assertAllClose(np.array(db_out[i]), var0.eval())
+        self.assertAllClose(np.array(db_out[i]), self.evaluate(var0))
 
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -476,45 +482,57 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertEquals(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([0, 0], var0.eval()[0])
-        self.assertAllClose([0, 0], var0.eval()[1])
-        self.assertAllClose([1, 1], var1.eval()[2])
+        self.assertAllClose([0, 0], self.evaluate(var0)[0])
+        self.assertAllClose([0, 0], self.evaluate(var0)[1])
+        self.assertAllClose([1, 1], self.evaluate(var1)[2])
 
         # Step 1: the momentum accumulators are 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
-        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
         self.assertAllCloseAccordingToType(
-            np.array([.01, .01]), slot1.eval()[2])
+            np.array([0, 0]),
+            self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([.1, .1]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]),
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
         self.assertAllCloseAccordingToType(
-            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]), var0.eval()[1])
+            np.array([0, 0]),
+            self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]), var1.eval()[2])
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
+            self.evaluate(var0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
+            self.evaluate(var1)[2])
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()[1])
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0)[1])
         self.assertAllCloseAccordingToType(
             np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            slot1.eval()[2])
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
             np.array([
-                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), -(0.1 * 2.0) - (
-                    (0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval()[1])
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]),
+            self.evaluate(var0)[1])
         self.assertAllCloseAccordingToType(
             np.array([
-                0.98 - ((0.9 * 0.01 + 0.01) * 2.0), 0.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval()[2])
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]),
+            self.evaluate(var1)[2])
 
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -538,37 +556,43 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertEquals(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update1.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the second momentum accumulators contain the previous update.
         mom_update2.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 0687eb5d4bc6cbe59342c10be94263f5269e7ec6..162fef971db0aca468ae619d249972bc4110f825 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -346,7 +346,7 @@ def _create_monitored_session_with_worker_context(worker_context,  # pylint: dis
       stop_grace_period_secs=stop_grace_period_secs)
 
 
-@tf_export('train.MonitoredTrainingSession')
+@tf_export(v1=['train.MonitoredTrainingSession'])
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              is_chief=True,
                              checkpoint_dir=None,
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index c870d99de9e3daed2c167455e6ee6ab5efa33a7b..2ceb387ec34d548b9c8af090911e7be4159626d0 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -382,6 +382,16 @@ class MonitoredTrainingSessionTest(test.TestCase):
         self.assertEqual(0, session.run(gstep))
 
 
+class MockExtended(object):
+
+  def __init__(self, between_graph, should_init, should_checkpoint,
+               should_save_summary):
+    self.experimental_between_graph = between_graph
+    self.experimental_should_init = should_init
+    self.should_checkpoint = should_checkpoint
+    self.should_save_summary = should_save_summary
+
+
 class MockStrategy(object):
 
   def __init__(self,
@@ -389,26 +399,8 @@ class MockStrategy(object):
                should_init=True,
                should_checkpoint=None,
                should_save_summary=None):
-    self._between_graph = between_graph
-    self._should_init = should_init
-    self._should_checkpoint = should_checkpoint
-    self._should_save_summary = should_save_summary
-
-  @property
-  def between_graph(self):
-    return self._between_graph
-
-  @property
-  def should_init(self):
-    return self._should_init
-
-  @property
-  def should_checkpoint(self):
-    return self._should_checkpoint
-
-  @property
-  def should_save_summary(self):
-    return self._should_save_summary
+    self.extended = MockExtended(between_graph, should_init, should_checkpoint,
+                                 should_save_summary)
 
 
 class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
@@ -1178,7 +1170,7 @@ class HookedSessionTest(test.TestCase):
       mock_run = FakeSession(sess)
       mon_sess = monitored_session._HookedSession(sess=mock_run, hooks=[])
       a_tensor = constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       output = mon_sess.run(fetches=a_tensor,
                             feed_dict='a_feed',
                             options='an_option',
@@ -1197,7 +1189,7 @@ class HookedSessionTest(test.TestCase):
       mon_sess = monitored_session._HookedSession(
           sess=sess, hooks=[mock_hook, mock_hook2])
       a_tensor = constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       mon_sess.run(a_tensor)
 
       for hook in [mock_hook, mock_hook2]:
@@ -1222,7 +1214,7 @@ class HookedSessionTest(test.TestCase):
       mon_sess = monitored_session._HookedSession(
           sess=sess, hooks=[mock_hook, mock_hook2])
       constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       mon_sess.run(fetches='a_tensor')
       self.assertFalse(mon_sess.should_stop())
@@ -1242,7 +1234,7 @@ class HookedSessionTest(test.TestCase):
       third_tensor = constant_op.constant([10], name='third_tensor')
       mock_hook.request = session_run_hook.SessionRunArgs([another_tensor])
       mock_hook2.request = session_run_hook.SessionRunArgs([third_tensor])
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       output = mon_sess.run(fetches=a_tensor)
       self.assertEqual(output, [0])
@@ -1262,7 +1254,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(mon_sess.run(fetches=add_tensor), [15])
 
@@ -1280,7 +1272,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       feed_dict = {c_tensor: [20]}
       self.assertEqual(
@@ -1301,7 +1293,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={a_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor)
@@ -1319,7 +1311,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor, feed_dict={b_tensor: [10]})
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index fc9eb479cc3a0c3fd3dba4de7269b7894d3ec84c..9b5449498b0e60bbdf4e9aa3a5d4a7e1301267f8 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
@@ -95,11 +96,10 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
       # In a replica context, we update variable using the mean of value across
       # replicas.
       def merge_fn(strategy, v, value):
-        value = strategy.reduce(
-            variable_scope.VariableAggregation.MEAN, value, v)
+        value = strategy.reduce(ds_reduce_util.ReduceOp.MEAN, value, v)
         return strategy.update(v, update_fn, value)
 
-      return replica_context.merge_call(merge_fn, variable, value)
+      return replica_context.merge_call(merge_fn, args=(variable, value))
     else:
       strategy = distribution_strategy_context.get_cross_replica_context()
       return strategy.update(variable, update_fn, value)
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index bb2fca66e3c1eed8f3143fa98fe0100a8eb71bbe..41e9dcea8421777269bdfb0b521db4bcec9703ea 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -43,11 +43,11 @@ class MovingAveragesTest(test.TestCase):
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
       variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var.eval())
+      self.assertAllClose([10.0, 11.0], self.evaluate(var))
       assign.op.run()
       self.assertAllClose(
           [10.0 * 0.25 + 1.0 * (1.0 - 0.25), 11.0 * 0.25 + 2.0 * (1.0 - 0.25)],
-          var.eval())
+          self.evaluate(var))
 
   def testAssignMovingAverage(self):
     with self.cached_session():
@@ -56,11 +56,11 @@ class MovingAveragesTest(test.TestCase):
       decay = 0.25
       assign = moving_averages.assign_moving_average(var, val, decay)
       variables.global_variables_initializer().run()
-      self.assertAllClose([0.0, 0.0], var.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var))
       assign.op.run()
-      self.assertAllClose([
-          1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)
-      ], var.eval())
+      self.assertAllClose(
+          [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
+          self.evaluate(var))
 
   def testAssignMovingAverageNewNamingMultipleCalls(self):
     with variable_scope.variable_scope("scope1") as vs1:
@@ -179,39 +179,39 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertEqual("add/ExponentialMovingAverage:0", avg2.name)
 
     # Check initial values.
-    self.assertAllClose(tens, var0.eval())
-    self.assertAllClose(thirties, var1.eval())
-    self.assertAllClose(_Repeat(10.0 + 30.0, dim), tensor2.eval())
+    self.assertAllClose(tens, self.evaluate(var0))
+    self.assertAllClose(thirties, self.evaluate(var1))
+    self.assertAllClose(_Repeat(10.0 + 30.0, dim), self.evaluate(tensor2))
 
     # Check that averages are initialized correctly.
-    self.assertAllClose(tens, avg0.eval())
-    self.assertAllClose(thirties, avg1.eval())
+    self.assertAllClose(tens, self.evaluate(avg0))
+    self.assertAllClose(thirties, self.evaluate(avg1))
     # Note that averages of Tensor's initialize to zeros_like since no value
     # of the Tensor is known because the Op has not been run (yet).
-    self.assertAllClose(_Repeat(0.0, dim), avg2.eval())
+    self.assertAllClose(_Repeat(0.0, dim), self.evaluate(avg2))
 
     # Update the averages and check.
     update.run()
     dk = actual_decay
 
     expected = _Repeat(10.0 * dk + 10.0 * (1 - dk), dim)
-    self.assertAllClose(expected, avg0.eval())
+    self.assertAllClose(expected, self.evaluate(avg0))
     expected = _Repeat(30.0 * dk + 30.0 * (1 - dk), dim)
-    self.assertAllClose(expected, avg1.eval())
+    self.assertAllClose(expected, self.evaluate(avg1))
     expected = _Repeat(0.0 * dk + (10.0 + 30.0) * (1 - dk) / _Scale(dk, 1), dim)
-    self.assertAllClose(expected, avg2.eval())
+    self.assertAllClose(expected, self.evaluate(avg2))
 
     # Again, update the averages and check.
     update.run()
     expected = _Repeat((10.0 * dk + 10.0 * (1 - dk)) * dk + 10.0 * (1 - dk),
                        dim)
-    self.assertAllClose(expected, avg0.eval())
+    self.assertAllClose(expected, self.evaluate(avg0))
     expected = _Repeat((30.0 * dk + 30.0 * (1 - dk)) * dk + 30.0 * (1 - dk),
                        dim)
-    self.assertAllClose(expected, avg1.eval())
+    self.assertAllClose(expected, self.evaluate(avg1))
     expected = _Repeat(((0.0 * dk + (10.0 + 30.0) * (1 - dk)) * dk +
                         (10.0 + 30.0) * (1 - dk)) / _Scale(dk, 2), dim)
-    self.assertAllClose(expected, avg2.eval())
+    self.assertAllClose(expected, self.evaluate(avg2))
 
   def testAverageVariablesNoNumUpdates_Scalar(self):
     with self.cached_session():
@@ -274,14 +274,14 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual([], v1_avg.value().op.control_inputs)
       self.assertEqual([], v1_avg.value().op.control_inputs)
       # We should be able to initialize v1_avg before v0.
-      sess.run(v1_avg.initializer)
-      sess.run(v0.initializer)
-      self.assertEqual([10.0], sess.run(v1_avg))
+      self.evaluate(v1_avg.initializer)
+      self.evaluate(v0.initializer)
+      self.assertEqual([10.0], self.evaluate(v1_avg))
       # running ema_op should add to v0 (in addition to updating v1_avg)
       sess.run(assign_to_v1)
-      sess.run(ema_op)
-      self.assertEqual(1, sess.run(v0))
-      self.assertEqual([17.5], sess.run(v1_avg))
+      self.evaluate(ema_op)
+      self.assertEqual(1, self.evaluate(v0))
+      self.assertEqual([17.5], self.evaluate(v1_avg))
 
   @test_util.run_in_graph_and_eager_modes
   def testBasicEager(self):
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 8e400f2aebaeb63cea6636682373b3db6626d70c..8cd5311b3147c42d73fe44e68fa850dbedba7a8d 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -24,6 +24,7 @@ import abc
 
 import six
 
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -465,11 +466,7 @@ class Optimizer(
         # Have to be careful to call distribute_lib.get_loss_reduction()
         # *after* loss() is evaluated, so we know what loss reduction it uses.
         # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-        if (distribute_lib.get_loss_reduction() ==
-            variable_scope.VariableAggregation.MEAN):
-          num_replicas = distribute_ctx.get_distribution_strategy().num_replicas
-          if num_replicas > 1:
-            loss_value *= (1. / num_replicas)
+        loss_value = self._scale_loss(loss_value)
 
       if var_list is None:
         var_list = tape.watched_variables()
@@ -486,11 +483,7 @@ class Optimizer(
           "be a function when eager execution is enabled.")
 
     # Scale loss if using a "mean" loss reduction and multiple replicas.
-    if (distribute_lib.get_loss_reduction() ==
-        variable_scope.VariableAggregation.MEAN):
-      num_replicas = distribute_ctx.get_distribution_strategy().num_replicas
-      if num_replicas > 1:
-        loss *= (1. / num_replicas)
+    loss = self._scale_loss(loss)
 
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
@@ -526,6 +519,15 @@ class Optimizer(
          if g is not None and v.dtype != dtypes.resource])
     return grads_and_vars
 
+  @staticmethod
+  def _scale_loss(loss_value):
+    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
+      num_replicas = \
+        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      if num_replicas > 1:
+        loss_value *= (1. / num_replicas)
+    return loss_value
+
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     """Apply gradients to variables.
 
@@ -563,7 +565,7 @@ class Optimizer(
     if distribute_ctx.has_distribution_strategy():
       grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
       return distribute_ctx.get_replica_context().merge_call(
-          self._distributed_apply, grads_and_vars, global_step, name)
+          self._distributed_apply, args=(grads_and_vars, global_step, name))
 
     # No DistributionStrategy case.
     grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
@@ -656,10 +658,10 @@ class Optimizer(
     Returns:
       An `Operation` that applies the specified gradients across all
       replicas. If `global_step` was not None, that operation also
-      increments `global_step`.
+      increments `global_step`
     """
-    reduced_grads = distribution.batch_reduce(
-        variable_scope.VariableAggregation.SUM, grads_and_vars)
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
     grads_and_vars = zip(reduced_grads, var_list)
     # Note that this is called in a cross-replica context.
@@ -693,21 +695,23 @@ class Optimizer(
       update_ops = [
           op
           for grad, var in grads_and_vars
-          for op in distribution.update(var, update, grad, grouped=False)
+          for op in distribution.extended.update(
+              var, update, args=(grad,), group=False)
       ]
 
       def finish(self, update_ops):
         return self._finish(update_ops, "update")
 
-      non_slot_devices = distribution.non_slot_devices(var_list)
-      finish_updates = distribution.update_non_slot(
-          non_slot_devices, finish, self, update_ops, grouped=False)
+      non_slot_devices = distribution.extended.non_slot_devices(var_list)
+      finish_updates = distribution.extended.update_non_slot(
+          non_slot_devices, finish, args=(self, update_ops), group=False)
       if global_step is None:
         apply_updates = distribution.group(finish_updates, name=name)
       else:
         with ops.control_dependencies(finish_updates):
-          apply_updates = distribution.update(
-              global_step, state_ops.assign_add, 1, name=name)
+          apply_updates = distribution.extended.update(
+              global_step, state_ops.assign_add, args=(1,),
+              kwargs={"name": name})
 
       if not context.executing_eagerly():
         if isinstance(apply_updates, ops.Tensor):
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index 7a7d01d50e0b6dc639d0d511f03d121c3a9e5c73..5ed0a30285987050cbec9c49fe8093c977e3c818 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -79,13 +79,13 @@ class OptimizerTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd through optimizer
         opt_op.run()
         # Validate updated params
-        self.assertAllClose([-14., -13.], var0.eval())
-        self.assertAllClose([-6., -5.], var1.eval())
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))
 
   def testPrecomputedGradient(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -102,15 +102,15 @@ class OptimizerTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd through optimizer
         opt_op.run()
         # Validate updated params
         self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
-                            var0.eval())
+                            self.evaluate(var0))
         self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
-                            var1.eval())
+                            self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testNoVariables(self):
@@ -257,13 +257,13 @@ class OptimizerTest(test.TestCase):
 
       variables.global_variables_initializer().run()
       # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([3.0, 4.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
       # Run 1 step of sgd through optimizer
       opt_op.run()
       # Validate updated params
-      self.assertAllClose([-0.1, -0.1], var0.eval())
-      self.assertAllClose([0., 0.], var1.eval())
+      self.assertAllClose([-0.1, -0.1], self.evaluate(var0))
+      self.assertAllClose([0., 0.], self.evaluate(var1))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/training/proximal_adagrad_test.py b/tensorflow/python/training/proximal_adagrad_test.py
index 74e06a5e2e68adc1b214110c6fc2268e50b30879..272f9019e7d43ea0733037c5f0eea2c5f4fe4e90 100644
--- a/tensorflow/python/training/proximal_adagrad_test.py
+++ b/tensorflow/python/training/proximal_adagrad_test.py
@@ -106,12 +106,13 @@ class ProximalAdagradOptimizerTest(test.TestCase):
         sgd_op = proximal_adagrad.ProximalAdagradOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0, 1]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
   def testProximalAdagradWithL1(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/python/training/proximal_gradient_descent_test.py b/tensorflow/python/training/proximal_gradient_descent_test.py
index f77f68b23432a59f509e73158ee6893021bbc138..a9355f482462cc7f824c0a3038ffce8122879b84 100644
--- a/tensorflow/python/training/proximal_gradient_descent_test.py
+++ b/tensorflow/python/training/proximal_gradient_descent_test.py
@@ -103,12 +103,13 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
             1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
   def testProximalGradientDescentWithL1_L2(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 15fe42bbd851fec831ef2a84401c1c7f1cac1973..65c2c13d8bbd3be85c7f6986daa7948cb606ee3c 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -58,7 +58,7 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
   def testTwoOps(self):
     with self.cached_session() as sess:
@@ -80,8 +80,8 @@ class QueueRunnerTest(test.TestCase):
       for t in threads:
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
-      self.assertEqual(3, var0.eval())
-      self.assertEqual(30, var1.eval())
+      self.assertEqual(3, self.evaluate(var0))
+      self.assertEqual(30, self.evaluate(var1))
 
   def testExceptionsCaptured(self):
     with self.cached_session() as sess:
@@ -121,11 +121,11 @@ class QueueRunnerTest(test.TestCase):
       # It should have terminated cleanly.
       self.assertEqual(0, len(qr.exceptions_raised))
       # The 2 values should be in queue1.
-      self.assertEqual(10.0, dequeue1.eval())
-      self.assertEqual(10.0, dequeue1.eval())
+      self.assertEqual(10.0, self.evaluate(dequeue1))
+      self.assertEqual(10.0, self.evaluate(dequeue1))
       # And queue1 should now be closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError, "is closed"):
-        dequeue1.eval()
+        self.evaluate(dequeue1)
 
   def testRespectCoordShouldStop(self):
     with self.cached_session() as sess:
@@ -149,7 +149,7 @@ class QueueRunnerTest(test.TestCase):
       coord.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 0.
-      self.assertEqual(0, var.eval())
+      self.assertEqual(0, self.evaluate(var))
 
   def testRequestStopOnException(self):
     with self.cached_session() as sess:
@@ -263,7 +263,7 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
   def testStartQueueRunnersRaisesIfNotASession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -310,7 +310,7 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
   def testQueueRunnerSerializationRoundTrip(self):
     graph = ops.Graph()
diff --git a/tensorflow/python/training/rmsprop_test.py b/tensorflow/python/training/rmsprop_test.py
index b63abe0529515b570c420f53919d24b51c1e2665..a9b8954e39db49f36d101bf05678c3c1880041e0 100644
--- a/tensorflow/python/training/rmsprop_test.py
+++ b/tensorflow/python/training/rmsprop_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -92,7 +93,7 @@ class RMSPropOptimizerTest(test.TestCase):
     # TODO(yori): Use ParameterizedTest when available
     for (dtype, learning_rate, decay, momentum,
          epsilon, centered, use_resource) in _TESTPARAMS:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
@@ -115,7 +116,7 @@ class RMSPropOptimizerTest(test.TestCase):
             centered=centered)
 
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         mg0 = opt.get_slot(var0, "mg")
         self.assertEqual(mg0 is not None, centered)
@@ -138,12 +139,12 @@ class RMSPropOptimizerTest(test.TestCase):
         mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 4 steps of RMSProp
         for _ in range(1, 5):
-          update.run()
+          self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
               var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
@@ -154,14 +155,14 @@ class RMSPropOptimizerTest(test.TestCase):
 
           # Validate updated params
           if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -176,14 +177,15 @@ class RMSPropOptimizerTest(test.TestCase):
             momentum=0.0,
             epsilon=0.0,
             centered=False).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0., 1.]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0., 1.]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
   def testMinimizeSparseResourceVariableCentered(self):
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -198,20 +200,21 @@ class RMSPropOptimizerTest(test.TestCase):
             momentum=0.0,
             epsilon=1.0,
             centered=True).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
   def testSparse(self):
     # TODO(yori): Use ParameterizedTest when available
     for (dtype, learning_rate, decay,
          momentum, epsilon, centered, _) in _TESTPARAMS:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
@@ -235,7 +238,7 @@ class RMSPropOptimizerTest(test.TestCase):
             epsilon=epsilon,
             centered=centered)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         mg0 = opt.get_slot(var0, "mg")
         self.assertEqual(mg0 is not None, centered)
@@ -258,12 +261,12 @@ class RMSPropOptimizerTest(test.TestCase):
         mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 4 steps of RMSProp
         for _ in range(1, 5):
-          update.run()
+          self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
               var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
@@ -274,18 +277,18 @@ class RMSPropOptimizerTest(test.TestCase):
 
           # Validate updated params
           if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testWithoutMomentum(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -293,7 +296,7 @@ class RMSPropOptimizerTest(test.TestCase):
         opt = rmsprop.RMSPropOptimizer(
             learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
@@ -305,34 +308,36 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertTrue(mom1 is not None)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the rms accumulators where 1. So we should see a normal
         # update: v -= grad * learning_rate
-        update.run()
+        self.evaluate(update)
         # Check the root mean square accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
+            np.array([0.901, 0.901]), self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
+            np.array([0.90001, 0.90001]), self.evaluate(rms1))
         # Check the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
         # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
+        self.evaluate(update)
         # Check the rms accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]),
+            self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]),
+            self.evaluate(rms1))
         # Check the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
@@ -340,18 +345,18 @@ class RMSPropOptimizerTest(test.TestCase):
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
   def testWithMomentum(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -360,7 +365,7 @@ class RMSPropOptimizerTest(test.TestCase):
         opt = rmsprop.RMSPropOptimizer(
             learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
@@ -372,57 +377,61 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertTrue(mom1 is not None)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: rms = 1, mom = 0. So we should see a normal
         # update: v -= grad * learning_rate
-        update.run()
+        self.evaluate(update)
         # Check the root mean square accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
+            np.array([0.901, 0.901]), self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
+            np.array([0.90001, 0.90001]), self.evaluate(rms1))
         # Check the momentum accumulators
         self.assertAllCloseAccordingToType(
             np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]),
+            self.evaluate(mom0))
         self.assertAllCloseAccordingToType(
             np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]),
+            self.evaluate(mom1))
 
         # Check that the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
         # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
+        self.evaluate(update)
         # Check the rms accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]),
+            self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]),
+            self.evaluate(rms1))
         self.assertAllCloseAccordingToType(
             np.array([
                 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
                 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
-            ]), mom0.eval())
+            ]), self.evaluate(mom0))
         self.assertAllCloseAccordingToType(
             np.array([
                 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
                 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
-            ]), mom1.eval())
+            ]), self.evaluate(mom1))
 
         # Check the parameters.
         self.assertAllCloseAccordingToType(
@@ -433,7 +442,7 @@ class RMSPropOptimizerTest(test.TestCase):
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
                 (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                  (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
 
         self.assertAllCloseAccordingToType(
             np.array([
@@ -443,7 +452,7 @@ class RMSPropOptimizerTest(test.TestCase):
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
                 (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                  (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
   def testCallableParams(self):
     with context.eager_mode():
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index ccbb1c6b5740797446a95e6acc20250e79ad1979..a29926a57df847fd6553e0813a5e2dfeebb3885e 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -781,8 +781,12 @@ class BaseSaverBuilder(object):
 
     with ops.name_scope(name, "save",
                         [saveable.op for saveable in saveables]) as name:
-      # Add the Constant string tensor for the filename.
-      filename_tensor = constant_op.constant(filename or "model")
+      # Add a placeholder string tensor for the filename.
+      filename_tensor = array_ops.placeholder_with_default(
+          filename or "model", shape=(), name="filename")
+      # Keep the name "Const" for backwards compatibility.
+      filename_tensor = array_ops.placeholder_with_default(
+          filename_tensor, shape=(), name="Const")
 
       # Add the save ops.
       if sharded:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index efb464410bda69706ba22f1bc8368c43110939e4..7bc0a178a48504cf9542938210cae82fc8c2a550 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -227,7 +227,7 @@ class SaverTest(test.TestCase):
         w1 = resource_variable_ops.ResourceVariable(1.0, name="w1")
         w2 = resource_variable_ops.ResourceVariable(2.0, name="w2")
         graph_saver = saver_module.Saver([w1, w2])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         graph_saver.save(sess, graph_ckpt_prefix)
 
     with context.eager_mode():
@@ -260,7 +260,7 @@ class SaverTest(test.TestCase):
         w3 = resource_variable_ops.ResourceVariable(0.0, name="w3")
         w4 = resource_variable_ops.ResourceVariable(0.0, name="w4")
         graph_saver = saver_module.Saver([w3, w4])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         graph_saver.restore(sess, eager_ckpt_prefix)
         self.assertAllEqual(w3.eval(), 3.0)
         self.assertAllEqual(w4.eval(), 4.0)
@@ -326,7 +326,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initialize all variables
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
       self.assertEqual(10.0, v0.eval())
@@ -376,7 +376,7 @@ class SaverTest(test.TestCase):
     with self.cached_session() as sess:
       tensor = sess.graph.get_tensor_by_name(
           save.saver_def.filename_tensor_name)
-      self.assertEqual(sess.run(tensor), filename)
+      self.assertEqual(self.evaluate(tensor), filename)
 
   def testInvalidPath(self):
     v0 = variables.VariableV1(0, name="v0")
@@ -742,7 +742,7 @@ class SaverTest(test.TestCase):
       try:
         with self.cached_session() as sess:
           # Initialize all variables
-          sess.run(init_all_op)
+          self.evaluate(init_all_op)
 
           # Check that the parameter nodes have been initialized.
           self.assertEqual(10.0, v0.eval())
@@ -777,7 +777,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initialize all variables
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
       self.assertEqual(10.0, v0.eval())
@@ -824,11 +824,11 @@ class SaverTest(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph) as sess:
       orig_vars = _model()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       save = saver_module.Saver(max_to_keep=1)
       variables.global_variables_initializer().run()
       save.save(sess, save_dir)
-      orig_vals = sess.run(orig_vars)
+      orig_vals = self.evaluate(orig_vars)
 
     restore_graph = ops_lib.Graph()
     with restore_graph.as_default(), self.session(
@@ -836,7 +836,7 @@ class SaverTest(test.TestCase):
       restored_vars = _model()
       save = saver_module.Saver(max_to_keep=1)
       save.restore(sess, save_dir)
-      restored_vals = sess.run(restored_vars)
+      restored_vals = self.evaluate(restored_vars)
 
     for orig, restored in zip(orig_vals, restored_vals):
       self.assertAllEqual(orig, restored)
@@ -1832,8 +1832,8 @@ class MetaGraphTest(test.TestCase):
       self.assertEqual(1, len(savers.value))
 
       # Verifies that saver0 graph nodes are omitted from the saver1 export
-      self.assertEqual(29, len(meta_graph_def0.graph_def.node))
-      self.assertEqual(19, len(meta_graph_def1.graph_def.node))
+      self.assertEqual(33, len(meta_graph_def0.graph_def.node))
+      self.assertEqual(21, len(meta_graph_def1.graph_def.node))
 
   def testBinaryAndTextFormat(self):
     test_dir = self._get_test_dir("binary_and_text")
@@ -1949,7 +1949,7 @@ class MetaGraphTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initializes all the variables.
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
       # Runs to logit.
       sess.run(logits)
       # Creates a saver.
@@ -1991,7 +1991,7 @@ class MetaGraphTest(test.TestCase):
       ops_lib.add_to_collection("train_op", train_op)
 
       # Runs train_op.
-      sess.run(train_op)
+      self.evaluate(train_op)
 
       # Generates MetaGraphDef.
       saver_module.export_meta_graph(train_filename)
@@ -2005,7 +2005,7 @@ class MetaGraphTest(test.TestCase):
       # Restores from checkpoint.
       new_saver.restore(sess, saver0_ckpt)
       train_op = ops_lib.get_collection("train_op")[0]
-      sess.run(train_op)
+      self.evaluate(train_op)
 
   def testGraphExtension(self):
     test_dir = self._get_test_dir("graph_extension")
@@ -2037,7 +2037,7 @@ class MetaGraphTest(test.TestCase):
 
       # Generate a MetaGraphDef containing the while loop.
       with session.Session() as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         sess.run(output)
         saver = saver_module.Saver()
         saver.save(sess, saver_ckpt)
@@ -2053,8 +2053,8 @@ class MetaGraphTest(test.TestCase):
       no_constfold_config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
       with session.Session(config=no_constfold_config) as sess:
-        sess.run(init_op)
-        expected_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        expected_grad_value = self.evaluate(grad)
 
     # Restore the MetaGraphDef into a new Graph.
     with ops_lib.Graph().as_default():
@@ -2070,8 +2070,8 @@ class MetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with session.Session(config=no_constfold_config) as sess:
-        sess.run(init_op)
-        actual_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        actual_grad_value = self.evaluate(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
   def _testWhileLoopAndGradientSerDes(self, outer_body_fn):
@@ -2140,13 +2140,14 @@ class MetaGraphTest(test.TestCase):
       ops = [o.name for o in meta_graph_def.meta_info_def.stripped_op_list.op]
       if save._write_version is saver_pb2.SaverDef.V1:
         self.assertEqual(ops, [
-            "Add", "Assign", "Const", "Identity", "NoOp", "RestoreV2",
-            "SaveSlices", "Sub", "VariableV2"
+            "Add", "Assign", "Const", "Identity", "NoOp",
+            "PlaceholderWithDefault", "RestoreV2", "SaveSlices", "Sub",
+            "VariableV2"
         ])
       else:
         self.assertEqual(ops, [
-            "Add", "Assign", "Const", "Identity", "NoOp", "RestoreV2", "SaveV2",
-            "Sub", "VariableV2"
+            "Add", "Assign", "Const", "Identity", "NoOp",
+            "PlaceholderWithDefault", "RestoreV2", "SaveV2", "Sub", "VariableV2"
         ])
 
       # Test calling stripped_op_list_for_graph directly
@@ -2208,7 +2209,7 @@ class MetaGraphTest(test.TestCase):
                                                       logits=logit, name="cost")
       adam.AdamOptimizer().minimize(cost, name="optimize")
       saver = saver_module.Saver()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver.save(sess, filename)
 
     graph = ops_lib.Graph()
@@ -2245,7 +2246,7 @@ class MetaGraphTest(test.TestCase):
 
       # Create a variable in graph_2 under scope "my_scope".
       variables.VariableV1(array_ops.zeros([10]), name="my_scope/my_var")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       # Restore the checkpoint into a different scope "subgraph_2".
       new_saver_2 = saver_module.import_meta_graph(
           filename + ".meta", graph=graph_2, import_scope="subgraph_2")
@@ -2278,7 +2279,7 @@ class MetaGraphTest(test.TestCase):
                                                       logits=logit, name="cost")
       adam.AdamOptimizer().minimize(cost, name="optimize")
       saver = saver_module.Saver()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver.save(sess, filename)
 
     graph = ops_lib.Graph()
@@ -2315,12 +2316,12 @@ class MetaGraphTest(test.TestCase):
           meta_graph_def, clear_devices=False, import_scope="new_model")
       # Device refers to GPU, which is not available here.
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(
           meta_graph_def, clear_devices=True, import_scope="new_model")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(["new_model/optimize"], {
           "new_model/image:0": np.random.random([1, 784]),
           "new_model/label:0": np.random.randint(
@@ -2347,7 +2348,7 @@ class MetaGraphTest(test.TestCase):
 
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(["new_model/optimize"], {
           "new_model/image:0": np.random.random([1, 784]),
           "new_model/label:0": np.random.randint(
@@ -2373,7 +2374,7 @@ class MetaGraphTest(test.TestCase):
                            meta_graph_def_from_graph_def]:
       with session.Session(graph=ops_lib.Graph()) as sess:
         saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in range(10):
           self.assertEqual(i * i, sess.run("new_model/output:0"))
         with self.assertRaises(errors.OutOfRangeError):
@@ -2399,7 +2400,7 @@ class CheckpointReaderTest(test.TestCase):
     save_path = os.path.join(self.get_temp_dir(),
                              "ckpt_for_debug_string" + str(self._WRITE_VERSION))
     with self.cached_session() as sess:
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
       # Saves a checkpoint.
       save.save(sess, save_path)
 
@@ -2545,7 +2546,7 @@ class ScopedGraphTest(test.TestCase):
       self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
 
     with self.session(graph=graph) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.save(sess, os.path.join(test_dir, ckpt_filename), write_state=False)
 
@@ -2610,7 +2611,7 @@ class ScopedGraphTest(test.TestCase):
       # Verify that we have restored weights1 and biases1.
       sess.run([weights1, biases1])
       # Initialize the rest of the variables and run logits.
-      sess.run(init_rest_op)
+      self.evaluate(init_rest_op)
       sess.run(logits)
 
   # Verifies that we can save the subgraph under "hidden1" and restore it
@@ -2639,7 +2640,7 @@ class ScopedGraphTest(test.TestCase):
 
     # Run the graph and save scoped checkpoint.
     with self.session(graph=graph1) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           export_scope="hidden1")
       saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
@@ -2695,7 +2696,7 @@ class ScopedGraphTest(test.TestCase):
 
     # Run the graph and save scoped checkpoint.
     with self.session(graph=graph1) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           graph_def=graph1.as_graph_def(), export_scope="hidden1")
       saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
@@ -2963,7 +2964,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     a_saver = saver_module.Saver([a])
     b_saver = saver_module.Saver([b])
     with self.cached_session() as sess:
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
       save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
       with self.assertRaisesRegexp(
           errors.NotFoundError, "Key b not found in checkpoint"):
@@ -2985,7 +2986,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       a_saver = saver_module.Saver([a])
 
       with self.session(graph=g) as sess:
-        sess.run(a.initializer)
+        self.evaluate(a.initializer)
         save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
 
     with ops_lib.Graph().as_default() as g:
diff --git a/tensorflow/python/training/server_lib.i b/tensorflow/python/training/server_lib.i
deleted file mode 100644
index 94250304f853ba1f942506bcfb3240a4adab3797..0000000000000000000000000000000000000000
--- a/tensorflow/python/training/server_lib.i
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-%nothread tensorflow::ServerInterface::Join;
-
-%include "tensorflow/python/platform/base.i"
-
-//%newobject tensorflow::NewServer;
-
-%typemap(in) const ServerDef& (tensorflow::ServerDef temp) {
-  char* c_string;
-  Py_ssize_t py_size;
-  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
-    // Python has raised an error (likely TypeError or UnicodeEncodeError).
-    SWIG_fail;
-  }
-
-  if (!temp.ParseFromString(string(c_string, py_size))) {
-    PyErr_SetString(
-        PyExc_TypeError,
-        "The ServerDef could not be parsed as a valid protocol buffer");
-    SWIG_fail;
-  }
-  $1 = &temp;
-}
-
-%typemap(in, numinputs=0)
-    std::unique_ptr<tensorflow::ServerInterface>* out_server (
-        std::unique_ptr<tensorflow::ServerInterface> temp) {
-  $1 = &temp;
-}
-
-%typemap(argout) std::unique_ptr<tensorflow::ServerInterface>* out_server {
-  // TODO(mrry): Convert this to SWIG_POINTER_OWN when the issues with freeing
-  // a server are fixed.
-  $result = SWIG_NewPointerObj($1->release(),
-                               $descriptor(tensorflow::ServerInterface*),
-                               0);
-}
-
-%feature("except") tensorflow::ServerInterface::Join {
-  // Let other threads run while we wait for the server to shut down.
-  Py_BEGIN_ALLOW_THREADS
-  $action
-  Py_END_ALLOW_THREADS
-}
-
-%{
-#include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/distributed_runtime/server_lib.h"
-#include "tensorflow/core/lib/core/status.h"
-
-using tensorflow::ServerDef;
-
-static void PyServer_New(const ServerDef& server_def,
-                         std::unique_ptr<tensorflow::ServerInterface>* out_server,
-                         TF_Status* out_status) {
-  tensorflow::Status status =
-      tensorflow::NewServer(server_def, out_server);
-  tensorflow::Set_TF_Status_from_Status(out_status, status);
-}
-
-static void PyServer_Start(
-    tensorflow::ServerInterface* in_server,
-    TF_Status* out_status) {
-  tensorflow::Set_TF_Status_from_Status(out_status, in_server->Start());
-}
-
-static void PyServer_Stop(
-    tensorflow::ServerInterface* in_server,
-    TF_Status* out_status) {
-  tensorflow::Set_TF_Status_from_Status(out_status, in_server->Stop());
-}
-
-static void PyServer_Join(
-    tensorflow::ServerInterface* in_server,
-    TF_Status* out_status) {
-  tensorflow::Set_TF_Status_from_Status(out_status, in_server->Join());
-}
-%}
-
-// Wrap this function.
-void PyServer_New(const ServerDef& server_def,
-                  std::unique_ptr<tensorflow::ServerInterface>* out_server,
-                  TF_Status* out_status);
-void PyServer_Start(tensorflow::ServerInterface* in_server,
-                    TF_Status* out_status);
-void PyServer_Stop(tensorflow::ServerInterface* in_server,
-                   TF_Status* out_status);
-void PyServer_Join(tensorflow::ServerInterface* in_server,
-                   TF_Status* out_status);
-
-%ignoreall
-
-%unignore tensorflow;
-%unignore tensorflow::ServerDef;
-%unignore tensorflow::ServerInterface;
-%unignore tensorflow::ServerInterface::~ServerInterface;
-%unignore tensorflow::ServerInterface::target;
-
-%unignore PyServer_New;
-%unignore PyServer_Start;
-%unignore PyServer_Stop;
-%unignore PyServer_Join;
-
-%include "tensorflow/core/distributed_runtime/server_lib.h"
-
-%unignoreall
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 46543413e40a5a212b180b0cdeb2280148d606c5..302ca2dd44b99d2a5cfeffa163d95634513f9eaa 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
-from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
@@ -143,12 +143,24 @@ class Server(object):
     """
     self._server_def = _make_server_def(server_or_cluster_def,
                                         job_name, task_index, protocol, config)
-    with errors.raise_exception_on_not_ok_status() as status:
-      self._server = pywrap_tensorflow.PyServer_New(
-          self._server_def.SerializeToString(), status)
+    self._server = c_api.TF_NewServer(self._server_def.SerializeToString())
     if start:
       self.start()
 
+  def __del__(self):
+    try:
+      c_api.TF_ServerStop(self._server)
+      # Clean shutdown of servers is not yet implemented, so
+      # we leak instead of calling c_api.TF_DeleteServer here.
+      # See:
+      # https://github.com/tensorflow/tensorflow/blob/0495317a6e9dd4cac577b9d5cf9525e62b571018/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h#L73
+    except errors.UnimplementedError:
+      pass
+    except AttributeError:
+      # At shutdown, `c_api` may have been garbage collected.
+      pass
+    self._server = None
+
   def start(self):
     """Starts this server.
 
@@ -156,8 +168,7 @@ class Server(object):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         starting the TensorFlow server.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.PyServer_Start(self._server, status)
+    c_api.TF_ServerStart(self._server)
 
   def join(self):
     """Blocks until the server has shut down.
@@ -168,8 +179,7 @@ class Server(object):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         joining the TensorFlow server.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.PyServer_Join(self._server, status)
+    c_api.TF_ServerJoin(self._server)
 
   @property
   def server_def(self):
@@ -198,7 +208,7 @@ class Server(object):
     Returns:
       A string containing a session target for this server.
     """
-    return self._server.target()
+    return c_api.TF_ServerTarget(self._server)
 
   @staticmethod
   def create_local_server(config=None, start=True):
diff --git a/tensorflow/python/training/server_lib_sparse_job_test.py b/tensorflow/python/training/server_lib_sparse_job_test.py
index 1a6b44b90e8d4d4c3faf9f0ac596942a7ff3d09f..8c2745b51aa5505e2b25191322bfac5ab8cac6c4 100644
--- a/tensorflow/python/training/server_lib_sparse_job_test.py
+++ b/tensorflow/python/training/server_lib_sparse_job_test.py
@@ -36,7 +36,7 @@ class SparseJobTest(test.TestCase):
       a = constant_op.constant(1.0)
 
     with session.Session(server.target) as sess:
-      self.assertEqual(1.0, sess.run(a))
+      self.assertEqual(1.0, self.evaluate(a))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index cf995707fc56448e7fe5354d162581947604f382..323e94c257c4116a6120e28b2355a42657d1bea8 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -174,7 +174,7 @@ class GrpcServerTest(test.TestCase):
     # is not supported, but it should successfully ignore it.
     sess = session.InteractiveSession(server.target)
     c = constant_op.constant(42.0)
-    self.assertEqual(42.0, c.eval())
+    self.assertEqual(42.0, self.evaluate(c))
     sess.close()
 
   def testSetConfiguration(self):
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index 6d6364169fd4b9afa6f64fb9aadc283aab261cbb..382c15bb55c4169258d5b3f3ba63d99de10854dd 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -41,7 +41,7 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([1.0, 2.5], slot.eval())
+      self.assertAllEqual([1.0, 2.5], self.evaluate(slot))
 
   def testCreateSlotFromTensor(self):
     with self.cached_session():
@@ -53,7 +53,7 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([2.0, 5.0], slot.eval())
+      self.assertAllEqual([2.0, 5.0], self.evaluate(slot))
 
   def testCreateZerosSlotFromVariable(self):
     with self.cached_session():
@@ -67,7 +67,7 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
   def testCreateZerosSlotFromDynamicShapedVariable(self):
     with self.cached_session():
@@ -88,7 +88,7 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
   def testCreateZerosSlotFromTensor(self):
     with self.cached_session():
@@ -101,7 +101,7 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
   def testCreateZerosSlotFromDynamicShapedTensor(self):
     with self.cached_session():
@@ -116,7 +116,7 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
   def testCreateSlotFromVariableRespectsScope(self):
     # See discussion on #2740.
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index 7cd99d86801e659b369419796848babb49ac9ff4..9dc88d78ccc3073112450f9bbd6c1b5191464b06 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -100,7 +100,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       for _ in xrange(10):
-        sess.run(my_op)
+        self.evaluate(my_op)
       sess.close()
       sv.stop()
 
@@ -111,7 +111,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir=logdir)
       with sv.managed_session("") as sess:
         for _ in xrange(10):
-          sess.run(my_op)
+          self.evaluate(my_op)
       # Supervisor has been stopped.
       self.assertTrue(sv.should_stop())
 
@@ -128,7 +128,7 @@ class SupervisorTest(test.TestCase):
             if step == 1:
               raise RuntimeError("failing here")
             else:
-              sess.run(my_op)
+              self.evaluate(my_op)
       # Supervisor has been stopped.
       self.assertTrue(sv.should_stop())
       self.assertEqual(1, last_step)
@@ -146,7 +146,7 @@ class SupervisorTest(test.TestCase):
             raise errors_impl.OutOfRangeError(my_op.op.node_def, my_op.op,
                                               "all done")
           else:
-            sess.run(my_op)
+            self.evaluate(my_op)
       # Supervisor has been stopped.  OutOfRangeError was not thrown.
       self.assertTrue(sv.should_stop())
       self.assertEqual(3, last_step)
@@ -335,7 +335,7 @@ class SupervisorTest(test.TestCase):
       sess = sv.prepare_or_wait_for_session(
           "", config=config_pb2.ConfigProto(device_count={"CPU": 2}))
       for _ in xrange(10):
-        sess.run(my_op)
+        self.evaluate(my_op)
       sess.close()
       sv.stop()
 
@@ -799,7 +799,7 @@ class SupervisorTest(test.TestCase):
       v = variables.VariableV1([10.10], name="foo")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
-      self.assertEqual(1.0, v.eval()[0])
+      self.assertEqual(1.0, self.evaluate(v)[0])
 
   # Same as testStandardServicesNoGlobalStep but with a global step.
   # We should get a summary about the step time.
@@ -863,7 +863,7 @@ class SupervisorTest(test.TestCase):
       v = variables.VariableV1([-12], name="global_step")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
-      self.assertEqual(123, v.eval()[0])
+      self.assertEqual(123, self.evaluate(v)[0])
 
   def testNoQueueRunners(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 6a3756fba9fd97b9f2916075f606119360342f5b..fbde8fe3c2a5ee720df4eef9659a1b9ebae9922c 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -31,6 +31,7 @@ from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
 from tensorflow.python.training import session_manager
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -39,7 +40,7 @@ from tensorflow.python.util.tf_export import tf_export
 # rate according to the number of replicas. This change is introduced to be
 # consistent with how gradients are aggregated (averaged) within a batch in a
 # replica.
-@tf_export("train.SyncReplicasOptimizer")
+@tf_export(v1=["train.SyncReplicasOptimizer"])
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
@@ -139,6 +140,12 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
   ```
   """
 
+  @deprecation.deprecated(
+      None,
+      "The `SyncReplicaOptimizer` is deprecated. For synchrononous training, "
+      "please use [Distribution Strategies](https://github.com/tensorflow/"
+      "tensorflow/tree/master/tensorflow/contrib/distribute).",
+      warn_once=True)
   def __init__(self,
                opt,
                replicas_to_aggregate,
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index 02164828250e786cae1f21d1a604863829a9f6eb..929dd74ac64c9124aac2f945d07b4aec7260b9dd 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -53,9 +53,9 @@ class TrainingOpsTest(TensorFlowTestCase):
     with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
       variables.global_variables_initializer().run()
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
-      out = apply_sgd.eval()
+      out = self.evaluate(apply_sgd)
       self.assertShapeEqual(out, apply_sgd)
       self.assertAllCloseAccordingToType(x - alpha * delta, out)
 
@@ -74,13 +74,13 @@ class TrainingOpsTest(TensorFlowTestCase):
       accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_adagrad = training_ops.apply_adagrad(var, accum, lr, grad)
-      out = apply_adagrad.eval()
+      out = self.evaluate(apply_adagrad)
       self.assertShapeEqual(out, apply_adagrad)
       self.assertAllCloseAccordingToType(x - lr * grad * (y + grad * grad)**
                                          (-0.5), out)
-      self.assertAllCloseAccordingToType(y + grad * grad, accum.eval())
+      self.assertAllCloseAccordingToType(y + grad * grad, self.evaluate(accum))
 
   def _testTypesForFtrl(self,
                         x,
@@ -99,10 +99,10 @@ class TrainingOpsTest(TensorFlowTestCase):
       linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_ftrl = training_ops.apply_ftrl(var, accum, linear, grad, lr, l1, l2,
                                            lr_power)
-      out = apply_ftrl.eval()
+      out = self.evaluate(apply_ftrl)
       self.assertShapeEqual(out, apply_ftrl)
       accum_update = y + grad * grad
       linear_update = z + grad - (accum_update**(-lr_power) - y**
@@ -112,17 +112,19 @@ class TrainingOpsTest(TensorFlowTestCase):
           np.sign(linear_update[i]) * l1 - linear_update[i]) / (quadratic[i]) if
                                np.abs(linear_update[i]) > l1 else 0.0
                                for i in range(linear_update.size)])
-      self.assertAllCloseAccordingToType(accum_update, accum.eval())
+      self.assertAllCloseAccordingToType(accum_update, self.evaluate(accum))
       if x.dtype == np.float16:
         # The calculations here really are not very precise in float16.
-        self.assertAllClose(linear_update, linear.eval(), rtol=2e-2, atol=2e-2)
+        self.assertAllClose(
+            linear_update, self.evaluate(linear), rtol=2e-2, atol=2e-2)
         self.assertAllClose(expected_out, out, rtol=2e-2, atol=2e-2)
       elif x.dtype == np.float32:
         # The calculations here not sufficiently precise in float32.
-        self.assertAllClose(linear_update, linear.eval(), rtol=1e-5, atol=1e-5)
+        self.assertAllClose(
+            linear_update, self.evaluate(linear), rtol=1e-5, atol=1e-5)
         self.assertAllClose(expected_out, out, rtol=1e-5, atol=1e-5)
       else:
-        self.assertAllClose(linear_update, linear.eval())
+        self.assertAllClose(linear_update, self.evaluate(linear))
         self.assertAllClose(expected_out, out)
 
   def testApplyAdagrad(self):
@@ -152,19 +154,19 @@ class TrainingOpsTest(TensorFlowTestCase):
       accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_adagrad = training_ops.sparse_apply_adagrad(
           var, accum, lr, grad,
           constant_op.constant(indices, self._toType(indices.dtype)))
-      out = sparse_apply_adagrad.eval()
+      out = self.evaluate(sparse_apply_adagrad)
       self.assertShapeEqual(out, sparse_apply_adagrad)
 
       for (i, index) in enumerate(indices):
         self.assertAllCloseAccordingToType(
             x[index] - lr * grad[i] * (y[index] + grad[i] * grad[i])**(-0.5),
-            var.eval()[index])
+            self.evaluate(var)[index])
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
-                                           accum.eval()[index])
+                                           self.evaluate(accum)[index])
 
   def _testTypesForSparseFtrl(self,
                               x,
@@ -183,7 +185,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_ftrl = training_ops.sparse_apply_ftrl(
           var,
           accum,
@@ -194,15 +196,16 @@ class TrainingOpsTest(TensorFlowTestCase):
           l1,
           l2,
           lr_power=lr_power)
-      out = sparse_apply_ftrl.eval()
+      out = self.evaluate(sparse_apply_ftrl)
       self.assertShapeEqual(out, sparse_apply_ftrl)
 
       for (i, index) in enumerate(indices):
-        self.assertAllCloseAccordingToType(x[index] - lr * grad[i] *
-                                           (y[index] + grad[i] * grad[i])**
-                                           (lr_power), var.eval()[index])
+        self.assertAllCloseAccordingToType(
+            x[index] - lr * grad[i] * (y[index] + grad[i] * grad[i])**
+            (lr_power),
+            self.evaluate(var)[index])
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
-                                           accum.eval()[index])
+                                           self.evaluate(accum)[index])
 
   def testSparseApplyAdagrad(self):
     for (dtype, index_type) in itertools.product(
@@ -276,13 +279,13 @@ class TrainingOpsTest(TensorFlowTestCase):
       epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), [])
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(var, var_t.eval())
+      self.assertAllCloseAccordingToType(var, self.evaluate(var_t))
       new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr, beta1,
                                             beta2, epsilon)
       apply_adam = training_ops.apply_adam(var_t, m_t, v_t, beta1_power_t,
                                            beta2_power_t, lr_t, beta1_t,
                                            beta2_t, epsilon_t, grad)
-      out = apply_adam.eval()
+      out = self.evaluate(apply_adam)
       self.assertShapeEqual(out, apply_adam)
       self.assertAllCloseAccordingToType(new_var, out)
 
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 77a420fb732c47226ebc53e1fb9655d48120eef9..86f1b4d5aae31bacfe34141866ee4e7156eaa57b 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -39,7 +39,7 @@ GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
 write_graph = graph_io.write_graph
 
 
-@tf_export('train.global_step')
+@tf_export(v1=['train.global_step'])
 def global_step(sess, global_step_tensor):
   """Small helper to get the global step.
 
@@ -69,7 +69,7 @@ def global_step(sess, global_step_tensor):
   return int(sess.run(global_step_tensor))
 
 
-@tf_export('train.get_global_step')
+@tf_export(v1=['train.get_global_step'])
 def get_global_step(graph=None):
   """Get the global step tensor.
 
@@ -104,7 +104,7 @@ def get_global_step(graph=None):
   return global_step_tensor
 
 
-@tf_export('train.create_global_step')
+@tf_export(v1=['train.create_global_step'])
 def create_global_step(graph=None):
   """Create global step tensor in graph.
 
@@ -145,7 +145,7 @@ def create_global_step(graph=None):
                      ops.GraphKeys.GLOBAL_STEP])
 
 
-@tf_export('train.get_or_create_global_step')
+@tf_export(v1=['train.get_or_create_global_step'])
 def get_or_create_global_step(graph=None):
   """Returns and create (if necessary) the global step tensor.
 
@@ -163,7 +163,7 @@ def get_or_create_global_step(graph=None):
   return global_step_tensor
 
 
-@tf_export('train.assert_global_step')
+@tf_export(v1=['train.assert_global_step'])
 def assert_global_step(global_step_tensor):
   """Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
 
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index 91a0b53b3a8771d8a1d826c6c63df91c91eec954..f1e719e6dbe10c76390f66b4d2ab570ad166946c 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -22,7 +22,7 @@ import os
 import numpy as np
 import six
 
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -49,7 +49,7 @@ class WarmStartingUtilTest(test.TestCase):
     return vocab_file
 
   def _write_checkpoint(self, sess):
-    sess.run(variables.global_variables_initializer())
+    self.evaluate(variables.global_variables_initializer())
     saver = saver_lib.Saver()
     ckpt_prefix = os.path.join(self.get_temp_dir(), "model")
     saver.save(sess, ckpt_prefix, global_step=0)
@@ -125,7 +125,7 @@ class WarmStartingUtilTest(test.TestCase):
         prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(prev_val, fruit_weights.eval(sess))
 
   def testWarmStartVarPrevVarPartitioned(self):
@@ -143,7 +143,7 @@ class WarmStartingUtilTest(test.TestCase):
         prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(prev_val, fruit_weights.eval(sess))
 
   def testWarmStartVarCurrentVarPartitioned(self):
@@ -162,7 +162,7 @@ class WarmStartingUtilTest(test.TestCase):
         prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         fruit_weights = fruit_weights._get_variable_list()
         new_val = np.concatenate(
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
@@ -189,7 +189,7 @@ class WarmStartingUtilTest(test.TestCase):
             fruit_weights, prev_tensor_name="old_scope/fruit_weights")
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         fruit_weights = fruit_weights._get_variable_list()
         new_val = np.concatenate(
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
@@ -211,7 +211,7 @@ class WarmStartingUtilTest(test.TestCase):
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
 
@@ -236,7 +236,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[0.3, 0.5, 0.], [0.8, 1.0, 0.], [1.2, 1.5, 0.],
                              [2.3, 2., 0.]], fruit_output_layer.eval(sess))
 
@@ -261,7 +261,7 @@ class WarmStartingUtilTest(test.TestCase):
             self.get_temp_dir(),
             prev_vocab_path,
             previous_vocab_size=2)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Old vocabulary limited to ['apple', 'banana'].
         self.assertAllClose([[0.], [0.], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
@@ -285,7 +285,7 @@ class WarmStartingUtilTest(test.TestCase):
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
 
@@ -312,7 +312,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[0.3, 0.5, 0.], [0.8, 1.0, 0.], [1.2, 1.5, 0.],
                              [2.3, 2., 0.]], fruit_output_layer.eval(sess))
 
@@ -340,7 +340,7 @@ class WarmStartingUtilTest(test.TestCase):
             self.get_temp_dir(),
             prev_vocab_path,
             current_oov_buckets=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
         fruit_weights_vars = fruit_weights._get_variable_list()
@@ -372,7 +372,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_output_layer, variables.PartitionedVariable))
         fruit_output_layer_vars = fruit_output_layer._get_variable_list()
@@ -404,7 +404,7 @@ class WarmStartingUtilTest(test.TestCase):
             partitioner=lambda shape, dtype: [2, 1])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 6,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
         fruit_weights_vars = fruit_weights._get_variable_list()
@@ -438,7 +438,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_output_layer, variables.PartitionedVariable))
         fruit_output_layer_vars = fruit_output_layer._get_variable_list()
@@ -463,7 +463,7 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[10, 1],
             initializer=zeros())
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=[var])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
         self.assertAllEqual(var.eval(), prev_int_val)
 
@@ -483,7 +483,7 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[10, 1],
             initializer=zeros())
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=["v1"])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
         self.assertAllEqual(var.eval(), prev_int_val)
 
@@ -519,7 +519,7 @@ class WarmStartingUtilTest(test.TestCase):
                            # This warm-starts both v1 and v1/Momentum, but only
                            # v2 (and not v2/Momentum).
                            vars_to_warm_start=["v1", "v2[^/]"])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify the selection of weights were correctly warm-started (init
         # overridden to ones).
         self.assertAllEqual(v1.eval(), prev_v1_val)
@@ -542,7 +542,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [np.zeros([10, 1])]},
@@ -553,7 +553,7 @@ class WarmStartingUtilTest(test.TestCase):
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=".*sc_int.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [prev_int_val]}, sess)
 
@@ -571,7 +571,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [np.zeros([15, 1])]},
@@ -583,7 +583,7 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]},
                                   sess)
@@ -605,7 +605,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
@@ -619,7 +619,7 @@ class WarmStartingUtilTest(test.TestCase):
         # vocab is assumed to be same as new vocab.
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*sc_vocab.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
@@ -641,7 +641,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
@@ -657,7 +657,7 @@ class WarmStartingUtilTest(test.TestCase):
             # Explicitly provide the file prefix instead of just the dir.
             os.path.join(self.get_temp_dir(), "model-0"),
             vars_to_warm_start=".*sc_vocab.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
@@ -686,7 +686,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([2, 1])]},
@@ -708,7 +708,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab/weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  'banana' isn't in the
         # first two entries of the old vocabulary, so it's newly initialized.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [[[1], [0]]]}, sess)
@@ -729,7 +729,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars,
@@ -741,7 +741,7 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars,
                                   {real_bucket: [prev_bucket_val]}, sess)
@@ -800,7 +800,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, all weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {
@@ -826,7 +826,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab/weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {
             sc_int: [prev_int_val],
@@ -865,7 +865,7 @@ class WarmStartingUtilTest(test.TestCase):
             "linear_model/sc_vocab/weights",
             initializer=[[0.5], [1.], [2.], [3.]])
         self._write_checkpoint(sess)
-        prev_keys_val = sess.run(sc_keys_weights)
+        prev_keys_val = self.evaluate(sc_keys_weights)
 
     def _partitioner(shape, dtype):  # pylint:disable=unused-argument
       # Partition each var into 2 equal slices.
@@ -892,7 +892,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
         # should be correctly warm-started after vocab remapping.
@@ -933,7 +933,7 @@ class WarmStartingUtilTest(test.TestCase):
             "linear_model/sc_vocab/weights",
             initializer=[[0.5], [1.], [2.], [3.]])
         self._write_checkpoint(sess)
-        prev_keys_val = sess.run(sc_keys_weights)
+        prev_keys_val = self.evaluate(sc_keys_weights)
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
@@ -955,7 +955,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
         # should be correctly warm-started after vocab remapping.
@@ -1024,7 +1024,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_vocab should be correctly warm-started after vocab remapping,
         # and neither of the other two should be warm-started..
@@ -1091,7 +1091,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[emb_vocab_column]):
                     vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab_column should be correctly warm-started after vocab
         # remapping. Missing values are filled in with the EmbeddingColumn's
@@ -1163,7 +1163,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab_embedding/embedding_weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab should be correctly warm-started after vocab remapping.
         # Missing values are filled in with the EmbeddingColumn's initializer.
diff --git a/tensorflow/python/user_ops/user_ops.py b/tensorflow/python/user_ops/user_ops.py
index 20ea3b0f621dc74bd3778d565f8897e47a881d42..3dbacd09e62b65c31266dca94dee5382664833fa 100644
--- a/tensorflow/python/user_ops/user_ops.py
+++ b/tensorflow/python/user_ops/user_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops.gen_user_ops import *  # pylint: disable=wildcard-imp
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('user_ops.my_fact')
+@tf_export(v1=['user_ops.my_fact'])
 def my_fact():
   """Example of overriding the generated code for an Op."""
   return _gen_user_ops.fact()
diff --git a/tensorflow/python/util/decorator_utils.py b/tensorflow/python/util/decorator_utils.py
index 7b4363c0e40802779cf47c75c5a5e5a901da37e2..ab9641d96bc28949d9dc81fa91357793dc8fd6ad 100644
--- a/tensorflow/python/util/decorator_utils.py
+++ b/tensorflow/python/util/decorator_utils.py
@@ -75,13 +75,31 @@ def _normalize_docstring(docstring):
 
 def add_notice_to_docstring(
     doc, instructions, no_doc_str, suffix_str, notice):
-  """Adds a deprecation notice to a docstring."""
+  """Adds a deprecation notice to a docstring.
+
+  Args:
+    doc: The original docstring.
+    instructions: A string, describing how to fix the problem.
+    no_doc_str: The default value to use for `doc` if `doc` is empty.
+    suffix_str: Is added to the end of the first line.
+    notice: A list of strings. The main notice warning body.
+
+  Returns:
+    A new docstring, with the notice attached.
+
+  Raises:
+    ValueError: If `notice` is empty.
+  """
   if not doc:
     lines = [no_doc_str]
   else:
     lines = _normalize_docstring(doc).splitlines()
     lines[0] += ' ' + suffix_str
 
+  if not notice:
+    raise ValueError('The `notice` arg must not be empty.')
+
+  notice[0] = 'Warning: ' + notice[0]
   notice = [''] + notice + ([instructions] if instructions else [])
 
   if len(lines) > 1:
diff --git a/tensorflow/python/util/decorator_utils_test.py b/tensorflow/python/util/decorator_utils_test.py
index 64e0cc7f57effe98756cb08d738dc198d982b473..440dcbb6df3ffbaeb0aed4668033750e44518374 100644
--- a/tensorflow/python/util/decorator_utils_test.py
+++ b/tensorflow/python/util/decorator_utils_test.py
@@ -55,8 +55,9 @@ class AddNoticeToDocstringTest(test.TestCase):
         expected)
 
   def test_regular(self):
-    expected = ("Brief (suffix)\n\nGo away\nInstructions\n\nDocstring\n\n"
-                "Args:\n  arg1: desc")
+    expected = (
+        "Brief (suffix)\n\nWarning: Go away\nInstructions\n\nDocstring\n\n"
+        "Args:\n  arg1: desc")
     # No indent for main docstring
     self._check("Brief\n\nDocstring\n\nArgs:\n  arg1: desc", expected)
     # 2 space indent for main docstring, blank lines not indented
@@ -71,7 +72,7 @@ class AddNoticeToDocstringTest(test.TestCase):
                 expected)
 
   def test_brief_only(self):
-    expected = "Brief (suffix)\n\nGo away\nInstructions"
+    expected = "Brief (suffix)\n\nWarning: Go away\nInstructions"
     self._check("Brief", expected)
     self._check("Brief\n", expected)
     self._check("Brief\n  ", expected)
@@ -79,12 +80,12 @@ class AddNoticeToDocstringTest(test.TestCase):
     self._check("\n  Brief\n  ", expected)
 
   def test_no_docstring(self):
-    expected = "Nothing here\n\nGo away\nInstructions"
+    expected = "Nothing here\n\nWarning: Go away\nInstructions"
     self._check(None, expected)
     self._check("", expected)
 
   def test_no_empty_line(self):
-    expected = "Brief (suffix)\n\nGo away\nInstructions\n\nDocstring"
+    expected = "Brief (suffix)\n\nWarning: Go away\nInstructions\n\nDocstring"
     # No second line indent
     self._check("Brief\nDocstring", expected)
     # 2 space second line indent
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index c43589f5c4555180442a1962e25f82e51d677d1b..4c68d1aaae3272ddae27bd44ab98c6c68dbaa9b6 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -54,16 +54,39 @@ def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
       '(deprecated)', main_text)
 
 
-def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
+def _add_deprecated_arg_notice_to_docstring(doc, date, instructions,
+                                            deprecated_names):
   """Adds a deprecation notice to a docstring for deprecated arguments."""
+
+  deprecation_string = ', '.join(sorted(deprecated_names))
+
   return decorator_utils.add_notice_to_docstring(
-      doc, instructions,
-      'DEPRECATED FUNCTION ARGUMENTS',
+      doc, instructions, 'DEPRECATED FUNCTION ARGUMENTS',
       '(deprecated arguments)', [
-          'SOME ARGUMENTS ARE DEPRECATED. '
-          'They will be removed %s.' % (
-              'in a future version' if date is None else ('after %s' % date)),
-          'Instructions for updating:'])
+          'SOME ARGUMENTS ARE DEPRECATED: `(%s)`. '
+          'They will be removed %s.' %
+          (deprecation_string, 'in a future version' if date is None else
+           ('after %s' % date)), 'Instructions for updating:'
+      ])
+
+
+def _add_deprecated_arg_value_notice_to_docstring(doc, date, instructions,
+                                                  deprecated_name_value_dict):
+  """Adds a deprecation notice to a docstring for deprecated arguments."""
+
+  deprecation_string = ', '.join(
+      '%s=%r' % (key, value)
+      for key, value in sorted(deprecated_name_value_dict.items()))
+
+  when = 'in a future version' if date is None else ('after %s' % date)
+
+  return decorator_utils.add_notice_to_docstring(
+      doc, instructions, 'DEPRECATED FUNCTION ARGUMENT VALUES',
+      '(deprecated argument values)', [
+          'SOME ARGUMENT VALUES ARE DEPRECATED: `(%s)`. '
+          'They will be removed %s.' % (deprecation_string, when),
+          'Instructions for updating:'
+      ])
 
 
 def _validate_deprecation_args(date, instructions):
@@ -403,10 +426,11 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
             pos, spec.has_ok_value, spec.ok_value)
     return deprecated_positional_args
 
+  deprecated_arg_names = _get_arg_names_to_ok_vals()
+
   def deprecated_wrapper(func):
     """Deprecation decorator."""
     decorator_utils.validate_callable(func, 'deprecated_args')
-    deprecated_arg_names = _get_arg_names_to_ok_vals()
 
     arg_spec = tf_inspect.getfullargspec(func)
     deprecated_positions = _get_deprecated_positional_arguments(
@@ -486,9 +510,11 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
                 'in a future version' if date is None else ('after %s' % date),
                 instructions)
       return func(*args, **kwargs)
-    return tf_decorator.make_decorator(func, new_func, 'deprecated',
-                                       _add_deprecated_arg_notice_to_docstring(
-                                           func.__doc__, date, instructions))
+
+    doc = _add_deprecated_arg_notice_to_docstring(
+        func.__doc__, date, instructions, sorted(deprecated_arg_names.keys()))
+    return tf_decorator.make_decorator(func, new_func, 'deprecated', doc)
+
   return deprecated_wrapper
 
 
@@ -551,9 +577,11 @@ def deprecated_arg_values(date, instructions, warn_once=True,
                   func.__module__, arg_name, arg_value, 'in a future version'
                   if date is None else ('after %s' % date), instructions)
       return func(*args, **kwargs)
-    return tf_decorator.make_decorator(func, new_func, 'deprecated',
-                                       _add_deprecated_arg_notice_to_docstring(
-                                           func.__doc__, date, instructions))
+
+    doc = _add_deprecated_arg_value_notice_to_docstring(
+        func.__doc__, date, instructions, deprecated_kwargs)
+    return tf_decorator.make_decorator(func, new_func, 'deprecated', doc)
+
   return deprecated_wrapper
 
 
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 90c73a0a58d129af44cc051874acda37d5c78394..34cbca52a1b42869e6ef106328b85435ec2877be 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -153,7 +153,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed in a future version."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. "
+        "It will be removed in a future version."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -195,7 +196,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -227,7 +228,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__)
 
     # Assert calling new fn issues log warning.
@@ -251,7 +252,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
         "\n%s" % (date, instructions), _fn.__doc__)
 
@@ -289,7 +290,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -326,7 +327,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions),
         getattr(_Object, "_fn").__doc__)
 
@@ -355,9 +356,10 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
-        "\n%s" % (date, instructions), getattr(_Object, "_fn").__doc__)
+        "\n%s" % (date, instructions),
+        getattr(_Object, "_fn").__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual(3, _Object()._fn(1, 2))
@@ -406,12 +408,13 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "prop doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
         "\n%s"
         "\n"
         "\nReturns:"
-        "\n  String." % (date, instructions), getattr(_Object, "_prop").__doc__)
+        "\n  String." % (date, instructions),
+        getattr(_Object, "_prop").__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual("prop_with_doc", _Object()._prop)
@@ -439,9 +442,10 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
-        "\n%s" % (date, instructions), getattr(_Object, "_prop").__doc__)
+        "\n%s" % (date, instructions),
+        getattr(_Object, "_prop").__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual("prop_no_doc", _Object()._prop)
@@ -507,7 +511,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated arguments)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -544,7 +549,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated arguments)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__)
 
     # Assert calls without the deprecated argument log nothing.
@@ -572,7 +578,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION ARGUMENTS"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:"
         "\n%s" % (date, instructions), _fn.__doc__)
 
@@ -767,9 +774,10 @@ class DeprecatedArgValuesTest(test.TestCase):
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
     self.assertEqual(
-        "fn doc. (deprecated arguments)"
+        "fn doc. (deprecated argument values)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -809,9 +817,10 @@ class DeprecatedArgValuesTest(test.TestCase):
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
     self.assertEqual(
-        "fn doc. (deprecated arguments)"
+        "fn doc. (deprecated argument values)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__)
 
     # Assert calling new fn with non-deprecated value logs nothing.
@@ -842,9 +851,10 @@ class DeprecatedArgValuesTest(test.TestCase):
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
     self.assertEqual(
-        "DEPRECATED FUNCTION ARGUMENTS"
+        "DEPRECATED FUNCTION ARGUMENT VALUES"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:"
         "\n%s" % (date, instructions), _fn.__doc__)
 
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index d67dbde30473f8466f443d0180f62d85f54a848b..be8b0f1949ff7655d14c81ce29d643a919176fe6 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -503,7 +503,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
               "The two namedtuples don't have the same sequence type. Input "
               "structure has type %s, while shallow structure has type %s."
               % (type(input_tree), type(shallow_tree)))
-      else:
+      elif not (isinstance(shallow_tree, _collections.Mapping)
+                and isinstance(input_tree, _collections.Mapping)):
         raise TypeError(
             "The two structures don't have the same sequence type. Input "
             "structure has type %s, while shallow structure has type %s."
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index e03a8daaa19b4f2a39741cbc120f6317557e8474..997a3c5c36f083faf157d764afc583aa2e5ad1cf 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -706,6 +706,40 @@ class NestTest(parameterized.TestCase, test.TestCase):
         name_list, data_list)
     self.assertEqual(out, ["first_4_evens", ["first_5_odds", "first_3_primes"]])
 
+    # Dicts.
+    inp_val = dict(a=2, b=3)
+    inp_ops = dict(a=dict(add=1, mul=2), b=dict(add=2, mul=3))
+    out = nest.map_structure_up_to(
+        inp_val,
+        lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+    self.assertEqual(out["a"], 6)
+    self.assertEqual(out["b"], 15)
+
+    # Non-equal dicts.
+    inp_val = dict(a=2, b=3)
+    inp_ops = dict(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
+    with self.assertRaisesRegexp(ValueError, "same keys"):
+      nest.map_structure_up_to(
+          inp_val,
+          lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+
+    # Dict+custom mapping.
+    inp_val = dict(a=2, b=3)
+    inp_ops = _CustomMapping(a=dict(add=1, mul=2), b=dict(add=2, mul=3))
+    out = nest.map_structure_up_to(
+        inp_val,
+        lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+    self.assertEqual(out["a"], 6)
+    self.assertEqual(out["b"], 15)
+
+    # Non-equal dict/mapping.
+    inp_val = dict(a=2, b=3)
+    inp_ops = _CustomMapping(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
+    with self.assertRaisesRegexp(ValueError, "same keys"):
+      nest.map_structure_up_to(
+          inp_val,
+          lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+
   def testGetTraverseShallowStructure(self):
     scalar_traverse_input = [3, 4, (1, 2, [0]), [5, 6], {"a": (7,)}, []]
     scalar_traverse_r = nest.get_traverse_shallow_structure(
diff --git a/tensorflow/python/util/py_checkpoint_reader.i b/tensorflow/python/util/py_checkpoint_reader.i
index 1c73f7f06f1937a8db0bd858421c2e884892e25b..a1b98a2a75991ee8555c3d3de3aca826fba07a7e 100644
--- a/tensorflow/python/util/py_checkpoint_reader.i
+++ b/tensorflow/python/util/py_checkpoint_reader.i
@@ -165,7 +165,6 @@ def NewCheckpointReader(filepattern):
     from tensorflow.python.util import compat
     return CheckpointReader(compat.as_bytes(filepattern), status)
 
-NewCheckpointReader._tf_api_names = ['train.NewCheckpointReader']
 NewCheckpointReader._tf_api_names_v1 = ['train.NewCheckpointReader']
 %}
 
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index 3d837a40449ece056c154e1b09636a8885047035..0cfc836246d2d885c28d168fe90b08a325cf6ded 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -101,6 +101,55 @@ def make_decorator(target,
   return decorator_func
 
 
+def rewrap(decorator_func, previous_target, new_target):
+  """Injects a new target into a function built by make_decorator.
+
+  This function allows replacing a function wrapped by `decorator_func`,
+  assuming the decorator that wraps the function is written as described below.
+
+  The decorator function must use `<decorator name>.__wrapped__` instead of the
+  wrapped function that is normally used:
+
+  Example:
+
+      # Instead of this:
+      def simple_parametrized_wrapper(*args, **kwds):
+        return wrapped_fn(*args, **kwds)
+
+      tf_decorator.make_decorator(simple_parametrized_wrapper, wrapped_fn)
+
+      # Write this:
+      def simple_parametrized_wrapper(*args, **kwds):
+        return simple_parametrized_wrapper.__wrapped__(*args, **kwds)
+
+      tf_decorator.make_decorator(simple_parametrized_wrapper, wrapped_fn)
+
+  Note that this process modifies decorator_func.
+
+  Args:
+    decorator_func: Callable returned by `wrap`.
+    previous_target: Callable that needs to be replaced.
+    new_target: Callable to replace previous_target with.
+  """
+  # Because the process mutates the decorator, we only need to alter the
+  # innermost function that wraps previous_target.
+  cur = decorator_func
+  innermost_decorator = None
+  target = None
+  while hasattr(cur, '_tf_decorator'):
+    innermost_decorator = cur
+    target = getattr(cur, '_tf_decorator')
+    if target.decorated_target is previous_target:
+      break
+    cur = target.decorated_target
+
+  if innermost_decorator is None:
+    return
+
+  target.decorated_target = new_target
+  innermost_decorator.__wrapped__ = new_target
+
+
 def unwrap(maybe_tf_decorator):
   """Unwraps an object into a list of TFDecorators and a final target.
 
@@ -163,6 +212,10 @@ class TFDecorator(object):
   def decorated_target(self):
     return self._decorated_target
 
+  @decorated_target.setter
+  def decorated_target(self, decorated_target):
+    self._decorated_target = decorated_target
+
   @property
   def decorator_name(self):
     return self._decorator_name
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
index 0f9712c987d442358ecb4f81f46ef0898e380b01..9198f0b3fad1590bedac71b30cf332e35cb489fe 100644
--- a/tensorflow/python/util/tf_decorator_test.py
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -52,6 +52,22 @@ def test_decorator_increment_first_int_arg(target):
   return tf_decorator.make_decorator(target, wrapper)
 
 
+def test_injectable_decorator_square(target):
+
+  def wrapper(x):
+    return wrapper.__wrapped__(x)**2
+
+  return tf_decorator.make_decorator(target, wrapper)
+
+
+def test_injectable_decorator_increment(target):
+
+  def wrapper(x):
+    return wrapper.__wrapped__(x) + 1
+
+  return tf_decorator.make_decorator(target, wrapper)
+
+
 def test_function(x):
   """Test Function Docstring."""
   return x + 1
@@ -65,6 +81,12 @@ def test_decorated_function(x):
   return x * 2
 
 
+@test_injectable_decorator_square
+@test_injectable_decorator_increment
+def test_rewrappable_decorated(x):
+  return x * 2
+
+
 @test_tfdecorator('decorator')
 class TestDecoratedClass(object):
   """Test Decorated Class."""
@@ -215,6 +237,30 @@ class TfMakeDecoratorTest(test.TestCase):
     _ = tf_decorator.make_decorator(partial, test_wrapper)
 
 
+class TfDecoratorRewrapTest(test.TestCase):
+
+  def testRewrapMutatesAffectedFunction(self):
+
+    def new_target(x):
+      return x * 3
+
+    self.assertEqual((1 * 2 + 1) ** 2, test_rewrappable_decorated(1))
+    prev_target, _ = tf_decorator.unwrap(test_rewrappable_decorated)
+    tf_decorator.rewrap(test_rewrappable_decorated, prev_target, new_target)
+    self.assertEqual((1 * 3 + 1) ** 2, test_rewrappable_decorated(1))
+
+  def testRewrapOfDecoratorFunction(self):
+
+    def new_target(x):
+      return x * 3
+
+    prev_target = test_rewrappable_decorated._tf_decorator._decorated_target
+    # In this case, only the outer decorator (test_injectable_decorator_square)
+    # should be preserved.
+    tf_decorator.rewrap(test_rewrappable_decorated, prev_target, new_target)
+    self.assertEqual((1 * 3) ** 2, test_rewrappable_decorated(1))
+
+
 class TfDecoratorUnwrapTest(test.TestCase):
 
   def testUnwrapReturnsEmptyArrayForUndecoratedFunction(self):
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index a7a07babfe12b3b8512637e18b00015c8018d5b7..ec70cae7d2fc00f793e8ffa0aec331e32e11115f 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -50,6 +50,10 @@ from tensorflow.python.util import tf_decorator
 ESTIMATOR_API_NAME = 'estimator'
 TENSORFLOW_API_NAME = 'tensorflow'
 
+# List of subpackage names used by TensorFlow components. Have to check that
+# TensorFlow core repo does not export any symbols under these names.
+SUBPACKAGE_NAMESPACES = [ESTIMATOR_API_NAME]
+
 _Attributes = collections.namedtuple(
     'ExportedApiAttributes', ['names', 'constants'])
 
@@ -78,14 +82,21 @@ class SymbolAlreadyExposedError(Exception):
   pass
 
 
-def get_canonical_name_for_symbol(symbol, api_name=TENSORFLOW_API_NAME):
-  """Get canonical name for the API symbol.
+class InvalidSymbolNameError(Exception):
+  """Raised when trying to export symbol as an invalid or unallowed name."""
+  pass
+
 
-  Canonical name is the first non-deprecated endpoint name.
+def get_canonical_name_for_symbol(
+    symbol, api_name=TENSORFLOW_API_NAME,
+    add_prefix_to_v1_names=False):
+  """Get canonical name for the API symbol.
 
   Args:
     symbol: API function or class.
     api_name: API name (tensorflow or estimator).
+    add_prefix_to_v1_names: Specifies whether a name available only in V1
+      should be prefixed with compat.v1.
 
   Returns:
     Canonical name for the API symbol (for e.g. initializers.zeros) if
@@ -98,26 +109,42 @@ def get_canonical_name_for_symbol(symbol, api_name=TENSORFLOW_API_NAME):
   if api_names_attr not in undecorated_symbol.__dict__:
     return None
   api_names = getattr(undecorated_symbol, api_names_attr)
-  # TODO(annarev): may be add a separate deprecated attribute
-  # for estimator names.
   deprecated_api_names = undecorated_symbol.__dict__.get(
       '_tf_deprecated_api_names', [])
-  return get_canonical_name(api_names, deprecated_api_names)
+
+  canonical_name = get_canonical_name(api_names, deprecated_api_names)
+  if canonical_name:
+    return canonical_name
+
+  # If there is no V2 canonical name, get V1 canonical name.
+  api_names_attr = API_ATTRS_V1[api_name].names
+  api_names = getattr(undecorated_symbol, api_names_attr)
+  v1_canonical_name = get_canonical_name(api_names, deprecated_api_names)
+  if add_prefix_to_v1_names:
+    return 'compat.v1.%s' % v1_canonical_name
+  return v1_canonical_name
 
 
 def get_canonical_name(api_names, deprecated_api_names):
-  """Get first non-deprecated endpoint name.
+  """Get preferred endpoint name.
 
   Args:
     api_names: API names iterable.
     deprecated_api_names: Deprecated API names iterable.
   Returns:
-    Canonical name if there is at least one non-deprecated endpoint.
-    Otherwise returns None.
+    Returns one of the following in decreasing preference:
+    - first non-deprecated endpoint
+    - first endpoint
+    - None
   """
-  return next(
+  non_deprecated_name = next(
       (name for name in api_names if name not in deprecated_api_names),
       None)
+  if non_deprecated_name:
+    return non_deprecated_name
+  if api_names:
+    return api_names[0]
+  return None
 
 
 class api_export(object):  # pylint: disable=invalid-name
@@ -145,6 +172,37 @@ class api_export(object):  # pylint: disable=invalid-name
     self._overrides = kwargs.get('overrides', [])
     self._allow_multiple_exports = kwargs.get('allow_multiple_exports', False)
 
+    self._validate_symbol_names()
+
+  def _validate_symbol_names(self):
+    """Validate you are exporting symbols under an allowed package.
+
+    We need to ensure things exported by tf_export, estimator_export, etc.
+    export symbols under disjoint top-level package names.
+
+    For TensorFlow, we check that it does not export anything under subpackage
+    names used by components (estimator, keras, etc.).
+
+    For each component, we check that it exports everything under its own
+    subpackage.
+
+    Raises:
+      InvalidSymbolNameError: If you try to export symbol under disallowed name.
+    """
+    all_symbol_names = set(self._names) | set(self._names_v1)
+    if self._api_name == TENSORFLOW_API_NAME:
+      for subpackage in SUBPACKAGE_NAMESPACES:
+        if any(n.startswith(subpackage) for n in all_symbol_names):
+          raise InvalidSymbolNameError(
+              '@tf_export is not allowed to export symbols under %s.*' % (
+                  subpackage))
+    else:
+      if not all(n.startswith(self._api_name) for n in all_symbol_names):
+        raise InvalidSymbolNameError(
+            'Can only export symbols under package name of component. '
+            'e.g. tensorflow_estimator must export all symbols under '
+            'tf.estimator')
+
   def __call__(self, func):
     """Calls this decorator.
 
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index 4ae1dc55e06b434aeb4a95e2ca9aa68e4eef56de..a0fac8bf362627e6802821e3b33c0f107c5c97ce 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -130,6 +130,26 @@ class ValidateExportTest(test.TestCase):
     with self.assertRaises(tf_export.SymbolAlreadyExposedError):
       export_decorator(_test_function)
 
+  def testRaisesExceptionIfInvalidSymbolName(self):
+    # TensorFlow code is not allowed to export symbols under package
+    # tf.estimator
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.tf_export('estimator.invalid')
+
+    # All symbols exported by Estimator must be under tf.estimator package.
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('invalid')
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('Estimator.invalid')
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('invalid.estimator')
+
+  def testRaisesExceptionIfInvalidV1SymbolName(self):
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.tf_export('valid', v1=['estimator.invalid'])
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('estimator.valid', v1=['invalid'])
+
   def testOverridesFunction(self):
     _test_function2._tf_api_names = ['abc']
 
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 444e44eaf141dadab7c1fab5d6091ec632d4bcc3..5f1e776640df3e2b75e6a0b8accfce40098cf36c 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -352,6 +352,11 @@ def isfunction(object):  # pylint: disable=redefined-builtin
   return _inspect.isfunction(tf_decorator.unwrap(object)[1])
 
 
+def isframe(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismodule."""
+  return _inspect.isframe(tf_decorator.unwrap(object)[1])
+
+
 def isgenerator(object):  # pylint: disable=redefined-builtin
   """TFDecorator-aware replacement for inspect.isgenerator."""
   return _inspect.isgenerator(tf_decorator.unwrap(object)[1])
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index fedbe1dff6a7bd6e2524355e9946a99fa740f597..cde67c4e4f544311cca64dba22dbb2cb30f4007d 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -111,7 +111,7 @@ class TfShouldUseTest(test.TestCase):
         # Creating another op and executing it does not mark the
         # unused op as being "used".
         v = constant_op.constant(1.0, name='meh')
-        v.eval()
+        self.evaluate(v)
     msg = '\n'.join(error.call_args[0])
     self.assertIn('Object was never used', msg)
     self.assertIn('blah3:0', msg)
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index d4d97087ba48087acf2313ca16fa2144bca649be..2526e1adaa107565042d0dff9e12183bd022f9f1 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -1,6 +1,8 @@
 licenses(["restricted"])
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 
@@ -13,6 +15,14 @@ STREAM_EXECUTOR_HEADERS = glob([
     "platform/**/*.h",
 ])
 
+tf_proto_library(
+    name = "dnn_proto",
+    srcs = ["dnn.proto"],
+    cc_api_version = 2,
+    default_header = True,
+    protodeps = tf_additional_all_protos(),
+)
+
 cc_library(
     name = "stream_executor_impl",
     srcs = glob(
@@ -35,8 +45,13 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        ":dnn_proto_cc_impl",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
     ],
     alwayslink = 1,
@@ -47,8 +62,10 @@ cc_library(
     hdrs = STREAM_EXECUTOR_HEADERS,
     visibility = ["//visibility:public"],
     deps = [
+        ":dnn_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
+        "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_static([":stream_executor_impl"]),
 )
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 19397c7dbf21c35466ca04371b437879e7da2403..1f2e2f48bbddf5f638135129e502cfe233d5952f 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -132,43 +132,6 @@ string ToString(cudnnStatus_t status) {
   }
 }
 
-template <typename T>
-cudnnDataType_t GetCudnnDataType(
-    dnn::DataLayout = dnn::DataLayout::kBatchDepthYX);
-
-template <>
-cudnnDataType_t GetCudnnDataType<double>(dnn::DataLayout) {
-  return CUDNN_DATA_DOUBLE;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<float>(dnn::DataLayout) {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<Eigen::half>(dnn::DataLayout) {
-  return CUDNN_DATA_HALF;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<int8>(dnn::DataLayout layout) {
-  switch (layout) {
-    case dnn::DataLayout::kYXDepthBatch:
-    case dnn::DataLayout::kYXBatchDepth:
-    case dnn::DataLayout::kBatchYXDepth:
-    case dnn::DataLayout::kBatchDepthYX:
-      return CUDNN_DATA_INT8;
-    case dnn::DataLayout::kBatchDepthYX4:
-      return CUDNN_DATA_INT8x4;
-  }
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<int32>(dnn::DataLayout) {
-  return CUDNN_DATA_INT32;
-}
-
 // RAII wrapper for all calls to cuDNN with a cuDNN handle argument.
 //
 // See CudnnAccess::GetHandle() for details.
@@ -685,10 +648,10 @@ class CudnnConvolutionDescriptor {
     CHECK_CUDNN_OK(cudnnSetConvolutionNdDescriptor(
         handle_.get(), convolution_descriptor.ndims(), padding.data(),
         strides.data(), dilations.data(),
-        // NOTE(keveman): cuDNN supports convolution and cross correlation.
-        // However, almost all the use cases do cross correlation, so just
-        // hard coding it here.
-        CUDNN_CROSS_CORRELATION, data_type));
+        convolution_descriptor.convolution_not_crosscorr()
+            ? CUDNN_CONVOLUTION
+            : CUDNN_CROSS_CORRELATION,
+        data_type));
 
     // NOTE(benbarsdell): This only applies if tensor op math is enabled
     //                      and algo selection is set to Default.
@@ -861,11 +824,19 @@ cudnnDataType_t ToCudnnDataType(
     case dnn::DataType::kInt8:
       return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
                                                             : CUDNN_DATA_INT8;
+    case dnn::DataType::kInt32:
+      return CUDNN_DATA_INT32;
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
 }
 
+template <typename T>
+cudnnDataType_t GetCudnnDataType(
+    dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
+  return ToCudnnDataType(dnn::ToDataType<T>::value, data_layout);
+}
+
 cudnnRNNInputMode_t ToCudnnRnnInputMode(dnn::RnnInputMode input_mode) {
   switch (input_mode) {
     case dnn::RnnInputMode::kRnnLinearSkip:
@@ -2345,27 +2316,6 @@ struct ConvDoFP32ComputationFP16Input {
   static constexpr bool kDefaultFlag = true;
 };
 
-// A group of helper functions to return the internal compute type for
-// convolutions in cudnn.
-template <typename T>
-cudnnDataType_t GetConvComputeType() {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetConvComputeType<Eigen::half>() {
-  if (CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()) {
-    return CUDNN_DATA_FLOAT;
-  } else {
-    return CUDNN_DATA_HALF;
-  }
-}
-
-template <>
-cudnnDataType_t GetConvComputeType<double>() {
-  return CUDNN_DATA_DOUBLE;
-}
-
 // A helper struct to decide whether to use FP32 as the internal compute type
 // for rnn when the input data type is FP16. At present it is turned off,
 // users can explicitly control them through an env-var
@@ -2437,7 +2387,7 @@ port::Status CudnnSupport::DoConvolveImpl(
     const DeviceMemory<T>& filter_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
-    ScratchAllocator* scratch_allocator,
+    dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -2445,7 +2395,7 @@ port::Status CudnnSupport::DoConvolveImpl(
   CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -2536,8 +2486,7 @@ port::Status CudnnSupport::DoConvolveImpl(
   return port::Status::OK();
 }
 
-template <typename AccumulatorType, typename ElementType, typename BiasType,
-          typename ScaleType>
+template <typename ElementType, typename BiasType, typename ScaleType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<ElementType>& conv_input_data,
@@ -2548,7 +2497,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
     ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
     const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<ElementType>* output_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   if (activation_mode != dnn::ActivationMode::kRelu &&
@@ -2569,7 +2519,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetCudnnDataType<AccumulatorType>());
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -2938,10 +2888,10 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveImpl<float>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, dnn::DataType::kFloat, scratch_allocator,
+                     algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2956,10 +2906,10 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveImpl<double>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, dnn::DataType::kDouble, scratch_allocator,
+                     algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2973,11 +2923,15 @@ bool CudnnSupport::DoConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveImpl<Eigen::half>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, acc_type, scratch_allocator, algorithm_config,
+                     output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2995,12 +2949,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl<double>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kDouble, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3018,12 +2973,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl<float>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kFloat, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3041,13 +2997,17 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoFusedConvolveImpl<float>(
+      DoFusedConvolveImpl(
           stream, conv_input_descriptor, conv_input_data, conv_input_scale,
           filter_descriptor, filter_data, convolution_descriptor,
           side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+          activation_mode, output_descriptor, output_data, acc_type,
+          scratch_allocator, algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3073,12 +3033,13 @@ bool CudnnSupport::DoFusedConvolve(
     return false;
   }
   return IsStatusOk(
-      DoFusedConvolveImpl<int32>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kInt32, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3112,7 +3073,8 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
     DeviceMemory<T> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -3133,7 +3095,7 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
   CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3213,11 +3175,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kDouble, scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3233,11 +3195,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kFloat, scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3252,12 +3214,16 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, acc_type, scratch_allocator, algorithm_config,
+          output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3269,7 +3235,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     DeviceMemory<T> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<T>* backward_filter_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -3290,7 +3257,7 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
   CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3406,11 +3373,12 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, dnn::DataType::kDouble,
+
+          scratch_allocator, algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3425,13 +3393,14 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return IsStatusOk(DoConvolveBackwardFilterImpl(
+                        stream, input_descriptor, input_data, output_descriptor,
+                        backward_output_data, convolution_descriptor,
+                        filter_descriptor, backward_filter_data,
+
+                        dnn::DataType::kFloat, scratch_allocator,
+                        algorithm_config, output_profile_result),
+                    /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3445,12 +3414,16 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, acc_type, scratch_allocator, algorithm_config,
+          output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 74f6f935b84cfbea27e1e9165b5f7241f74a9cbb..0641be140d2f19651696b0bcac498870a4db2960 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -670,12 +670,12 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<T>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T>* output_data, ScratchAllocator* scratch_allocator,
+      DeviceMemory<T>* output_data, dnn::DataType accumulator_type,
+      ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
-  template <typename AccumulatorType, typename ElementType, typename BiasType,
-            typename ScaleType>
+  template <typename ElementType, typename BiasType, typename ScaleType>
   port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<ElementType>& conv_input_data,
@@ -687,7 +687,7 @@ class CudnnSupport : public dnn::DnnSupport {
       ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
       const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<ElementType>* output_data,
+      DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
@@ -700,7 +700,8 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
+      DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+      ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
@@ -712,7 +713,7 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<T>* backward_filter_data,
+      DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 622a4a4edb1fe4163831e9429c1a7ab9262f2727..b342e71bdd94f6112d500d86f6ed4051821d2d54 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -209,3 +209,5 @@ REGISTER_MODULE_INITIALIZER(cuda_platform,
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(cuda_platform, multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     cuda_platform);
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index a38a6d527654049c43611783211664c070b375d5..faa662211ebb366b8e20cdc3e33ca651c64cf73a 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -23,7 +23,7 @@ namespace stream_executor {
 namespace dnn {
 
 uint64 AlgorithmDesc::hash() const {
-  return ::tensorflow::Hash64Combine(algo_, tensor_ops_enabled_);
+  return ::tensorflow::Hash64Combine(algo_id(), tensor_ops_enabled());
 }
 
 bool DnnSupport::GetConvolveAlgorithms(
@@ -187,6 +187,9 @@ std::tuple<int, int, int> GetDimIndices(const DataLayout& layout,
       batch_idx = 0;
       spatial_idx = 2;
       break;
+
+    default:
+      LOG(FATAL) << "Unknown layout " << layout;
   }
 
   return std::make_tuple(depth_idx, batch_idx, spatial_idx);
@@ -233,28 +236,27 @@ string AlgorithmConfig::ToString() const {
 // -- BatchDescriptor
 
 BatchDescriptor::BatchDescriptor(int ndims)
-    : count_(0),
-      feature_map_count_(0),
-      spatial_size_(ndims, 0),
-      value_max_(0.0),
+    : value_max_(0.0),
       value_min_(0.0),
-      layout_(DataLayout::kYXDepthBatch),
-      ndims_(ndims),
-      quantized_activation_mode_(QuantizedActivationMode::k8Bit) {}
+      quantized_activation_mode_(QuantizedActivationMode::k8Bit) {
+  tensor_.mutable_dimensions()->Resize(ndims + 2, 0);
+  set_layout(DataLayout::kYXDepthBatch);
+}
 
 BatchDescriptor::BatchDescriptor() : BatchDescriptor(/*ndims=*/2) {}
 
 std::vector<int64> BatchDescriptor::full_dims(const DataLayout& layout) const {
-  std::vector<int64> bdyx_dims(ndims_ + 2);
+  std::vector<int64> bdyx_dims(ndims() + 2);
   bdyx_dims[0] = count();
   bdyx_dims[1] = feature_map_count();
-  std::copy(spatial_size_.begin(), spatial_size_.end(), bdyx_dims.begin() + 2);
+  std::copy(spatial_size().begin(), spatial_size().end(),
+            bdyx_dims.begin() + 2);
   return ReorderDims(bdyx_dims, DataLayout::kBatchDepthYX, layout);
 }
 
 std::vector<int64> BatchDescriptor::full_strides(
     const DataLayout& layout) const {
-  if (layout_ == DataLayout::kBatchDepthYX4) {
+  if (this->layout() == DataLayout::kBatchDepthYX4) {
     LOG(FATAL)
         << "Cannot compute full strides for batch descriptor " << ToString()
         << ", because its layout is kBatchDepthYX4. In fact, "
@@ -262,36 +264,32 @@ std::vector<int64> BatchDescriptor::full_strides(
            "Use cudnnSetTensor4DDescriptor to set cudnnTensorDescriptor_t "
            "instead.";
   }
-  std::vector<int64> phys_dims = full_dims(layout_);
+  std::vector<int64> phys_dims = full_dims(this->layout());
   std::vector<int64> phys_strides(phys_dims.size());
-  phys_strides[ndims_ + 1] = 1;
-  for (int i = ndims_; i >= 0; i--) {
+  phys_strides[ndims() + 1] = 1;
+  for (int i = ndims(); i >= 0; i--) {
     phys_strides[i] = phys_strides[i + 1] * phys_dims[i + 1];
   }
-  return ReorderDims(phys_strides, layout_, layout);
+  return ReorderDims(phys_strides, this->layout(), layout);
 }
 
 void BatchDescriptor::CloneFrom(const BatchDescriptor& other) {
-  count_ = other.count_;
-  feature_map_count_ = other.feature_map_count_;
-  spatial_size_ = other.spatial_size_;
+  tensor_ = other.tensor_;
   value_max_ = other.value_max_;
   value_min_ = other.value_min_;
-  layout_ = other.layout_;
-  ndims_ = other.ndims_;
   quantized_activation_mode_ = other.quantized_activation_mode_;
 }
 
 string BatchDescriptor::ToString() const {
   string spatial;
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
   }
   return port::Printf(
       "{count: %lld feature_map_count: %lld spatial: %s "
       "value_min: %f value_max: %f layout: %s}",
-      count_, feature_map_count_, spatial.c_str(), value_min_, value_max_,
-      DataLayoutString(layout_).c_str());
+      count(), feature_map_count(), spatial.c_str(), value_min_, value_max_,
+      DataLayoutString(layout()).c_str());
 }
 
 string BatchDescriptor::ToShortString() const {
@@ -302,8 +300,8 @@ string BatchDescriptor::ToShortString() const {
   string batch = absl::StrCat("b", count());
 
   string spatial = "s";
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
   }
 
   string suffix;
@@ -333,18 +331,18 @@ string BatchDescriptor::ToShortString() const {
 
 int64 BatchDescriptor::NodesPerFeatureMap() const {
   int64 ret = 1;
-  for (int i = 0; i < ndims_; i++) {
-    ret *= spatial_size_[i];
+  for (int i = 0; i < ndims(); i++) {
+    ret *= spatial_size()[i];
   }
   return ret;
 }
 
 int64 BatchDescriptor::NodesAcrossFeatureMaps() const {
-  return NodesPerFeatureMap() * feature_map_count_;
+  return NodesPerFeatureMap() * feature_map_count();
 }
 
 int64 BatchDescriptor::ElementCount() const {
-  return count_ * feature_map_count_ * NodesPerFeatureMap();
+  return count() * feature_map_count() * NodesPerFeatureMap();
 }
 
 int64 BatchDescriptor::FullyConnectedWeightCount(
@@ -372,33 +370,27 @@ BatchDescriptor BatchDescriptor::DepthConcatenateOutputDescriptor(
 
 // -- FilterDescriptor
 
-FilterDescriptor::FilterDescriptor(int ndims)
-    : output_feature_map_count_(0),
-      input_feature_map_count_(0),
-      input_filter_dims_(ndims, 0),
-      ndims_(ndims),
-      layout_(FilterLayout::kOutputInputYX) {}
+FilterDescriptor::FilterDescriptor(int ndims) {
+  tensor_.mutable_dimensions()->Resize(ndims + 2, 0);
+  set_layout(FilterLayout::kOutputInputYX);
+}
 
 FilterDescriptor::FilterDescriptor() : FilterDescriptor(/*ndims=*/2) {}
 
 FilterDescriptor::~FilterDescriptor() {}
 
 void FilterDescriptor::CloneFrom(const FilterDescriptor& other) {
-  set_output_feature_map_count(other.output_feature_map_count())
-      .set_input_feature_map_count(other.input_feature_map_count())
-      .set_layout(other.layout());
-  input_filter_dims_ = other.input_filter_dims_;
-  ndims_ = other.ndims_;
+  tensor_ = other.tensor_;
 }
 
 string FilterDescriptor::ToString() const {
   string desc = port::Printf(
       "{output_feature_map_count: %lld input_feature_map_count: %lld "
       "layout: %s shape: ",
-      output_feature_map_count_, input_feature_map_count_,
-      FilterLayoutString(layout_).c_str());
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "%lld ", input_filter_dims_[i]);
+      output_feature_map_count(), input_feature_map_count(),
+      FilterLayoutString(layout()).c_str());
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "%lld ", input_filter_dims()[i]);
   }
   absl::StrAppend(&desc, "}");
 
@@ -409,15 +401,15 @@ string FilterDescriptor::ToShortString() const {
   // All the constituent strings are less than 15 characters, so the
   // small string optimization ensures that there will be at most one
   // heap memory allocation.
-  string od = absl::StrCat("od", output_feature_map_count_);
-  string id = absl::StrCat("id", input_feature_map_count_);
+  string od = absl::StrCat("od", output_feature_map_count());
+  string id = absl::StrCat("id", input_feature_map_count());
 
   string spatial = "s";
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", input_filter_dims_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", input_filter_dims()[i]);
   }
 
-  switch (layout_) {
+  switch (layout()) {
     case FilterLayout::kOutputInputYX:
       return absl::StrCat(od, id, spatial);
     case FilterLayout::kOutputYXInput:
@@ -429,28 +421,28 @@ string FilterDescriptor::ToShortString() const {
     case FilterLayout::kYXInputOutput:
       return absl::StrCat(spatial, id, od);
     default:
-      LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout_);
+      LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout());
       return "";  // Avoid return warning (unreachable)
   }
 }
 
 int64 FilterDescriptor::ComputeWeightCount() const {
-  int64 ret = output_feature_map_count_ * input_feature_map_count_;
-  for (int i = 0; i < ndims_; i++) {
-    ret *= input_filter_dims_[i];
+  int64 ret = output_feature_map_count() * input_feature_map_count();
+  for (int i = 0; i < ndims(); i++) {
+    ret *= input_filter_dims()[i];
   }
   return ret;
 }
 
 // -- ConvolutionDescriptor
 
-ConvolutionDescriptor::ConvolutionDescriptor(int ndims)
-    : zero_padding_(ndims, 0),
-      filter_strides_(ndims, 1),
-      dilation_rates_(ndims, 1),
-      pad_alignment_(PadAlignment::kDefault),
-      group_count_(1),
-      ndims_(ndims) {}
+ConvolutionDescriptor::ConvolutionDescriptor(int ndims) {
+  proto_.mutable_paddings()->Resize(ndims, 0);
+  proto_.mutable_strides()->Resize(ndims, 1);
+  proto_.mutable_dilations()->Resize(ndims, 1);
+  proto_.set_group_count(1);
+  proto_.set_convolution_mode(ConvolutionMode::CROSS_CORRELATION);
+}
 
 ConvolutionDescriptor::ConvolutionDescriptor()
     : ConvolutionDescriptor(/*ndims=*/2) {}
@@ -461,30 +453,30 @@ string ConvolutionDescriptor::ToString() const {
   string padding;
   string strides;
   string dilations;
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&padding, "%lld ", zero_padding_[i]);
-    port::Appendf(&strides, "%lld ", filter_strides_[i]);
-    port::Appendf(&dilations, "%lld ", dilation_rates_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&padding, "%lld ", this->padding()[i]);
+    port::Appendf(&strides, "%lld ", this->strides()[i]);
+    port::Appendf(&dilations, "%lld ", this->dilations()[i]);
   }
 
   return port::Printf(
       "{zero_padding: %s pad_alignment: %s filter_strides: %s dilation_rates: "
       "%s}",
-      padding.c_str(), PadAlignmentString(pad_alignment_).c_str(),
+      padding.c_str(), PadAlignmentString(pad_alignment()).c_str(),
       strides.c_str(), dilations.c_str());
 }
 
 string ConvolutionDescriptor::ToShortString() const {
   string desc;
-  for (int i = 0; i < ndims_; i++) {
+  for (int i = 0; i < ndims(); i++) {
     if (i > 0) port::Appendf(&desc, "_");
-    port::Appendf(&desc, "p%d:%lld", i, zero_padding_[i]);
+    port::Appendf(&desc, "p%d:%lld", i, padding()[i]);
   }
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "_s%d:%lld", i, filter_strides_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "_s%d:%lld", i, strides()[i]);
   }
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "_d%d:%lld", i, dilation_rates_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "_d%d:%lld", i, dilations()[i]);
   }
   return desc;
 }
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 558f3890da7e80a3d2658ccffd828fbc1a3400d5..c044a356efb38c333c3153f024092a22fbdf56db 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -29,7 +29,9 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/dnn.pb.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -48,19 +50,6 @@ class ScratchAllocator;
 
 namespace dnn {
 
-// Describes how an input or output layer's data is formatted.
-// Specify int64 so there's no padding in BatchDescriptor.
-enum class DataLayout : int64 {
-  kYXDepthBatch = 0,  // Same as dist_belief::DF_DEPTH_MAJOR.
-  kYXBatchDepth,      // Same as dist_belief::DF_BATCH_MAJOR.
-  kBatchYXDepth,      // Same as run_brain output, and tensorflow's layout.
-  kBatchDepthYX,      // cuDNN's NCHW layout, data laid out as image, feature
-                      // maps, rows, columns.
-  kBatchDepthYX4,     // cuDNN's NCHW_VECT_C layout, data laid out the same as
-                      // kBatchDepthYX but each element is a vector of 4 feature
-                      // maps.
-};
-
 // Specifies an index to use when accessing specific spatial dimensions.
 enum class DimIndex : int {
   X = 0,
@@ -73,8 +62,27 @@ inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) {
   return data.rbegin()[static_cast<int64>(dim)];
 }
 
+inline void SetDim(absl::Span<int64> data, DimIndex dim, int64 value) {
+  data.rbegin()[static_cast<int64>(dim)] = value;
+}
+
 inline void SetDim(std::vector<int64>* data, DimIndex dim, int64 value) {
-  data->rbegin()[static_cast<int64>(dim)] = value;
+  return SetDim(absl::MakeSpan(*data), dim, value);
+}
+
+// tensorflow::int64 is not the same type as tensorflow::protobuf_int64 in
+// open-source. Wrapper function that gives an int64 array slice view of a
+// repeated int64 protobuf field.
+inline absl::Span<const int64> AsInt64Slice(
+    const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>& v) {
+  return absl::Span<const int64>(reinterpret_cast<const int64*>(v.data()),
+                                 v.size());
+}
+
+inline absl::Span<int64> AsInt64Slice(
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* v) {
+  return absl::Span<int64>(reinterpret_cast<int64*>(v->mutable_data()),
+                           v->size());
 }
 
 // Returns a string representation of the given data layout.
@@ -87,14 +95,6 @@ enum class QuantizedActivationMode {
   k32Bit = 4,
 };
 
-// Specifies the data type used by an operation.
-enum class DataType {
-  kFloat = 0,
-  kDouble = 1,
-  kHalf = 2,
-  kInt8 = 3,
-};
-
 // A helper class to convert C/C++ types to the proper enums.
 template <typename T>
 struct ToDataType;
@@ -114,6 +114,10 @@ template <>
 struct ToDataType<int8> {
   static constexpr DataType value = DataType::kInt8;
 };
+template <>
+struct ToDataType<int32> {
+  static constexpr DataType value = DataType::kInt32;
+};
 
 // Specifies the types of a RNN model.
 enum class RnnMode {
@@ -245,15 +249,15 @@ class BatchDescriptor {
   string ToShortString() const;
 
   // Accessors.
-  int64 count() const { return count_; }
-  int64 feature_map_count() const { return feature_map_count_; }
-  int64 height() const { return GetDim(spatial_size_, DimIndex::Y); }
-  int64 width() const { return GetDim(spatial_size_, DimIndex::X); }
-  int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size_, dim); }
-  int ndims() const { return ndims_; }
+  int64 count() const { return tensor_.dimensions(0); }
+  int64 feature_map_count() const { return tensor_.dimensions(1); }
+  int64 height() const { return GetDim(spatial_size(), DimIndex::Y); }
+  int64 width() const { return GetDim(spatial_size(), DimIndex::X); }
+  int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size(), dim); }
+  int ndims() const { return spatial_size().size(); }
   float value_max() const { return value_max_; }
   float value_min() const { return value_min_; }
-  DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return tensor_.data_layout(); }
   QuantizedActivationMode quantized_activation_mode() const {
     return quantized_activation_mode_;
   }
@@ -267,23 +271,23 @@ class BatchDescriptor {
 
   // Named-argument helpers for avoiding user error during construction.
   BatchDescriptor& set_count(int64 value) {
-    count_ = value;
+    tensor_.set_dimensions(0, value);
     return *this;
   }
   BatchDescriptor& set_feature_map_count(int64 value) {
-    feature_map_count_ = value;
+    tensor_.set_dimensions(1, value);
     return *this;
   }
   BatchDescriptor& set_height(int64 value) {
-    SetDim(&spatial_size_, DimIndex::Y, value);
+    SetDim(spatial_size(), DimIndex::Y, value);
     return *this;
   }
   BatchDescriptor& set_width(int64 value) {
-    SetDim(&spatial_size_, DimIndex::X, value);
+    SetDim(spatial_size(), DimIndex::X, value);
     return *this;
   }
   BatchDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
-    SetDim(&spatial_size_, dim, value);
+    SetDim(spatial_size(), dim, value);
     return *this;
   }
   BatchDescriptor& set_value_max(float value) {
@@ -295,7 +299,7 @@ class BatchDescriptor {
     return *this;
   }
   BatchDescriptor& set_layout(DataLayout layout) {
-    layout_ = layout;
+    tensor_.set_data_layout(layout);
     return *this;
   }
   BatchDescriptor& set_quantized_activation_mode(
@@ -334,31 +338,20 @@ class BatchDescriptor {
       port::ArraySlice<dnn::BatchDescriptor> inputs);
 
  private:
-  int64 count_;
-  int64 feature_map_count_;
-  // Stored as: ..., y, x.
-  std::vector<int64> spatial_size_;
+  absl::Span<const int64> spatial_size() const {
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
+  }
+
+  absl::Span<int64> spatial_size() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
   float value_max_;
   float value_min_;
-  DataLayout layout_;
-  int ndims_;
   QuantizedActivationMode quantized_activation_mode_;
 };
 
-// Describes how a filter is laid out in the memory.
-// Specify int64 so there's no padding in FilterDescriptor.
-enum class FilterLayout : int64 {
-  kOutputInputYX = 0,  // cuDNN's default filter layout, laid out as:
-                       // (major) output feature maps >> input feature maps >>
-                       // rows >> columns (minor).
-  kOutputYXInput,      // major to minor:
-                       //   (output features, row, columns, input features)
-  kOutputInputYX4,  // laid out the same as kOutputInputYX but each element is a
-                    // vector of 4 feature maps.
-  kInputYXOutput,   // Same as dist_belief's default filter layout.
-  kYXInputOutput,   // Same as tensorflow's default filter layout.
-};
-
 // Returns a string representation of the given filter layout.
 string FilterLayoutString(FilterLayout layout);
 
@@ -398,30 +391,30 @@ class FilterDescriptor {
 
   // Named-argument helpers for avoiding user error during construction.
   FilterDescriptor& set_output_feature_map_count(int64 value) {
-    output_feature_map_count_ = value;
+    tensor_.set_dimensions(0, value);
     return *this;
   }
   FilterDescriptor& set_input_feature_map_count(int64 value) {
-    input_feature_map_count_ = value;
+    tensor_.set_dimensions(1, value);
     return *this;
   }
   FilterDescriptor& set_input_filter_height(int64 value) {
-    SetDim(&input_filter_dims_, DimIndex::Y, value);
+    SetDim(input_filter_dims(), DimIndex::Y, value);
     return *this;
   }
   FilterDescriptor& set_input_filter_width(int64 value) {
-    SetDim(&input_filter_dims_, DimIndex::X, value);
+    SetDim(input_filter_dims(), DimIndex::X, value);
     return *this;
   }
   FilterDescriptor& set_layout(FilterLayout layout) {
-    layout_ = layout;
+    tensor_.set_filter_layout(layout);
     return *this;
   }
   FilterDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
-    SetDim(&input_filter_dims_, dim, value);
+    SetDim(input_filter_dims(), dim, value);
     return *this;
   }
-  int ndims() const { return ndims_; }
+  int ndims() const { return input_filter_dims().size(); }
 
   void CloneFrom(const FilterDescriptor& other);
 
@@ -434,32 +427,32 @@ class FilterDescriptor {
 
   // Returns the number of biases required as parameters for a convolution
   // using this filter descriptor.
-  int64 bias_count() const { return output_feature_map_count_; }
+  int64 bias_count() const { return output_feature_map_count(); }
 
-  int64 output_feature_map_count() const { return output_feature_map_count_; }
-  int64 input_feature_map_count() const { return input_feature_map_count_; }
+  int64 output_feature_map_count() const { return tensor_.dimensions(0); }
+  int64 input_feature_map_count() const { return tensor_.dimensions(1); }
   int64 input_filter_height() const {
-    return GetDim(input_filter_dims_, DimIndex::Y);
+    return GetDim(input_filter_dims(), DimIndex::Y);
   }
   int64 input_filter_width() const {
-    return GetDim(input_filter_dims_, DimIndex::X);
+    return GetDim(input_filter_dims(), DimIndex::X);
   }
   int64 input_filter_dim(DimIndex dim) const {
-    return GetDim(input_filter_dims_, dim);
+    return GetDim(input_filter_dims(), dim);
   }
 
-  FilterLayout layout() const { return layout_; }
+  FilterLayout layout() const { return tensor_.filter_layout(); }
+
   absl::Span<const int64> input_filter_dims() const {
-    return input_filter_dims_;
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
   }
 
  private:
-  int64 output_feature_map_count_;
-  int64 input_feature_map_count_;
-  // Stored as: ..., y, x.
-  std::vector<int64> input_filter_dims_;
-  int ndims_;
-  FilterLayout layout_;
+  absl::Span<int64> input_filter_dims() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
 };
 
 // Describes how padding should be aligned when the total number of pad
@@ -500,6 +493,11 @@ std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
 //   cells between each filter element in the "y dimension".
 // - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1)
 //   skipped cells between each filter element in the "x dimension".
+// - convolution_not_crosscor: By default (convolution_not_crosscor == false),
+//   we perform cross correlation rather than convolution. With the flag set,
+//   we perform convolution. Convolution and cross correlation are related by
+//   rotating the filter by 180 degrees (or equivalently flipping all spatial
+//   dimensions).
 class ConvolutionDescriptor {
  public:
   // By default construction, there is no zero-padding and the filter stride is
@@ -513,87 +511,102 @@ class ConvolutionDescriptor {
   string ToShortString() const;
 
   ConvolutionDescriptor& set_zero_padding_height(int64 value) {
-    SetDim(&zero_padding_, DimIndex::Y, value);
+    SetDim(padding(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_zero_padding_width(int64 value) {
-    SetDim(&zero_padding_, DimIndex::X, value);
+    SetDim(padding(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64 value) {
-    SetDim(&zero_padding_, dim, value);
+    SetDim(padding(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_vertical_filter_stride(int64 value) {
-    SetDim(&filter_strides_, DimIndex::Y, value);
+    SetDim(strides(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) {
-    SetDim(&filter_strides_, DimIndex::X, value);
+    SetDim(strides(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64 value) {
-    SetDim(&filter_strides_, dim, value);
+    SetDim(strides(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_vertical_dilation_rate(int64 value) {
-    SetDim(&dilation_rates_, DimIndex::Y, value);
+    SetDim(dilations(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_horizontal_dilation_rate(int64 value) {
-    SetDim(&dilation_rates_, DimIndex::X, value);
+    SetDim(dilations(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64 value) {
-    SetDim(&dilation_rates_, dim, value);
-    return *this;
-  }
-  ConvolutionDescriptor& set_pad_alignment(PadAlignment pad_alignment) {
-    pad_alignment_ = pad_alignment;
+    SetDim(dilations(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_group_count(int group_count) {
-    group_count_ = group_count;
+    proto_.set_group_count(group_count);
     return *this;
   }
-  int64 zero_padding_height() const {
-    return GetDim(zero_padding_, DimIndex::Y);
-  }
-  int64 zero_padding_width() const {
-    return GetDim(zero_padding_, DimIndex::X);
+  ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) {
+    proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION
+                                     : ConvolutionMode::CROSS_CORRELATION);
+    return *this;
   }
+  int64 zero_padding_height() const { return GetDim(padding(), DimIndex::Y); }
+  int64 zero_padding_width() const { return GetDim(padding(), DimIndex::X); }
   int64 vertical_filter_stride() const {
-    return GetDim(filter_strides_, DimIndex::Y);
+    return GetDim(strides(), DimIndex::Y);
   }
   int64 horizontal_filter_stride() const {
-    return GetDim(filter_strides_, DimIndex::X);
+    return GetDim(strides(), DimIndex::X);
   }
   int64 vertical_dilation_rate() const {
-    return GetDim(dilation_rates_, DimIndex::Y);
+    return GetDim(dilations(), DimIndex::Y);
   }
   int64 horizontal_dilation_rate() const {
-    return GetDim(dilation_rates_, DimIndex::X);
+    return GetDim(dilations(), DimIndex::X);
   }
 
-  int zero_padding(DimIndex dim) const { return GetDim(zero_padding_, dim); }
-  int filter_stride(DimIndex dim) const { return GetDim(filter_strides_, dim); }
-  int dilation_rate(DimIndex dim) const { return GetDim(dilation_rates_, dim); }
-  PadAlignment pad_alignment() const { return pad_alignment_; }
-  int group_count() const { return group_count_; }
-  int ndims() const { return ndims_; }
+  int zero_padding(DimIndex dim) const { return GetDim(padding(), dim); }
+  int filter_stride(DimIndex dim) const { return GetDim(strides(), dim); }
+  int dilation_rate(DimIndex dim) const { return GetDim(dilations(), dim); }
+  // TODO(timshen): remove this function. No users of this class is setting a
+  // non-default pad alignment.
+  PadAlignment pad_alignment() const { return PadAlignment::kDefault; }
+  int group_count() const { return proto_.group_count(); }
+  int ndims() const { return padding().size(); }
+  bool convolution_not_crosscorr() const {
+    return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION;
+  }
+
+  absl::Span<const int64> strides() const {
+    return AsInt64Slice(proto_.strides());
+  }
 
-  absl::Span<const int64> strides() const { return filter_strides_; }
-  absl::Span<const int64> dilations() const { return dilation_rates_; }
-  absl::Span<const int64> padding() const { return zero_padding_; }
+  absl::Span<const int64> dilations() const {
+    return AsInt64Slice(proto_.dilations());
+  }
+
+  absl::Span<const int64> padding() const {
+    return AsInt64Slice(proto_.paddings());
+  }
 
  private:
-  // Stored as: .. y, x.
-  std::vector<int64> zero_padding_;
-  std::vector<int64> filter_strides_;
-  std::vector<int64> dilation_rates_;
-  PadAlignment pad_alignment_;
-  int group_count_;
-  int ndims_;
+  absl::Span<int64> strides() { return AsInt64Slice(proto_.mutable_strides()); }
+
+  absl::Span<int64> dilations() {
+    return AsInt64Slice(proto_.mutable_dilations());
+  }
+
+  absl::Span<int64> padding() {
+    return AsInt64Slice(proto_.mutable_paddings());
+  }
+
+  ConvolutionDescriptorProto proto_;
+
   // TODO(leary) cudnn provides these fields, but need to characterize what
   // their effect is -- they may be boolean rather than integral.
   // int64 upscale_input_x;
@@ -717,21 +730,23 @@ class PoolingDescriptor {
 class AlgorithmDesc {
  public:
   typedef int64 Index;
-  AlgorithmDesc(Index a, bool use_tensor_ops)
-      : algo_(a), tensor_ops_enabled_(use_tensor_ops) {
-    DCHECK_NE(a, -1);
+  AlgorithmDesc(Index a, bool use_tensor_ops) {
+    proto_.set_algo_id(a);
+    proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
+                                        : AlgorithmProto::DEFAULT_MATH);
+  }
+  bool tensor_ops_enabled() const {
+    return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH;
   }
-  bool tensor_ops_enabled() const { return tensor_ops_enabled_; }
-  Index algo_id() const { return algo_; }
+  Index algo_id() const { return proto_.algo_id(); }
   bool operator==(const AlgorithmDesc& other) const {
-    return this->algo_ == other.algo_ &&
-           this->tensor_ops_enabled_ == other.tensor_ops_enabled_;
+    return algo_id() == other.algo_id() &&
+           tensor_ops_enabled() == other.tensor_ops_enabled();
   }
   uint64 hash() const;
 
  private:
-  Index algo_;
-  bool tensor_ops_enabled_;
+  AlgorithmProto proto_;
 };
 
 // Describes the result from a perf experiment.
@@ -875,24 +890,6 @@ class NormalizeDescriptor {
   int32 segment_size_;
 };
 
-// Describes a kind of non-linearity (threshold-like mathematical function).
-enum class ActivationMode {
-  kNone = 0,
-  kSigmoid,
-  // Rectified linear activation: f(x) = x < 0 ? 0 : x
-  kRelu,
-  // Rectified linear activation, where upper maximum is 6.0.
-  kRelu6,
-  // Rectified linear activation, where upper maximum specified by
-  // BatchDescriptor::value_max().
-  kReluX,
-  kTanh,
-  // Like ReluX, but passes all values in the range [-X,X].
-  kBandPass,
-
-  kNumActivationModes,  // Always in the end.
-};
-
 // Returns a string representation of the given activation mode.
 string ActivationModeString(ActivationMode mode);
 
@@ -921,6 +918,23 @@ class VersionInfo {
 // Suite of operations typically used for implementing Deep/Convolutional Neural
 // Nets. Note: A false return value of an operation indicates the
 // implementation is not available.
+//
+// TODO(b/118763918): this class (or rather dispatch table) has several
+// problems:
+// * Some overloads are missing. Ideally we want to have template virtual
+//   functions while the template arguments is a closed set. However, we don't
+//   get that from the language.
+// * The API is a union of cuDNN and another private backend. Only 10% of the
+//   functions are actually implemented by both backends, the rest are
+//   actually backend-specific. The massive interface creates extra mental
+//   burden.
+// * Poor error handling: the API should return Status objects.
+//
+// Things worth trying:
+// * Move functions that are not actually common back to the backends. Then,
+//   callers may use dynamic_cast to access specific backends. This may not be
+//   that hard, as many of the callers are Stream::ThenXxx functions.
+// * Change all the returned bools to Status.
 class DnnSupport {
  public:
   DnnSupport() {}
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
new file mode 100644
index 0000000000000000000000000000000000000000..56b079c3f5b962636e7c75b46449adca8e13a43e
--- /dev/null
+++ b/tensorflow/stream_executor/dnn.proto
@@ -0,0 +1,103 @@
+// LINT: LEGACY_NAMES
+syntax = "proto3";
+
+package stream_executor.dnn;
+
+// Specifies the data type used by an operation.
+enum DataType {
+  kFloat = 0;
+  kDouble = 1;
+  kHalf = 2;
+  kInt8 = 3;
+  kInt32 = 4;
+}
+
+// Describes how a convolution input or output layer's data is formatted.
+enum DataLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Batch <-> batch, or N
+  // Depth <-> feature, or channel
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  kYXDepthBatch = 0;
+  kYXBatchDepth = 1;
+  kBatchYXDepth = 2;   // cuDNN's NHWC layout
+  kBatchDepthYX = 3;   // cuDNN's NCHW layout
+  kBatchDepthYX4 = 4;  // cuDNN's NCHW_VECT_C layout
+}
+
+// Describes how a convolution filter is laid out in the memory.
+enum FilterLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Output <-> output feature, or N
+  // Input <-> input feature, or N
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  kOutputInputYX = 0;   // cuDNN's NCHW layout
+  kOutputYXInput = 1;   // cuDNN's NHWC layout
+  kOutputInputYX4 = 2;  // cuDNN's NCHW_VECT_C layout
+  kInputYXOutput = 3;
+  kYXInputOutput = 4;
+}
+
+// Describes a kind of non-linearity (threshold-like mathematical function).
+enum ActivationMode {
+  kNone = 0;
+  kSigmoid = 1;
+  // Rectified linear activation: f(x) = x < 0 ? 0 : x
+  kRelu = 2;
+  // Rectified linear activation; where upper maximum is 6.0.
+  kRelu6 = 3;
+  // Rectified linear activation; where upper maximum specified by
+  // BatchDescriptor::value_max().
+  kReluX = 4;
+  kTanh = 5;
+  // Like ReluX; but passes all values in the range [-X,X].
+  kBandPass = 6;
+}
+
+// Describe the math definition for the conv op. The popular behavior is
+// actually called cross-correlation in math, despite the operation is often
+// referred as convolution. See cuDNN cudnnConvolutionMode_t.
+enum ConvolutionMode {
+  CROSS_CORRELATION = 0;
+  CONVOLUTION = 1;
+}
+
+// Generic tensor representation.
+message TensorDescriptorProto {
+  repeated int64 dimensions = 1;
+  DataType data_type = 2;
+  oneof layout_oneof {
+    DataLayout data_layout = 3;
+    FilterLayout filter_layout = 4;
+  }
+}
+
+// Generic algorithm representation.
+message AlgorithmProto {
+  enum MathType {
+    DEFAULT_MATH = 0;
+    // The GPU may operate 4x4 matrix FMA.
+    // See cuDNN's documentation for CUDNN_TENSOR_OP_MATH.
+    TENSOR_OP_MATH = 1;
+  }
+  int64 algo_id = 1;
+  MathType math_type = 2;
+}
+
+// Convolution-specific parameters.
+message ConvolutionDescriptorProto {
+  repeated int64 paddings = 1;
+  repeated int64 strides = 2;
+  repeated int64 dilations = 3;
+  // The "accumulator" type. For example, use F32 as an accumulator for F16
+  // convolutions.
+  // See cuDNN's cudnnConvolutionMode_t.
+  DataType compute_mode = 4;
+  // See cuDNN's group count.
+  int32 group_count = 5;
+  ConvolutionMode convolution_mode = 6;
+}
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index 410dc9da899cc967b36c1738a6b7c128a98cf70c..d16cca8dcc041d1a78ed93c42aa14ba0ff692239 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -103,3 +103,5 @@ REGISTER_MODULE_INITIALIZER(host_platform,
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(host_platform, multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     host_platform);
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index 5b51398d8cab5df7c7514bc3bedf87f5c33c6e5a..bbb56071f49c74973f360040db7d126ffe346075 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -15,62 +15,86 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
+namespace {
 
-/* static */ mutex MultiPlatformManager::platforms_mutex_{LINKER_INITIALIZED};
+class MultiPlatformManagerImpl {
+ public:
+  port::Status RegisterPlatform(std::unique_ptr<Platform> platform)
+      LOCKS_EXCLUDED(mu_);
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::LookupByNameLocked(
-    const string& target) {
-  PlatformMap* platform_map = GetPlatformMap();
-  auto it = platform_map->find(port::Lowercase(target));
-  if (it == platform_map->end()) {
-    return port::Status(
-        port::error::NOT_FOUND,
-        "could not find registered platform with name: \"" + target + "\"");
-  }
-  return it->second;
-}
+  port::StatusOr<Platform*> PlatformWithName(absl::string_view target)
+      LOCKS_EXCLUDED(mu_);
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::LookupByIdLocked(
-    const Platform::Id& id) {
-  PlatformIdMap* platform_map = GetPlatformByIdMap();
-  auto it = platform_map->find(id);
-  if (it == platform_map->end()) {
-    return port::Status(
-        port::error::NOT_FOUND,
-        port::Printf("could not find registered platform with id: 0x%p", id));
-  }
-  return it->second;
-}
+  port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
+      LOCKS_EXCLUDED(mu_);
+
+  port::StatusOr<Platform*> InitializePlatformWithName(
+      absl::string_view target, const std::map<string, string>& options)
+      LOCKS_EXCLUDED(mu_);
+  port::StatusOr<Platform*> InitializePlatformWithId(
+      const Platform::Id& id, const std::map<string, string>& options)
+      LOCKS_EXCLUDED(mu_);
+
+  std::vector<Platform*> AllPlatforms() LOCKS_EXCLUDED(mu_);
+
+  using Listener = MultiPlatformManager::Listener;
+  port::Status RegisterListener(std::unique_ptr<Listener> listener)
+      LOCKS_EXCLUDED(mu_);
+
+ private:
+  // Looks up the platform object with the given name.  Assumes the Platforms
+  // mutex is held.
+  port::StatusOr<Platform*> LookupByNameLocked(absl::string_view target)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Looks up the platform object with the given id.  Assumes the Platforms
+  // mutex is held.
+  port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-/* static */ port::Status MultiPlatformManager::RegisterPlatform(
+  absl::Mutex mu_;
+  std::vector<std::unique_ptr<Listener>> listeners_ GUARDED_BY(mu_);
+  absl::flat_hash_map<Platform::Id, Platform*> id_map_ GUARDED_BY(mu_);
+  absl::flat_hash_map<string, Platform*> name_map_ GUARDED_BY(mu_);
+};
+
+port::Status MultiPlatformManagerImpl::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   CHECK(platform != nullptr);
   string key = port::Lowercase(platform->Name());
-  mutex_lock lock(platforms_mutex_);
-  if (GetPlatformMap()->find(key) != GetPlatformMap()->end()) {
+  absl::MutexLock lock(&mu_);
+  if (name_map_.find(key) != name_map_.end()) {
     return port::Status(port::error::INTERNAL,
                         "platform is already registered with name: \"" +
                             platform->Name() + "\"");
   }
-  GetPlatformByIdMap()->insert(std::make_pair(platform->id(), platform.get()));
+  Platform* platform_ptr = platform.get();
+  CHECK(id_map_.emplace(platform->id(), platform_ptr).second);
   // Release ownership/uniqueness to prevent destruction on program exit.
   // This avoids Platforms "cleaning up" on program exit, because otherwise,
   // there are _very_ tricky races between StreamExecutor and underlying
   // platforms (CUDA, OpenCL) during exit. Since these are fixed-size and 1x per
   // program, these are deemed acceptable.
-  (*GetPlatformMap())[key] = platform.release();
+  name_map_[key] = platform.release();
+  for (const auto& listener : listeners_) {
+    listener->PlatformRegistered(platform_ptr);
+  }
   return port::Status::OK();
 }
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
-    const string& target) {
-  mutex_lock lock(platforms_mutex_);
+port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
+    absl::string_view target) {
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
   if (!platform->Initialized()) {
@@ -80,9 +104,9 @@ namespace stream_executor {
   return platform;
 }
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
     const Platform::Id& id) {
-  mutex_lock lock(platforms_mutex_);
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (!platform->Initialized()) {
@@ -92,15 +116,15 @@ namespace stream_executor {
   return platform;
 }
 
-/* static */ port::StatusOr<Platform*>
-MultiPlatformManager::InitializePlatformWithName(
-    const string& target, const std::map<string, string>& options) {
-  mutex_lock lock(platforms_mutex_);
+port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithName(
+    absl::string_view target, const std::map<string, string>& options) {
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
   if (platform->Initialized()) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "platform \"" + target + "\" is already initialized");
+    return port::Status(
+        port::error::FAILED_PRECONDITION,
+        absl::StrCat("platform \"", target, "\" is already initialized"));
   }
 
   SE_RETURN_IF_ERROR(platform->Initialize(options));
@@ -108,10 +132,9 @@ MultiPlatformManager::InitializePlatformWithName(
   return platform;
 }
 
-/* static */ port::StatusOr<Platform*>
-MultiPlatformManager::InitializePlatformWithId(
+port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithId(
     const Platform::Id& id, const std::map<string, string>& options) {
-  mutex_lock lock(platforms_mutex_);
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (platform->Initialized()) {
@@ -125,10 +148,90 @@ MultiPlatformManager::InitializePlatformWithId(
   return platform;
 }
 
-/* static */ void MultiPlatformManager::ClearPlatformRegistry() {
-  mutex_lock lock(platforms_mutex_);
-  GetPlatformMap()->clear();
-  GetPlatformByIdMap()->clear();
+port::Status MultiPlatformManagerImpl::RegisterListener(
+    std::unique_ptr<Listener> listener) {
+  absl::MutexLock lock(&mu_);
+  CHECK(id_map_.empty());
+  CHECK(name_map_.empty());
+  listeners_.push_back(std::move(listener));
+  return port::Status::OK();
+}
+
+std::vector<Platform*> MultiPlatformManagerImpl::AllPlatforms() {
+  absl::MutexLock lock(&mu_);
+  CHECK_EQ(id_map_.size(), name_map_.size());
+  std::vector<Platform*> platforms;
+  platforms.reserve(id_map_.size());
+  for (const auto& entry : id_map_) {
+    platforms.push_back(entry.second);
+  }
+  return platforms;
+}
+
+port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
+    absl::string_view target) {
+  auto it = name_map_.find(port::Lowercase(target));
+  if (it == name_map_.end()) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        absl::StrCat("Could not find registered platform with name: \"", target,
+                     "\""));
+  }
+  return it->second;
+}
+
+port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByIdLocked(
+    const Platform::Id& id) {
+  auto it = id_map_.find(id);
+  if (it == id_map_.end()) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        port::Printf("could not find registered platform with id: 0x%p", id));
+  }
+  return it->second;
+}
+
+MultiPlatformManagerImpl& Impl() {
+  static MultiPlatformManagerImpl* impl = new MultiPlatformManagerImpl;
+  return *impl;
+}
+
+}  // namespace
+
+/*static*/ port::Status MultiPlatformManager::RegisterPlatform(
+    std::unique_ptr<Platform> platform) {
+  return Impl().RegisterPlatform(std::move(platform));
+}
+
+/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
+    absl::string_view target) {
+  return Impl().PlatformWithName(target);
+}
+
+/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+    const Platform::Id& id) {
+  return Impl().PlatformWithId(id);
+}
+
+/*static*/ port::StatusOr<Platform*>
+MultiPlatformManager::InitializePlatformWithName(
+    absl::string_view target, const std::map<string, string>& options) {
+  return Impl().InitializePlatformWithName(target, options);
+}
+
+/*static*/ port::StatusOr<Platform*>
+MultiPlatformManager::InitializePlatformWithId(
+    const Platform::Id& id, const std::map<string, string>& options) {
+  return Impl().InitializePlatformWithId(id, options);
+}
+
+/*static*/ port::Status MultiPlatformManager::RegisterListener(
+    std::unique_ptr<Listener> listener) {
+  return Impl().RegisterListener(std::move(listener));
+}
+
+/*static*/ std::vector<Platform*> MultiPlatformManager::AllPlatforms() {
+  return Impl().AllPlatforms();
 }
 
 }  // namespace stream_executor
@@ -141,3 +244,15 @@ REGISTER_MODULE_INITIALIZER(
         // purposes from Platform subclasses that register
         // themselves with the MultiPlatformManager.
     });
+
+REGISTER_MODULE_INITIALIZER(
+    multi_platform_manager_listener,
+    {
+        // Nothing -- this is just a module initializer definition to reference
+        // for sequencing registration of listeners with the
+        // MultiPlatformManager.
+    });
+
+// Listener registration should happen before platform registration.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     multi_platform_manager);
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index 146a128e85cfe84a844aae0fd50d5a329df2723c..06f5ae2c2baaee0444697d4096da7bf36e5c217d 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -67,14 +67,14 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
+#include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
 
 namespace stream_executor {
 
@@ -84,9 +84,8 @@ class MultiPlatformManager {
   // Registers a platform object, returns an error status if the platform is
   // already registered. The associated listener, if not null, will be used to
   // trace events for ALL executors for that platform.
-  // Takes ownership of listener.
-  static port::Status RegisterPlatform(std::unique_ptr<Platform> platform)
-      LOCKS_EXCLUDED(platforms_mutex_);
+  // Takes ownership of platform.
+  static port::Status RegisterPlatform(std::unique_ptr<Platform> platform);
 
   // Retrieves the platform registered with the given platform name (e.g.
   // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
@@ -98,10 +97,8 @@ class MultiPlatformManager {
   // If the requested platform is not registered, an error status is returned.
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
-  static port::StatusOr<Platform*> PlatformWithName(const string& target)
-      LOCKS_EXCLUDED(platforms_mutex_);
-  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
-      LOCKS_EXCLUDED(platforms_mutex_);
+  static port::StatusOr<Platform*> PlatformWithName(absl::string_view target);
+  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
 
   // Retrieves the platform registered with the given platform name (e.g.
   // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
@@ -114,14 +111,12 @@ class MultiPlatformManager {
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
   static port::StatusOr<Platform*> InitializePlatformWithName(
-      const string& target, const std::map<string, string>& options)
-      LOCKS_EXCLUDED(platforms_mutex_);
+      absl::string_view target, const std::map<string, string>& options);
+
   static port::StatusOr<Platform*> InitializePlatformWithId(
-      const Platform::Id& id, const std::map<string, string>& options)
-      LOCKS_EXCLUDED(platforms_mutex_);
+      const Platform::Id& id, const std::map<string, string>& options);
 
-  // Clears the set of registered platforms, primarily used for testing.
-  static void ClearPlatformRegistry() LOCKS_EXCLUDED(platforms_mutex_);
+  static std::vector<Platform*> AllPlatforms();
 
   // Although the MultiPlatformManager "owns" its platforms, it holds them as
   // undecorated pointers to prevent races during program exit (between this
@@ -135,57 +130,32 @@ class MultiPlatformManager {
   // of any platforms registered with it, and leak checking should be disabled
   // during allocation of such Platforms, to avoid spurious reporting at program
   // exit.
-  using PlatformMap = std::map<string, Platform*>;
-
-  // Provides access to the available set of platforms under a lock.
-  static port::Status WithPlatforms(
-      std::function<port::Status(PlatformMap*)> callback)
-      LOCKS_EXCLUDED(platforms_mutex_) {
-    mutex_lock lock(platforms_mutex_);
-    return callback(GetPlatformMap());
-  }
-
- private:
-  using PlatformIdMap = std::map<Platform::Id, Platform*>;
-
-  static mutex platforms_mutex_;
-
-  // TODO(b/22689637): Clean up these two maps; make sure they coexist nicely.
-  // TODO(b/22689637): Move this (whatever the final/"official" map is) to
-  // plugin_regstry.h, along with the associated functionality.
-  // Platform-name-to-object mapping. These platforms are registered via module
-  // initializers, and linkage determines which platforms are available to a
-  // given target.
-  static PlatformMap* GetPlatformMap() {
-    static PlatformMap* instance = new PlatformMap;
-    return instance;
-  }
-
-  // Holds a Platform::Id-to-object mapping.
-  // Unlike platforms_ above, this map does not own its contents.
-  static PlatformIdMap* GetPlatformByIdMap() {
-    static PlatformIdMap* instance = new PlatformIdMap;
-    return instance;
-  }
-
-  // Looks up the platform object with the given name.  Assumes the Platforms
-  // mutex is held.
-  static port::StatusOr<Platform*> LookupByNameLocked(const string& target)
-      EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_);
-
-  // Looks up the platform object with the given id.  Assumes the Platforms
-  // mutex is held.
-  static port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
-      EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(MultiPlatformManager);
+
+  // Interface for a listener that gets notfied at certain events.
+  class Listener {
+   public:
+    virtual ~Listener() = default;
+    // Callback that is invoked when a Platform is registered.
+    virtual void PlatformRegistered(Platform* platform) = 0;
+  };
+  // Registers a listeners to receive notifications about certain events.
+  // Precondition: No Platform has been registered yet.
+  static port::Status RegisterListener(std::unique_ptr<Listener> listener);
 };
 
 }  // namespace stream_executor
 
-// multi_platform_manager.cc will define this instance. Includers of this header
-// should use
+// multi_platform_manager.cc will define these instances.
+//
+// Registering a platform:
 // REGISTER_MODULE_INITIALIZER_SEQUENCE(my_platform, multi_platform_manager);
+// REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+// my_platform);
+//
+// Registering a listener:
+// REGISTER_MODULE_INITIALIZER_SEQUENCE(my_listener,
+// multi_platform_manager_listener);
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+DECLARE_MODULE_INITIALIZER(multi_platform_manager_listener);
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 5421e4f4a5edc933a9fdbffda81678fab458483a..3edc66cde8045d7f6ae53095e8136d1697fb1d23 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -191,8 +191,11 @@ string ToVlogString(dnn::DataType data_type) {
       return "dnn::DataType::kHalf";
     case dnn::DataType::kInt8:
       return "dnn::DataType::kInt8";
+    case dnn::DataType::kInt32:
+      return "dnn::DataType::kInt32";
+    default:
+      return "unknown DataType";
   }
-  return "unknown DataType";
 }
 
 // Used together with PARAM to VLOG calls made to the stream. Intended
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 74773629d299a609c4fa88da12fb00fed7f6833a..2d67d1f46614d20e7929f9e6706614114d55a894 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1167,6 +1167,11 @@ def tf_kernel_library(
         copts = []
     textual_hdrs = []
     copts = copts + tf_copts(is_external = is_external)
+
+    # Override EIGEN_STRONG_INLINE to inline when
+    # --define=override_eigen_strong_inline=true to avoid long compiling time.
+    # See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"])
     if prefix:
         if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
             if not gpu_srcs:
@@ -1302,13 +1307,13 @@ def _py_wrap_cc_impl(ctx):
         ctx.outputs.py_out.dirname,
     ]
     args += ["-l" + f.path for f in ctx.files.swig_includes]
-    args += ["-I" + i for i in swig_include_dirs]
+    args += ["-I" + i for i in swig_include_dirs.to_list()]
     args += [src.path]
     outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
     ctx.action(
         executable = ctx.executable._swig,
         arguments = args,
-        inputs = list(inputs),
+        inputs = inputs.to_list(),
         outputs = outputs,
         mnemonic = "PythonSwig",
         progress_message = "SWIGing " + src.path,
@@ -1488,7 +1493,7 @@ check_deps = rule(
     },
 )
 
-def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [], **kwargs):
+def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [], copts = [], **kwargs):
     """Helper to build a dynamic library (.so) from the sources containing implementations of custom ops and kernels.
     """
     cuda_deps = [
@@ -1500,12 +1505,18 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         clean_dep("//tensorflow/core:stream_executor_headers_lib"),
     ]
     deps = deps + tf_custom_op_library_additional_deps()
+
+    # Override EIGEN_STRONG_INLINE to inline when
+    # --define=override_eigen_strong_inline=true to avoid long compiling time.
+    # See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"])
+
     if gpu_srcs:
         basename = name.split(".")[0]
         native.cc_library(
             name = basename + "_gpu",
             srcs = gpu_srcs,
-            copts = _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
+            copts = copts + _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
             features = if_cuda(["-use_header_modules"]),
             deps = deps + if_cuda_is_configured_compat(cuda_deps) + if_rocm_is_configured(rocm_deps),
             **kwargs
@@ -1526,7 +1537,7 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         srcs = srcs,
         deps = deps + if_cuda_is_configured_compat(cuda_deps) + if_rocm_is_configured(rocm_deps),
         data = if_static([name + "_check_deps"]),
-        copts = tf_copts(is_external = True),
+        copts = copts + tf_copts(is_external = True),
         features = ["windows_export_all_symbols"],
         linkopts = linkopts + select({
             "//conditions:default": [
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index f7491649c22738c625e3f63944f2347358d2e525..a1083d732a1bb1b3212457f445323e5e868ef162 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -20,7 +20,13 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT32
     }
-    reserved_range {
+    field {
+      name: "use_numa_affinity"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+     reserved_range {
       start: 2
       end: 3
     }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index 53b532beab344db8cff9d1ccac4821b8f280af67..b505d813509c2049fa6e3f60df553492d6f66613 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -143,6 +143,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
+      field {
+        name: "use_numa_affinity"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
index 0a16d6ab92faac1db63470f0aedadf69341be29b..50af42f4fcddfa8cac8bfd58458b9903e988fad2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "gradient"
     argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'UnconnectedGradients.NONE\'], "
   }
+  member_method {
+    name: "jacobian"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'unconnected_gradients\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'True\'], "
+  }
   member_method {
     name: "reset"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..493dcba8922d7f6c51a61d337f48e09d168e6bac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.TensorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_spec.TensorSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 8b7f63e43e237864d4ef24d3b251b23199f9ee17..f59082baeb21f092783290657d083ca7ef0bbc7b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.Dataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index a7bfa82c650e0a511cb6c8eaffceaf49fbfeaa39..d73168b070e374a749a00f74b24b77a715d2f37e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -17,7 +19,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index 9f4de74c393972ae31b1cbb790363ca7f1348af1..9d032d43de1094f212e5f749013f1fac5a898459 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -18,10 +18,6 @@ tf_class {
     name: "experimental_hoist_random_uniform"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "experimental_latency_all_edges"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "experimental_map_and_batch_fusion"
     mtype: "<type \'property\'>"
@@ -54,6 +50,10 @@ tf_class {
     name: "experimental_shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_stats"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 7b7a9ebaf08b1e9fdb5e4c5b7448175611a9b2c4..51224cd6b45f0a1efdfbb3ba6a3ca377d37fd00b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.data.TFRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 2817f900e15ccf8df2ca71aa0218ba07eef682e2..a10add1b7e38f9875e699903b3e3c103d73e647e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2520e28a3c708f45942eb2e73911b7a5226646e5..71b597c19c512879b8f18b34843b160efecc6bec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.CsvDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 1dd53b1eabdf15b662a839a07176ba4eaf8bda37..20646e87b5fbe23d89ad31ca632a64bf958339f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.RandomDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8fdd9dc52e332abdeed039bd85d31f6318d013e9..86c5ff5b0bd7b42d61a92a44c8888852a48677be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.SqlDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
index 0bcc8cf3e87ea8b78f28130da60a1749e2848806..6536a698b50efc9daaa72d8ae589855e30fbc601 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.data.experimental.StatsAggregator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_ops.StatsAggregator\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_aggregator.StatsAggregator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f423eed42cc2d7115fd50b3ad533f3790736a850
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.data.experimental.StatsOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "aggregator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "counter_prefix"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latency_all_edges"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "prefix"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 116684e5d81da3b9b181727cca00b51c84360573..244b24519c8102ba973bffd62ac0df88658708c1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "StatsAggregator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StatsOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
@@ -86,7 +90,7 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
@@ -124,10 +128,6 @@ tf_module {
     name: "scan"
     argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "set_stats_aggregator"
-    argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], "
-  }
   member_method {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
index ab6287f8cd080621d76fc34e2cb437960a217800..8a7f1e9363b8211d83d39d31da11507cb4c805eb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c39ac5a20dec5d32f30115ea3cfe4bd0ab8e7d72
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.distribute.InputContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.distribute.InputContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "input_pipeline_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_input_pipelines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_input_pipelines\', \'input_pipeline_id\', \'num_replicas_in_sync\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "get_per_replica_batch_size"
+    argspec: "args=[\'self\', \'global_batch_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a7a3a97aa0927b81708311d4b8b28fced217c00
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.distribute.InputReplicationMode"
+tf_class {
+  is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_WORKER"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4899f38cad253167ce0b94f79388cb97fe534197
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.ReduceOp"
+tf_class {
+  is_instance: "<enum \'ReduceOp\'>"
+  member {
+    name: "MEAN"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3eda6c60366d8367ef95d4fcab769c6b102c0018
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.ReplicaContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.distribute.ReplicaContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "distribution_strategy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica_id_in_sync_group"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strategy"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_call"
+    argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b502b534bec6ba5716c850af83a8519240513c9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.distribute.StrategyExtended"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.distribute.DistributionStrategyExtended\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'container_strategy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_steps_on_iterator"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4fe035b474f02bb0fda5b8034b0e38f146d2a6b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -0,0 +1,133 @@
+path: "tensorflow.distribute.Strategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.distribute.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d833b54ba0950b6b2cf40c958829dc2eeb24795
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.distribute"
+tf_module {
+  member {
+    name: "InputContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InputReplicationMode"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReduceOp"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReplicaContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Strategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StrategyExtended"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "get_loss_reduction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "has_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "in_cross_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
index 32b84e90ce6ae99e80208905d701d690227a0cf7..af1659528b6a1cbf74bc4667463cefc438a9d108 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.estimator.BaselineClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifierV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
index 94933e7ffd6b0f4838f7b2e9254a4056c9cbf245..d218773dfc4a152bad17d6285dd82dd5a1aaa36d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.BaselineEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
index db7776b5bf67879cc806bed1b8463b99a082a50e..e5794252e47cdbabe16475e1d7f4a42fd604b542 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.estimator.BaselineRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressorV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index fa352907c0bfd482423a3e5a643a9135851af038..970abd8622faf950dfd4a5f1a766a70ec2b9881b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -22,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 154b35f3064ed1e3dca8c3b1e2bdfa7f13801dcc..b5bbad965e2e8934edf9dc771c34f72bbf99a836 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -22,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
index ce6040d0f279361ea789b54dee489996d9787ea7..77e60d426ef7033e2fb45a0a26ffb7d658db9877 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.estimator.DNNClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85ff5a4fb10f58690e6e1f8c44fb2f8ecf5b9355
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.estimator.DNNEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index e85007e16edcf5f7f59768c7c17b9340e15bc7b6..07aefed63dc3c2a9856d00d5658a77710fbfc807 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.estimator.DNNLinearCombinedClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifierV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
index a23f5daeac4d6690a599d0d92a2cb5ffdc4937c3..ac13dad2d46cf322e21fd312e70b898513fda85d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNLinearCombinedEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index 8a55bb835ff4b118a8b2ee45561f3e29639bab90..852e8d2f54f9305e1213db14011dca7bbb452278 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.estimator.DNNLinearCombinedRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressorV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 2c4128ec480cc8302e01aa56b61fdb8a7db35b0a..2779cbe90ea7a9ba4b27d995f0209bebe0f1b982 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.estimator.DNNRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressorV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
index 9d270a87ab8fe9788988f9277ad0e652f2b4860a..eee57462fb56231ee563752124e6faf6dd3f71e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.estimator.Estimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
index 4b5de2e245090a3dd265a3ab9d062bf8e43169d7..6569e92c6af0ec8bfb605b85e9ec84dc91a6cee7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.estimator.LinearClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..023edec819c22daa1ea4e16e6f0839cf05ed37e1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.estimator.LinearEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
index 0d1510e9ab1371d9cead321cfb6def4fcb417b16..d74bf4f197c777ee4abf0e37be1137aa9377d4f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.estimator.LinearRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressorV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
index 862761a96c723d157a2e983151a742aa90e83e17..cabca3e883fbceecb399e048e09722acd4efcad4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -16,4 +16,12 @@ tf_module {
     name: "linear_logit_fn_builder"
     argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], "
   }
+  member_method {
+    name: "make_early_stopping_hook"
+    argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_higher_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
index ec3216ae705709e39d9afb18545476213d529ddd..c5b0085b8d3ec58b4215d4a756957e1509501841 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DNNEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNLinearCombinedClassifier"
     mtype: "<type \'type\'>"
@@ -72,6 +76,10 @@ tf_module {
     name: "LinearClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c3f04e468c4c817cd474deb42149aee3021aa43
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.experimental"
+tf_module {
+  member_method {
+    name: "function_executor_type"
+    argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index 0a231f1b65155b8662bb38943bfd97c5283b9385..15d0e099bab3052553671d52d396239b27383a8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -172,6 +172,10 @@ tf_module {
     name: "random_saturation"
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "resize"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
+  }
   member_method {
     name: "resize_area"
     argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -240,6 +244,10 @@ tf_module {
     name: "total_variation"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "transpose_image"
     argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index dccf136788da44073160931707167b7d8baa0add..64b63ed1a4a5611d369cd4aa01589aee2076b24f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -112,6 +112,10 @@ tf_module {
     name: "serialize_sparse"
     argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
+  member_method {
+    name: "serialize_tensor"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tf_record_iterator"
     argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 4eb7f8f2a7171abbcb19de220d2231e352cd14e5..9dc8daea5c4e8e6293b2427add50ad4ebfbc264e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -89,10 +101,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -109,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 524455be8bbfc4b51719e5634cfdde695cb715ee..a357a825153528bdff75e9f73ec8d99545d71120 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -90,10 +102,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -114,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index 5f0dfd7ae7d7743455d051e10bd3c4ef767a6948..8cd0c6ea5f027fa1f30b60a742450b651242d406 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -182,7 +182,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'inputs\', \'outputs\', \'updates\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'outputs\', \'updates\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "gather"
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\', \'zero_output_for_mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "round"
@@ -512,6 +512,10 @@ tf_module {
     name: "temporal_padding"
     argspec: "args=[\'x\', \'padding\'], varargs=None, keywords=None, defaults=[\'(1, 1)\'], "
   }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'x\', \'n\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_dense"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
index 9eee9b378964a9947b067b7ec495ef6556ab6d0c..7d298e95135ebf41230d72ff488fef30be682edb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 5bb949c5bb650acee91b14a4d6bf95b36029edf7..133205ab88b47afad32fc70ceca93513768a3b19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
index a5340d52c1af6d69da30fd710bcee9d832917574..d766c09ac5efaa9d0e4ffba4e495385130c7e770 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
@@ -22,6 +22,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
index ed0f37647f4ae7ae56466844f8e71dcc6f1e6ce4..605f74e5602a63f5a18c31cb26113d300ec76e7a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
index ee400b31c43829efba156298d5ee807cdafc8a98..cd893e67269164781d6a6b6294a199014d40fed8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index df8d7b0ef7afca17338a26388c38827b5b306f95..50f2054cabb1b8f6c46a9537ea923a18f87e5c80 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index ce1a9b694d8708720e0eb677afd25607c6262e9c..9ed9db0a89b49b88098e15baca414ff78b6f10e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 48bb24a05274addca03f11acef99607f78b92e51..3d8d1363bb4e4de818788efbf3c997594350006a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index d8bb8b2a7d0f491c7ec2b30096a1acaf04681a56..5012f1517d57dd646d82ab669cb279b6363dd6ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index dc27af9552a88650261b4f0694ea0265e6bda05c..73652c2b61259f768eca76b995ae4592df868392 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 5a3b791c0adc0d61129d38b2995ee9077cf0988b..24db71de1182d58b78fec0419aa9cb48a2e315d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
index e9d53b7225a6d5edcc1275db8ed364017b8a12e3..c5503c69a5f3cb6765c984778c0e3626369ee815 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index 5c2d336353aee7fc98b45620adac4f4bcda05ea0..de6e8ef072558e6d926ea125aa5056e3c229d37f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index db69e25c5b70a0cdd76dba6aa570d0c634a31279..1d814b2c8b553f1b2a07f9d9b97dc70ec0674969 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 5510465d7b015e4989472b06c9d00ec9772373cf..b84629540e700f242f885064c92309c294693a11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index 38ec8a0aff0b9321f3a7ab2cfd9e6b75a8228e4a..5918a13ad8629582829049485e896688ecad9579 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 41cb8e30bfb57068ebe787f14f69ccc467047f26..599da06427dfe4f28e757a7aac8d8a14856a4556 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 9a7aaa8e961528aa750248e02f44403cab10a413..f9ff1538c8134d96051ad81d35c73e59c6a8cc57 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 014f5828fad1152b19a0b0e3d2ffee7cee4c999b..723fc9cdb0d0ad93470e22fd8c147d3ecc92af91 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index cc303bf7b98bb81cb0646fc18df0a4c5c70f1917..957ce2f0ce86f8df3eb8b57606229fb661eb52f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 628447ce3555628b651536d6c5b2a7716d59085c..a52c0af68175420dc2a1993d1f025d36705538e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index f03c986c22210906ad7bdc8b880753469b31aa1b..a004db62ddcaaae02a411d8db51f4026ece1384d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index a6e4856de9b63c946b77b745a6ede28dedf44afc..44f83d1387cb2ec681f50f7b1f0297f3f74594ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a01eaf8a12626257e97d135f50c06c7ea32fca27..8378faf7188ec594865d4b68c8ea8cae284183ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 0d6698f2ef4c674bf8a4dfc026eb209a83dcb8e7..9d5655c9644e3a2394a346bed78fc478cf60ba8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index f1b23be48f7fec2051f1985381058d769eb8c2f8..820034564ffdc6d94dcc3a39659a55dd8a9d0070 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 0672cd5b7b8fdb1967e39c9163635372f73459b7..d37a6b47105225d7b83b6a264b944ceeb583a6c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index b25ae1e82e8a1f315553337a261a2d8a46301fa0..1ad7a91be0ba48d0dbab19da8c7cd9ca89095918 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index bb1918eba65659d9ede888400c24b3a5121d6052..cb9abc25396bb63a3c40de5cc52f9df7ed20071e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -178,6 +178,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 16e0fd5a3131723b3ba3ef3ae6d93fa6426dbd47..47dba1d81f8f97a60fe72ec521f82a78ee5f3505 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 381839d6deb8355a54fd883596f94e38fa1356d4..fd649418961301f150aac3dabc1bdf0ade4a9c28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 543bae6fa96fa3ae51775e865bf95ea6f79c8e94..1b1425d53197db8b59abf51fe93c0b0c45299956 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 2933f9f4b3ab854a9bff6a200da2c2b912bfd4d8..1741063fe8b09acf3865e0a135e96bb715dcdcfa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 072943dc2c709a7cee26c3439e02e11455187282..50feb4f458ad1a9cb2b2bfe5d67997b7551eed74 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 222a1ef4fc5d19afe2c111c169c2f0bd38c331d6..faaa535df9fe03ad07862f0793f8ebea67b405ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 9c9c7461c8b8b43acfc7b7db94fd961a15d57817..4079329d1ee2a61270fee38426bb8a0859c38ce3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index f93906717814d4df7dfbf983d6cdbef358e9a55c..32e56696e1617f7810792e3416a2ebb2037d23c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 44ca598724a5c7b6d40ba460dd866675971015c3..381abe73401fa3a588873d643324fc020c159e30 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 471b18ef8500a279fb07bc893e2c8100d76d7bf1..b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f250a09b7eb69871e7e89d30da817aeb1d896fc..7aeff8003c322e8a8168dd70481a8b30b08762a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index f52128483c67321e4f0e5f0cf5a9fd3c65794561..a1728d9d4f9a1e677646db04c4d0df9572e21208 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 98daf3bab128357ffdde2e8ffa4f61fd5c6493f7..8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index b207c6800050a6a3f8b9525315fffa14341758cc..7758209adf8fe7a1306fa5ef125935dafd925c3e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 2d7a09ceda90fa8564de4410bf9b553cf1594c97..7c463ff1257599366be049edce6cc06140906286 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 3ac3825759391b7ea21fd6e3b3b149bb9e731479..4960d0264e96e872ea5c49a8841cef20bd5eb37c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 280ec8c25fabe1be63c9aa9a2c7f168315c219d7..8fad7535f882718462a11e27e75732e3097cb87d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 560f66f9c7a1f7e42e27c739a6c71671f8bd147b..5b425f2d4d7a8a897280490e26922766d8bf7065 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index c0543529c3884f20383911f32ea04c07fec4a050..f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index 04eb2824b9b14cf45eaef263282ffc6778bf709d..82b761fc1761bb3e7638f7a80bc80c6433162d04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index f400432915f8ce892a3297a23078f140eb96db7b..c9ff323877e06b6dff274644744d425e3a9b7932 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index ab176b441a246d93b88c00cd6decb34af175ad86..9b4165d4cbf88fefd2bb684dae70ea8afc01357b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index c3895a0ac127bc663f2a323661c1371a428159b0..f225f7c4309615919fb05df05f2ae664bde80097 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 9e24bb8ae6a1dc2e438038d4ac9225a7f17c4598..855d001700179fb634d1dff78585d340420abe7f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 55e0d7ef023ac4ca5e89f640c5ebb79199c31afa..2c404c99cd2175cdc8b60b229e4410bf280ebcb7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 38fbff5e4a3d2c892b0601c54e52690dae5760bd..6f109d59d0f6fcd2b4650719e3b4f653baec7d23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index a8094c0bde3e46b49cd253d7861922c90f1ffbd1..69f8a9031d32eb73bb44291cdf330d738d745cf9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 929f48df23180a2c5e21c110e0e1d343596ecd76..4299f765e525b136e289bba169becec06e19ffb1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 2e6d59337f1df94e327b506248eb74ab11bd6013..9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 3ebe162f573f630400a17d9a7ccad1615b8e9c78..625e81fd2322ceba153fa65c138948ce43843089 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4e3e258430cdacaf55aed5d46411d2b74c9bdf2e..2fc769742c70c5665c9cb77ad246fcdb49366d5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index fb9166316f6a641eb12a5664100e31d652148a84..e307a65c7c565660e1f2b6b6b74dc5970425eaa4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index c0a53b847b4afc1fb098fa06eb8e8e27a96c3459..4394ad0364e89fd3531d6625e52540991cadf973 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 87b7f6797a0d5bef8c5a4ff582c30433eaced2d4..050ed39fe98dc7cfdf6febe45e235d3ae7cbf486 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 98bf96fa0c251c5f6de8878d48e651ac3346ff38..436191821ef4689351b6124cf2a20afad917e4ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index ff6c6f3ec4d501a87a858c7c9bf365590cfb4fdc..4ba540aa6adc72b572aa9340f89967d69ab78a3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index c9d4158d1c434655abb11b92269e6e70ad2d1f91..a2e9322cb3fd4e56af708d5c4e17b660f7bc2247 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 9953102ff991bfd4f0568120dd7aef07f75ea208..5d16a57fc1aeff9939220de8043fcae39e3d953e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 2617f5a95fa631cf0b92e1fd2feef7457f96fd80..9dd29c1251ef2eacaf535a3f10f3d42dc36624a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
index 5fd0a47a68c0d4ad218c4c64cc6be8f603d9673a..bc3ceb67a4e7506b42fccd6b227891b9eef8147f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index e9f6ef45aaf1c775ea1b8dd157737f65c87e232f..0045d5775e2c19df21428bd4420b6e5612c8002b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 1b1ccbe118069e5fa5c3acd4ebea2d7fc14395b2..529c750f98715ec30313ed34c9023a845061a3df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 2e0b6bac24fd63988b28c1099d40581989b783df..d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 1e93d1118a4d306d5427d9b6873de1746d93b764..e1f5491180903f7d6931cc09755cabb715bbf233 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bfd36012a7edb8a74198a87a86577278be3fdcd4..9b69d9a9447f42907236b5cc8c7672012f96c38a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 5ad5990d7e624c4f6b1dde92b4608c65aeb19db1..fd52259432577ac94dc702d4411ad5c0eed1ff10 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 40d03369a5235f394832e3e2f48710bb069e9aac..5fc8af0d03564c649dff6e9df70d10731319de40 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 86666b51bb8c8dc22deb95f05cb9edfb10688015..7f8932270e63bc02852c5b64e53694e7e26be08b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index d26da270e747898412b05f6e07e3bbc23f287e0b..4723b99cb0792e1ce0bdc45e46908da8c2b5359c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 85f23df671d2772995ec01bb09e191237d60e6a7..173c5d4a8b149c4e23683cf375e8d793db7faa5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 235806b96500473fe95dd1b25aafe7f091bdb36b..14e1899e145224e411d65cbf481060a3b2cec0f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 524c5fd69e508bac55b05502e62650b92cf2be5c..a708e652bf0e82dea0f58034a81a040a39550dc9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index fda2562fc8c51623f5c4b33e23319ed35229905e..e6706b5cf9f32bda78adc4e2db5916a5750cc82e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 71d2d09a8d1d7addf91d7dc4ca109f8c2d45aed9..a73c082d1bba0453b742f76bacf0ad6116ba79a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index 12949b39a6f7affa657d1dccdc49ad0dc37e9c2f..f3f195554bbf4a43efaf2af0fd278a23bf270994 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index ab16d0021e627e6a2a821a0185ad71eb5bef1835..f345d1d67b2ce0200c64b1aeea5f39821d070bac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 61ccbf5962791ee1c0b35cc4aba422ff5cacd456..31cb8bc177c7a9e365101e75108a29900fbda124 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index ce2320d7030d05ba1e065f5bbcf8a18014891b5e..44cccc92bd2f1ff0335c22f2967865dc88a96ff7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 69848af8cf876ad1232a0bf7c419f52ed68af9f0..b55e191ff1ad6997550966bbb6154a81a489575d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 3358f26aebf8a8f845278d54b4524f4405751f5d..e9575436e5b14ac8c52a0b59c86937886eab5f40 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 413f45f018ae0ce9ccf0e459b24d544c456e4c7c..98223b207f2ecfd5b7af8a53390166e53a7d4f73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9c61ff602744c00f9105a3f297151b49a8a3dead..2df918b16b2552323d75083bfa80e328c0639cfe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index baa91804c49f86a31093aed0c0a56613f7c1afee..ce5f9e21290eeddc0052257191ac4a6d068c1366 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 15a5d6ac9ea6e087dc0d76a2ab48b08448bfb6ee..a0bb917775fd9edb5d909bf850310e0596a88209 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index be43bd5b3c13632711a49cbbe6c85527d46d46ec..d7942f201bdbfa8d1577813be461a5905b5c6c90 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6105992c7a3a92d00718fe3287412af3c752db1d..f7ac9042d46f46ab35d18c62e5d8841679a18ca9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 1b6cf1e9ecb08a789212da141971434bd63988a6..e5a92688220f6e227b317d71a70fde01df4c432b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 29488a37f8f29f953d2b8b7e447c331df3244c84..0fe2c974a762784a82a6b97e116357be2a61d84f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3d70cf8b65972bda1f2bf4d78e126c0946a7c2a5..2ee5873f0f11688019dec3a6cd69db06d99b9caa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -149,6 +149,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index d29731ecf9d5387a324104865af5f563d287c60b..5b8f64aa35725d0ea44fc5c5b81952fd839503e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index a6d7494ca7d2230298a442b86766f46bc58a6d54..240cb6e562f77467d94ef95db2374150e318bc04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index c36e802693df564702100a652f3ccc2e95e4c40d..6226c469f8a534f96f6ea991fa5e7d2cf0019e3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9c46cfe40fd6959b526d6ca271bda3182daa1188..34dabce6d8dd0b1b6fe50a008a981e1f06a77edf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 8982f787940dd65291580781b5dc95941d804071..0ddf628ace582db259ebe0b211aba6e6362b5d5b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index ec2cc502984d302b243803b04b4f9d60cee43d05..12eb35ad154a514afd9c900cb2dbece8af28c49f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index d7bc1980f32e523781a68e80312905bc355f0509..c41020c2b45cc88c9b63f3b7a45c35066794dfe2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index fec2de6b49ec1ffaf45b9ee9048bcce37425e919..479f89cf6ae93e8d6ae02e304a51a145164df7de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -93,6 +93,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 3d285e7f17db3e8cdfbacf0056a4c56ffa7e67cb..233363ce02614f184b43a059889c7475b6a8c50b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index b05e5ec84de1eb4899b6fae437dc0d4bd1ab402d..cb6228ac446bd236df88f94eb6e9e717ea38463d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 728eca415a80842291d5684e55632689ceea4099..03bad3ccb613a225ad56e128ea680fc9312151e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index da64e77c39c0e116ff725bb05526882541dd6056..158996792a47fab0e7aa26d21d4bb7f281ca76d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 2f505f9293f429490543ba2c569668f4b2ba3ca4..63a56cd3eebe271f66258c9a0acb974764555b34 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index f82c77072e6969dd57f89f4a971e59e28b4bfc63..965a4cca04651e123c5bd93484200a58b39918ba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 54e01a99177cde5fbfaf5e1e0ac310bef3ea8eae..1a624308878a68f1b48cb0f8b5e08dafbbfa0333 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index dbd88939b7f8d0df502374fa8e16dd037cfeafc4..41d8b2fc950d02ace5f0efc7f790aa0a9c022f5f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -89,10 +101,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -109,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 3c36f0702f8da5301b67435d0010374ffe878dfc..2cf107a5cd89805d590e0fc5a372ed3d18c914c0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -90,10 +102,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -114,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index c82e67526b21696a7d56517dc2cb6998882dc7a5..059c91f724aae187055f8323c7748dc99f153302 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 1d031cb5f8461145127b0f13d77e6b8774f5a0b3..d06c8e81ee5d2a8b487d7c3c3714a1f4ed2c8e80 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index a8dda6655df1d06ca77b74f0a992c8fd7e7a357d..6be8e7c210f3f0a28ed8ad8a6672bc4323eb7f9d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 97f65ed89436bd0b4027bb0cbeb80b6f1419269c..b132bd43c48cf0d76728d1119b241e183a1d8f5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index ccd9578f0d62bd70ea252ddeac587d59c926b018..21c695935ce7751df67e09091c961e9e0cfbbf7c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 9cbb58d721bb49bde562a57728a9ee46968e611e..f24d0307207588610c1f764bf43912b64c3ea2c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index c75ea3911e17bc879d140068ef54521effd2824e..0a510ece355435d8e75e39d5f7cdc6cebefe32cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index 5dc834e5141e58d255357e02d7446a06e6e2aa45..d0ee44bed3c739da27cc83f0e643e1ea9dd98078 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 96ab209874ac14d6acf2e8115e7f04fc35c4b2bd..546de3cdab3aa0519450f74c6c6d0fe74ddc000c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 7e9656b3525c1d53940b869607616ff414a466cf..3ad311581eba815c2d1b0155a1380db80dd61c5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index e9a2269a6e8de1f9a12f1b54d2e6dced3d4f8902..9b83271350cf90a2d430303dfecfd28facad272b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 7d2eaaab2a8cb9159214a16ba65473d0b6870ac4..87a7fb3d843e3e8e3e2fe5a56ec0b181355a6d7b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
index fd02c919aeb5a536bd052324618983af699e7c47..80834e08f7ada08f02c660017ae0b735bb31e20e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 8bc3eb26e9ca0bf0f129db336b7ca23466fd036f..32b17e90ade7aa0054a390256e3abadfc7011cbe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index 6a0dcce56ac0184ffe995662fd62b89e16257a29..643c469717c258207046ddd93a318f47753de46b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index b6c84edf2a2f86240369b4053cd7351d0b59442d..434e25adc12c2f2f704b07087b8552781ac2d024 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 062a02fa590537b9efbf540a874eeaa6d36697f3..089fc6f9243c85937500b6275da034eb0748ecd4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index eaad0fb23ef7501c8c5b7acee6a9677665b7057f..bc3d58b9ca9789b43bc91f9283a81811f2b6a4e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index ece28a8ce962d8fafb3f7a397a814b903e915d48..fe7d71af3a4a46bed4ea9e62cbd7ad17987517c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 973705dae2fabbef0eafb38ad12e96c747aeee27..773c74e64d13ca4a840b7f599fc2cbe9c161cd03 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index de917706d55214cc59f3205f0778d600a356a5b1..533544d21f2753f785113a30518f4fcbcff96cd7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index c4e6a21c3ac9324f5dd445dc65415c2abb4c6e9f..e3926eb6d4714731d09ff9c5b75a89830c06e7c1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 2e085a8e289e21173789041efb9254e992bd723b..ba209df7824a9cc076499458e35acd7dcf1eaf35 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 42d22bce42d8850a784afae3f67771ef1cfe5403..081fb0e08bcd1b35ab44459d1c8eb0857dd14956 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index d6749fdcec69425e83a044409ec695d2661f782e..2014a04301618c20af5cf6f1144eb4dbda2479e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index d9f363d1336210623536e8293a6290d9ebfc2fe1..9a87ae9687741090485bd8d4d0d07d359a2015e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index aac7ee31ed62c22b2e86d287d48c68c7e905fd00..33afb835ce1d524991c0024bfb87c29a72aac08e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -76,6 +76,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index c11d39082939eda4520b3955b767022bd485b5be..a9078c8ab5cca078237a29febabdbbd4a8b6c89c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 3ee800269e617390c25248a2c847cbe259b18e79..4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 63a1bc2321e35645700778c5906d1b8659eb4a32..a87649133fd207ad59f2124c6b0b5aa44916e5a5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index e2c5a505a7d2f9abbee5b3bb4f92ee8843198c51..32656467840fbbc0c8708ea68aac5aa75c11a540 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index a1b0e06b4753488bc9fcbe9aeb0d260092745f9c..49d8890c8942bc0021886ee6c9bc4e7625452655 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index 6d849dc040f61b498b100820bf7be3d4bc264bb4..c89dc067b331603e227d9d578147e2dd1ee4a900 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index cbab7ce6314320068a40392cb7f430fd47845af5..1a4098d121b71d25fc0aaa9c7e6e4f096b01e033 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -136,6 +136,10 @@ tf_module {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'adjoint_a\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "norm"
     argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.losses.-reduction.pbtxt
index 258ad5047eb6e82eeb9c0941b0acf0573e5ca61d..b2adb52660fb14792df569a7f1dcb69184be23dc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.losses.-reduction.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.losses.Reduction"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.Reduction\'>"
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "MEAN"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index a45c0a3259e174e1f15e86a2982fe4556f483d3b..67f348be218e7fe18b94cbdeaed1e666046b926c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -176,6 +176,26 @@ tf_module {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "l2_normalize"
     argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
@@ -296,10 +316,18 @@ tf_module {
     name: "reduce_prod"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_std"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "reduce_sum"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_variance"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 93f2fda2acf8f6566f6099a60d1ee4287b4d2ae6..48501e1b581336558c7e62d4b27d21d8c701878e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "bidirectional_dynamic_rnn"
     argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "collapse_repeated"
+    argspec: "args=[\'labels\', \'seq_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_accidental_hits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -72,6 +76,10 @@ tf_module {
     name: "conv3d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
+  member_method {
+    name: "conv3d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "conv3d_backprop_filter_v2"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
@@ -104,6 +112,14 @@ tf_module {
     name: "ctc_loss"
     argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
   }
+  member_method {
+    name: "ctc_loss_v2"
+    argspec: "args=[\'labels\', \'logits\', \'label_length\', \'logit_length\', \'logits_time_major\', \'unique\', \'blank_index\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ctc_unique_labels"
+    argspec: "args=[\'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "depth_to_space"
     argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
@@ -112,6 +128,14 @@ tf_module {
     name: "depthwise_conv2d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "depthwise_conv2d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_backprop_input"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "depthwise_conv2d_native"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 88b8f37c4ff0cfaf562293c845e505f06119e227..f7f9978c063ceae89c7228b476f54694e25bc249 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index a4483fefa279957ce503857021c063254a9abf83..f9e898484b9813373a49e6f117578f822cdeb156 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 381c4975d7d778599ce34a9023d0e46b20753cba..9e52a4252619ffc19b287fc1818fa6f772847335 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 912365a28b1277962f648b2b0655d280bca1427c..9836433d08cba809107f9bb5dbccf2e971865b8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -110,6 +110,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index a4bb3219c792708cd02a8345541d8685485c8d05..5fd9b329bdeb40b5a57fe68564977f61b5349ae5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 715bfd5fc7c18993d4997caeefe3188ba88f741c..76c8cff22b1e65e65d0ac3d6705541dc3f16f80c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index b66c0f89cc904c1318787651a3e8e629319c14fb..f53567af52f7ed6baa78bcc75bfc0e38de02e548 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index faeb4f3513362919fca8f0c2ef7c491d7938cb92..d3b68e4f2976912ed65ba7916284c951fda03b05 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -105,6 +105,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index caa2e600800178e4b2d36ae263da23d0b4608dd2..1f7840ab919baeeb0077904592ba8dcc1d4c91fb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index c9eb136f54e54864760c814389d0727cc745cc58..367f506b2179e065546feba7287dea90413cb866 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -244,6 +244,10 @@ tf_module {
     name: "TensorShape"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorSpec"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextLineReader"
     mtype: "<type \'type\'>"
@@ -320,6 +324,10 @@ tf_module {
     name: "debugging"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "distribute"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "distributions"
     mtype: "<type \'module\'>"
@@ -340,6 +348,10 @@ tf_module {
     name: "estimator"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "feature_column"
     mtype: "<type \'module\'>"
@@ -528,6 +540,10 @@ tf_module {
     name: "sets"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "signal"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "sparse"
     mtype: "<type \'module\'>"
@@ -684,6 +700,10 @@ tf_module {
     name: "argmin"
     argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'stable\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'False\', \'None\'], "
+  }
   member_method {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
@@ -770,7 +790,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
@@ -1556,6 +1576,10 @@ tf_module {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "no_gradient"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "no_op"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1636,6 +1660,10 @@ tf_module {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "py_function"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "qr"
     argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -1932,6 +1960,10 @@ tf_module {
     name: "slice"
     argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'None\'], "
+  }
   member_method {
     name: "space_to_batch"
     argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2280,6 +2312,10 @@ tf_module {
     name: "while_loop"
     argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\', \'return_same_structure\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\', \'False\'], "
   }
+  member_method {
+    name: "wrap_function"
+    argspec: "args=[\'fn\', \'signature\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "write_file"
     argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
index 160c09798d02653ba0c090db53124450b956ef05..d788f6dfca277ee9f76db66ef7bf214289fa1527 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "shuffle"
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "stateless_categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
   member_method {
     name: "stateless_multinomial"
     argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
index 67457de070830d45a48230835fc4827e36f70058..e4cc0061a953c81729d8499530e43f5b43a2210e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.saved_model.Builder"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl._SavedModelBuilder\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
index 83bd7035409534abf036c7e2b0d66fcc060ada3a..44860b11720e1af87d8baa3aec5f4f3169410d82 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.saved_model.builder.SavedModelBuilder"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl._SavedModelBuilder\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34343e7c039a373e704d0feb1df2564896fd319f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.saved_model.experimental"
+tf_module {
+  member_method {
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 5b28f7b9b1824ef4875a277a28cd51870090e423..3929003fa1ff0902b55adcdca1274b1c1b1de2e8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -108,6 +108,10 @@ tf_module {
     name: "constants"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "loader"
     mtype: "<type \'module\'>"
@@ -144,6 +148,10 @@ tf_module {
     name: "classification_signature_def"
     argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "contains_saved_model"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_tensor_from_tensor_info"
     argspec: "args=[\'tensor_info\', \'graph\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
index 8a196b1a556e283671cc75af28df3eaa62532975..09d6f1424b785e266854ede48b26ebbdf571288b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.sets"
 tf_module {
+  member_method {
+    name: "difference"
+    argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "intersection"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "set_difference"
     argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
@@ -16,4 +24,12 @@ tf_module {
     name: "set_union"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
+  member_method {
+    name: "size"
+    argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "union"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
similarity index 51%
rename from tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
index 6a421ef12d58dc047905ec916cbe777b4ce19b9a..ea717b4d719d6709e05182faca964ae544abc39c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.spectral"
+path: "tensorflow.signal"
 tf_module {
   member_method {
     name: "dct"
@@ -16,6 +16,18 @@ tf_module {
     name: "fft3d"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "frame"
+    argspec: "args=[\'signal\', \'frame_length\', \'frame_step\', \'pad_end\', \'pad_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "hamming_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "hann_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "idct"
     argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
@@ -32,6 +44,14 @@ tf_module {
     name: "ifft3d"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "inverse_stft"
+    argspec: "args=[\'stfts\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "inverse_stft_window_fn"
+    argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
+  }
   member_method {
     name: "irfft"
     argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -44,6 +64,18 @@ tf_module {
     name: "irfft3d"
     argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "linear_to_mel_weight_matrix"
+    argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "mfccs_from_log_mel_spectrograms"
+    argspec: "args=[\'log_mel_spectrograms\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "overlap_and_add"
+    argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "rfft"
     argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -56,4 +88,8 @@ tf_module {
     name: "rfft3d"
     argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "stft"
+    argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 32bd8d5f8edb24ee1f5a5672487499337bd1c0dd..ee4f31774ee9cd494d32ca8ab2a8366a9dcd0027 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -112,6 +112,10 @@ tf_module {
     name: "softmax"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sparse_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "split"
     argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2538de661b357245ad18d9e1c4fc88d2e80eaeb0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.train.CheckpointManager"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.checkpoint_management.CheckpointManager\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "checkpoints"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latest_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'checkpoint_number\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index 9f3539528435f0487492deb10fa2cfb63f8f58ae..877c55c6b3820294bcadf249304bd67f82bcaee6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "Checkpoint"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointManager"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
deleted file mode 100644
index c9a32c16b34a78bd5a182b7c0635a559bddc611d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.ConditionalAccumulatorBase"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'accumulator_ref\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
deleted file mode 100644
index 15e0ab76b6fd97b83019589e79ac290bbce11053..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.ConditionalAccumulator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulator\'>"
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\', \'reduction_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'conditional_accumulator\', \'MEAN\'], "
-  }
-  member_method {
-    name: "apply_grad"
-    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "take_grad"
-    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
index f7491649c22738c625e3f63944f2347358d2e525..caa72fe5a61aa9a13bc51ae5ab70048d309f6b62 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
@@ -20,6 +20,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT32
     }
+    field {
+      name: "use_numa_affinity"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     reserved_range {
       start: 2
       end: 3
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
index 53b532beab344db8cff9d1ccac4821b8f280af67..b505d813509c2049fa6e3f60df553492d6f66613 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
@@ -143,6 +143,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
+      field {
+        name: "use_numa_affinity"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
deleted file mode 100644
index 92e535c341447628a50d8941998a4065e78d12a5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-path: "tensorflow.DeviceSpec"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.device.DeviceSpec\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "job"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "replica"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "task"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'job\', \'replica\', \'task\', \'device_type\', \'device_index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_string"
-    argspec: "args=[\'spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "merge_from"
-    argspec: "args=[\'self\', \'dev\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "parse_from_string"
-    argspec: "args=[\'self\', \'spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "to_string"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
deleted file mode 100644
index a9ab27719b4d71f3d7ed10963ad896ccafa82f15..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-path: "tensorflow.Dimension"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.Dimension\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "value"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "assert_is_compatible_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_compatible_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "merge_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt
deleted file mode 100644
index 6933814a7b68f775e694fe940a7c65a8e31b9398..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.FixedLenFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "default_value"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt
deleted file mode 100644
index c53878795190924e205a1e7efe1672f216869c41..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt
+++ /dev/null
@@ -1,31 +0,0 @@
-path: "tensorflow.FixedLenSequenceFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "allow_missing"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "default_value"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
index 0a16d6ab92faac1db63470f0aedadf69341be29b..50af42f4fcddfa8cac8bfd58458b9903e988fad2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "gradient"
     argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'UnconnectedGradients.NONE\'], "
   }
+  member_method {
+    name: "jacobian"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'unconnected_gradients\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'True\'], "
+  }
   member_method {
     name: "reset"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
deleted file mode 100644
index ffe479093397a9bf98d10aa4e054c643e64d5f5d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
+++ /dev/null
@@ -1,140 +0,0 @@
-path: "tensorflow.GraphKeys"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.ops.GraphKeys\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "ACTIVATIONS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "ASSET_FILEPATHS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "BIASES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "CONCATENATED_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "COND_CONTEXT"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "EVAL_STEP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "GLOBAL_STEP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "GLOBAL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_RESOURCES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOSSES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "METRIC_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MODEL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MOVING_AVERAGE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "QUEUE_RUNNERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "READY_FOR_LOCAL_INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "READY_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "REGULARIZATION_LOSSES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "RESOURCES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVEABLE_OBJECTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SUMMARIES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SUMMARY_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TABLE_INITIALIZERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAINABLE_RESOURCE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAINABLE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAIN_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "UPDATE_OPS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "WHILE_CONTEXT"
-    mtype: "<type \'str\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt
deleted file mode 100644
index d875394fb5de73f67629b77c902a2ed2a03dd982..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt
+++ /dev/null
@@ -1,35 +0,0 @@
-path: "tensorflow.SparseFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "already_sorted"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "index_key"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "value_key"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
deleted file mode 100644
index 0064c8460cb374f1e3f108085a2efed4131dd205..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.TensorInfo.CooSparse"
-tf_proto {
-  descriptor {
-    name: "CooSparse"
-    field {
-      name: "values_tensor_name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "indices_tensor_name"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "dense_shape_tensor_name"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
deleted file mode 100644
index 63566c808e55cb4d3b630f0a017fa3a2c8a30de3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
+++ /dev/null
@@ -1,59 +0,0 @@
-path: "tensorflow.TensorInfo"
-tf_proto {
-  descriptor {
-    name: "TensorInfo"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "coo_sparse"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorInfo.CooSparse"
-      oneof_index: 0
-    }
-    field {
-      name: "dtype"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-    }
-    field {
-      name: "tensor_shape"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    nested_type {
-      name: "CooSparse"
-      field {
-        name: "values_tensor_name"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "indices_tensor_name"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "dense_shape_tensor_name"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    oneof_decl {
-      name: "encoding"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..493dcba8922d7f6c51a61d337f48e09d168e6bac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.TensorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_spec.TensorSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt
deleted file mode 100644
index 54b66f43f8e7d714e82ae9d68b37ac348c476c97..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.VarLenFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
deleted file mode 100644
index 67e1b76caba8a278eabe4a54e5c2fe85c5c2e099..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
+++ /dev/null
@@ -1,7 +0,0 @@
-path: "tensorflow.app"
-tf_module {
-  member_method {
-    name: "run"
-    argspec: "args=[\'main\', \'argv\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 8b7f63e43e237864d4ef24d3b251b23199f9ee17..9394f4b767b069246d93b6f80336d7d4ee19224e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.data.Dataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -16,7 +16,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "apply"
@@ -46,10 +45,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index a7bfa82c650e0a511cb6c8eaffceaf49fbfeaa39..8c32c773b7cf1025f8f16378ed7cdb0611c04c80 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDatasetV2\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -17,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -47,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index 9f4de74c393972ae31b1cbb790363ca7f1348af1..9d032d43de1094f212e5f749013f1fac5a898459 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -18,10 +18,6 @@ tf_class {
     name: "experimental_hoist_random_uniform"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "experimental_latency_all_edges"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "experimental_map_and_batch_fusion"
     mtype: "<type \'property\'>"
@@ -54,6 +50,10 @@ tf_class {
     name: "experimental_shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_stats"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 7b7a9ebaf08b1e9fdb5e4c5b7448175611a9b2c4..9f32bce10933547db374aad62306bb429fbe7e77 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.TFRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV2\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -47,10 +47,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 2817f900e15ccf8df2ca71aa0218ba07eef682e2..0eedfdbfe13200034008cae3053af8468c949511 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDatasetV2\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -47,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2520e28a3c708f45942eb2e73911b7a5226646e5..08214ec3cf7d3077a96d2d0b6beaca88ee5678c0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.CsvDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 1dd53b1eabdf15b662a839a07176ba4eaf8bda37..608253298ee0d8ab93f6f526affe9fd14b2b67b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.RandomDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8fdd9dc52e332abdeed039bd85d31f6318d013e9..3335eb1dc799b9fca57526410293d7727598ed16 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.SqlDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
index 0bcc8cf3e87ea8b78f28130da60a1749e2848806..6536a698b50efc9daaa72d8ae589855e30fbc601 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.data.experimental.StatsAggregator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_ops.StatsAggregator\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_aggregator.StatsAggregator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f423eed42cc2d7115fd50b3ad533f3790736a850
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.data.experimental.StatsOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "aggregator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "counter_prefix"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latency_all_edges"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "prefix"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 116684e5d81da3b9b181727cca00b51c84360573..244b24519c8102ba973bffd62ac0df88658708c1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "StatsAggregator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StatsOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
@@ -86,7 +90,7 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
@@ -124,10 +128,6 @@ tf_module {
     name: "scan"
     argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "set_stats_aggregator"
-    argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], "
-  }
   member_method {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
index ab6287f8cd080621d76fc34e2cb437960a217800..314aedda909cda8b1d8a209333b85a7792c19bd5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
@@ -6,19 +6,19 @@ tf_module {
   }
   member_method {
     name: "assert_all_finite"
-    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "assert_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_integer"
@@ -26,35 +26,35 @@ tf_module {
   }
   member_method {
     name: "assert_less"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_less_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_near"
-    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_non_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_non_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_none_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_proper_iterable"
@@ -62,15 +62,15 @@ tf_module {
   }
   member_method {
     name: "assert_rank"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank_at_least"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank_in"
-    argspec: "args=[\'x\', \'ranks\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'ranks\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_same_float_dtype"
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
@@ -88,28 +88,8 @@ tf_module {
     name: "check_numerics"
     argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "is_finite"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_inf"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_nan"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_non_decreasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "is_numeric_tensor"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "is_strictly_increasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c39ac5a20dec5d32f30115ea3cfe4bd0ab8e7d72
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.distribute.InputContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.distribute.InputContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "input_pipeline_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_input_pipelines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_input_pipelines\', \'input_pipeline_id\', \'num_replicas_in_sync\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "get_per_replica_batch_size"
+    argspec: "args=[\'self\', \'global_batch_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a7a3a97aa0927b81708311d4b8b28fced217c00
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.distribute.InputReplicationMode"
+tf_class {
+  is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_WORKER"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4899f38cad253167ce0b94f79388cb97fe534197
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.ReduceOp"
+tf_class {
+  is_instance: "<enum \'ReduceOp\'>"
+  member {
+    name: "MEAN"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3eda6c60366d8367ef95d4fcab769c6b102c0018
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.ReplicaContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.distribute.ReplicaContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "distribution_strategy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica_id_in_sync_group"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strategy"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_call"
+    argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b502b534bec6ba5716c850af83a8519240513c9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.distribute.StrategyExtended"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.distribute.DistributionStrategyExtended\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'container_strategy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_steps_on_iterator"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4fe035b474f02bb0fda5b8034b0e38f146d2a6b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -0,0 +1,133 @@
+path: "tensorflow.distribute.Strategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.distribute.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d833b54ba0950b6b2cf40c958829dc2eeb24795
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.distribute"
+tf_module {
+  member {
+    name: "InputContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InputReplicationMode"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReduceOp"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReplicaContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Strategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StrategyExtended"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "get_loss_reduction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "has_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "in_cross_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt
deleted file mode 100644
index ca96f4eaece0020235d24901f51306a65676c1c9..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Bernoulli"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.bernoulli.Bernoulli\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "logits"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "probs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'False\', \'True\', \'Bernoulli\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt
deleted file mode 100644
index d0508acd9f4f6c190b205301223599cf5b027955..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Beta"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.beta.Beta\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration0"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration1"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'concentration1\', \'concentration0\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Beta\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt
deleted file mode 100644
index ff0fbb56cd4b9e4c288a168a7c3d9e83c552b0e2..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Categorical"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.categorical.Categorical\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "logits"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "probs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'False\', \'True\', \'Categorical\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt
deleted file mode 100644
index d75e4a2f88b29ff7f638d72f98876a230b191dce..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.DirichletMultinomial"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.dirichlet_multinomial.DirichletMultinomial\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_count"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'total_count\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'DirichletMultinomial\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt
deleted file mode 100644
index b838b9ae21decba0323211f08d09fe373ababf23..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Dirichlet"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.dirichlet.Dirichlet\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Dirichlet\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt
deleted file mode 100644
index 6f06b7d50dd9f5f405673d572503ff549f148f33..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt
+++ /dev/null
@@ -1,134 +0,0 @@
-path: "tensorflow.distributions.Distribution"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'reparameterization_type\', \'validate_args\', \'allow_nan_stats\', \'parameters\', \'graph_parents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt
deleted file mode 100644
index d34f9cde5d4d4161883f6d1b4646f22f054d16ad..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt
+++ /dev/null
@@ -1,144 +0,0 @@
-path: "tensorflow.distributions.Exponential"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.exponential.Exponential\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.gamma.Gamma\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "rate"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Exponential\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt
deleted file mode 100644
index df268b8d99eb6bf22264ddb63231074413686efa..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Gamma"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.gamma.Gamma\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "rate"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'concentration\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Gamma\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt
deleted file mode 100644
index 303dcb4ed3bf8416b822bb010c2e87e8ef03b7c9..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Laplace"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.laplace.Laplace\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loc"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Laplace\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt
deleted file mode 100644
index ecda8acb15c49c390eaae203a0082e78e53499bd..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Multinomial"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.multinomial.Multinomial\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "logits"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "probs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_count"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'total_count\', \'logits\', \'probs\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Multinomial\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt
deleted file mode 100644
index 92b9eeea223b488cda1ebcabd31ec808e78fcf70..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.normal.Normal\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loc"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Normal\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt
deleted file mode 100644
index e3db443c2bdaa70f7651126a30caf2062a3c6f67..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-path: "tensorflow.distributions.RegisterKL"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.kullback_leibler.RegisterKL\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dist_cls_a\', \'dist_cls_b\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt
deleted file mode 100644
index 02e8d576ddd00aa21005fa39cd323a92392bf75a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-path: "tensorflow.distributions.ReparameterizationType"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'rep_type\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt
deleted file mode 100644
index 9aa7f9a63465c78f79ae4a8a11bc63d92d027dab..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.StudentT"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.student_t.StudentT\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "df"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loc"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'df\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'StudentT\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt
deleted file mode 100644
index d1b9d3069629c552d6c6048642934f422a13dce7..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.uniform.Uniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "high"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "low"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'low\', \'high\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'False\', \'True\', \'Uniform\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "range"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt
deleted file mode 100644
index 90b60ef074dd2eaf911291e6c725b98e2891e728..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt
+++ /dev/null
@@ -1,75 +0,0 @@
-path: "tensorflow.distributions"
-tf_module {
-  member {
-    name: "Bernoulli"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Beta"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Categorical"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Dirichlet"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "DirichletMultinomial"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Distribution"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Exponential"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "FULLY_REPARAMETERIZED"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
-  }
-  member {
-    name: "Gamma"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Laplace"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Multinomial"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "NOT_REPARAMETERIZED"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
-  }
-  member {
-    name: "Normal"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "RegisterKL"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ReparameterizationType"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "StudentT"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Uniform"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'distribution_a\', \'distribution_b\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
index 32b84e90ce6ae99e80208905d701d690227a0cf7..07483df83e3b87e4aa6e9ed4cd99dafdafcd102e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.BaselineClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifierV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
index 94933e7ffd6b0f4838f7b2e9254a4056c9cbf245..d218773dfc4a152bad17d6285dd82dd5a1aaa36d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.BaselineEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
index db7776b5bf67879cc806bed1b8463b99a082a50e..292b5f32d8dabb77e56386679556332611d234cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.BaselineRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressorV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index fa352907c0bfd482423a3e5a643a9135851af038..970abd8622faf950dfd4a5f1a766a70ec2b9881b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -22,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 154b35f3064ed1e3dca8c3b1e2bdfa7f13801dcc..b5bbad965e2e8934edf9dc771c34f72bbf99a836 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -22,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
index ce6040d0f279361ea789b54dee489996d9787ea7..c542edf64d467a79c96d9e39f1f7422547bca4bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.DNNClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85ff5a4fb10f58690e6e1f8c44fb2f8ecf5b9355
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.estimator.DNNEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index e85007e16edcf5f7f59768c7c17b9340e15bc7b6..623cbc3648e6bb0cd5e4972c72d787f579479948 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.DNNLinearCombinedClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifierV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
index a23f5daeac4d6690a599d0d92a2cb5ffdc4937c3..ac13dad2d46cf322e21fd312e70b898513fda85d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNLinearCombinedEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index 8a55bb835ff4b118a8b2ee45561f3e29639bab90..f45e76537aa8afee4d755d3e0e72c5cbcfdbbafe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.DNNLinearCombinedRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressorV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 2c4128ec480cc8302e01aa56b61fdb8a7db35b0a..8db21965129d1bd7ec0ff8982b24c50acafacb08 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.DNNRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressorV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
index 9d270a87ab8fe9788988f9277ad0e652f2b4860a..71531fd217ec9c5e691b681d3de0e55ded6962e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.Estimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -34,10 +34,6 @@ tf_class {
     name: "export_saved_model"
     argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
-  member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
-  }
   member_method {
     name: "get_variable_names"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
index 4b5de2e245090a3dd265a3ab9d062bf8e43169d7..72c226b25d902224121945aebc31e01397e32adb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.LinearClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..023edec819c22daa1ea4e16e6f0839cf05ed37e1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.estimator.LinearEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
index 0d1510e9ab1371d9cead321cfb6def4fcb417b16..c4bb19612a67576334b8ba2db5f0286823d7645e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.LinearRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressorV2\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
index 862761a96c723d157a2e983151a742aa90e83e17..cabca3e883fbceecb399e048e09722acd4efcad4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -16,4 +16,12 @@ tf_module {
     name: "linear_logit_fn_builder"
     argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], "
   }
+  member_method {
+    name: "make_early_stopping_hook"
+    argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_higher_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
index ec3216ae705709e39d9afb18545476213d529ddd..c5b0085b8d3ec58b4215d4a756957e1509501841 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DNNEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNLinearCombinedClassifier"
     mtype: "<type \'type\'>"
@@ -72,6 +76,10 @@ tf_module {
     name: "LinearClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c3f04e468c4c817cd474deb42149aee3021aa43
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.experimental"
+tf_module {
+  member_method {
+    name: "function_executor_type"
+    argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
index f06e7989537eef2b0e6fa4b720e90614366b41ee..3aadd7dc341ae97fdbfa83cd3fc96fc75249a4c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "categorical_column_with_vocabulary_file"
-    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\", \'None\', \'0\'], "
   }
   member_method {
     name: "categorical_column_with_vocabulary_list"
@@ -32,14 +32,6 @@ tf_module {
     name: "indicator_column"
     argspec: "args=[\'categorical_column\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "input_layer"
-    argspec: "args=[\'features\', \'feature_columns\', \'weight_collections\', \'trainable\', \'cols_to_vars\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "linear_model"
-    argspec: "args=[\'features\', \'feature_columns\', \'units\', \'sparse_combiner\', \'weight_collections\', \'trainable\', \'cols_to_vars\'], varargs=None, keywords=None, defaults=[\'1\', \'sum\', \'None\', \'True\', \'None\'], "
-  }
   member_method {
     name: "make_parse_example_spec"
     argspec: "args=[\'feature_columns\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
deleted file mode 100644
index eecfaffd0a6f6e611eba8bf3f5bb709bc9e0157f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.FastGFile"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.FastGFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
deleted file mode 100644
index 305251059d90b52aa2e76e99a4ec65e68b73fb79..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.GFile"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
deleted file mode 100644
index 6e8894180a4a685d5a35ba02df53c6e054db01b9..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.Open"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
index 65b55a8b7c4e30e349c1ea256664002b19191c82..74d0a0579ea529b8eb9f4c6d0b7581ace2b76dc2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
@@ -1,17 +1,5 @@
 path: "tensorflow.gfile"
 tf_module {
-  member {
-    name: "FastGFile"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GFile"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Open"
-    mtype: "<type \'type\'>"
-  }
   member_method {
     name: "Copy"
     argspec: "args=[\'oldpath\', \'newpath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -20,10 +8,6 @@ tf_module {
     name: "DeleteRecursively"
     argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "Exists"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "Glob"
     argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
index 162ee76ee7f900d266498c297873177ada35b542..d0facad3809f48763a6827b9fc1c66ab16d8dce6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
@@ -1,27 +1,7 @@
 path: "tensorflow.graph_util"
 tf_module {
-  member_method {
-    name: "convert_variables_to_constants"
-    argspec: "args=[\'sess\', \'input_graph_def\', \'output_node_names\', \'variable_names_whitelist\', \'variable_names_blacklist\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "extract_sub_graph"
-    argspec: "args=[\'graph_def\', \'dest_nodes\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "import_graph_def"
     argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "must_run_on_cpu"
-    argspec: "args=[\'node\', \'pin_variables_on_cpu\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "remove_training_nodes"
-    argspec: "args=[\'input_graph\', \'protected_nodes\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "tensor_shape_from_node_def_name"
-    argspec: "args=[\'graph\', \'input_name\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 0a231f1b65155b8662bb38943bfd97c5283b9385..3c6ed1cfb8340b6e8f2599360e3c321c562e37ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -38,7 +38,7 @@ tf_module {
   }
   member_method {
     name: "crop_and_resize"
-    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+    argspec: "args=[\'image\', \'boxes\', \'box_indices\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
   }
   member_method {
     name: "crop_to_bounding_box"
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'sizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "extract_jpeg_shape"
@@ -173,16 +173,8 @@ tf_module {
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "resize_area"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bicubic"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bilinear"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    name: "resize"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "resize_image_with_crop_or_pad"
@@ -192,14 +184,6 @@ tf_module {
     name: "resize_image_with_pad"
     argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
-  member_method {
-    name: "resize_images"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
-  }
-  member_method {
-    name: "resize_nearest_neighbor"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
   member_method {
     name: "rgb_to_grayscale"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -222,7 +206,7 @@ tf_module {
   }
   member_method {
     name: "sample_distorted_bounding_box"
-    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sobel_edges"
@@ -241,8 +225,8 @@ tf_module {
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "transpose_image"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+    name: "transpose"
+    argspec: "args=[\'image\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "yiq_to_rgb"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
index d49181714fe44cb6e27fb149948ce6eedd5e8ec5..e3c63fe737ee655169c00c7c0b2882c84f566244 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
@@ -64,8 +64,4 @@ tf_module {
     name: "lecun_uniform"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "tables_initializer"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59652cb0639eec1bf9b9fd06cbf9912f4f7f3c9c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.io.gfile"
+tf_module {
+  member_method {
+    name: "exists"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index dccf136788da44073160931707167b7d8baa0add..caa207b02260a3a269358062c3305205affc01da 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "VarLenFeature"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "gfile"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -113,8 +117,8 @@ tf_module {
     argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
   member_method {
-    name: "tf_record_iterator"
-    argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "serialize_tensor"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "write_file"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 4eb7f8f2a7171abbcb19de220d2231e352cd14e5..9dc8daea5c4e8e6293b2427add50ad4ebfbc264e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -89,10 +101,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -109,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 524455be8bbfc4b51719e5634cfdde695cb715ee..a357a825153528bdff75e9f73ec8d99545d71120 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -90,10 +102,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -114,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index 13a5d25e06ffa05a1ec723c7d30b95efc75e210c..d200d3d26d7c1b7d54eda596a8056a66e29be0b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -182,7 +182,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'inputs\', \'outputs\', \'updates\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'outputs\', \'updates\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "gather"
@@ -394,7 +394,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\', \'zero_output_for_mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "round"
@@ -508,6 +508,10 @@ tf_module {
     name: "temporal_padding"
     argspec: "args=[\'x\', \'padding\'], varargs=None, keywords=None, defaults=[\'(1, 1)\'], "
   }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'x\', \'n\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_dense"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
index 9eee9b378964a9947b067b7ec495ef6556ab6d0c..7d298e95135ebf41230d72ff488fef30be682edb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 5bb949c5bb650acee91b14a4d6bf95b36029edf7..133205ab88b47afad32fc70ceca93513768a3b19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
index a5340d52c1af6d69da30fd710bcee9d832917574..d766c09ac5efaa9d0e4ffba4e495385130c7e770 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
@@ -22,6 +22,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
index ed0f37647f4ae7ae56466844f8e71dcc6f1e6ce4..605f74e5602a63f5a18c31cb26113d300ec76e7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
index ee400b31c43829efba156298d5ee807cdafc8a98..cd893e67269164781d6a6b6294a199014d40fed8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index df8d7b0ef7afca17338a26388c38827b5b306f95..50f2054cabb1b8f6c46a9537ea923a18f87e5c80 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index ce1a9b694d8708720e0eb677afd25607c6262e9c..9ed9db0a89b49b88098e15baca414ff78b6f10e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 48bb24a05274addca03f11acef99607f78b92e51..3d8d1363bb4e4de818788efbf3c997594350006a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index d8bb8b2a7d0f491c7ec2b30096a1acaf04681a56..5012f1517d57dd646d82ab669cb279b6363dd6ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index dc27af9552a88650261b4f0694ea0265e6bda05c..73652c2b61259f768eca76b995ae4592df868392 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 5a3b791c0adc0d61129d38b2995ee9077cf0988b..24db71de1182d58b78fec0419aa9cb48a2e315d2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
index e9d53b7225a6d5edcc1275db8ed364017b8a12e3..c5503c69a5f3cb6765c984778c0e3626369ee815 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index 5c2d336353aee7fc98b45620adac4f4bcda05ea0..de6e8ef072558e6d926ea125aa5056e3c229d37f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index db69e25c5b70a0cdd76dba6aa570d0c634a31279..1d814b2c8b553f1b2a07f9d9b97dc70ec0674969 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 5510465d7b015e4989472b06c9d00ec9772373cf..b84629540e700f242f885064c92309c294693a11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index 38ec8a0aff0b9321f3a7ab2cfd9e6b75a8228e4a..5918a13ad8629582829049485e896688ecad9579 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 41cb8e30bfb57068ebe787f14f69ccc467047f26..599da06427dfe4f28e757a7aac8d8a14856a4556 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 9a7aaa8e961528aa750248e02f44403cab10a413..f9ff1538c8134d96051ad81d35c73e59c6a8cc57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 014f5828fad1152b19a0b0e3d2ffee7cee4c999b..723fc9cdb0d0ad93470e22fd8c147d3ecc92af91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index cc303bf7b98bb81cb0646fc18df0a4c5c70f1917..957ce2f0ce86f8df3eb8b57606229fb661eb52f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 628447ce3555628b651536d6c5b2a7716d59085c..a52c0af68175420dc2a1993d1f025d36705538e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index f03c986c22210906ad7bdc8b880753469b31aa1b..a004db62ddcaaae02a411d8db51f4026ece1384d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index a6e4856de9b63c946b77b745a6ede28dedf44afc..44f83d1387cb2ec681f50f7b1f0297f3f74594ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a01eaf8a12626257e97d135f50c06c7ea32fca27..8378faf7188ec594865d4b68c8ea8cae284183ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 0d6698f2ef4c674bf8a4dfc026eb209a83dcb8e7..9d5655c9644e3a2394a346bed78fc478cf60ba8d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index f1b23be48f7fec2051f1985381058d769eb8c2f8..820034564ffdc6d94dcc3a39659a55dd8a9d0070 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 0672cd5b7b8fdb1967e39c9163635372f73459b7..d37a6b47105225d7b83b6a264b944ceeb583a6c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index b25ae1e82e8a1f315553337a261a2d8a46301fa0..1ad7a91be0ba48d0dbab19da8c7cd9ca89095918 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index bb1918eba65659d9ede888400c24b3a5121d6052..cb9abc25396bb63a3c40de5cc52f9df7ed20071e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -178,6 +178,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 16e0fd5a3131723b3ba3ef3ae6d93fa6426dbd47..47dba1d81f8f97a60fe72ec521f82a78ee5f3505 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 381839d6deb8355a54fd883596f94e38fa1356d4..fd649418961301f150aac3dabc1bdf0ade4a9c28 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 543bae6fa96fa3ae51775e865bf95ea6f79c8e94..1b1425d53197db8b59abf51fe93c0b0c45299956 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 2933f9f4b3ab854a9bff6a200da2c2b912bfd4d8..1741063fe8b09acf3865e0a135e96bb715dcdcfa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 072943dc2c709a7cee26c3439e02e11455187282..50feb4f458ad1a9cb2b2bfe5d67997b7551eed74 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 222a1ef4fc5d19afe2c111c169c2f0bd38c331d6..faaa535df9fe03ad07862f0793f8ebea67b405ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 9c9c7461c8b8b43acfc7b7db94fd961a15d57817..4079329d1ee2a61270fee38426bb8a0859c38ce3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index f93906717814d4df7dfbf983d6cdbef358e9a55c..32e56696e1617f7810792e3416a2ebb2037d23c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 44ca598724a5c7b6d40ba460dd866675971015c3..381abe73401fa3a588873d643324fc020c159e30 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 471b18ef8500a279fb07bc893e2c8100d76d7bf1..b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f250a09b7eb69871e7e89d30da817aeb1d896fc..7aeff8003c322e8a8168dd70481a8b30b08762a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index f52128483c67321e4f0e5f0cf5a9fd3c65794561..a1728d9d4f9a1e677646db04c4d0df9572e21208 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 98daf3bab128357ffdde2e8ffa4f61fd5c6493f7..8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index b207c6800050a6a3f8b9525315fffa14341758cc..7758209adf8fe7a1306fa5ef125935dafd925c3e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 2d7a09ceda90fa8564de4410bf9b553cf1594c97..7c463ff1257599366be049edce6cc06140906286 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
similarity index 77%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index b66c0f89cc904c1318787651a3e8e629319c14fb..0781a93bd56c5ebc77e1fb650497621e49d7ee1f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,8 +1,6 @@
-path: "tensorflow.nn.rnn_cell.MultiRNNCell"
+path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,18 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -100,12 +82,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cells\', \'state_is_tuple\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -116,7 +102,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -128,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'features\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -150,10 +136,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -194,8 +176,4 @@ tf_class {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 3ac3825759391b7ea21fd6e3b3b149bb9e731479..4960d0264e96e872ea5c49a8841cef20bd5eb37c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 280ec8c25fabe1be63c9aa9a2c7f168315c219d7..8fad7535f882718462a11e27e75732e3097cb87d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 560f66f9c7a1f7e42e27c739a6c71671f8bd147b..5b425f2d4d7a8a897280490e26922766d8bf7065 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index c0543529c3884f20383911f32ea04c07fec4a050..f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index 04eb2824b9b14cf45eaef263282ffc6778bf709d..82b761fc1761bb3e7638f7a80bc80c6433162d04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index f400432915f8ce892a3297a23078f140eb96db7b..c9ff323877e06b6dff274644744d425e3a9b7932 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index ab176b441a246d93b88c00cd6decb34af175ad86..9b4165d4cbf88fefd2bb684dae70ea8afc01357b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index c3895a0ac127bc663f2a323661c1371a428159b0..f225f7c4309615919fb05df05f2ae664bde80097 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 9e24bb8ae6a1dc2e438038d4ac9225a7f17c4598..855d001700179fb634d1dff78585d340420abe7f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 55e0d7ef023ac4ca5e89f640c5ebb79199c31afa..2c404c99cd2175cdc8b60b229e4410bf280ebcb7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 38fbff5e4a3d2c892b0601c54e52690dae5760bd..6f109d59d0f6fcd2b4650719e3b4f653baec7d23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index a8094c0bde3e46b49cd253d7861922c90f1ffbd1..69f8a9031d32eb73bb44291cdf330d738d745cf9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 929f48df23180a2c5e21c110e0e1d343596ecd76..4299f765e525b136e289bba169becec06e19ffb1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 2e6d59337f1df94e327b506248eb74ab11bd6013..9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 3ebe162f573f630400a17d9a7ccad1615b8e9c78..625e81fd2322ceba153fa65c138948ce43843089 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4e3e258430cdacaf55aed5d46411d2b74c9bdf2e..2fc769742c70c5665c9cb77ad246fcdb49366d5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index fb9166316f6a641eb12a5664100e31d652148a84..e307a65c7c565660e1f2b6b6b74dc5970425eaa4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index c0a53b847b4afc1fb098fa06eb8e8e27a96c3459..4394ad0364e89fd3531d6625e52540991cadf973 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 87b7f6797a0d5bef8c5a4ff582c30433eaced2d4..050ed39fe98dc7cfdf6febe45e235d3ae7cbf486 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 98bf96fa0c251c5f6de8878d48e651ac3346ff38..436191821ef4689351b6124cf2a20afad917e4ab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index ff6c6f3ec4d501a87a858c7c9bf365590cfb4fdc..4ba540aa6adc72b572aa9340f89967d69ab78a3c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index c9d4158d1c434655abb11b92269e6e70ad2d1f91..a2e9322cb3fd4e56af708d5c4e17b660f7bc2247 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 9953102ff991bfd4f0568120dd7aef07f75ea208..5d16a57fc1aeff9939220de8043fcae39e3d953e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 2617f5a95fa631cf0b92e1fd2feef7457f96fd80..9dd29c1251ef2eacaf535a3f10f3d42dc36624a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
index 5fd0a47a68c0d4ad218c4c64cc6be8f603d9673a..bc3ceb67a4e7506b42fccd6b227891b9eef8147f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index e9f6ef45aaf1c775ea1b8dd157737f65c87e232f..0045d5775e2c19df21428bd4420b6e5612c8002b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 1b1ccbe118069e5fa5c3acd4ebea2d7fc14395b2..529c750f98715ec30313ed34c9023a845061a3df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 2e0b6bac24fd63988b28c1099d40581989b783df..d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 1e93d1118a4d306d5427d9b6873de1746d93b764..e1f5491180903f7d6931cc09755cabb715bbf233 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bfd36012a7edb8a74198a87a86577278be3fdcd4..9b69d9a9447f42907236b5cc8c7672012f96c38a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a94bd5930f4d9af9aaf9ec9b5cda9d678e698a19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
@@ -0,0 +1,285 @@
+path: "tensorflow.keras.layers.LinearModel"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.LinearModel\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'units\', \'sparse_combiner\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'sum\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 5ad5990d7e624c4f6b1dde92b4608c65aeb19db1..fd52259432577ac94dc702d4411ad5c0eed1ff10 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 40d03369a5235f394832e3e2f48710bb069e9aac..5fc8af0d03564c649dff6e9df70d10731319de40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 86666b51bb8c8dc22deb95f05cb9edfb10688015..7f8932270e63bc02852c5b64e53694e7e26be08b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index d26da270e747898412b05f6e07e3bbc23f287e0b..4723b99cb0792e1ce0bdc45e46908da8c2b5359c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 85f23df671d2772995ec01bb09e191237d60e6a7..173c5d4a8b149c4e23683cf375e8d793db7faa5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 235806b96500473fe95dd1b25aafe7f091bdb36b..14e1899e145224e411d65cbf481060a3b2cec0f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 524c5fd69e508bac55b05502e62650b92cf2be5c..a708e652bf0e82dea0f58034a81a040a39550dc9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index fda2562fc8c51623f5c4b33e23319ed35229905e..e6706b5cf9f32bda78adc4e2db5916a5750cc82e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 71d2d09a8d1d7addf91d7dc4ca109f8c2d45aed9..a73c082d1bba0453b742f76bacf0ad6116ba79a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index 12949b39a6f7affa657d1dccdc49ad0dc37e9c2f..f3f195554bbf4a43efaf2af0fd278a23bf270994 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index ab16d0021e627e6a2a821a0185ad71eb5bef1835..f345d1d67b2ce0200c64b1aeea5f39821d070bac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 61ccbf5962791ee1c0b35cc4aba422ff5cacd456..31cb8bc177c7a9e365101e75108a29900fbda124 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index ce2320d7030d05ba1e065f5bbcf8a18014891b5e..44cccc92bd2f1ff0335c22f2967865dc88a96ff7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 69848af8cf876ad1232a0bf7c419f52ed68af9f0..b55e191ff1ad6997550966bbb6154a81a489575d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 3358f26aebf8a8f845278d54b4524f4405751f5d..e9575436e5b14ac8c52a0b59c86937886eab5f40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 413f45f018ae0ce9ccf0e459b24d544c456e4c7c..98223b207f2ecfd5b7af8a53390166e53a7d4f73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9c61ff602744c00f9105a3f297151b49a8a3dead..2df918b16b2552323d75083bfa80e328c0639cfe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index baa91804c49f86a31093aed0c0a56613f7c1afee..ce5f9e21290eeddc0052257191ac4a6d068c1366 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 15a5d6ac9ea6e087dc0d76a2ab48b08448bfb6ee..a0bb917775fd9edb5d909bf850310e0596a88209 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index be43bd5b3c13632711a49cbbe6c85527d46d46ec..d7942f201bdbfa8d1577813be461a5905b5c6c90 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6105992c7a3a92d00718fe3287412af3c752db1d..f7ac9042d46f46ab35d18c62e5d8841679a18ca9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 1b6cf1e9ecb08a789212da141971434bd63988a6..e5a92688220f6e227b317d71a70fde01df4c432b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 29488a37f8f29f953d2b8b7e447c331df3244c84..0fe2c974a762784a82a6b97e116357be2a61d84f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3d70cf8b65972bda1f2bf4d78e126c0946a7c2a5..2ee5873f0f11688019dec3a6cd69db06d99b9caa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -149,6 +149,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index d29731ecf9d5387a324104865af5f563d287c60b..5b8f64aa35725d0ea44fc5c5b81952fd839503e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index a6d7494ca7d2230298a442b86766f46bc58a6d54..240cb6e562f77467d94ef95db2374150e318bc04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index c36e802693df564702100a652f3ccc2e95e4c40d..6226c469f8a534f96f6ea991fa5e7d2cf0019e3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9c46cfe40fd6959b526d6ca271bda3182daa1188..34dabce6d8dd0b1b6fe50a008a981e1f06a77edf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 8982f787940dd65291580781b5dc95941d804071..0ddf628ace582db259ebe0b211aba6e6362b5d5b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index ec2cc502984d302b243803b04b4f9d60cee43d05..12eb35ad154a514afd9c900cb2dbece8af28c49f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index d7bc1980f32e523781a68e80312905bc355f0509..c41020c2b45cc88c9b63f3b7a45c35066794dfe2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index fec2de6b49ec1ffaf45b9ee9048bcce37425e919..479f89cf6ae93e8d6ae02e304a51a145164df7de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -93,6 +93,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 3d285e7f17db3e8cdfbacf0056a4c56ffa7e67cb..233363ce02614f184b43a059889c7475b6a8c50b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index b05e5ec84de1eb4899b6fae437dc0d4bd1ab402d..cb6228ac446bd236df88f94eb6e9e717ea38463d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 728eca415a80842291d5684e55632689ceea4099..03bad3ccb613a225ad56e128ea680fc9312151e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index da64e77c39c0e116ff725bb05526882541dd6056..158996792a47fab0e7aa26d21d4bb7f281ca76d2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 2f505f9293f429490543ba2c569668f4b2ba3ca4..63a56cd3eebe271f66258c9a0acb974764555b34 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index f82c77072e6969dd57f89f4a971e59e28b4bfc63..965a4cca04651e123c5bd93484200a58b39918ba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 54e01a99177cde5fbfaf5e1e0ac310bef3ea8eae..1a624308878a68f1b48cb0f8b5e08dafbbfa0333 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index 9d7e5bb8c7808689bedd8abb835e61c1f38fdb1d..3b4724ef104878df0caada75b0ba68740dc93f8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -124,6 +124,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DenseFeatures"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DepthwiseConv2D"
     mtype: "<type \'type\'>"
@@ -240,6 +244,10 @@ tf_module {
     name: "LeakyReLU"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearModel"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LocallyConnected1D"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
similarity index 72%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index a4bb3219c792708cd02a8345541d8685485c8d05..2db07df5235e150f691a12d6b332c6d0d241ac19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -1,9 +1,9 @@
-path: "tensorflow.nn.rnn_cell.GRUCell"
+path: "tensorflow.keras.metrics.Accuracy"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.GRUCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -15,10 +15,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,18 +63,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -101,12 +85,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -117,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -125,11 +113,11 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"
@@ -151,10 +139,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -191,12 +175,20 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
similarity index 72%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 88b8f37c4ff0cfaf562293c845e505f06119e227..904ad3a21a05895b23e30dab82a89a31c74dcfca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -1,9 +1,9 @@
-path: "tensorflow.nn.rnn_cell.BasicLSTMCell"
+path: "tensorflow.keras.metrics.BinaryAccuracy"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -15,10 +15,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,18 +63,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -101,12 +85,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -117,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -125,11 +113,11 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"
@@ -151,10 +139,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -191,12 +175,20 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
similarity index 70%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 715bfd5fc7c18993d4997caeefe3188ba88f741c..17b74924fab4f596a010d6b9731b474433a8153e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -1,9 +1,9 @@
-path: "tensorflow.nn.rnn_cell.LSTMCell"
+path: "tensorflow.keras.metrics.CategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -15,10 +15,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,18 +63,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -101,12 +85,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -117,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -125,11 +113,11 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"
@@ -151,10 +139,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -191,12 +175,20 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40fe64bbd2cec45b9a8c4e9b041d3fa858af1327
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c17452292a031d42f3da0d5844e99d1272dad25
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index a296e131586504a3fadc9e6fe54079ee0f8270ba..8a8fb97b96e0bce6a132b2b581ea8d066efa90ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,25 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index dbd88939b7f8d0df502374fa8e16dd037cfeafc4..41d8b2fc950d02ace5f0efc7f790aa0a9c022f5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -89,10 +101,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -109,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 3c36f0702f8da5301b67435d0010374ffe878dfc..2cf107a5cd89805d590e0fc5a372ed3d18c914c0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -90,10 +102,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -114,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 973705dae2fabbef0eafb38ad12e96c747aeee27..773c74e64d13ca4a840b7f599fc2cbe9c161cd03 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index de917706d55214cc59f3205f0778d600a356a5b1..533544d21f2753f785113a30518f4fcbcff96cd7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index c4e6a21c3ac9324f5dd445dc65415c2abb4c6e9f..e3926eb6d4714731d09ff9c5b75a89830c06e7c1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 2e085a8e289e21173789041efb9254e992bd723b..ba209df7824a9cc076499458e35acd7dcf1eaf35 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 42d22bce42d8850a784afae3f67771ef1cfe5403..081fb0e08bcd1b35ab44459d1c8eb0857dd14956 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index d6749fdcec69425e83a044409ec695d2661f782e..2014a04301618c20af5cf6f1144eb4dbda2479e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index d9f363d1336210623536e8293a6290d9ebfc2fe1..9a87ae9687741090485bd8d4d0d07d359a2015e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index aac7ee31ed62c22b2e86d287d48c68c7e905fd00..33afb835ce1d524991c0024bfb87c29a72aac08e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -76,6 +76,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index c11d39082939eda4520b3955b767022bd485b5be..a9078c8ab5cca078237a29febabdbbd4a8b6c89c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 3ee800269e617390c25248a2c847cbe259b18e79..4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 63a1bc2321e35645700778c5906d1b8659eb4a32..a87649133fd207ad59f2124c6b0b5aa44916e5a5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index e2c5a505a7d2f9abbee5b3bb4f92ee8843198c51..32656467840fbbc0c8708ea68aac5aa75c11a540 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index a1b0e06b4753488bc9fcbe9aeb0d260092745f9c..49d8890c8942bc0021886ee6c9bc4e7625452655 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index 6d849dc040f61b498b100820bf7be3d4bc264bb4..c89dc067b331603e227d9d578147e2dd1ee4a900 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index cbab7ce6314320068a40392cb7f430fd47845af5..d8259aa77571a108a1b5f2e0e5835ecc2c013bfc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -136,9 +136,13 @@ tf_module {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'adjoint_a\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "qr"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
deleted file mode 100644
index 85bb15455da624962744a0cc856e79e0a6d57d7c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
+++ /dev/null
@@ -1,83 +0,0 @@
-path: "tensorflow.logging"
-tf_module {
-  member {
-    name: "DEBUG"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ERROR"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "FATAL"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INFO"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "WARN"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "TaskLevelStatusMessage"
-    argspec: "args=[\'msg\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "debug"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "error"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "fatal"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_verbosity"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "info"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "log"
-    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "log_every_n"
-    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "log_first_n"
-    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "log_if"
-    argspec: "args=[\'level\', \'msg\', \'condition\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_verbosity"
-    argspec: "args=[\'v\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "vlog"
-    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "warn"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "warning"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
index 258ad5047eb6e82eeb9c0941b0acf0573e5ca61d..6a44e4ce66c9dfcb9912c96d0106e4f4fd9fdcff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
@@ -1,11 +1,7 @@
 path: "tensorflow.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.Reduction\'>"
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "MEAN"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "NONE"
     mtype: "<type \'str\'>"
@@ -14,18 +10,10 @@ tf_class {
     name: "SUM"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "SUM_BY_NONZERO_WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "SUM_OVER_BATCH_SIZE"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "SUM_OVER_NONZERO_WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index a45c0a3259e174e1f15e86a2982fe4556f483d3b..86df97051441e858dfacee3e7c9a5b442160fa08 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -30,11 +30,11 @@ tf_module {
   }
   member_method {
     name: "argmax"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "argmin"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "asin"
@@ -102,7 +102,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'keepdims\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "cumprod"
@@ -176,6 +176,26 @@ tf_module {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "l2_normalize"
     argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
@@ -210,7 +230,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "logical_and"
@@ -296,10 +316,18 @@ tf_module {
     name: "reduce_prod"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_std"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "reduce_sum"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_variance"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -354,7 +382,7 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f8e12f8817356477fe09b9efb4e1aef8b0469ec6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9bc6a716a1d114330fce2521e238897bdae56d0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ef75d8756f8b8f50c281f12e664f9989df951d6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7fe6d6fda9685e3f9f0ce29b81f260f3e41a7ef3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bce43fbdeb13591ab5a25b50a0d880702173d98
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
index e9b996c9f53e9062dcdd39ef22f99eef5175eb35..d82ce8b38a92f4b09c9e1d0a997e447eb7ac68ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
@@ -1,8 +1,24 @@
 path: "tensorflow.metrics"
 tf_module {
-  member_method {
-    name: "accuracy"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
   }
   member_method {
     name: "auc"
@@ -28,10 +44,6 @@ tf_module {
     name: "false_positives_at_thresholds"
     argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "mean_absolute_error"
     argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 8e9994e54a991a614a26533c20ba098215f4cbc5..e550b2d75490785307a1195909e717e5501a86c0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "batch_norm_with_global_normalization"
-    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'mean\', \'variance\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_normalization"
@@ -41,8 +41,8 @@ tf_module {
     argspec: "args=[\'value\', \'bias\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "bidirectional_dynamic_rnn"
-    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
+    name: "collapse_repeated"
+    argspec: "args=[\'labels\', \'seq_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_accidental_hits"
@@ -50,43 +50,43 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filters\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "conv3d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_backprop_filter_v2"
+    name: "conv3d_backprop_filter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
   }
   member_method {
     name: "convolution"
-    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'VALID\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "crelu"
-    argspec: "args=[\'features\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+    argspec: "args=[\'features\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "ctc_beam_search_decoder"
@@ -98,7 +98,11 @@ tf_module {
   }
   member_method {
     name: "ctc_loss"
-    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
+    argspec: "args=[\'labels\', \'logits\', \'label_length\', \'logit_length\', \'logits_time_major\', \'unique\', \'blank_index\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ctc_unique_labels"
+    argspec: "args=[\'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "depth_to_space"
@@ -106,18 +110,14 @@ tf_module {
   }
   member_method {
     name: "depthwise_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "depthwise_conv2d_native"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native_backprop_filter"
+    name: "depthwise_conv2d_backprop_filter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native_backprop_input"
+    name: "depthwise_conv2d_backprop_input"
     argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
@@ -128,10 +128,6 @@ tf_module {
     name: "dropout"
     argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "dynamic_rnn"
-    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
-  }
   member_method {
     name: "elu"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -154,11 +150,11 @@ tf_module {
   }
   member_method {
     name: "fractional_avg_pool"
-    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'0\', \'None\'], "
   }
   member_method {
     name: "fractional_max_pool"
-    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'0\', \'None\'], "
   }
   member_method {
     name: "fused_batch_norm"
@@ -194,7 +190,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "lrn"
@@ -226,27 +222,7 @@ tf_module {
   }
   member_method {
     name: "pool"
-    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "quantized_avg_pool"
-    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "quantized_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'None\'], "
-  }
-  member_method {
-    name: "quantized_max_pool"
-    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "quantized_relu_x"
-    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
-  }
-  member_method {
-    name: "raw_rnn"
-    argspec: "args=[\'cell\', \'loop_fn\', \'parallel_iterations\', \'swap_memory\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'VALID\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "relu"
@@ -274,7 +250,7 @@ tf_module {
   }
   member_method {
     name: "separable_conv2d"
-    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sigmoid"
@@ -286,7 +262,7 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
@@ -312,21 +288,13 @@ tf_module {
     name: "sparse_softmax_cross_entropy_with_logits"
     argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "static_bidirectional_rnn"
-    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "static_rnn"
-    argspec: "args=[\'cell\', \'inputs\', \'initial_state\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "static_state_saving_rnn"
     argspec: "args=[\'cell\', \'inputs\', \'state_saver\', \'state_name\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sufficient_statistics"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "tanh"
@@ -342,16 +310,12 @@ tf_module {
   }
   member_method {
     name: "weighted_moments"
-    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "with_space_to_batch"
     argspec: "args=[\'input\', \'dilation_rate\', \'padding\', \'op\', \'filter_shape\', \'spatial_dims\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "xw_plus_b"
-    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "zero_fraction"
     argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 381c4975d7d778599ce34a9023d0e46b20753cba..9e52a4252619ffc19b287fc1818fa6f772847335 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 912365a28b1277962f648b2b0655d280bca1427c..9836433d08cba809107f9bb5dbccf2e971865b8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -110,6 +110,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index faeb4f3513362919fca8f0c2ef7c491d7938cb92..d3b68e4f2976912ed65ba7916284c951fda03b05 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -105,6 +105,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index caa2e600800178e4b2d36ae263da23d0b4608dd2..1f7840ab919baeeb0077904592ba8dcc1d4c91fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
index 24767e250f96da37ff078e9bf1b9b94fe0b1ed66..b1f687f52964e20a6dfa6f81f68e61d2a67513c9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow.nn.rnn_cell"
 tf_module {
-  member {
-    name: "BasicLSTMCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "DeviceWrapper"
     mtype: "<type \'type\'>"
@@ -12,22 +8,10 @@ tf_module {
     name: "DropoutWrapper"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "GRUCell"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "LSTMCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "LSTMStateTuple"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "MultiRNNCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "RNNCell"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 97bdd4d813a52238000d93de5198d9f147f54286..c91aa43671a10a75f27b8aea59b04a08a58c3a81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -8,14 +8,6 @@ tf_module {
     name: "AttrValue"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "ConditionalAccumulator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ConditionalAccumulatorBase"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ConfigProto"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -24,14 +16,6 @@ tf_module {
     name: "DType"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "DeviceSpec"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Dimension"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Event"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -40,14 +24,6 @@ tf_module {
     name: "FIFOQueue"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "FixedLenFeature"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FixedLenSequenceFeature"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "GPUOptions"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -64,10 +40,6 @@ tf_module {
     name: "GraphDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "GraphKeys"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "GraphOptions"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -104,10 +76,6 @@ tf_module {
     name: "OptimizerOptions"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "QUANTIZED_DTYPES"
-    mtype: "<type \'frozenset\'>"
-  }
   member {
     name: "RegisterGradient"
     mtype: "<type \'type\'>"
@@ -124,10 +92,6 @@ tf_module {
     name: "SessionLog"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "SparseFeature"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SparseTensor"
     mtype: "<type \'type\'>"
@@ -153,21 +117,17 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "TensorInfo"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+    name: "TensorShape"
+    mtype: "<type \'type\'>"
   }
   member {
-    name: "TensorShape"
+    name: "TensorSpec"
     mtype: "<type \'type\'>"
   }
   member {
     name: "UnconnectedGradients"
     mtype: "<class \'enum.EnumMeta\'>"
   }
-  member {
-    name: "VarLenFeature"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Variable"
     mtype: "<class \'tensorflow.python.ops.variables.VariableMetaclass\'>"
@@ -180,10 +140,6 @@ tf_module {
     name: "VariableSynchronization"
     mtype: "<class \'enum.EnumMeta\'>"
   }
-  member {
-    name: "app"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "bfloat16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -221,7 +177,7 @@ tf_module {
     mtype: "<type \'module\'>"
   }
   member {
-    name: "distributions"
+    name: "distribute"
     mtype: "<type \'module\'>"
   }
   member {
@@ -240,6 +196,10 @@ tf_module {
     name: "estimator"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "feature_column"
     mtype: "<type \'module\'>"
@@ -316,10 +276,6 @@ tf_module {
     name: "lite"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "logging"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "losses"
     mtype: "<type \'module\'>"
@@ -348,10 +304,6 @@ tf_module {
     name: "ones_initializer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "profiler"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "pywrap_tensorflow"
     mtype: "<type \'module\'>"
@@ -396,10 +348,6 @@ tf_module {
     name: "resource"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "resource_loader"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "saved_model"
     mtype: "<type \'module\'>"
@@ -409,11 +357,11 @@ tf_module {
     mtype: "<type \'module\'>"
   }
   member {
-    name: "sparse"
+    name: "signal"
     mtype: "<type \'module\'>"
   }
   member {
-    name: "spectral"
+    name: "sparse"
     mtype: "<type \'module\'>"
   }
   member {
@@ -460,10 +408,6 @@ tf_module {
     name: "uint8"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "user_ops"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "variant"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -480,14 +424,6 @@ tf_module {
     name: "Assert"
     argspec: "args=[\'condition\', \'data\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "NoGradient"
-    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "NotDifferentiable"
-    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "abs"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -504,37 +440,21 @@ tf_module {
     name: "add"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "add_check_numerics_ops"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "add_n"
     argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "add_to_collection"
-    argspec: "args=[\'name\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_to_collections"
-    argspec: "args=[\'names\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "arg_max"
-    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
-  }
-  member_method {
-    name: "arg_min"
-    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
-  }
   member_method {
     name: "argmax"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "argmin"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'stable\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'False\', \'None\'], "
   }
   member_method {
     name: "as_dtype"
@@ -554,19 +474,19 @@ tf_module {
   }
   member_method {
     name: "assert_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_less"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "atan"
@@ -602,7 +522,7 @@ tf_module {
   }
   member_method {
     name: "boolean_mask"
-    argspec: "args=[\'tensor\', \'mask\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'boolean_mask\', \'None\'], "
+    argspec: "args=[\'tensor\', \'mask\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'boolean_mask\'], "
   }
   member_method {
     name: "broadcast_dynamic_shape"
@@ -650,31 +570,19 @@ tf_module {
   }
   member_method {
     name: "cond"
-    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'strict\', \'name\', \'fn1\', \'fn2\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "constant"
     argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\', \'False\'], "
   }
-  member_method {
-    name: "container"
-    argspec: "args=[\'container_name\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "control_dependencies"
     argspec: "args=[\'control_inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "convert_to_tensor"
-    argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "convert_to_tensor_or_indexed_slices"
-    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "convert_to_tensor_or_sparse_tensor"
-    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'value\', \'dtype\', \'dtype_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "cos"
@@ -684,10 +592,6 @@ tf_module {
     name: "cosh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "create_partitioned_variables"
     argspec: "args=[\'shape\', \'slicing\', \'initializer\', \'dtype\', \'trainable\', \'collections\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'True\', \'None\', \'None\', \'None\'], "
@@ -702,11 +606,7 @@ tf_module {
   }
   member_method {
     name: "device"
-    argspec: "args=[\'device_name_or_function\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "div"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'device_name\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "div_no_nan"
@@ -754,7 +654,7 @@ tf_module {
   }
   member_method {
     name: "expand_dims"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "extract_volume_patches"
@@ -764,14 +664,6 @@ tf_module {
     name: "eye"
     argspec: "args=[\'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
-  member_method {
-    name: "fft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "fill"
     argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -800,6 +692,10 @@ tf_module {
     name: "foldr"
     argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "function"
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\'], "
+  }
   member_method {
     name: "gather"
     argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
@@ -808,18 +704,6 @@ tf_module {
     name: "gather_nd"
     argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "get_collection"
-    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_collection_ref"
-    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_default_graph"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "gradients"
     argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
@@ -860,14 +744,6 @@ tf_module {
     name: "identity_n"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "ifft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ifft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "import_graph_def"
     argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
@@ -876,10 +752,6 @@ tf_module {
     name: "init_scope"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "initialize_all_tables"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
-  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -896,10 +768,6 @@ tf_module {
     name: "linspace"
     argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "load_file_system_library"
-    argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "load_library"
     argspec: "args=[\'library_location\'], varargs=None, keywords=None, defaults=None"
@@ -932,14 +800,6 @@ tf_module {
     name: "make_ndarray"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "make_template"
-    argspec: "args=[\'name_\', \'func_\', \'create_scope_now_\', \'unique_name_\', \'custom_getter_\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "make_tensor_proto"
-    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
-  }
   member_method {
     name: "map_fn"
     argspec: "args=[\'fn\', \'elems\', \'dtype\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\', \'True\', \'None\'], "
@@ -960,10 +820,6 @@ tf_module {
     name: "meshgrid"
     argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
   }
-  member_method {
-    name: "min_max_variable_partitioner"
-    argspec: "args=[\'max_partitions\', \'axis\', \'min_slice_size\', \'bytes_per_string_element\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'262144\', \'16\'], "
-  }
   member_method {
     name: "minimum"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -972,10 +828,6 @@ tf_module {
     name: "mod"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -984,6 +836,10 @@ tf_module {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "no_gradient"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "no_op"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -994,7 +850,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "not_equal"
@@ -1010,40 +866,16 @@ tf_module {
   }
   member_method {
     name: "ones_like"
-    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
-  }
-  member_method {
-    name: "op_scope"
-    argspec: "args=[\'values\', \'name\', \'default_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "pad"
-    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'name\', \'constant_values\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\', \'0\'], "
+    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'constant_values\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'0\', \'None\'], "
   }
   member_method {
     name: "parallel_stack"
     argspec: "args=[\'values\', \'name\'], varargs=None, keywords=None, defaults=[\'parallel_stack\'], "
   }
-  member_method {
-    name: "parse_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "parse_single_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "parse_single_sequence_example"
-    argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "placeholder"
-    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "placeholder_with_default"
-    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1053,28 +885,8 @@ tf_module {
     argspec: "args=[], varargs=inputs, keywords=kwargs, defaults=None"
   }
   member_method {
-    name: "py_func"
-    argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-  member_method {
-    name: "quantize_v2"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\', \'round_mode\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'HALF_AWAY_FROM_ZERO\'], "
-  }
-  member_method {
-    name: "random_crop"
-    argspec: "args=[\'value\', \'size\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_normal"
-    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_shuffle"
-    argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_uniform"
-    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+    name: "py_function"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "range"
@@ -1142,7 +954,7 @@ tf_module {
   }
   member_method {
     name: "reverse_sequence"
-    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\', \'seq_dim\', \'batch_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "roll"
@@ -1192,21 +1004,9 @@ tf_module {
     name: "sequence_mask"
     argspec: "args=[\'lengths\', \'maxlen\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'bool\'>\", \'None\'], "
   }
-  member_method {
-    name: "serialize_tensor"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_random_seed"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "setdiff1d"
-    argspec: "args=[\'x\', \'y\', \'index_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
-  }
   member_method {
     name: "shape"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "shape_n"
@@ -1230,43 +1030,23 @@ tf_module {
   }
   member_method {
     name: "size"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "slice"
     argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'None\'], "
+  }
   member_method {
     name: "space_to_batch_nd"
     argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "sparse_concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_matmul"
-    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_split"
-    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'expand_nonconcat_dim\', \'concat_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_to_dense"
@@ -1312,10 +1092,6 @@ tf_module {
     name: "subtract"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "tables_initializer"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
-  }
   member_method {
     name: "tan"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1336,37 +1112,9 @@ tf_module {
     name: "timestamp"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "to_bfloat16"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToBFloat16\'], "
-  }
-  member_method {
-    name: "to_complex128"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToComplex128\'], "
-  }
-  member_method {
-    name: "to_complex64"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToComplex64\'], "
-  }
-  member_method {
-    name: "to_double"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToDouble\'], "
-  }
-  member_method {
-    name: "to_float"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToFloat\'], "
-  }
-  member_method {
-    name: "to_int32"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt32\'], "
-  }
-  member_method {
-    name: "to_int64"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt64\'], "
-  }
   member_method {
     name: "transpose"
-    argspec: "args=[\'a\', \'perm\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'None\', \'transpose\', \'False\'], "
+    argspec: "args=[\'a\', \'perm\', \'conjugate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'transpose\'], "
   }
   member_method {
     name: "truediv"
@@ -1386,7 +1134,7 @@ tf_module {
   }
   member_method {
     name: "tuple"
-    argspec: "args=[\'tensors\', \'name\', \'control_inputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'tensors\', \'control_inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "unique"
@@ -1418,7 +1166,7 @@ tf_module {
   }
   member_method {
     name: "while_loop"
-    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\', \'return_same_structure\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'maximum_iterations\', \'return_same_structure\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "zeros"
@@ -1426,6 +1174,6 @@ tf_module {
   }
   member_method {
     name: "zeros_like"
-    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'input\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
deleted file mode 100644
index e09c44cc9ce71305692740ba2d63b0940b2e0573..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.profiler.AdviceProto.Checker"
-tf_proto {
-  descriptor {
-    name: "Checker"
-    field {
-      name: "reports"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
deleted file mode 100644
index 87462435496fd2eedeb0bc8d92e8a833671b6531..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.profiler.AdviceProto.CheckersEntry"
-tf_proto {
-  descriptor {
-    name: "CheckersEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.AdviceProto.Checker"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
deleted file mode 100644
index a8a8858ccd5af3fb3dac612eef44e5cb450df914..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
+++ /dev/null
@@ -1,41 +0,0 @@
-path: "tensorflow.profiler.AdviceProto"
-tf_proto {
-  descriptor {
-    name: "AdviceProto"
-    field {
-      name: "checkers"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.AdviceProto.CheckersEntry"
-    }
-    nested_type {
-      name: "CheckersEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.tfprof.AdviceProto.Checker"
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "Checker"
-      field {
-        name: "reports"
-        number: 2
-        label: LABEL_REPEATED
-        type: TYPE_STRING
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
deleted file mode 100644
index afec73f537aadd5d1a274db8d57e37b8c6fa3e74..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.profiler.GraphNodeProto.InputShapesEntry"
-tf_proto {
-  descriptor {
-    name: "InputShapesEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
deleted file mode 100644
index 3c83177005323a277f929d8c769cd7b1eeff4d51..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
+++ /dev/null
@@ -1,191 +0,0 @@
-path: "tensorflow.profiler.GraphNodeProto"
-tf_proto {
-  descriptor {
-    name: "GraphNodeProto"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensor_value"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.TFProfTensorProto"
-    }
-    field {
-      name: "run_count"
-      number: 21
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "exec_micros"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "accelerator_exec_micros"
-      number: 17
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "cpu_exec_micros"
-      number: 18
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "requested_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "peak_bytes"
-      number: 24
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "residual_bytes"
-      number: 25
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "output_bytes"
-      number: 26
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "parameters"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "float_ops"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "devices"
-      number: 10
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "total_definition_count"
-      number: 23
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_run_count"
-      number: 22
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_exec_micros"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_accelerator_exec_micros"
-      number: 19
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_cpu_exec_micros"
-      number: 20
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_requested_bytes"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_peak_bytes"
-      number: 27
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_residual_bytes"
-      number: 28
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_output_bytes"
-      number: 29
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_parameters"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_float_ops"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "shapes"
-      number: 11
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    field {
-      name: "input_shapes"
-      number: 16
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto.InputShapesEntry"
-    }
-    field {
-      name: "children"
-      number: 12
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto"
-    }
-    nested_type {
-      name: "InputShapesEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorShapeProto"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
deleted file mode 100644
index 2b08a05437f90b91160fc08e670b2466ae163149..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
+++ /dev/null
@@ -1,134 +0,0 @@
-path: "tensorflow.profiler.MultiGraphNodeProto"
-tf_proto {
-  descriptor {
-    name: "MultiGraphNodeProto"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "exec_micros"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "accelerator_exec_micros"
-      number: 12
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "cpu_exec_micros"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "requested_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "peak_bytes"
-      number: 16
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "residual_bytes"
-      number: 17
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "output_bytes"
-      number: 18
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "parameters"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "float_ops"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_exec_micros"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_accelerator_exec_micros"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_cpu_exec_micros"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_requested_bytes"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_peak_bytes"
-      number: 19
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_residual_bytes"
-      number: 20
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_output_bytes"
-      number: 21
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_parameters"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_float_ops"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "graph_nodes"
-      number: 10
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto"
-    }
-    field {
-      name: "children"
-      number: 11
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.MultiGraphNodeProto"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
deleted file mode 100644
index b3adc50c7e14152a81a148df9deccc5272189aad..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.profiler.OpLogProto.IdToStringEntry"
-tf_proto {
-  descriptor {
-    name: "IdToStringEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
deleted file mode 100644
index 7510c566ba574e9370f5e54c29023ef4fb5ee804..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.profiler.OpLogProto"
-tf_proto {
-  descriptor {
-    name: "OpLogProto"
-    field {
-      name: "log_entries"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.OpLogEntry"
-    }
-    field {
-      name: "id_to_string"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.OpLogProto.IdToStringEntry"
-    }
-    nested_type {
-      name: "IdToStringEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
deleted file mode 100644
index 19ff38a3900c2d358faaa40e7316cc3a9da73040..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
+++ /dev/null
@@ -1,93 +0,0 @@
-path: "tensorflow.profiler.ProfileOptionBuilder"
-tf_class {
-  is_instance: "<class \'tensorflow.python.profiler.option_builder.ProfileOptionBuilder\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "account_displayed_op_only"
-    argspec: "args=[\'self\', \'is_true\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "float_operation"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "order_by"
-    argspec: "args=[\'self\', \'attribute\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "select"
-    argspec: "args=[\'self\', \'attributes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "time_and_memory"
-    argspec: "args=[\'min_micros\', \'min_bytes\', \'min_accelerator_micros\', \'min_cpu_micros\', \'min_peak_bytes\', \'min_residual_bytes\', \'min_output_bytes\'], varargs=None, keywords=None, defaults=[\'1\', \'1\', \'0\', \'0\', \'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "trainable_variables_parameter"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_accounted_types"
-    argspec: "args=[\'self\', \'account_type_regexes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_empty_output"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_file_output"
-    argspec: "args=[\'self\', \'outfile\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_max_depth"
-    argspec: "args=[\'self\', \'max_depth\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_execution_time"
-    argspec: "args=[\'self\', \'min_micros\', \'min_accelerator_micros\', \'min_cpu_micros\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "with_min_float_operations"
-    argspec: "args=[\'self\', \'min_float_ops\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_memory"
-    argspec: "args=[\'self\', \'min_bytes\', \'min_peak_bytes\', \'min_residual_bytes\', \'min_output_bytes\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "with_min_occurrence"
-    argspec: "args=[\'self\', \'min_occurrence\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_parameters"
-    argspec: "args=[\'self\', \'min_params\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_node_names"
-    argspec: "args=[\'self\', \'start_name_regexes\', \'show_name_regexes\', \'hide_name_regexes\', \'trim_name_regexes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "with_pprof_output"
-    argspec: "args=[\'self\', \'pprof_file\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_stdout_output"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_step"
-    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_timeline_output"
-    argspec: "args=[\'self\', \'timeline_file\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
deleted file mode 100644
index acb61dae9f0d184ba998aa820ec40de5bc38c3eb..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-path: "tensorflow.profiler.Profiler"
-tf_class {
-  is_instance: "<class \'tensorflow.python.profiler.model_analyzer.Profiler\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'graph\', \'op_log\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_step"
-    argspec: "args=[\'self\', \'step\', \'run_meta\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "advise"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_graph"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_name_scope"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_operations"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_python"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "serialize_to_string"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
deleted file mode 100644
index 7b4d3ac522abc4229c5623da25c4ec818d86f829..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
+++ /dev/null
@@ -1,39 +0,0 @@
-path: "tensorflow.profiler"
-tf_module {
-  member {
-    name: "AdviceProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "GraphNodeProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "MultiGraphNodeProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "OpLogProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "ProfileOptionBuilder"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Profiler"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "advise"
-    argspec: "args=[\'graph\', \'run_meta\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
-  }
-  member_method {
-    name: "profile"
-    argspec: "args=[\'graph\', \'run_meta\', \'op_log\', \'cmd\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'scope\', \'0\'], "
-  }
-  member_method {
-    name: "write_op_log"
-    argspec: "args=[\'graph\', \'log_dir\', \'op_log\', \'run_meta\', \'add_trace\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
index 160c09798d02653ba0c090db53124450b956ef05..de5cb6b7172af32e3e246798c8d748c272dae097 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -1,20 +1,24 @@
 path: "tensorflow.random"
 tf_module {
   member_method {
-    name: "gamma"
-    argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+    name: "all_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_seed"
-    argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
+    name: "categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "log_uniform_candidate_sampler"
-    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    name: "fixed_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "gamma"
+    argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
-    name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    name: "log_uniform_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "normal"
@@ -22,10 +26,10 @@ tf_module {
   }
   member_method {
     name: "poisson"
-    argspec: "args=[\'lam\', \'shape\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
+    argspec: "args=[\'shape\', \'lam\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
-    name: "set_random_seed"
+    name: "set_seed"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
@@ -33,8 +37,8 @@ tf_module {
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "stateless_multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    name: "stateless_categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "stateless_normal"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt
deleted file mode 100644
index 288b78b4cd0ad3f5d5bc1f9c773977d50a6db086..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.resource_loader"
-tf_module {
-  member_method {
-    name: "get_data_files_path"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_path_to_datafile"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_root_dir_with_all_resources"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "load_resource"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readahead_file_path"
-    argspec: "args=[\'path\', \'readahead\'], varargs=None, keywords=None, defaults=[\'128M\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index 6b86d4b49f5ea6169fdb1107378e78cd4c2018ef..d946c666c2fb91e63ddc6dcd1ee2d2d3ab4c46ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -32,10 +32,6 @@ tf_module {
     name: "GPU"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "LEGACY_INIT_OP_KEY"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "MAIN_OP_KEY"
     mtype: "<type \'str\'>"
@@ -105,12 +101,12 @@ tf_module {
     argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "is_valid_signature"
-    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
+    name: "contains_saved_model"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "maybe_saved_model_directory"
-    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+    name: "is_valid_signature"
+    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "predict_signature_def"
@@ -120,4 +116,8 @@ tf_module {
     name: "regression_signature_def"
     argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
index 8a196b1a556e283671cc75af28df3eaa62532975..900d08ff47ca062fdda4f0f2f6ac20ee9822d1df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
@@ -1,19 +1,19 @@
 path: "tensorflow.sets"
 tf_module {
   member_method {
-    name: "set_difference"
+    name: "difference"
     argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
   }
   member_method {
-    name: "set_intersection"
+    name: "intersection"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
-    name: "set_size"
+    name: "size"
     argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
-    name: "set_union"
+    name: "union"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea717b4d719d6709e05182faca964ae544abc39c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.signal"
+tf_module {
+  member_method {
+    name: "dct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "frame"
+    argspec: "args=[\'signal\', \'frame_length\', \'frame_step\', \'pad_end\', \'pad_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "hamming_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "hann_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "idct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ifft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "inverse_stft"
+    argspec: "args=[\'stfts\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "inverse_stft_window_fn"
+    argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "irfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "linear_to_mel_weight_matrix"
+    argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "mfccs_from_log_mel_spectrograms"
+    argspec: "args=[\'log_mel_spectrograms\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "overlap_and_add"
+    argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "stft"
+    argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 32bd8d5f8edb24ee1f5a5672487499337bd1c0dd..4ad94568b24240454da8e0f349173af25ffd4859 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,11 +10,11 @@ tf_module {
   }
   member_method {
     name: "add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'expand_nonconcat_dim\', \'concat_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "cross"
@@ -40,41 +40,21 @@ tf_module {
     name: "mask"
     argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "matmul"
-    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
-  }
   member_method {
     name: "maximum"
     argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "merge"
-    argspec: "args=[\'sp_ids\', \'sp_values\', \'vocab_size\', \'name\', \'already_sorted\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
-  }
   member_method {
     name: "minimum"
     argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "placeholder"
-    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'output_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'output_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reorder"
@@ -94,15 +74,15 @@ tf_module {
   }
   member_method {
     name: "segment_mean"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_sqrt_n"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_sum"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "slice"
@@ -112,9 +92,13 @@ tf_module {
     name: "softmax"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sparse_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "split"
-    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'num_split\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "to_dense"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 03144cbe709fe59afc3a818ea7c157ace72b713d..16b7f14ab2b38bb42bda840923994b8a99fc5779 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "length"
-    argspec: "args=[\'input\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
+    argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
     name: "reduce_join"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6715c14e168d6a30ce8aa35470525521069de40a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.summary.SummaryWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.summary_ops_v2.SummaryWriter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'resource\', \'init_op_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "init"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 7ed9cd77a01c2eadb5ea43a02306d60d505127a0..26c979c0c620ab2f4afbbbb1d95324ab4436f118 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -24,44 +24,32 @@ tf_module {
     name: "SummaryDescription"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "SummaryWriter"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TaggedRunMetadata"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
   member_method {
-    name: "audio"
-    argspec: "args=[\'name\', \'tensor\', \'sample_rate\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_summary_description"
-    argspec: "args=[\'node_def\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "histogram"
-    argspec: "args=[\'name\', \'values\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "image"
-    argspec: "args=[\'name\', \'tensor\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "merge"
-    argspec: "args=[\'inputs\', \'collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    name: "create_file_writer"
+    argspec: "args=[\'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "merge_all"
-    argspec: "args=[\'key\', \'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\', \'None\'], "
+    name: "flush"
+    argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "scalar"
-    argspec: "args=[\'name\', \'tensor\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    name: "import_event"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "tensor_summary"
-    argspec: "args=[\'name\', \'tensor\', \'summary_description\', \'collections\', \'summary_metadata\', \'family\', \'display_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    name: "record_summaries"
+    argspec: "args=[\'boolean\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
-    name: "text"
-    argspec: "args=[\'name\', \'tensor\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "should_record_summaries"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index dee536cd830ed12c941c1cc0e37d60082ac0c834..72ce7330445a9e9b94402cf06438c4284676d9dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -12,34 +12,18 @@ tf_module {
     name: "TestCase"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "mock"
-    mtype: "<type \'module\'>"
-  }
   member_method {
     name: "assert_equal_graph_def"
-    argspec: "args=[\'actual\', \'expected\', \'checkpoint_v2\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'actual\', \'expected\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "benchmark_config"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "compute_gradient"
-    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradient_error"
-    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\'], "
-  }
   member_method {
     name: "create_local_cluster"
     argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\', \'worker_config\', \'ps_config\'], varargs=None, keywords=None, defaults=[\'grpc\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "get_temp_dir"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "gpu_device_name"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -56,8 +40,4 @@ tf_module {
     name: "main"
     argspec: "args=[\'argv\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "test_src_dir_path"
-    argspec: "args=[\'relative_path\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2538de661b357245ad18d9e1c4fc88d2e80eaeb0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.train.CheckpointManager"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.checkpoint_management.CheckpointManager\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "checkpoints"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latest_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'checkpoint_number\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
deleted file mode 100644
index 4df6c4156a8bfe6d3bc0fb6746512cb3025c2604..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.ProfilerHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.ProfilerHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'show_dataflow\', \'show_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'True\', \'False\'], "
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt
deleted file mode 100644
index 2c0fda3c72b7e1f02265827b9dc1929500935cd1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ /dev/null
@@ -1,63 +0,0 @@
-path: "tensorflow.train.SyncReplicasOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'opt\', \'replicas_to_aggregate\', \'total_num_replicas\', \'variable_averages\', \'variables_to_average\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'sync_replicas\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_chief_queue_runner"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_init_tokens_op"
-    argspec: "args=[\'self\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "make_session_run_hook"
-    argspec: "args=[\'self\', \'is_chief\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index c2dc4140e8ebe184013b1ed7f4b5c51c85721c8e..89d9270b276b9ac21f5f0de2c6a114bce287cef7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "Checkpoint"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointManager"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
@@ -132,10 +136,6 @@ tf_module {
     name: "Optimizer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "ProfilerHook"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ProximalAdagradOptimizer"
     mtype: "<type \'type\'>"
@@ -212,10 +212,6 @@ tf_module {
     name: "Supervisor"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SyncReplicasOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "VocabInfo"
     mtype: "<type \'type\'>"
@@ -224,26 +220,6 @@ tf_module {
     name: "WorkerSessionCreator"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "MonitoredTrainingSession"
-    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\', \'max_wait_secs\', \'save_checkpoint_steps\', \'summary_dir\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'<object object instance>\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\', \'7200\', \'<object object instance>\', \'None\'], "
-  }
-  member_method {
-    name: "NewCheckpointReader"
-    argspec: "args=[\'filepattern\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "assert_global_step"
-    argspec: "args=[\'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "basic_train_loop"
-    argspec: "args=[\'supervisor\', \'train_step_fn\', \'args\', \'kwargs\', \'master\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\'], "
-  }
-  member_method {
-    name: "checkpoint_exists"
-    argspec: "args=[\'checkpoint_prefix\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
@@ -252,42 +228,14 @@ tf_module {
     name: "cosine_decay_restarts"
     argspec: "args=[\'learning_rate\', \'global_step\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
   }
-  member_method {
-    name: "create_global_step"
-    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "exponential_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
-  member_method {
-    name: "generate_checkpoint_state_proto"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_checkpoint_mtimes"
-    argspec: "args=[\'checkpoint_prefixes\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_checkpoint_state"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "get_global_step"
-    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_or_create_global_step"
-    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "global_step"
-    argspec: "args=[\'sess\', \'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "init_from_checkpoint"
-    argspec: "args=[\'ckpt_dir_or_file\', \'assignment_map\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "inverse_time_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -328,14 +276,6 @@ tf_module {
     name: "polynomial_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
   }
-  member_method {
-    name: "remove_checkpoint"
-    argspec: "args=[\'checkpoint_prefix\', \'checkpoint_format_version\', \'meta_graph_suffix\'], varargs=None, keywords=None, defaults=[\'2\', \'meta\'], "
-  }
-  member_method {
-    name: "replica_device_setter"
-    argspec: "args=[\'ps_tasks\', \'ps_device\', \'worker_device\', \'merge_devices\', \'cluster\', \'ps_ops\', \'ps_strategy\'], varargs=None, keywords=None, defaults=[\'0\', \'/job:ps\', \'/job:worker\', \'True\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "sdca_fprint"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -352,16 +292,8 @@ tf_module {
     name: "summary_iterator"
     argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_checkpoint_state"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "warm_start"
     argspec: "args=[\'ckpt_to_initialize_from\', \'vars_to_warm_start\', \'var_name_to_vocab_info\', \'var_name_to_prev_var_name\'], varargs=None, keywords=None, defaults=[\'.*\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "write_graph"
-    argspec: "args=[\'graph_or_graph_def\', \'logdir\', \'name\', \'as_text\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
 }
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index fb489ea80fbdad0612f5ae0af9d91fa0df534115..b0f3742af1a9ab03512aee521e927eb71f2810b0 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -33,7 +33,7 @@ import re
 import sys
 
 import tensorflow as tf
-from tensorflow._api import v2 as tf_v2
+from tensorflow._api.v2 import v2 as tf_v2
 
 from google.protobuf import message
 from google.protobuf import text_format
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
new file mode 100644
index 0000000000000000000000000000000000000000..85b9d943131749b446db8e4cba50c7557abd8933
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
@@ -0,0 +1,75 @@
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 \
+#       --tag "gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04
+
+FROM ubuntu:14.04
+LABEL maintainer="Manuel Klimek <klimek@google.com>"
+
+RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
+    rm -rf /var/lib/apt/lists/* && \
+    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
+    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
+    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +2 > cudasign.pub && \
+    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
+
+ENV CUDA_VERSION 10.0.130
+ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
+ENV CUDNN_VERSION 7.3.1.20
+ENV NCCL_VERSION 2.3.5
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0,driver>=410"
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV PATH /usr/local/cuda/bin:${PATH}
+
+# TODO(b/110903506): /usr/loca/cuda/lib64/stubs should not be needed in
+# LD_LIBRARY_PATH. The stubs/libcuda.so is not meant to used at runtime. The
+# correct way to pass the path to bfd-ld is to pass
+# -Wl,-rpath-link=/usr/local/cuda/lib64/stubs to all binaries transitively
+# depending on libcuda. Optimally, builds targeting cuda would do that
+# internally.
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
+
+LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        cuda-command-line-tools-$CUDA_PKG_VERSION \
+        cuda-compat-10-0=410.48-1 \
+        cuda-cudart-$CUDA_PKG_VERSION \
+        cuda-libraries-$CUDA_PKG_VERSION \
+        cuda-libraries-dev-$CUDA_PKG_VERSION \
+        cuda-minimal-build-$CUDA_PKG_VERSION \
+        cuda-nvml-dev-$CUDA_PKG_VERSION \
+        cuda-nvtx-$CUDA_PKG_VERSION \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 \
+        libnccl2=$NCCL_VERSION-2+cuda10.0 \
+        libnccl-dev=$NCCL_VERSION-2+cuda10.0 && \
+    ln -s cuda-10.0 /usr/local/cuda && \
+    apt-mark hold libcudnn7 && \
+    apt-mark hold libnccl2 && \
+    rm -rf /var/lib/apt/lists/*
+
+# TODO(b/110903506): Provide a link to the SONAME of libcuda.so.
+# https://github.com/NVIDIA/nvidia-docker/issues/775
+RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
+# libnccl is resolved, delete this block.
+RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
+ && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_golang.sh
+
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
index dd8d705331a58af8ec8cd4474bbedce47bba727f..eb6ca7c8f0fe27bd8bb9e5b11cf14e98ad67e530 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
@@ -6,7 +6,7 @@
 # TODO(klimek): Include clang in this image so we can also target clang
 # builds.
 
-FROM ubuntu:14.04
+FROM gcr.io/clang-docker-builder/clang-ubuntu14_04
 LABEL maintainer="Manuel Klimek <klimek@google.com>"
 
 RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
@@ -71,6 +71,15 @@ RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
  && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
 
+# Install a newer version of libstdc++, as new clang versions do not work
+# with the stock ubuntu 14.04 libstdc++.
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
+    apt-get update && \
+    apt-get install -y libstdc++-7-dev && \
+    rm -rf /var/lib/apt/lists/*
+
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 ARG DEBIAN_FRONTEND=noninteractive
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 9b3ff0cba7dcacc0f68a417299c31f7a0f413430..44abcc309b9ff238059d6f298c42c7edb3fecd32 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -55,6 +55,7 @@ function build_libtensorflow_tarball() {
   export CC_OPT_FLAGS='-mavx'
   if [ "${TF_NEED_CUDA}" == "1" ]; then
     BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
+    export TF_NEED_ROCM=0
   fi
   bazel clean --expunge
   yes "" | ./configure
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 5f619c4e62ab76047966316c227c1ca9e7a10ba7..f46e36bf321b38dab2f2ebe9574725f29f7c755e 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -14,6 +14,18 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_test(
+    name = "ast_edits_test",
+    srcs = ["ast_edits_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ast_edits",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
+    ],
+)
+
 py_binary(
     name = "tf_upgrade",
     srcs = ["tf_upgrade.py"],
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index a5b9fbdae8be9ac0607ef387e79a2f0a42d6ab1c..56c67b8356524e9169f59d24af6d455e2cd82706 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -34,7 +34,7 @@ class APIChangeSpec(object):
 
   * `function_keyword_renames`: maps function names to a map of old -> new
     argument names
-  * `function_renames`: maps function names to new function names
+  * `symbol_renames`: maps function names to new function names
   * `change_to_function`: a set of function names that have changed (for
     notifications)
   * `function_reorders`: maps functions whose argument order has changed to the
@@ -176,9 +176,9 @@ class _ASTCallVisitor(ast.NodeVisitor):
     ast.NodeVisitor.generic_visit(self, node)
 
   def _rename_functions(self, node, full_name):
-    function_renames = self._api_change_spec.function_renames
+    symbol_renames = self._api_change_spec.symbol_renames
     try:
-      new_name = function_renames[full_name]
+      new_name = symbol_renames[full_name]
       self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
                           node.lineno, node.col_offset, full_name, new_name)
     except KeyError:
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..08f4ae3fccf53dddf4e49a8ce8c31c5142d75e28
--- /dev/null
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -0,0 +1,396 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ast_edits which is used in tf upgraders.
+
+All of the tests assume that we want to change from an API containing
+
+    def f(a, b, kw1, kw2): ...
+    def g(a, b, kw1, c, kw1_alias): ...
+    def g2(a, b, kw1, c, d, kw1_alias): ...
+    def h(a, kw1, kw2, kw1_alias, kw2_alias): ...
+
+and the changes to the API consist of renaming, reordering, and/or removing
+arguments. Thus, we want to be able to generate changes to produce each of the
+following new APIs:
+
+    def f(a, b, kw1, kw3): ...
+    def f(a, b, kw2, kw1): ...
+    def f(a, b, kw3, kw1): ...
+    def g(a, b, kw1, c): ...
+    def g(a, b, c, kw1): ...
+    def g2(a, b, kw1, c, d): ...
+    def g2(a, b, c, d, kw1): ...
+    def h(a, kw1, kw2): ...
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
+
+
+class NoUpdateSpec(ast_edits.APIChangeSpec):
+  """A specification of an API change which doesn't change anything."""
+
+  def __init__(self):
+    self.function_handle = {}
+    self.function_reorders = {}
+    self.function_keyword_renames = {}
+
+
+class RenameKeywordSpec(NoUpdateSpec):
+  """A specification where kw2 gets renamed to kw3.
+
+  The new API is
+
+    def f(a, b, kw1, kw3): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.update_renames()
+
+  def update_renames(self):
+    self.function_keyword_renames["f"] = {"kw2": "kw3"}
+
+
+class ReorderKeywordSpec(NoUpdateSpec):
+  """A specification where kw2 gets moved in front of kw1.
+
+  The new API is
+
+    def f(a, b, kw2, kw1): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.update_reorders()
+
+  def update_reorders(self):
+    # Note that these should be in the old order.
+    self.function_reorders["f"] = ["a", "b", "kw1", "kw2"]
+
+
+class ReorderAndRenameKeywordSpec(ReorderKeywordSpec, RenameKeywordSpec):
+  """A specification where kw2 gets moved in front of kw1 and is changed to kw3.
+
+  The new API is
+
+    def f(a, b, kw3, kw1): ...
+
+  """
+
+  def __init__(self):
+    ReorderKeywordSpec.__init__(self)
+    RenameKeywordSpec.__init__(self)
+    self.update_renames()
+    self.update_reorders()
+
+
+class RemoveDeprecatedAliasKeyword(NoUpdateSpec):
+  """A specification where kw1_alias is removed in g.
+
+  The new API is
+
+    def g(a, b, kw1, c): ...
+    def g2(a, b, kw1, c, d): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.function_keyword_renames["g"] = {"kw1_alias": "kw1"}
+    self.function_keyword_renames["g2"] = {"kw1_alias": "kw1"}
+
+
+class RemoveDeprecatedAliasAndReorderRest(RemoveDeprecatedAliasKeyword):
+  """A specification where kw1_alias is removed in g.
+
+  The new API is
+
+    def g(a, b, c, kw1): ...
+    def g2(a, b, c, d, kw1): ...
+
+  """
+
+  def __init__(self):
+    RemoveDeprecatedAliasKeyword.__init__(self)
+    # Note that these should be in the old order.
+    self.function_reorders["g"] = ["a", "b", "kw1", "c"]
+    self.function_reorders["g2"] = ["a", "b", "kw1", "c", "d"]
+
+
+class RemoveMultipleKeywordArguments(NoUpdateSpec):
+  """A specification where both keyword aliases are removed from h.
+
+  The new API is
+
+    def h(a, kw1, kw2): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.function_keyword_renames["h"] = {
+        "kw1_alias": "kw1",
+        "kw2_alias": "kw2",
+    }
+
+
+class TestAstEdits(test_util.TensorFlowTestCase):
+
+  def _upgrade(self, spec, old_file_text):
+    in_file = six.StringIO(old_file_text)
+    out_file = six.StringIO()
+    upgrader = ast_edits.ASTCodeUpgrader(spec)
+    count, report, errors = (
+        upgrader.process_opened_file("test.py", in_file,
+                                     "test_out.py", out_file))
+    return (count, report, errors), out_file.getvalue()
+
+  def testNoTransformIfNothingIsSupplied(self):
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    self.assertEqual(new_text, text)
+
+    text = "f(a, b, c, d)\n"
+    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    self.assertEqual(new_text, text)
+
+  def testKeywordRename(self):
+    """Test that we get the expected result if renaming kw2 to kw3."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    expected = "f(a, b, kw1=c, kw3=d)\n"
+    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    self.assertEqual(new_text, expected)
+
+    # No keywords specified, no reordering, so we should get input as output
+    text = "f(a, b, c, d)\n"
+    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    self.assertEqual(new_text, text)
+
+  def testKeywordReorder(self):
+    """Test that we get the expected result if kw2 is now before kw1."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    acceptable_outputs = [
+        # No change is a valid output
+        text,
+        # Just reordering the kw.. args is also ok
+        "f(a, b, kw2=d, kw1=c)\n",
+        # Also cases where all arguments are fully specified are allowed
+        "f(a=a, b=b, kw1=c, kw2=d)\n",
+        "f(a=a, b=b, kw2=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "f(a, b, c, d)\n"
+    acceptable_outputs = [
+        "f(a, b, d, c)\n",
+        "f(a=a, b=b, kw1=c, kw2=d)\n",
+        "f(a=a, b=b, kw2=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testKeywordReorderAndRename(self):
+    """Test that we get the expected result if kw2 is renamed and moved."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    acceptable_outputs = [
+        "f(a, b, kw3=d, kw1=c)\n",
+        "f(a=a, b=b, kw1=c, kw3=d)\n",
+        "f(a=a, b=b, kw3=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "f(a, b, c, d)\n"
+    acceptable_outputs = [
+        "f(a, b, d, c)\n",
+        "f(a=a, b=b, kw1=c, kw3=d)\n",
+        "f(a=a, b=b, kw3=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAlias(self):
+    """Test that we get the expected result if a keyword alias is removed."""
+    text = "g(a, b, kw1=x, c=c)\n"
+    acceptable_outputs = [
+        # Not using deprecated alias, so original is ok
+        text,
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # No keyword used, should be no change
+    text = "g(a, b, x, c)\n"
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertEqual(new_text, text)
+
+    # If we used the alias, it should get renamed
+    text = "g(a, b, kw1_alias=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed even if it's last
+    text = "g(a, b, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAndReorder(self):
+    """Test for when a keyword alias is removed and args are reordered."""
+    text = "g(a, b, kw1=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "g(a, b, x, c)\n"
+    # Don't accept an output which doesn't reorder c and d
+    acceptable_outputs = [
+        "g(a, b, c, x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # If we used the alias, it should get renamed
+    text = "g(a, b, kw1_alias=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed and reordered even if it's last
+    text = "g(a, b, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAndReorder2(self):
+    """Same as testRemoveDeprecatedKeywordAndReorder but on g2 (more args)."""
+    text = "g2(a, b, kw1=x, c=c, d=d)\n"
+    acceptable_outputs = [
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "g2(a, b, x, c, d)\n"
+    # Don't accept an output which doesn't reorder c and d
+    acceptable_outputs = [
+        "g2(a, b, c, d, x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # If we used the alias, it should get renamed
+    text = "g2(a, b, kw1_alias=x, c=c, d=d)\n"
+    acceptable_outputs = [
+        "g2(a, b, kw1=x, c=c, d=d)\n",
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+        "g2(a=a, b=b, c=c, d=d, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed and reordered even if it's not in order
+    text = "g2(a, b, d=d, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g2(a, b, kw1=x, c=c, d=d)\n",
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a, b, d=d, c=c, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+        "g2(a=a, b=b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, d=d, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveMultipleKeywords(self):
+    """Remove multiple keywords at once."""
+    # Not using deprecated keywords -> no rename
+    text = "h(a, kw1=x, kw2=y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertEqual(new_text, text)
+
+    # Using positional arguments (in proper order) -> no change
+    text = "h(a, x, y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertEqual(new_text, text)
+
+    # Use only the old names, in order
+    text = "h(a, kw1_alias=x, kw2_alias=y)\n"
+    acceptable_outputs = [
+        "h(a, x, y)\n",
+        "h(a, kw1=x, kw2=y)\n",
+        "h(a=a, kw1=x, kw2=y)\n",
+        "h(a, kw2=y, kw1=x)\n",
+        "h(a=a, kw2=y, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Use only the old names, in reverse order, should give one of same outputs
+    text = "h(a, kw2_alias=y, kw1_alias=x)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Mix old and new names
+    text = "h(a, kw1=x, kw2_alias=y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 260278878fa8d920e63e7111c46e3f046c01fdb4..088611dc6cb14a382a42ae3187b6d09bddcbaee0 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -25,50 +25,131 @@ from __future__ import division
 from __future__ import print_function
 
 renames = {
+    'tf.AUTO_REUSE': 'tf.compat.v1.AUTO_REUSE',
+    'tf.COMPILER_VERSION': 'tf.version.COMPILER_VERSION',
+    'tf.CXX11_ABI_FLAG': 'tf.sysconfig.CXX11_ABI_FLAG',
+    'tf.ConditionalAccumulator': 'tf.compat.v1.ConditionalAccumulator',
+    'tf.ConditionalAccumulatorBase': 'tf.compat.v1.ConditionalAccumulatorBase',
+    'tf.DeviceSpec': 'tf.compat.v1.DeviceSpec',
+    'tf.Dimension': 'tf.compat.v1.Dimension',
+    'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
+    'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
+    'tf.FixedLengthRecordReader': 'tf.compat.v1.FixedLengthRecordReader',
+    'tf.GIT_VERSION': 'tf.version.GIT_VERSION',
+    'tf.GRAPH_DEF_VERSION': 'tf.version.GRAPH_DEF_VERSION',
+    'tf.GRAPH_DEF_VERSION_MIN_CONSUMER': 'tf.version.GRAPH_DEF_VERSION_MIN_CONSUMER',
+    'tf.GRAPH_DEF_VERSION_MIN_PRODUCER': 'tf.version.GRAPH_DEF_VERSION_MIN_PRODUCER',
+    'tf.GraphKeys': 'tf.compat.v1.GraphKeys',
+    'tf.IdentityReader': 'tf.compat.v1.IdentityReader',
+    'tf.InteractiveSession': 'tf.compat.v1.InteractiveSession',
+    'tf.LMDBReader': 'tf.compat.v1.LMDBReader',
+    'tf.MONOLITHIC_BUILD': 'tf.sysconfig.MONOLITHIC_BUILD',
+    'tf.NoGradient': 'tf.no_gradient',
+    'tf.NotDifferentiable': 'tf.no_gradient',
     'tf.OpError': 'tf.errors.OpError',
     'tf.PaddingFIFOQueue': 'tf.io.PaddingFIFOQueue',
+    'tf.Print': 'tf.compat.v1.Print',
     'tf.PriorityQueue': 'tf.io.PriorityQueue',
+    'tf.QUANTIZED_DTYPES': 'tf.dtypes.QUANTIZED_DTYPES',
     'tf.QueueBase': 'tf.io.QueueBase',
     'tf.RandomShuffleQueue': 'tf.io.RandomShuffleQueue',
+    'tf.ReaderBase': 'tf.compat.v1.ReaderBase',
+    'tf.Session': 'tf.compat.v1.Session',
     'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
+    'tf.SparseFeature': 'tf.io.SparseFeature',
+    'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
+    'tf.TensorInfo': 'tf.compat.v1.TensorInfo',
+    'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
+    'tf.VERSION': 'tf.version.VERSION',
+    'tf.VarLenFeature': 'tf.io.VarLenFeature',
+    'tf.VariableScope': 'tf.compat.v1.VariableScope',
+    'tf.WholeFileReader': 'tf.compat.v1.WholeFileReader',
     'tf.accumulate_n': 'tf.math.accumulate_n',
+    'tf.add_check_numerics_ops': 'tf.compat.v1.add_check_numerics_ops',
+    'tf.add_to_collection': 'tf.compat.v1.add_to_collection',
+    'tf.add_to_collections': 'tf.compat.v1.add_to_collections',
+    'tf.all_variables': 'tf.compat.v1.all_variables',
     'tf.angle': 'tf.math.angle',
-    'tf.assert_greater_equal': 'tf.debugging.assert_greater_equal',
-    'tf.assert_integer': 'tf.debugging.assert_integer',
-    'tf.assert_less_equal': 'tf.debugging.assert_less_equal',
-    'tf.assert_near': 'tf.debugging.assert_near',
-    'tf.assert_negative': 'tf.debugging.assert_negative',
-    'tf.assert_non_negative': 'tf.debugging.assert_non_negative',
-    'tf.assert_non_positive': 'tf.debugging.assert_non_positive',
-    'tf.assert_none_equal': 'tf.debugging.assert_none_equal',
-    'tf.assert_positive': 'tf.debugging.assert_positive',
+    'tf.app.run': 'tf.compat.v1.app.run',
+    'tf.arg_max': 'tf.compat.v1.arg_max',
+    'tf.arg_min': 'tf.compat.v1.arg_min',
+    'tf.assert_greater_equal': 'tf.compat.v1.assert_greater_equal',
+    'tf.assert_integer': 'tf.compat.v1.assert_integer',
+    'tf.assert_less_equal': 'tf.compat.v1.assert_less_equal',
+    'tf.assert_near': 'tf.compat.v1.assert_near',
+    'tf.assert_negative': 'tf.compat.v1.assert_negative',
+    'tf.assert_non_negative': 'tf.compat.v1.assert_non_negative',
+    'tf.assert_non_positive': 'tf.compat.v1.assert_non_positive',
+    'tf.assert_none_equal': 'tf.compat.v1.assert_none_equal',
+    'tf.assert_positive': 'tf.compat.v1.assert_positive',
     'tf.assert_proper_iterable': 'tf.debugging.assert_proper_iterable',
-    'tf.assert_rank_at_least': 'tf.debugging.assert_rank_at_least',
-    'tf.assert_rank_in': 'tf.debugging.assert_rank_in',
+    'tf.assert_rank_at_least': 'tf.compat.v1.assert_rank_at_least',
+    'tf.assert_rank_in': 'tf.compat.v1.assert_rank_in',
     'tf.assert_same_float_dtype': 'tf.debugging.assert_same_float_dtype',
-    'tf.assert_scalar': 'tf.debugging.assert_scalar',
-    'tf.assert_type': 'tf.debugging.assert_type',
+    'tf.assert_scalar': 'tf.compat.v1.assert_scalar',
+    'tf.assert_type': 'tf.compat.v1.assert_type',
+    'tf.assert_variables_initialized': 'tf.compat.v1.assert_variables_initialized',
+    'tf.assign': 'tf.compat.v1.assign',
+    'tf.assign_add': 'tf.compat.v1.assign_add',
+    'tf.assign_sub': 'tf.compat.v1.assign_sub',
     'tf.betainc': 'tf.math.betainc',
     'tf.bincount': 'tf.math.bincount',
     'tf.ceil': 'tf.math.ceil',
     'tf.check_numerics': 'tf.debugging.check_numerics',
     'tf.cholesky': 'tf.linalg.cholesky',
     'tf.cholesky_solve': 'tf.linalg.cholesky_solve',
+    'tf.colocate_with': 'tf.compat.v1.colocate_with',
     'tf.confusion_matrix': 'tf.math.confusion_matrix',
     'tf.conj': 'tf.math.conj',
+    'tf.container': 'tf.compat.v1.container',
+    'tf.convert_to_tensor_or_indexed_slices': 'tf.compat.v1.convert_to_tensor_or_indexed_slices',
+    'tf.convert_to_tensor_or_sparse_tensor': 'tf.compat.v1.convert_to_tensor_or_sparse_tensor',
+    'tf.count_nonzero': 'tf.compat.v1.count_nonzero',
+    'tf.count_up_to': 'tf.compat.v1.count_up_to',
     'tf.cross': 'tf.linalg.cross',
     'tf.cumprod': 'tf.math.cumprod',
+    'tf.debugging.is_finite': 'tf.math.is_finite',
+    'tf.debugging.is_inf': 'tf.math.is_inf',
+    'tf.debugging.is_nan': 'tf.math.is_nan',
+    'tf.debugging.is_non_decreasing': 'tf.math.is_non_decreasing',
+    'tf.debugging.is_strictly_increasing': 'tf.math.is_strictly_increasing',
     'tf.decode_base64': 'tf.io.decode_base64',
     'tf.decode_compressed': 'tf.io.decode_compressed',
     'tf.decode_csv': 'tf.io.decode_csv',
     'tf.decode_json_example': 'tf.io.decode_json_example',
     'tf.decode_raw': 'tf.io.decode_raw',
+    'tf.delete_session_tensor': 'tf.compat.v1.delete_session_tensor',
     'tf.depth_to_space': 'tf.nn.depth_to_space',
     'tf.dequantize': 'tf.quantization.dequantize',
     'tf.deserialize_many_sparse': 'tf.io.deserialize_many_sparse',
     'tf.diag': 'tf.linalg.tensor_diag',
     'tf.diag_part': 'tf.linalg.tensor_diag_part',
     'tf.digamma': 'tf.math.digamma',
+    'tf.dimension_at_index': 'tf.compat.v1.dimension_at_index',
+    'tf.dimension_value': 'tf.compat.v1.dimension_value',
+    'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
+    'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
+    'tf.distributions.Beta': 'tf.compat.v1.distributions.Beta',
+    'tf.distributions.Categorical': 'tf.compat.v1.distributions.Categorical',
+    'tf.distributions.Dirichlet': 'tf.compat.v1.distributions.Dirichlet',
+    'tf.distributions.DirichletMultinomial': 'tf.compat.v1.distributions.DirichletMultinomial',
+    'tf.distributions.Distribution': 'tf.compat.v1.distributions.Distribution',
+    'tf.distributions.Exponential': 'tf.compat.v1.distributions.Exponential',
+    'tf.distributions.FULLY_REPARAMETERIZED': 'tf.compat.v1.distributions.FULLY_REPARAMETERIZED',
+    'tf.distributions.Gamma': 'tf.compat.v1.distributions.Gamma',
+    'tf.distributions.Laplace': 'tf.compat.v1.distributions.Laplace',
+    'tf.distributions.Multinomial': 'tf.compat.v1.distributions.Multinomial',
+    'tf.distributions.NOT_REPARAMETERIZED': 'tf.compat.v1.distributions.NOT_REPARAMETERIZED',
+    'tf.distributions.Normal': 'tf.compat.v1.distributions.Normal',
+    'tf.distributions.RegisterKL': 'tf.compat.v1.distributions.RegisterKL',
+    'tf.distributions.ReparameterizationType': 'tf.compat.v1.distributions.ReparameterizationType',
+    'tf.distributions.StudentT': 'tf.compat.v1.distributions.StudentT',
+    'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
+    'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
+    'tf.div': 'tf.compat.v1.div',
+    'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
     'tf.encode_base64': 'tf.io.encode_base64',
     'tf.erf': 'tf.math.erf',
     'tf.erfc': 'tf.math.erfc',
@@ -80,26 +161,132 @@ renames = {
     'tf.fake_quant_with_min_max_vars_gradient': 'tf.quantization.fake_quant_with_min_max_vars_gradient',
     'tf.fake_quant_with_min_max_vars_per_channel': 'tf.quantization.fake_quant_with_min_max_vars_per_channel',
     'tf.fake_quant_with_min_max_vars_per_channel_gradient': 'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
-    'tf.fft': 'tf.spectral.fft',
+    'tf.feature_column.input_layer': 'tf.compat.v1.feature_column.input_layer',
+    'tf.feature_column.linear_model': 'tf.compat.v1.feature_column.linear_model',
+    'tf.fft': 'tf.signal.fft',
+    'tf.fft2d': 'tf.signal.fft2d',
+    'tf.fft3d': 'tf.signal.fft3d',
     'tf.floordiv': 'tf.math.floordiv',
-    'tf.get_seed': 'tf.random.get_seed',
+    'tf.get_collection': 'tf.compat.v1.get_collection',
+    'tf.get_collection_ref': 'tf.compat.v1.get_collection_ref',
+    'tf.get_default_graph': 'tf.compat.v1.get_default_graph',
+    'tf.get_default_session': 'tf.compat.v1.get_default_session',
+    'tf.get_local_variable': 'tf.compat.v1.get_local_variable',
+    'tf.get_seed': 'tf.compat.v1.get_seed',
+    'tf.get_session_handle': 'tf.compat.v1.get_session_handle',
+    'tf.get_session_tensor': 'tf.compat.v1.get_session_tensor',
+    'tf.get_variable': 'tf.compat.v1.get_variable',
+    'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
+    'tf.gfile.Exists': 'tf.compat.v1.gfile.Exists',
+    'tf.gfile.FastGFile': 'tf.compat.v1.gfile.FastGFile',
+    'tf.gfile.GFile': 'tf.compat.v1.gfile.GFile',
+    'tf.gfile.Open': 'tf.compat.v1.gfile.Open',
     'tf.global_norm': 'tf.linalg.global_norm',
+    'tf.global_variables': 'tf.compat.v1.global_variables',
+    'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
     'tf.glorot_normal_initializer': 'tf.keras.initializers.glorot_normal',
-    'tf.ifft': 'tf.spectral.ifft',
+    'tf.graph_util.convert_variables_to_constants': 'tf.compat.v1.graph_util.convert_variables_to_constants',
+    'tf.graph_util.extract_sub_graph': 'tf.compat.v1.graph_util.extract_sub_graph',
+    'tf.graph_util.must_run_on_cpu': 'tf.compat.v1.graph_util.must_run_on_cpu',
+    'tf.graph_util.remove_training_nodes': 'tf.compat.v1.graph_util.remove_training_nodes',
+    'tf.graph_util.tensor_shape_from_node_def_name': 'tf.compat.v1.graph_util.tensor_shape_from_node_def_name',
+    'tf.ifft': 'tf.signal.ifft',
+    'tf.ifft2d': 'tf.signal.ifft2d',
+    'tf.ifft3d': 'tf.signal.ifft3d',
     'tf.igamma': 'tf.math.igamma',
     'tf.igammac': 'tf.math.igammac',
     'tf.imag': 'tf.math.imag',
+    'tf.image.resize_area': 'tf.compat.v1.image.resize_area',
+    'tf.image.resize_bicubic': 'tf.compat.v1.image.resize_bicubic',
+    'tf.image.resize_bilinear': 'tf.compat.v1.image.resize_bilinear',
+    'tf.image.resize_images': 'tf.compat.v1.image.resize_images',
+    'tf.image.resize_nearest_neighbor': 'tf.compat.v1.image.resize_nearest_neighbor',
+    'tf.image.transpose_image': 'tf.compat.v1.image.transpose_image',
+    'tf.initialize_all_tables': 'tf.compat.v1.initialize_all_tables',
+    'tf.initialize_all_variables': 'tf.compat.v1.initialize_all_variables',
+    'tf.initialize_local_variables': 'tf.compat.v1.initialize_local_variables',
+    'tf.initialize_variables': 'tf.compat.v1.initialize_variables',
+    'tf.initializers.global_variables': 'tf.compat.v1.initializers.global_variables',
+    'tf.initializers.local_variables': 'tf.compat.v1.initializers.local_variables',
+    'tf.initializers.tables_initializer': 'tf.compat.v1.initializers.tables_initializer',
+    'tf.initializers.variables': 'tf.compat.v1.initializers.variables',
     'tf.invert_permutation': 'tf.math.invert_permutation',
-    'tf.is_finite': 'tf.debugging.is_finite',
-    'tf.is_inf': 'tf.debugging.is_inf',
-    'tf.is_nan': 'tf.debugging.is_nan',
-    'tf.is_non_decreasing': 'tf.debugging.is_non_decreasing',
+    'tf.io.tf_record_iterator': 'tf.compat.v1.io.tf_record_iterator',
+    'tf.is_finite': 'tf.math.is_finite',
+    'tf.is_inf': 'tf.math.is_inf',
+    'tf.is_nan': 'tf.math.is_nan',
+    'tf.is_non_decreasing': 'tf.math.is_non_decreasing',
     'tf.is_numeric_tensor': 'tf.debugging.is_numeric_tensor',
-    'tf.is_strictly_increasing': 'tf.debugging.is_strictly_increasing',
+    'tf.is_strictly_increasing': 'tf.math.is_strictly_increasing',
+    'tf.is_variable_initialized': 'tf.compat.v1.is_variable_initialized',
+    'tf.keras.backend.get_session': 'tf.compat.v1.keras.backend.get_session',
+    'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
+    'tf.layers.AveragePooling2D': 'tf.compat.v1.layers.AveragePooling2D',
+    'tf.layers.AveragePooling3D': 'tf.compat.v1.layers.AveragePooling3D',
+    'tf.layers.BatchNormalization': 'tf.compat.v1.layers.BatchNormalization',
+    'tf.layers.Conv1D': 'tf.compat.v1.layers.Conv1D',
+    'tf.layers.Conv2D': 'tf.compat.v1.layers.Conv2D',
+    'tf.layers.Conv2DTranspose': 'tf.compat.v1.layers.Conv2DTranspose',
+    'tf.layers.Conv3D': 'tf.compat.v1.layers.Conv3D',
+    'tf.layers.Conv3DTranspose': 'tf.compat.v1.layers.Conv3DTranspose',
+    'tf.layers.Dense': 'tf.compat.v1.layers.Dense',
+    'tf.layers.Dropout': 'tf.compat.v1.layers.Dropout',
+    'tf.layers.Flatten': 'tf.compat.v1.layers.Flatten',
+    'tf.layers.InputSpec': 'tf.keras.layers.InputSpec',
+    'tf.layers.Layer': 'tf.compat.v1.layers.Layer',
+    'tf.layers.MaxPooling1D': 'tf.compat.v1.layers.MaxPooling1D',
+    'tf.layers.MaxPooling2D': 'tf.compat.v1.layers.MaxPooling2D',
+    'tf.layers.MaxPooling3D': 'tf.compat.v1.layers.MaxPooling3D',
+    'tf.layers.SeparableConv1D': 'tf.compat.v1.layers.SeparableConv1D',
+    'tf.layers.SeparableConv2D': 'tf.compat.v1.layers.SeparableConv2D',
+    'tf.layers.average_pooling1d': 'tf.compat.v1.layers.average_pooling1d',
+    'tf.layers.average_pooling2d': 'tf.compat.v1.layers.average_pooling2d',
+    'tf.layers.average_pooling3d': 'tf.compat.v1.layers.average_pooling3d',
+    'tf.layers.batch_normalization': 'tf.compat.v1.layers.batch_normalization',
+    'tf.layers.conv1d': 'tf.compat.v1.layers.conv1d',
+    'tf.layers.conv2d': 'tf.compat.v1.layers.conv2d',
+    'tf.layers.conv2d_transpose': 'tf.compat.v1.layers.conv2d_transpose',
+    'tf.layers.conv3d': 'tf.compat.v1.layers.conv3d',
+    'tf.layers.conv3d_transpose': 'tf.compat.v1.layers.conv3d_transpose',
+    'tf.layers.dense': 'tf.compat.v1.layers.dense',
+    'tf.layers.dropout': 'tf.compat.v1.layers.dropout',
+    'tf.layers.experimental.keras_style_scope': 'tf.compat.v1.layers.experimental.keras_style_scope',
+    'tf.layers.experimental.set_keras_style': 'tf.compat.v1.layers.experimental.set_keras_style',
+    'tf.layers.flatten': 'tf.compat.v1.layers.flatten',
+    'tf.layers.max_pooling1d': 'tf.compat.v1.layers.max_pooling1d',
+    'tf.layers.max_pooling2d': 'tf.compat.v1.layers.max_pooling2d',
+    'tf.layers.max_pooling3d': 'tf.compat.v1.layers.max_pooling3d',
+    'tf.layers.separable_conv1d': 'tf.compat.v1.layers.separable_conv1d',
+    'tf.layers.separable_conv2d': 'tf.compat.v1.layers.separable_conv2d',
     'tf.lbeta': 'tf.math.lbeta',
     'tf.lgamma': 'tf.math.lgamma',
+    'tf.load_file_system_library': 'tf.compat.v1.load_file_system_library',
+    'tf.local_variables': 'tf.compat.v1.local_variables',
+    'tf.local_variables_initializer': 'tf.compat.v1.local_variables_initializer',
     'tf.log_sigmoid': 'tf.math.log_sigmoid',
+    'tf.logging.DEBUG': 'tf.compat.v1.logging.DEBUG',
+    'tf.logging.ERROR': 'tf.compat.v1.logging.ERROR',
+    'tf.logging.FATAL': 'tf.compat.v1.logging.FATAL',
+    'tf.logging.INFO': 'tf.compat.v1.logging.INFO',
+    'tf.logging.TaskLevelStatusMessage': 'tf.compat.v1.logging.TaskLevelStatusMessage',
+    'tf.logging.WARN': 'tf.compat.v1.logging.WARN',
+    'tf.logging.debug': 'tf.compat.v1.logging.debug',
+    'tf.logging.error': 'tf.compat.v1.logging.error',
+    'tf.logging.fatal': 'tf.compat.v1.logging.fatal',
+    'tf.logging.flush': 'tf.compat.v1.logging.flush',
+    'tf.logging.get_verbosity': 'tf.compat.v1.logging.get_verbosity',
+    'tf.logging.info': 'tf.compat.v1.logging.info',
+    'tf.logging.log': 'tf.compat.v1.logging.log',
+    'tf.logging.log_every_n': 'tf.compat.v1.logging.log_every_n',
+    'tf.logging.log_first_n': 'tf.compat.v1.logging.log_first_n',
+    'tf.logging.log_if': 'tf.compat.v1.logging.log_if',
+    'tf.logging.set_verbosity': 'tf.compat.v1.logging.set_verbosity',
+    'tf.logging.vlog': 'tf.compat.v1.logging.vlog',
+    'tf.logging.warn': 'tf.compat.v1.logging.warn',
+    'tf.logging.warning': 'tf.compat.v1.logging.warning',
     'tf.logical_xor': 'tf.math.logical_xor',
+    'tf.make_template': 'tf.compat.v1.make_template',
+    'tf.make_tensor_proto': 'tf.compat.v1.make_tensor_proto',
     'tf.manip.batch_to_space_nd': 'tf.batch_to_space_nd',
     'tf.manip.gather_nd': 'tf.gather_nd',
     'tf.manip.reshape': 'tf.reshape',
@@ -119,34 +306,133 @@ renames = {
     'tf.matrix_solve_ls': 'tf.linalg.lstsq',
     'tf.matrix_transpose': 'tf.linalg.transpose',
     'tf.matrix_triangular_solve': 'tf.linalg.triangular_solve',
+    'tf.min_max_variable_partitioner': 'tf.compat.v1.min_max_variable_partitioner',
+    'tf.model_variables': 'tf.compat.v1.model_variables',
+    'tf.moving_average_variables': 'tf.compat.v1.moving_average_variables',
+    'tf.multinomial': 'tf.compat.v1.multinomial',
+    'tf.nn.conv3d_backprop_filter_v2': 'tf.nn.conv3d_backprop_filter',
+    'tf.nn.ctc_beam_search_decoder_v2': 'tf.nn.ctc_beam_search_decoder',
+    'tf.nn.depthwise_conv2d_native': 'tf.compat.v1.nn.depthwise_conv2d_native',
+    'tf.nn.depthwise_conv2d_native_backprop_filter': 'tf.nn.depthwise_conv2d_backprop_filter',
+    'tf.nn.depthwise_conv2d_native_backprop_input': 'tf.nn.depthwise_conv2d_backprop_input',
+    'tf.nn.dynamic_rnn': 'tf.compat.v1.nn.dynamic_rnn',
     'tf.nn.log_uniform_candidate_sampler': 'tf.random.log_uniform_candidate_sampler',
+    'tf.nn.quantized_avg_pool': 'tf.compat.v1.nn.quantized_avg_pool',
+    'tf.nn.quantized_conv2d': 'tf.compat.v1.nn.quantized_conv2d',
+    'tf.nn.quantized_max_pool': 'tf.compat.v1.nn.quantized_max_pool',
+    'tf.nn.quantized_relu_x': 'tf.compat.v1.nn.quantized_relu_x',
+    'tf.nn.raw_rnn': 'tf.compat.v1.nn.raw_rnn',
+    'tf.nn.rnn_cell.BasicLSTMCell': 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell',
+    'tf.nn.rnn_cell.BasicRNNCell': 'tf.compat.v1.nn.rnn_cell.BasicRNNCell',
+    'tf.nn.rnn_cell.GRUCell': 'tf.compat.v1.nn.rnn_cell.GRUCell',
+    'tf.nn.rnn_cell.LSTMCell': 'tf.compat.v1.nn.rnn_cell.LSTMCell',
+    'tf.nn.softmax_cross_entropy_with_logits_v2': 'tf.nn.softmax_cross_entropy_with_logits',
+    'tf.nn.static_rnn': 'tf.compat.v1.nn.static_rnn',
     'tf.nn.uniform_candidate_sampler': 'tf.random.uniform_candidate_sampler',
+    'tf.nn.xw_plus_b': 'tf.compat.v1.nn.xw_plus_b',
+    'tf.op_scope': 'tf.compat.v1.op_scope',
     'tf.orthogonal_initializer': 'tf.keras.initializers.Orthogonal',
+    'tf.parse_example': 'tf.io.parse_example',
+    'tf.parse_single_example': 'tf.io.parse_single_example',
+    'tf.parse_single_sequence_example': 'tf.io.parse_single_sequence_example',
     'tf.parse_tensor': 'tf.io.parse_tensor',
+    'tf.placeholder': 'tf.compat.v1.placeholder',
+    'tf.placeholder_with_default': 'tf.compat.v1.placeholder_with_default',
     'tf.polygamma': 'tf.math.polygamma',
+    'tf.profiler.AdviceProto': 'tf.compat.v1.profiler.AdviceProto',
+    'tf.profiler.GraphNodeProto': 'tf.compat.v1.profiler.GraphNodeProto',
+    'tf.profiler.MultiGraphNodeProto': 'tf.compat.v1.profiler.MultiGraphNodeProto',
+    'tf.profiler.OpLogProto': 'tf.compat.v1.profiler.OpLogProto',
+    'tf.profiler.ProfileOptionBuilder': 'tf.compat.v1.profiler.ProfileOptionBuilder',
+    'tf.profiler.Profiler': 'tf.compat.v1.profiler.Profiler',
+    'tf.profiler.advise': 'tf.compat.v1.profiler.advise',
+    'tf.profiler.profile': 'tf.compat.v1.profiler.profile',
+    'tf.profiler.write_op_log': 'tf.compat.v1.profiler.write_op_log',
+    'tf.py_func': 'tf.compat.v1.py_func',
     'tf.python_io.TFRecordCompressionType': 'tf.io.TFRecordCompressionType',
     'tf.python_io.TFRecordOptions': 'tf.io.TFRecordOptions',
     'tf.python_io.TFRecordWriter': 'tf.io.TFRecordWriter',
-    'tf.python_io.tf_record_iterator': 'tf.io.tf_record_iterator',
+    'tf.python_io.tf_record_iterator': 'tf.compat.v1.python_io.tf_record_iterator',
     'tf.qr': 'tf.linalg.qr',
     'tf.quantize': 'tf.quantization.quantize',
+    'tf.quantize_v2': 'tf.compat.v1.quantize_v2',
     'tf.quantized_concat': 'tf.quantization.quantized_concat',
+    'tf.random.get_seed': 'tf.compat.v1.random.get_seed',
+    'tf.random.multinomial': 'tf.compat.v1.random.multinomial',
+    'tf.random.set_random_seed': 'tf.compat.v1.random.set_random_seed',
+    'tf.random.stateless_multinomial': 'tf.compat.v1.random.stateless_multinomial',
+    'tf.random_crop': 'tf.image.random_crop',
     'tf.random_gamma': 'tf.random.gamma',
-    'tf.random_poisson': 'tf.random.poisson',
+    'tf.random_normal': 'tf.random.normal',
+    'tf.random_poisson': 'tf.compat.v1.random_poisson',
+    'tf.random_shuffle': 'tf.random.shuffle',
+    'tf.random_uniform': 'tf.random.uniform',
     'tf.read_file': 'tf.io.read_file',
     'tf.real': 'tf.math.real',
     'tf.reciprocal': 'tf.math.reciprocal',
     'tf.reduce_join': 'tf.strings.reduce_join',
     'tf.regex_replace': 'tf.strings.regex_replace',
+    'tf.report_uninitialized_variables': 'tf.compat.v1.report_uninitialized_variables',
+    'tf.resource_loader.get_data_files_path': 'tf.compat.v1.resource_loader.get_data_files_path',
+    'tf.resource_loader.get_path_to_datafile': 'tf.compat.v1.resource_loader.get_path_to_datafile',
+    'tf.resource_loader.get_root_dir_with_all_resources': 'tf.compat.v1.resource_loader.get_root_dir_with_all_resources',
+    'tf.resource_loader.load_resource': 'tf.compat.v1.resource_loader.load_resource',
+    'tf.resource_loader.readahead_file_path': 'tf.compat.v1.resource_loader.readahead_file_path',
     'tf.reverse_v2': 'tf.reverse',
     'tf.rint': 'tf.math.rint',
     'tf.rsqrt': 'tf.math.rsqrt',
-    'tf.saved_model.loader.maybe_saved_model_directory': 'tf.saved_model.maybe_saved_model_directory',
+    'tf.saved_model.Builder': 'tf.compat.v1.saved_model.Builder',
+    'tf.saved_model.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.build_tensor_info': 'tf.compat.v1.saved_model.build_tensor_info',
+    'tf.saved_model.builder.SavedModelBuilder': 'tf.compat.v1.saved_model.builder.SavedModelBuilder',
+    'tf.saved_model.constants.ASSETS_DIRECTORY': 'tf.saved_model.ASSETS_DIRECTORY',
+    'tf.saved_model.constants.ASSETS_KEY': 'tf.saved_model.ASSETS_KEY',
+    'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.constants.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.constants.MAIN_OP_KEY': 'tf.saved_model.MAIN_OP_KEY',
+    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PB': 'tf.saved_model.SAVED_MODEL_FILENAME_PB',
+    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PBTXT': 'tf.saved_model.SAVED_MODEL_FILENAME_PBTXT',
+    'tf.saved_model.constants.SAVED_MODEL_SCHEMA_VERSION': 'tf.saved_model.SAVED_MODEL_SCHEMA_VERSION',
+    'tf.saved_model.constants.VARIABLES_DIRECTORY': 'tf.saved_model.VARIABLES_DIRECTORY',
+    'tf.saved_model.constants.VARIABLES_FILENAME': 'tf.saved_model.VARIABLES_FILENAME',
+    'tf.saved_model.experimental.save': 'tf.saved_model.save',
+    'tf.saved_model.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
+    'tf.saved_model.load': 'tf.compat.v1.saved_model.load',
+    'tf.saved_model.loader.load': 'tf.compat.v1.saved_model.loader.load',
+    'tf.saved_model.loader.maybe_saved_model_directory': 'tf.compat.v1.saved_model.loader.maybe_saved_model_directory',
+    'tf.saved_model.main_op.main_op': 'tf.compat.v1.saved_model.main_op.main_op',
+    'tf.saved_model.main_op.main_op_with_restore': 'tf.compat.v1.saved_model.main_op.main_op_with_restore',
+    'tf.saved_model.main_op_with_restore': 'tf.compat.v1.saved_model.main_op_with_restore',
+    'tf.saved_model.maybe_saved_model_directory': 'tf.compat.v1.saved_model.maybe_saved_model_directory',
+    'tf.saved_model.signature_constants.CLASSIFY_INPUTS': 'tf.saved_model.CLASSIFY_INPUTS',
+    'tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME': 'tf.saved_model.CLASSIFY_METHOD_NAME',
+    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES': 'tf.saved_model.CLASSIFY_OUTPUT_CLASSES',
+    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES': 'tf.saved_model.CLASSIFY_OUTPUT_SCORES',
+    'tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY': 'tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY',
+    'tf.saved_model.signature_constants.PREDICT_INPUTS': 'tf.saved_model.PREDICT_INPUTS',
+    'tf.saved_model.signature_constants.PREDICT_METHOD_NAME': 'tf.saved_model.PREDICT_METHOD_NAME',
+    'tf.saved_model.signature_constants.PREDICT_OUTPUTS': 'tf.saved_model.PREDICT_OUTPUTS',
+    'tf.saved_model.signature_constants.REGRESS_INPUTS': 'tf.saved_model.REGRESS_INPUTS',
+    'tf.saved_model.signature_constants.REGRESS_METHOD_NAME': 'tf.saved_model.REGRESS_METHOD_NAME',
+    'tf.saved_model.signature_constants.REGRESS_OUTPUTS': 'tf.saved_model.REGRESS_OUTPUTS',
     'tf.saved_model.signature_def_utils.build_signature_def': 'tf.saved_model.build_signature_def',
     'tf.saved_model.signature_def_utils.classification_signature_def': 'tf.saved_model.classification_signature_def',
     'tf.saved_model.signature_def_utils.is_valid_signature': 'tf.saved_model.is_valid_signature',
     'tf.saved_model.signature_def_utils.predict_signature_def': 'tf.saved_model.predict_signature_def',
     'tf.saved_model.signature_def_utils.regression_signature_def': 'tf.saved_model.regression_signature_def',
+    'tf.saved_model.simple_save': 'tf.compat.v1.saved_model.simple_save',
+    'tf.saved_model.tag_constants.GPU': 'tf.saved_model.GPU',
+    'tf.saved_model.tag_constants.SERVING': 'tf.saved_model.SERVING',
+    'tf.saved_model.tag_constants.TPU': 'tf.saved_model.TPU',
+    'tf.saved_model.tag_constants.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.utils.build_tensor_info': 'tf.compat.v1.saved_model.utils.build_tensor_info',
+    'tf.saved_model.utils.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.utils.get_tensor_from_tensor_info',
+    'tf.scatter_add': 'tf.compat.v1.scatter_add',
+    'tf.scatter_nd_add': 'tf.compat.v1.scatter_nd_add',
+    'tf.scatter_nd_sub': 'tf.compat.v1.scatter_nd_sub',
+    'tf.scatter_nd_update': 'tf.compat.v1.scatter_nd_update',
+    'tf.scatter_sub': 'tf.compat.v1.scatter_sub',
+    'tf.scatter_update': 'tf.compat.v1.scatter_update',
     'tf.segment_max': 'tf.math.segment_max',
     'tf.segment_mean': 'tf.math.segment_mean',
     'tf.segment_min': 'tf.math.segment_min',
@@ -156,28 +442,60 @@ renames = {
     'tf.self_adjoint_eigvals': 'tf.linalg.eigvalsh',
     'tf.serialize_many_sparse': 'tf.io.serialize_many_sparse',
     'tf.serialize_sparse': 'tf.io.serialize_sparse',
+    'tf.serialize_tensor': 'tf.io.serialize_tensor',
+    'tf.set_random_seed': 'tf.compat.v1.set_random_seed',
+    'tf.setdiff1d': 'tf.compat.v1.setdiff1d',
+    'tf.sets.set_difference': 'tf.sets.difference',
+    'tf.sets.set_intersection': 'tf.sets.intersection',
+    'tf.sets.set_size': 'tf.sets.size',
+    'tf.sets.set_union': 'tf.sets.union',
     'tf.space_to_batch': 'tf.nn.space_to_batch',
     'tf.space_to_depth': 'tf.nn.space_to_depth',
-    'tf.sparse_add': 'tf.sparse.add',
+    'tf.sparse.matmul': 'tf.sparse.sparse_dense_matmul',
+    'tf.sparse.merge': 'tf.compat.v1.sparse.merge',
+    'tf.sparse.placeholder': 'tf.compat.v1.sparse.placeholder',
+    'tf.sparse.reduce_max_sparse': 'tf.compat.v1.sparse.reduce_max_sparse',
+    'tf.sparse.reduce_sum_sparse': 'tf.compat.v1.sparse.reduce_sum_sparse',
+    'tf.sparse_add': 'tf.compat.v1.sparse_add',
     'tf.sparse_fill_empty_rows': 'tf.sparse.fill_empty_rows',
     'tf.sparse_mask': 'tf.sparse.mask',
+    'tf.sparse_matmul': 'tf.compat.v1.sparse_matmul',
     'tf.sparse_maximum': 'tf.sparse.maximum',
-    'tf.sparse_merge': 'tf.sparse.merge',
+    'tf.sparse_merge': 'tf.compat.v1.sparse_merge',
     'tf.sparse_minimum': 'tf.sparse.minimum',
-    'tf.sparse_placeholder': 'tf.sparse.placeholder',
+    'tf.sparse_placeholder': 'tf.compat.v1.sparse_placeholder',
+    'tf.sparse_reduce_max': 'tf.compat.v1.sparse_reduce_max',
+    'tf.sparse_reduce_max_sparse': 'tf.compat.v1.sparse_reduce_max_sparse',
+    'tf.sparse_reduce_sum': 'tf.compat.v1.sparse_reduce_sum',
+    'tf.sparse_reduce_sum_sparse': 'tf.compat.v1.sparse_reduce_sum_sparse',
     'tf.sparse_reorder': 'tf.sparse.reorder',
     'tf.sparse_reset_shape': 'tf.sparse.reset_shape',
     'tf.sparse_reshape': 'tf.sparse.reshape',
     'tf.sparse_retain': 'tf.sparse.retain',
-    'tf.sparse_segment_mean': 'tf.sparse.segment_mean',
-    'tf.sparse_segment_sqrt_n': 'tf.sparse.segment_sqrt_n',
-    'tf.sparse_segment_sum': 'tf.sparse.segment_sum',
+    'tf.sparse_segment_mean': 'tf.compat.v1.sparse_segment_mean',
+    'tf.sparse_segment_sqrt_n': 'tf.compat.v1.sparse_segment_sqrt_n',
+    'tf.sparse_segment_sum': 'tf.compat.v1.sparse_segment_sum',
     'tf.sparse_slice': 'tf.sparse.slice',
     'tf.sparse_softmax': 'tf.sparse.softmax',
-    'tf.sparse_tensor_dense_matmul': 'tf.sparse.matmul',
+    'tf.sparse_split': 'tf.compat.v1.sparse_split',
+    'tf.sparse_tensor_dense_matmul': 'tf.sparse.sparse_dense_matmul',
     'tf.sparse_tensor_to_dense': 'tf.sparse.to_dense',
     'tf.sparse_to_indicator': 'tf.sparse.to_indicator',
     'tf.sparse_transpose': 'tf.sparse.transpose',
+    'tf.spectral.dct': 'tf.signal.dct',
+    'tf.spectral.fft': 'tf.signal.fft',
+    'tf.spectral.fft2d': 'tf.signal.fft2d',
+    'tf.spectral.fft3d': 'tf.signal.fft3d',
+    'tf.spectral.idct': 'tf.signal.idct',
+    'tf.spectral.ifft': 'tf.signal.ifft',
+    'tf.spectral.ifft2d': 'tf.signal.ifft2d',
+    'tf.spectral.ifft3d': 'tf.signal.ifft3d',
+    'tf.spectral.irfft': 'tf.signal.irfft',
+    'tf.spectral.irfft2d': 'tf.signal.irfft2d',
+    'tf.spectral.irfft3d': 'tf.signal.irfft3d',
+    'tf.spectral.rfft': 'tf.signal.rfft',
+    'tf.spectral.rfft2d': 'tf.signal.rfft2d',
+    'tf.spectral.rfft3d': 'tf.signal.rfft3d',
     'tf.squared_difference': 'tf.math.squared_difference',
     'tf.string_join': 'tf.strings.join',
     'tf.string_strip': 'tf.strings.strip',
@@ -185,10 +503,74 @@ renames = {
     'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
     'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
     'tf.string_to_number': 'tf.strings.to_number',
+    'tf.summary.audio': 'tf.compat.v1.summary.audio',
+    'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
+    'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
+    'tf.summary.image': 'tf.compat.v1.summary.image',
+    'tf.summary.merge': 'tf.compat.v1.summary.merge',
+    'tf.summary.merge_all': 'tf.compat.v1.summary.merge_all',
+    'tf.summary.scalar': 'tf.compat.v1.summary.scalar',
+    'tf.summary.tensor_summary': 'tf.compat.v1.summary.tensor_summary',
+    'tf.summary.text': 'tf.compat.v1.summary.text',
     'tf.svd': 'tf.linalg.svd',
+    'tf.tables_initializer': 'tf.compat.v1.tables_initializer',
+    'tf.test.compute_gradient': 'tf.compat.v1.test.compute_gradient',
+    'tf.test.compute_gradient_error': 'tf.compat.v1.test.compute_gradient_error',
+    'tf.test.get_temp_dir': 'tf.compat.v1.test.get_temp_dir',
+    'tf.test.mock': 'tf.compat.v1.test.mock',
+    'tf.test.test_src_dir_path': 'tf.compat.v1.test.test_src_dir_path',
+    'tf.to_bfloat16': 'tf.compat.v1.to_bfloat16',
+    'tf.to_complex128': 'tf.compat.v1.to_complex128',
+    'tf.to_complex64': 'tf.compat.v1.to_complex64',
+    'tf.to_double': 'tf.compat.v1.to_double',
+    'tf.to_float': 'tf.compat.v1.to_float',
+    'tf.to_int32': 'tf.compat.v1.to_int32',
+    'tf.to_int64': 'tf.compat.v1.to_int64',
     'tf.trace': 'tf.linalg.trace',
-    'tf.train.confusion_matrix': 'tf.math.confusion_matrix',
+    'tf.train.MonitoredTrainingSession': 'tf.compat.v1.train.MonitoredTrainingSession',
+    'tf.train.NewCheckpointReader': 'tf.compat.v1.train.NewCheckpointReader',
+    'tf.train.ProfilerHook': 'tf.compat.v1.train.ProfilerHook',
+    'tf.train.QueueRunner': 'tf.compat.v1.train.QueueRunner',
+    'tf.train.Saver': 'tf.compat.v1.train.Saver',
+    'tf.train.SaverDef': 'tf.compat.v1.train.SaverDef',
+    'tf.train.SyncReplicasOptimizer': 'tf.compat.v1.train.SyncReplicasOptimizer',
+    'tf.train.add_queue_runner': 'tf.compat.v1.train.add_queue_runner',
+    'tf.train.assert_global_step': 'tf.compat.v1.train.assert_global_step',
+    'tf.train.basic_train_loop': 'tf.compat.v1.train.basic_train_loop',
+    'tf.train.batch': 'tf.compat.v1.train.batch',
+    'tf.train.batch_join': 'tf.compat.v1.train.batch_join',
+    'tf.train.checkpoint_exists': 'tf.compat.v1.train.checkpoint_exists',
+    'tf.train.create_global_step': 'tf.compat.v1.train.create_global_step',
+    'tf.train.do_quantize_training_on_graphdef': 'tf.compat.v1.train.do_quantize_training_on_graphdef',
+    'tf.train.export_meta_graph': 'tf.compat.v1.train.export_meta_graph',
+    'tf.train.generate_checkpoint_state_proto': 'tf.compat.v1.train.generate_checkpoint_state_proto',
+    'tf.train.get_checkpoint_mtimes': 'tf.compat.v1.train.get_checkpoint_mtimes',
+    'tf.train.get_global_step': 'tf.compat.v1.train.get_global_step',
+    'tf.train.get_or_create_global_step': 'tf.compat.v1.train.get_or_create_global_step',
+    'tf.train.global_step': 'tf.compat.v1.train.global_step',
+    'tf.train.import_meta_graph': 'tf.compat.v1.train.import_meta_graph',
+    'tf.train.init_from_checkpoint': 'tf.compat.v1.train.init_from_checkpoint',
+    'tf.train.input_producer': 'tf.compat.v1.train.input_producer',
+    'tf.train.limit_epochs': 'tf.compat.v1.train.limit_epochs',
     'tf.train.match_filenames_once': 'tf.io.match_filenames_once',
+    'tf.train.maybe_batch': 'tf.compat.v1.train.maybe_batch',
+    'tf.train.maybe_batch_join': 'tf.compat.v1.train.maybe_batch_join',
+    'tf.train.maybe_shuffle_batch': 'tf.compat.v1.train.maybe_shuffle_batch',
+    'tf.train.maybe_shuffle_batch_join': 'tf.compat.v1.train.maybe_shuffle_batch_join',
+    'tf.train.queue_runner.QueueRunner': 'tf.compat.v1.train.queue_runner.QueueRunner',
+    'tf.train.queue_runner.add_queue_runner': 'tf.compat.v1.train.queue_runner.add_queue_runner',
+    'tf.train.queue_runner.start_queue_runners': 'tf.compat.v1.train.queue_runner.start_queue_runners',
+    'tf.train.range_input_producer': 'tf.compat.v1.train.range_input_producer',
+    'tf.train.remove_checkpoint': 'tf.compat.v1.train.remove_checkpoint',
+    'tf.train.replica_device_setter': 'tf.compat.v1.train.replica_device_setter',
+    'tf.train.shuffle_batch': 'tf.compat.v1.train.shuffle_batch',
+    'tf.train.shuffle_batch_join': 'tf.compat.v1.train.shuffle_batch_join',
+    'tf.train.slice_input_producer': 'tf.compat.v1.train.slice_input_producer',
+    'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
+    'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
+    'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
+    'tf.train.write_graph': 'tf.io.write_graph',
+    'tf.trainable_variables': 'tf.compat.v1.trainable_variables',
     'tf.uniform_unit_scaling_initializer': 'tf.initializers.uniform_unit_scaling',
     'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
     'tf.unsorted_segment_mean': 'tf.math.unsorted_segment_mean',
@@ -196,8 +578,12 @@ renames = {
     'tf.unsorted_segment_prod': 'tf.math.unsorted_segment_prod',
     'tf.unsorted_segment_sqrt_n': 'tf.math.unsorted_segment_sqrt_n',
     'tf.unsorted_segment_sum': 'tf.math.unsorted_segment_sum',
+    'tf.variable_op_scope': 'tf.compat.v1.variable_op_scope',
+    'tf.variable_scope': 'tf.compat.v1.variable_scope',
+    'tf.variables_initializer': 'tf.compat.v1.variables_initializer',
     'tf.variance_scaling_initializer': 'tf.keras.initializers.VarianceScaling',
-    'tf.verify_tensor_all_finite': 'tf.debugging.assert_all_finite',
+    'tf.verify_tensor_all_finite': 'tf.compat.v1.verify_tensor_all_finite',
+    'tf.wrap_function': 'tf.compat.v1.wrap_function',
     'tf.write_file': 'tf.io.write_file',
     'tf.zeta': 'tf.math.zeta'
 }
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 2dabf7834dad62c118441896154cc5a042daaeca..287d1a5483c32379da1dc651aba62a86a3f6d0f9 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -102,7 +102,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     }
 
     # Mapping from function to the new name of the function
-    self.function_renames = {
+    self.symbol_renames = {
         "tf.inv": "tf.reciprocal",
         "tf.contrib.deprecated.scalar_summary": "tf.summary.scalar",
         "tf.contrib.deprecated.histogram_summary": "tf.summary.histogram",
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index df84f7ec63ac179cb586fee0a7ced854e7b56a2d..a4d307032d81ab1ec178e177f5615685731d9b56 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -30,29 +30,214 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   def __init__(self):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
-    self.function_keyword_renames = {}
+    self.function_keyword_renames = {
+        "tf.image.crop_and_resize": {
+            "box_ind": "box_indices",
+        },
+        "tf.image.extract_image_patches": {
+            "ksizes": "sizes",
+        },
+        "tf.extract_image_patches": {
+            "ksizes": "sizes",
+        },
+        "tf.expand_dims": {
+            "dim": "axis",
+        },
+        "tf.convert_to_tensor": {
+            "preferred_dtype": "dtype_hint"
+        },
+        "tf.math.count_nonzero": {
+            "input_tensor": "input",
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis",
+        },
+        "tf.nn.pool": {
+            "dilation_rate": "dilations"
+        },
+        "tf.nn.separable_conv2d": {
+            "rate": "dilations"
+        },
+        "tf.nn.sufficient_statistics": {
+            "keep_dims": "keepdims"
+        },
+        "tf.nn.log_softmax": {
+            "dim": "axis",
+        },
+        "tf.nn.softmax": {
+            "dim": "axis",
+        },
+        "tf.debugging.assert_all_finite": {
+            "t": "x",
+            "msg": "message",
+        },
+        "tf.sparse.split": {
+            "split_dim": "axis",
+        },
+        "tf.multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.random.multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.nn.batch_norm_with_global_normalization": {
+            "t": "input",
+            "m": "mean",
+            "v": "variance",
+        },
+        "tf.nn.conv3d": {
+            "filter": "filters"
+        },
+        "tf.zeros_like": {
+            "tensor": "input",
+        },
+        "tf.ones_like": {
+            "tensor": "input",
+        },
+        "tf.nn.conv3d_transpose": {
+            "value": "input",
+            "filter": "filters",
+        },
+        "tf.nn.convolution": {
+            "filter": "filters",
+            "dilation_rate": "dilations",
+        },
+        "tf.gfile.Exists": {
+            "filename": "path",
+        },
+        "tf.random.stateless_multinomial": {
+            "output_dtype": "dtype",
+        },
+    }
 
     # Mapping from function to the new name of the function
-    self.function_renames = renames_v2.renames
+    self.symbol_renames = renames_v2.renames
     # pylint: disable=line-too-long
-    self.function_renames.update({
-        "tf.saved_model.builder.SavedModelBuilder": "tf.compat.v1.saved_model.Builder",
-        "tf.saved_model.loader.load": "tf.compat.v1.saved_model.load",
-        "tf.saved_model.main_op.main_op": "tf.compat.v1.saved_model.main_op",
-        "tf.saved_model.main_op.main_op_with_restore": "tf.compat.v1.saved_model.main_op_with_restore",
-        "tf.saved_model.simple_save": "tf.compat.v1.saved_model.simple_save",
-        "tf.saved_model.utils.build_tensor_info": "tf.compat.v1.saved_model.build_tensor_info",
-        "tf.saved_model.utils.get_tensor_from_tensor_info": "tf.compat.v1.saved_model.get_tensor_from_tensor_info",
+    # Add additional renames not in renames_v2.py here.
+    # IMPORTANT: For the renames in here, if you also need to add to
+    # function_reorders or function_keyword_renames, use the OLD function name.
+    # These renames happen after the arguments have been processed.
+    self.symbol_renames.update({
+        "tf.contrib.data.AUTOTUNE": "tf.data.experimental.AUTOTUNE",
+        "tf.contrib.data.Counter": "tf.data.experimental.Counter",
+        "tf.contrib.data.CheckpointInputPipelineHook": "tf.data.experimental.CheckpointInputPipelineHook",
+        "tf.contrib.data.CsvDataset": "tf.data.experimental.CsvDataset",
+        "tf.contrib.data.Optional": "tf.data.experimental.Optional",
+        "tf.contrib.data.RandomDataset": "tf.data.experimental.RandomDataset",
+        "tf.contrib.data.Reducer": "tf.data.experimental.Reducer",
+        "tf.contrib.data.SqlDataset": "tf.data.experimental.SqlDataset",
+        "tf.contrib.data.StatsAggregator": "tf.data.experimental.StatsAggregator",
+        "tf.contrib.data.TFRecordWriter": "tf.data.experimental.TFRecordWriter",
+        "tf.contrib.data.assert_element_shape": "tf.data.experimental.assert_element_shape",
+        "tf.contrib.data.batch_and_drop_remainder": "tf.compat.v1.contrib.data.batch_and_drop_remainder",
+        "tf.contrib.data.bucket_by_sequence_length": "tf.data.experimental.bucket_by_sequence_length",
+        "tf.contrib.data.choose_from_datasets": "tf.data.experimental.choose_from_datasets",
+        "tf.contrib.data.copy_to_device": "tf.data.experimental.copy_to_device",
+        "tf.contrib.data.dense_to_sparse_batch": "tf.data.experimental.dense_to_sparse_batch",
+        "tf.contrib.data.enumerate_dataset": "tf.data.experimental.enumerate_dataset",
+        "tf.contrib.data.get_next_as_optional": "tf.data.experimental.get_next_as_optional",
+        "tf.contrib.data.get_single_element": "tf.data.experimental.get_single_element",
+        "tf.contrib.data.group_by_reducer": "tf.data.experimental.group_by_reducer",
+        "tf.contrib.data.group_by_window": "tf.data.experimental.group_by_window",
+        "tf.contrib.data.ignore_errors": "tf.data.experimental.ignore_errors",
+        "tf.contrib.data.latency_stats": "tf.data.experimental.latency_stats",
+        "tf.contrib.data.make_batched_features_dataset": "tf.data.experimental.make_batched_features_dataset",
+        "tf.contrib.data.make_csv_dataset": "tf.data.experimental.make_csv_dataset",
+        "tf.contrib.data.make_saveable_from_iterator": "tf.data.experimental.make_saveable_from_iterator",
+        "tf.contrib.data.map_and_batch": "tf.data.experimental.map_and_batch",
+        "tf.contrib.data.padded_batch_and_drop_remainder": "tf.compat.v1.contrib.data.padded_batch_and_drop_remainder",
+        "tf.contrib.data.parallel_interleave": "tf.data.experimental.parallel_interleave",
+        "tf.contrib.data.parse_example_dataset": "tf.data.experimental.parse_example_dataset",
+        "tf.contrib.data.prefetch_to_device": "tf.data.experimental.prefetch_to_device",
+        "tf.contrib.data.read_batch_features": "tf.compat.v1.contrib.data.read_batch_features",
+        "tf.contrib.data.reduce_dataset": "tf.compat.v1.contrib.data.reduce_dataset",
+        "tf.contrib.data.rejection_resample": "tf.data.experimental.rejection_resample",
+        "tf.contrib.data.sample_from_datasets": "tf.data.experimental.sample_from_datasets",
+        "tf.contrib.data.scan": "tf.data.experimental.scan",
+        "tf.contrib.data.set_stats_aggregator": "tf.data.experimental.set_stats_aggregator",
+        "tf.contrib.data.shuffle_and_repeat": "tf.data.experimental.shuffle_and_repeat",
+        "tf.contrib.data.sliding_window_batch": "tf.compat.v1.contrib.data.sliding_window_batch",
+        "tf.contrib.data.sloppy_interleave": "tf.compat.v1.contrib.data.sloppy_interleave",
+        "tf.contrib.data.unbatch": "tf.data.experimental.unbatch",
+        "tf.contrib.data.unique": "tf.data.experimental.unique",
+        "tf.contrib.framework.sort": "tf.sort",
+        "tf.contrib.framework.argsort": "tf.argsort",
+        "tf.quantize_v2": "tf.quantization.quantize",
+        "tf.sparse_concat": "tf.sparse.concat",
+        "tf.sparse_split": "tf.sparse.split",
+        "tf.multinomial": "tf.random.categorical",
+        "tf.random.multinomial": "tf.random.categorical",
+        "tf.load_file_system_library": "tf.load_library",
     })
     # pylint: enable=line-too-long
 
+    # For custom behavior and if auto-generate rename in renames_v2.py
+    # is incorrect, add the op name here to exclude it from renames_v2.py.
+    excluded_renames = [
+    ]
+
     # Variables that should be changed to functions.
     self.change_to_function = {}
 
     # Functions that were reordered should be changed to the new keyword args
     # for safety, if positional arguments are used. If you have reversed the
     # positional arguments yourself, this could do the wrong thing.
-    self.function_reorders = {}
+    # IMPORTANT: order here should correspond to OLD argument order.
+    # We just prepend "arg_name=" to all arguments in function calls.
+    self.function_reorders = {
+        "tf.argmax": ["input", "axis", "name", "dimension", "output_type"],
+        "tf.argmin": ["input", "axis", "name", "dimension", "output_type"],
+        "tf.boolean_mask": ["tensor", "mask", "name", "axis"],
+        "tf.convert_to_tensor": ["value", "dtype", "name", "preferred_dtype"],
+        "tf.nn.convolution": [
+            "input", "filter", "padding", "strides", "dilation_rate", "name",
+            "data_format"],
+        "tf.nn.crelu": ["features", "name", "axis"],
+        "tf.nn.pool": [
+            "input", "window_shape", "pooling_type", "padding", "dilation_rate",
+            "strides", "name", "data_format"
+        ],
+        "tf.nn.depthwise_conv2d": [
+            "input", "filter", "strides", "padding", "rate", "name",
+            "data_format"
+        ],
+        "tf.multinomial": [
+            "logits", "num_samples", "seed", "name", "output_dtype"
+        ],
+        "tf.random.multinomial": [
+            "logits", "num_samples", "seed", "name", "output_dtype"
+        ],
+        "tf.pad": ["tensor", "paddings", "mode", "name", "constant_values"],
+        "tf.quantize_v2": [
+            "input", "min_range", "max_range", "T", "mode", "name",
+            "round_mode"
+        ],
+        "tf.feature_column.categorical_column_with_vocabulary_file": [
+            "key", "vocabulary_file", "vocabulary_size",
+            "num_oov_buckets", "default_value", "dtype"
+        ],
+        "tf.shape": ["input", "name", "out_type"],
+        "tf.size": ["input", "name", "out_type"],
+        "tf.sparse.concat": [
+            "axis", "sp_inputs", "name", "expand_nonconcat_dim", "concat_dim"
+        ],
+        "tf.random.poisson": ["lam", "shape", "dtype", "seed", "name"],
+        "tf.sparse.segment_mean": [
+            "data", "indices", "segment_ids", "name", "num_segments"
+        ],
+        "tf.sparse.segment_sqrt_n": [
+            "data", "indices", "segment_ids", "name", "num_segments"
+        ],
+        "tf.sparse.segment_sum": [
+            "data", "indices", "segment_ids", "name", "num_segments"
+        ],
+        "tf.strings.length": ["input", "name", "unit"],
+        "tf.transpose": ["a", "perm", "name", "conjugate"],
+        "tf.tuple": ["tensors", "name", "control_inputs"],
+        "tf.while_loop": ["cond", "body", "loop_vars", "shape_invariants",
+                          "parallel_iterations", "back_prop", "swap_memory",
+                          "name", "maximum_iterations",
+                          "return_same_structure"],
+    }
 
     # Specially handled functions.
     self.function_handle = {}
@@ -65,18 +250,103 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "you need to inspect this usage manually.\n"
     )
 
+    # TODO(b/118888586): add default value change to update script.
+    default_loss_reduction_changed = (
+        "WARNING: default value of loss_reduction has been changed to "
+        "SUM_OVER_BATCH_SIZE.\n"
+    )
+
+    assert_return_type_comment = (
+        "WARNING: assert_* functions have been changed to return None, the "
+        "data argument has been removed, and arguments have been reordered."
+    )
+
+    assert_rank_comment = (
+        "WARNING: assert_rank_* functions have been changed to return None, and"
+        " the data and summarize arguments have been removed."
+    )
+
+    tf_01s_like_no_optimize_comment = (
+        "WARNING: tf.zeros_like and tf.ones_like no longer have the optimize "
+        "argument in TF 2.0 or after (also, `tensor' argument is renamed to "
+        "`input')."
+    )
+
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
     self.function_warnings = {
-        "tf.train.exponential_decay": decay_function_comment,
-        "tf.train.piecewise_constant": decay_function_comment,
-        "tf.train.polynomial_decay": decay_function_comment,
-        "tf.train.natural_exp_decay": decay_function_comment,
-        "tf.train.inverse_time_decay": decay_function_comment,
-        "tf.train.cosine_decay": decay_function_comment,
-        "tf.train.cosine_decay_restarts": decay_function_comment,
-        "tf.train.linear_cosine_decay": decay_function_comment,
-        "tf.train.noisy_linear_cosine_decay": decay_function_comment,
+        "tf.assert_greater": assert_return_type_comment,
+        "tf.assert_equal": assert_return_type_comment,
+        "tf.assert_less": assert_return_type_comment,
+        "tf.assert_rank": assert_rank_comment,
+        "tf.debugging.assert_equal": assert_return_type_comment,
+        "tf.debugging.assert_greater": assert_return_type_comment,
+        "tf.debugging.assert_greater_equal": assert_return_type_comment,
+        "tf.debugging.assert_integer": assert_return_type_comment,
+        "tf.debugging.assert_less": assert_return_type_comment,
+        "tf.debugging.assert_less_equal": assert_return_type_comment,
+        "tf.debugging.assert_near": assert_return_type_comment,
+        "tf.debugging.assert_negative": assert_return_type_comment,
+        "tf.debugging.assert_non_negative": assert_return_type_comment,
+        "tf.debugging.assert_non_positive": assert_return_type_comment,
+        "tf.debugging.assert_none_equal": assert_return_type_comment,
+        "tf.debugging.assert_positive": assert_return_type_comment,
+        "tf.debugging.assert_rank": assert_rank_comment,
+        "tf.debugging.assert_rank_at_least": assert_rank_comment,
+        "tf.debugging.assert_rank_in": assert_rank_comment,
+        "tf.train.exponential_decay":
+            decay_function_comment,
+        "tf.train.piecewise_constant":
+            decay_function_comment,
+        "tf.train.polynomial_decay":
+            decay_function_comment,
+        "tf.train.natural_exp_decay":
+            decay_function_comment,
+        "tf.train.inverse_time_decay":
+            decay_function_comment,
+        "tf.train.cosine_decay":
+            decay_function_comment,
+        "tf.train.cosine_decay_restarts":
+            decay_function_comment,
+        "tf.train.linear_cosine_decay":
+            decay_function_comment,
+        "tf.train.noisy_linear_cosine_decay":
+            decay_function_comment,
+        "tf.estimator.LinearClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.LinearRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNLinearCombinedClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNLinearCombinedRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.BaselineClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.BaselineRegressor":
+            default_loss_reduction_changed,
+        "tf.nn.conv1d":
+        "WARNING: use_cudnn_on_gpu argument has been removed and \"value\" was "
+        "renamed to \"input\"",
+        "tf.nn.conv2d":
+        "WARNING: use_cudnn_on_gpu argument has been removed and \"filter\" "
+        "was renamed to \"filters\"",
+        "tf.nn.conv2d_backprop_filter":
+        "WARNING: use_cudnn_on_gpu argument has been removed",
+        "tf.nn.conv2d_backprop_input":
+        "WARNING: use_cudnn_on_gpu argument has been removed and \"filter\" "
+        "was renamed to \"filters\"",
+        "tf.zeros_like": tf_01s_like_no_optimize_comment,
+        "tf.ones_like": tf_01s_like_no_optimize_comment,
+    }
+    # Right now we can't have both a rename and a warning.
+    self.symbol_renames = {
+        name: new_name
+        for name, new_name in self.symbol_renames.items()
+        if name not in self.function_warnings and name not in excluded_renames
     }
 
 
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 6a0c3a787dafdfc307c7b5e0531fe0bac09ca37b..7baa1cafdd08731c4c24725606f52c519435266e 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -64,6 +64,30 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, "tf.math.rsqrt(tf.math.log_sigmoid(3.8))\n")
 
+  def testRenameConstant(self):
+    text = "tf.MONOLITHIC_BUILD\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "tf.sysconfig.MONOLITHIC_BUILD\n")
+    text = "some_call(tf.MONOLITHIC_BUILD)\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "some_call(tf.sysconfig.MONOLITHIC_BUILD)\n")
+
+  def testRenameArgs(self):
+    text = ("tf.nn.pool(input_a, window_shape_a, pooling_type_a, padding_a, "
+            "dilation_rate_a, strides_a, name_a, data_format_a)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text,
+                     ("tf.nn.pool(input=input_a, window_shape=window_shape_a,"
+                      " pooling_type=pooling_type_a, padding=padding_a, "
+                      "dilations=dilation_rate_a, strides=strides_a, "
+                      "name=name_a, data_format=data_format_a)\n"))
+
+  def testReorder(self):
+    text = "tf.boolean_mask(a, b, c, d)\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text,
+                     "tf.boolean_mask(tensor=a, mask=b, name=c, axis=d)\n")
+
   def testLearningRateDecay(self):
     for decay in ["tf.train.exponential_decay", "tf.train.piecewise_constant",
                   "tf.train.polynomial_decay", "tf.train.natural_exp_decay",
@@ -78,6 +102,66 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay])
       self.assertIn("%s has been changed" % decay, report)
 
+  def testEstimatorLossReductionChange(self):
+    classes = [
+        "LinearClassifier", "LinearRegressor", "DNNLinearCombinedClassifier",
+        "DNNLinearCombinedRegressor", "DNNRegressor", "DNNClassifier",
+        "BaselineClassifier", "BaselineRegressor"
+    ]
+    for c in classes:
+      ns = "tf.estimator." + c
+      text = ns + "(a, b)"
+      _, report, errors, new_text = self._upgrade(text)
+      self.assertEqual(text, new_text)
+      self.assertEqual(errors, ["test.py:1: %s requires manual check." % ns])
+      self.assertIn("loss_reduction has been changed", report)
+
+  def testCountNonZeroChanges(self):
+    text = (
+        "tf.math.count_nonzero(input_tensor=input, dtype=dtype, name=name, "
+        "reduction_indices=axis, keep_dims=keepdims)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.math.count_nonzero(input=input, dtype=dtype, name=name, "
+        "axis=axis, keepdims=keepdims)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+  def testRandomMultinomialToRandomCategorical(self):
+    text = (
+        "tf.random.multinomial(logits, samples, seed, name, output_dtype)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
+        "name=name, dtype=output_dtype)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.multinomial(logits, samples, seed, name, output_dtype)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
+        "name=name, dtype=output_dtype)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+  def testConvolutionOpUpdate(self):
+    text = (
+        "tf.nn.convolution(input, filter, padding, strides, dilation_rate, "
+        "name, data_format)"
+    )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.nn.convolution(input=input, filters=filter, padding=padding, "
+        "strides=strides, dilations=dilation_rate, name=name, "
+        "data_format=data_format)"
+    )
+    self.assertEqual(new_text, expected_text)
+
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index feb37c902ec3359e6221937f4334ab2504394fa3..0ee4550815568dececd4e88ca520743b8f81948f 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -9,6 +9,7 @@ py_binary(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:lib",
+        "//tensorflow/python:no_contrib",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
     ],
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
index 7d6beca3586a79062a9143cde77c978bdabcd440..949946c8276f1822ad428804dd77be0fdbddf892 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -20,10 +20,14 @@ To update renames_v2.py, run:
   bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
 """
 # pylint: enable=line-too-long
+import sys
 
 import tensorflow as tf
 
+# This import is needed so that TensorFlow python modules are in sys.modules.
+from tensorflow import python as tf_python  # pylint: disable=unused-import
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import app
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
 from tensorflow.tools.common import public_api
@@ -59,19 +63,83 @@ from __future__ import print_function
 
 """
 
+_TENSORFLOW_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
+_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+_TENSORFLOW_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].constants)
+_TENSORFLOW_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
 
-def update_renames_v2(output_file_path):
-  """Writes a Python dictionary mapping deprecated to canonical API names.
 
-  Args:
-    output_file_path: File path to write output to. Any existing contents
-      would be replaced.
+def get_canonical_name(v2_names, v1_name):
+  if v2_names:
+    return v2_names[0]
+  return 'compat.v1.%s' % v1_name
+
+
+def get_all_v2_names():
+  """Get a set of function/class names available in TensorFlow 2.0."""
+  v2_names = set()  # All op names in TensorFlow 2.0
+
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects TF 2.0 names."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      if not hasattr(attr, '__dict__'):
+        continue
+      api_names_v2 = attr.__dict__.get(_TENSORFLOW_API_ATTR, [])
+      for name in api_names_v2:
+        v2_names.add(name)
+
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  traverse.traverse(tf.compat.v2, visitor)
+  return v2_names
+
+
+def collect_constant_renames():
+  """Looks for constants that need to be renamed in TF 2.0.
+
+  Returns:
+    List of tuples of the form (current name, new name).
+  """
+  renames = set()
+  for module in sys.modules.values():
+    if not hasattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1):
+      continue
+    constants_v1_list = getattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1)
+    constants_v2_list = getattr(module, _TENSORFLOW_CONSTANTS_ATTR)
+
+    # _tf_api_constants attribute contains a list of tuples:
+    # (api_names_list, constant_name)
+    # We want to find API names that are in V1 but not in V2 for the same
+    # constant_names.
+
+    # First, we convert constants_v1_list and constants_v2_list to
+    # dictionaries for easier lookup.
+    constants_v1 = {constant_name: api_names
+                    for api_names, constant_name in constants_v1_list}
+    constants_v2 = {constant_name: api_names
+                    for api_names, constant_name in constants_v2_list}
+    # Second, we look for names that are in V1 but not in V2.
+    for constant_name, api_names_v1 in constants_v1.items():
+      api_names_v2 = constants_v2[constant_name]
+      for name in api_names_v1:
+        if name not in api_names_v2:
+          renames.add((name, get_canonical_name(api_names_v2, name)))
+  return renames
+
+
+def collect_function_renames():
+  """Looks for functions/classes that need to be renamed in TF 2.0.
+
+  Returns:
+    List of tuples of the form (current name, new name).
   """
   # Set of rename lines to write to output file in the form:
   #   'tf.deprecated_name': 'tf.canonical_name'
-  rename_line_set = set()
-  # _tf_api_names attribute name
-  tensorflow_api_attr = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+  renames = set()
 
   def visit(unused_path, unused_parent, children):
     """Visitor that collects rename strings to add to rename_line_set."""
@@ -79,20 +147,48 @@ def update_renames_v2(output_file_path):
       _, attr = tf_decorator.unwrap(child[1])
       if not hasattr(attr, '__dict__'):
         continue
-      api_names = attr.__dict__.get(tensorflow_api_attr, [])
-      deprecated_api_names = attr.__dict__.get('_tf_deprecated_api_names', [])
-      canonical_name = tf_export.get_canonical_name(
-          api_names, deprecated_api_names)
+      api_names_v1 = attr.__dict__.get(_TENSORFLOW_API_ATTR_V1, [])
+      api_names_v2 = attr.__dict__.get(_TENSORFLOW_API_ATTR, [])
+      deprecated_api_names = set(api_names_v1) - set(api_names_v2)
       for name in deprecated_api_names:
-        rename_line_set.add('    \'tf.%s\': \'tf.%s\'' % (name, canonical_name))
+        renames.add((name, get_canonical_name(api_names_v2, name)))
 
   visitor = public_api.PublicAPIVisitor(visit)
   visitor.do_not_descend_map['tf'].append('contrib')
   visitor.do_not_descend_map['tf.compat'] = ['v1', 'v2']
   traverse.traverse(tf, visitor)
 
+  # It is possible that a different function is exported with the
+  # same name. For e.g. when creating a different function to
+  # rename arguments. Exclude it from renames in this case.
+  v2_names = get_all_v2_names()
+  renames = set((name, new_name) for name, new_name in renames
+                if name not in v2_names)
+  return renames
+
+
+def get_rename_line(name, canonical_name):
+  return '    \'tf.%s\': \'tf.%s\'' % (name, canonical_name)
+
+
+def update_renames_v2(output_file_path):
+  """Writes a Python dictionary mapping deprecated to canonical API names.
+
+  Args:
+    output_file_path: File path to write output to. Any existing contents
+      would be replaced.
+  """
+  function_renames = collect_function_renames()
+  constant_renames = collect_constant_renames()
+  all_renames = function_renames.union(constant_renames)
+
+  # List of rename lines to write to output file in the form:
+  #   'tf.deprecated_name': 'tf.canonical_name'
+  rename_lines = [
+      get_rename_line(name, canonical_name)
+      for name, canonical_name in all_renames]
   renames_file_text = '%srenames = {\n%s\n}\n' % (
-      _FILE_HEADER, ',\n'.join(sorted(rename_line_set)))
+      _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
   file_io.write_string_to_file(output_file_path, renames_file_text)
 
 
@@ -101,4 +197,4 @@ def main(unused_argv):
 
 
 if __name__ == '__main__':
-  tf.app.run(main=main)
+  app.run(main=main)
diff --git a/tensorflow/tools/docker/LICENSE b/tensorflow/tools/docker/LICENSE
index 28711d7885dbc8013847e063fa6e1f922525388f..dea770e05eeb359ba155c1a207f80852ca7d27aa 100644
--- a/tensorflow/tools/docker/LICENSE
+++ b/tensorflow/tools/docker/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/tools/docker/notebooks/LICENSE b/tensorflow/tools/docker/notebooks/LICENSE
index 28711d7885dbc8013847e063fa6e1f922525388f..dea770e05eeb359ba155c1a207f80852ca7d27aa 100644
--- a/tensorflow/tools/docker/notebooks/LICENSE
+++ b/tensorflow/tools/docker/notebooks/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index aff26bf0fb39d82bc2f6e0e3efd9f80e2fbb1cde..1cac5ee138316cd2f9839d2c67648d7d0703a398 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -112,6 +112,7 @@ pkg_tar(
 genrule(
     name = "clicenses_generate",
     srcs = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
@@ -180,6 +181,7 @@ genrule(
 genrule(
     name = "jnilicenses_generate",
     srcs = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
@@ -204,6 +206,9 @@ genrule(
         "@protobuf_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
+        "@grpc//:LICENSE",
+        "@grpc//third_party/address_sorting:LICENSE",
+        "@grpc//third_party/nanopb:LICENSE.txt",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index f9b0a1129b71ffbf83a9ab2ed17153158a2135ed..fa372dcd74b0557d6410feb111c60ef7e94007f5 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -78,7 +78,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
-    "//tensorflow/contrib/signal:test_util",
     "//tensorflow/contrib/slim:slim",
     "//tensorflow/contrib/slim/python/slim/data:data_pip",
     "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
@@ -109,6 +108,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
     "//tensorflow/python/saved_model:saved_model",
     "//tensorflow/python/tools:tools_pip",
@@ -132,6 +132,7 @@ py_binary(
 filegroup(
     name = "licenses",
     data = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 07475cc0c4de6b3cd71795575637a3c06da7c041..e164853428b2f1f70075a7631c7b2e01cb0f8ca6 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -87,7 +87,8 @@ if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
       REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.13.0a0, < 1.14.0a0'
-      break
+    if 'tensorflow_estimator' in pkg:
+      REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
 # weakref.finalize and enum were introduced in Python 3.4
 if sys.version_info < (3, 4):
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 256db46892e1fe2302b853159819d8a197cc556d..101d0e4c7a6bff0752ac36af854b72b12bc5b872 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -112,33 +112,33 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
-        sha256 = "363cc9239eacf8e7917753c6d8c94f767e4cd049160d0654a61ef32d5e1b3049",
-        strip_prefix = "mkl-dnn-4e333787e0d66a1dca1218e99a891d493dbc8ef1",
+        sha256 = "b100f57af4a2b59a3a37a1ba38f77b644d2107d758a1a7f4e51310063cd21e73",
+        strip_prefix = "mkl-dnn-733fc908874c71a5285043931a1cf80aa923165c",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
+            "https://github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "e35082e88b9da04f4d68094c05ba112502a5063712f3021adfa465306d238c76",
-        strip_prefix = "abseil-cpp-cc8dcd307b76a575d2e3e0958a4fe4c7193c2f68",
+        sha256 = "3cf6132129ba87f0781c383bfaf381b7174b5818e81fffcc5d04bb451154f0f2",
+        strip_prefix = "abseil-cpp-f95179062eb65ce40895cc76f1398cce25394369",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/cc8dcd307b76a575d2e3e0958a4fe4c7193c2f68.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/cc8dcd307b76a575d2e3e0958a4fe4c7193c2f68.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f95179062eb65ce40895cc76f1398cce25394369.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/f95179062eb65ce40895cc76f1398cce25394369.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        sha256 = "d956415d784fa4e42b6a2a45c32556d6aec9d0a3d8ef48baee2522ab762556a9",
-        strip_prefix = "eigen-eigen-fd6845384b86",
+        sha256 = "8fa7ba1af23f0320be05f4658061138d6eb8dd1f320669cbf305b3a034f9d1c2",
+        strip_prefix = "eigen-eigen-ea671884cc96",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/ea671884cc96.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/ea671884cc96.tar.gz",
         ],
     )
 
@@ -347,11 +347,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     )
 
     PROTOBUF_URLS = [
-        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz",
-        "https://github.com/google/protobuf/archive/v3.6.0.tar.gz",
+        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.1.tar.gz",
+        "https://github.com/google/protobuf/archive/v3.6.1.tar.gz",
     ]
-    PROTOBUF_SHA256 = "50a5753995b3142627ac55cfd496cebc418a2e575ca0236e29033c67bd5665f4"
-    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.0"
+    PROTOBUF_SHA256 = "3d4e589d81b2006ca603c1ab712c9715a76227293032d05b26fca603f90b3f5b"
+    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1"
 
     tf_http_archive(
         name = "protobuf_archive",
@@ -472,11 +472,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "2342cb98083eb1191a8411542dcd57cb3efc28677be4412e166f40cf22bd2b8c",
-        strip_prefix = "llvm-3fe1b12fca949399a3334a072ee7f96e2b6f557e",
+        sha256 = "7b4f705c532ee2aafb6e8b9013ad22ec8bb1823a153cd2d6ddb6b7faef818874",
+        strip_prefix = "llvm-9ad322c7dfd4385be9a515d734f70700f192ebae",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/3fe1b12fca949399a3334a072ee7f96e2b6f557e.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/3fe1b12fca949399a3334a072ee7f96e2b6f557e.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/9ad322c7dfd4385be9a515d734f70700f192ebae.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/9ad322c7dfd4385be9a515d734f70700f192ebae.tar.gz",
         ],
     )
 
@@ -805,11 +805,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph",
         build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
-        sha256 = "bf9dcc88e5c66021e3aac80491a231711211540d613bf9b6bd28db3f5bb86b62",
-        strip_prefix = "ngraph-0.8.1",
+        sha256 = "2b28f9c9f063b96825a96d56d7f7978c9a1c55c9b25175c20dd49a8a77cb0305",
+        strip_prefix = "ngraph-0.9.1",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
+            "https://github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
         ],
     )
 
@@ -827,11 +827,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph_tf",
         build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
-        sha256 = "402f84c748c113780a60f35f39aab118435285543aee4900d712b76fbf8a21ee",
-        strip_prefix = "ngraph-tf-0.6.1",
+        sha256 = "89accbc702e68a09775f1011a99dd16561038fd1ce59d566d64450176abaae5c",
+        strip_prefix = "ngraph-tf-0.7.0",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
+            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
         ],
     )
 
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index 5fa459caf15c95df050815426e7b4b5ac315862b..a941ee1c998dae14febe2453184fb75d1afe8016 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -39,15 +39,15 @@ def download_clang(repo_ctx, out_folder):
 
     # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
     # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "343880"
-    CLANG_SUB_REVISION = 1
+    CLANG_REVISION = "346388"
+    CLANG_SUB_REVISION = 3
 
     package_version = "%s-%s" % (CLANG_REVISION, CLANG_SUB_REVISION)
 
     checksums = {
-        "Linux_x64": "3530f53516fd08799e2754601e53a19531e1db5bc73c9ad8d2d1d8efdd9c9c9b",
-        "Mac": "8761b47869089be216324af8c5a93cba2d539a1d252c9c8cad8f2cd6da21f9f4",
-        "Win": "06eb08aa0b1ff7ea65db375a7dc7151cde7c89a44044fb63e5b73ea2f96c6e65",
+        "Linux_x64": "d47b7ac4756c3f8e3bbfa0e81bf199ec8e9faa3a6b11573f0705e9c04af7ad51",
+        "Mac": "de2b0c701e19cda633ea02804866dd24d8506afb8cae51fbcce3415b76f4ded3",
+        "Win": "c7d27f13b41aa9eaaf9760903962e9b2b0f8261058df0d35170711dc60545a7d",
     }
 
     platform_folder = _get_platform_folder(repo_ctx.os.name)
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 759f8a9be92e14537d334c3ec37f036d369d8796..194a2272d5489c6e193dbae4b96e23ab3290c77a 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -65,6 +65,7 @@ cc_library(
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
         "EIGEN_MAX_ALIGN_BYTES=64",
+        "EIGEN_HAS_TYPE_TRAITS=0",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
index 5ab36649187a41507f1201804090a801d7f639f9..ff359cedced9610f423d899b3a95b2f8d5f8bba5 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
@@ -249,9 +249,7 @@ EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) {
   a.value /= b.value;
   return a;
 }
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) {
-  return -a.value;
-}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { return -a.value; }
 
 // Scaling QInt32 by double. We do the arithmetic in double because
 // float only has 23 bits of mantissa, so casting QInt32 to float might reduce
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
index e6f4080ae127a93fc7830a8dcded1b74f581188f..8477933e1baebaddf209a9c6c07fa1100d6b10cc 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
@@ -15,11 +15,9 @@ namespace internal {
 
 // Accumulate the product of 2 QInt8 inputs on 32 bits to prevent
 // overflows
-template<> struct scalar_product_traits<QInt8, QInt8>
-{
-  enum {
-    Defined = 1
-  };
+template <>
+struct scalar_product_traits<QInt8, QInt8> {
+  enum { Defined = 1 };
   typedef QInt32 ReturnType;
 };
 
@@ -33,11 +31,9 @@ struct scalar_product_traits<QInt16, QInt16> {
 
 // Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
 // to prevent overflows
-template<> struct scalar_product_traits<QInt8, QUInt8>
-{
-  enum {
-    Defined = 1
-  };
+template <>
+struct scalar_product_traits<QInt8, QUInt8> {
+  enum { Defined = 1 };
   typedef QInt32 ReturnType;
 };
 
@@ -47,14 +43,16 @@ template<> struct scalar_product_traits<QInt8, QUInt8>
 // signed 8bit integers
 #ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
 
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
@@ -68,22 +66,24 @@ public:
 };
 
 // The signed 8bit Mat-Mat product itself.
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -113,18 +113,19 @@ void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjugat
 }
 #endif
 
-
 // This definition tackle the case where the lhs is encoded using signed 8bit
 // integers and the rhs using unsigned 8bit integers.
 #ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
@@ -138,22 +139,24 @@ public:
 };
 
 // Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -183,18 +186,19 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 }
 #endif
 
-
 // This definition tackle the case where the khs is encoded using unsigned 8bit
 // integers and the rhs using signed 8bit integers.
 #ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QUInt8 LhsScalar;
   typedef QInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
@@ -207,24 +211,25 @@ public:
   };
 };
 
-
 // Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QUInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -263,6 +268,9 @@ class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
   typedef QInt16 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
index 66532fb60028789df7495bc54c833622187e79bf..8547dca1b32eb2d11b27b7854cb8ff77efe0a31e 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
@@ -28,6 +28,9 @@ class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
   typedef QInt16 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // Define register blocking scheme.
     nr = 16,
@@ -43,7 +46,7 @@ class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
 // Used by TensorContractionThreadPool, inputs must have dimensions that are
 // multiples of 32.
 template <typename Index, int ShardingType>
-class TensorContractionBlocking<QInt16, QInt16, Index, ShardingType> {
+class TensorContractionBlocking<QInt16, QInt16, QInt16, Index, ShardingType> {
  public:
   TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
       : kc_(((k + 15) / 16) * 16),
@@ -144,7 +147,7 @@ class gemm_blocking_space<ColMajor, QInt16, QInt16, MaxRows, MaxCols, MaxDepth,
 
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
+struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, QInt16, ColMajor,
                      Conjugate, PanelMode> {
   EIGEN_DONT_INLINE void operator()(QInt16* blockA, const DataMapper& lhs,
                                     Index depth, Index rows, Index stride = 0,
@@ -154,12 +157,14 @@ struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2,
-                                     ColMajor, Conjugate, PanelMode>::
+                                     QInt16, ColMajor, Conjugate, PanelMode>::
 operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows,
            Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt16>::type Packet;
+
   // Use alternate function for weird sizes
   if (rows % 16 != 0 || depth % 16 != 0) {
     assert(false &&
@@ -178,10 +183,10 @@ operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows,
     // Pack depth in sets of 4
     for (Index k = 0; k < depth; k += 4) {
       // Load vectors
-      __m256i L_A = lhs.loadPacket(m, k);
-      __m256i L_B = lhs.loadPacket(m, k + 1);
-      __m256i L_C = lhs.loadPacket(m, k + 2);
-      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
 
       // Rearrange the inputs as required by the kernel
       __m256i L_AB0_AB7 = _mm256_unpacklo_epi16(L_A, L_B);
@@ -236,13 +241,15 @@ struct gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
 
 template <typename Index, typename DataMapper, int nr, bool Conjugate,
           bool PanelMode>
-EIGEN_DONT_INLINE void
-gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
+EIGEN_DONT_INLINE void gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor,
+                                     Conjugate, PanelMode>::
 operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols,
            Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt16>::type Packet;
+
   // Use alternate function for weird sizes
   if (cols % 16 != 0 || depth % 16 != 0) {
     assert(false &&
@@ -277,28 +284,28 @@ operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols,
   for (Index n = 0; n < cols; n += 16) {
     // Pack depth in sets of 16
     for (Index k = 0; k < depth; k += 16) {
-      __m256i R_A = rhs.loadPacket(k, n);
-      __m256i R_B = rhs.loadPacket(k, n + 1);
-      __m256i R_C = rhs.loadPacket(k, n + 2);
-      __m256i R_D = rhs.loadPacket(k, n + 3);
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 4);
-      R_B = rhs.loadPacket(k, n + 5);
-      R_C = rhs.loadPacket(k, n + 6);
-      R_D = rhs.loadPacket(k, n + 7);
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 8);
-      R_B = rhs.loadPacket(k, n + 9);
-      R_C = rhs.loadPacket(k, n + 10);
-      R_D = rhs.loadPacket(k, n + 11);
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 12);
-      R_B = rhs.loadPacket(k, n + 13);
-      R_C = rhs.loadPacket(k, n + 14);
-      R_D = rhs.loadPacket(k, n + 15);
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
       PACK_STEP;
 
       blockB_256 += 12;
@@ -476,9 +483,13 @@ operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
       for (Index j = n; j < n + 16; j++) {
         LinearMapper r0 = res.getLinearMapper(m, j);
         LinearMapper r1 = res.getLinearMapper(m + 8, j);
-
-        r0.storePacket(0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
-        r1.storePacket(0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
+        typedef typename packet_traits<QInt32>::type Packet;
+        r0.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r0.template loadPacket<Packet>(0)));
+        r1.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r1.template loadPacket<Packet>(0)));
       }
 
       // Zero the result block so it can be reused
@@ -496,14 +507,16 @@ operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
 #ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
 
 // Define quantized traits
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // Define register blocking scheme.
     nr = 32,
@@ -518,22 +531,28 @@ public:
 // Specialized blocking for quantized implementations.
 // Used by TensorContractionThreadPool, inputs must have dimensions that are
 // multiples of 32.
-template<typename Index,
-         typename LeftTensor,
-         typename left_nocontract_t, typename left_contract_t,
-         bool left_inner_dim_contiguous, bool left_inner_dim_reordered, int LeftAlignment,
-         typename RightTensor,
-         typename right_nocontract_t, typename right_contract_t,
-         bool right_inner_dim_contiguous, bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
-class TensorContractionBlocking<TensorContractionInputMapper<QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32, left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>, TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor, right_nocontract_t, right_contract_t, 32, right_inner_dim_contiguous, right_inner_dim_reordered, RightAlignment>, Index, ShardingType> {
+template <typename ResScalar, typename Index, typename LeftTensor,
+          typename left_nocontract_t, typename left_contract_t,
+          bool left_inner_dim_contiguous, bool left_inner_dim_reordered,
+          int LeftAlignment, typename RightTensor, typename right_nocontract_t,
+          typename right_contract_t, bool right_inner_dim_contiguous,
+          bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
+class TensorContractionBlocking<
+    ResScalar,
+    TensorContractionInputMapper<
+        QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32,
+        left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>,
+    TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor,
+                                 right_nocontract_t, right_contract_t, 32,
+                                 right_inner_dim_contiguous,
+                                 right_inner_dim_reordered, RightAlignment>,
+    Index, ShardingType> {
  public:
-
-  typedef QInt8  LhsScalar;
+  typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
 
-  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
-      kc_(k), mc_(m), nc_(n)
-  {
+  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
     eigen_assert(m % 32 == 0);
     eigen_assert(k % 32 == 0);
     if (!k || !m || !n) {
@@ -543,8 +562,7 @@ class TensorContractionBlocking<TensorContractionInputMapper<QInt8, Index, Lhs,
     if (ShardingType == ShardByCol) {
       eigen_assert(n % 32 == 0);
       nc_ = (((n / num_threads) + 31) / 32) * 32;
-    }
-    else {
+    } else {
       eigen_assert(n % 32 == 0 || n == 1);
       // Special case to avoid breaking the unimplemented matrix-vector case
       if (n == 1) {
@@ -599,7 +617,6 @@ class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth,
   }
 };
 
-
 template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
 class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
                           KcFactor, false>
@@ -633,42 +650,60 @@ class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
 };
 
 // Alternate templates for any input sizes
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template <typename Scalar, typename Index, typename DataMapper, int Pack1,
+          int Pack2, int StorageOrder, bool Conjugate = false,
+          bool PanelMode = false>
 struct gemm_pack_lhs_any;
-template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> {
-  EIGEN_DONT_INLINE void operator()
-      (QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                         Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
+                                    Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+template <typename Scalar, typename Index, typename DataMapper, int nr,
+          int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_rhs_any;
-template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
-  EIGEN_DONT_INLINE void operator()
-      (QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                         PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
 };
 
-template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template <typename LhsScalar, typename RhsScalar, typename Index,
+          typename DataMapper, int mr, int nr, bool ConjugateLhs = false,
+          bool ConjugateRhs = false>
 struct gebp_kernel_any;
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                       ConjugateRhs> {
   typedef typename DataMapper::LinearMapper LinearMapper;
 
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
 // Alternate implementations for any input sizes
-template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>::
-operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2,
+                                         ColMajor, Conjugate, PanelMode>::
+operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
+           Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt8>::type Packet;
+
   // Get vector pointer
   __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
 
@@ -690,15 +725,15 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
     // Pack depth in sets of 8
     for (Index k = 0; k < depth_8; k += 8) {
       // Load vectors
-      __m256i L_A = lhs.loadPacket(m, k);
-      __m256i L_B = lhs.loadPacket(m, k + 1);
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
 
       // Interleave 8-bit elements
       __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
       __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
 
-      __m256i L_C = lhs.loadPacket(m, k + 2);
-      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
       __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
       __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
 
@@ -719,12 +754,12 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
       _mm256_store_si256(blockA_256++, L_AD16);
       __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
       _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_E = lhs.loadPacket(m, k + 4);
-      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
+      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
       __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
       __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_G = lhs.loadPacket(m, k + 6);
-      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
+      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
       __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
       __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
       __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
@@ -745,76 +780,76 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
     if (depth_8 < depth) {
       __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
       switch (depth - depth_8) {
-      case 1:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 2:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 3:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 4:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 5:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = lhs.loadPacket(m, depth_8 + 4);
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 6:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = lhs.loadPacket(m, depth_8 + 4);
-        L_F = lhs.loadPacket(m, depth_8 + 5);
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 7:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = lhs.loadPacket(m, depth_8 + 4);
-        L_F = lhs.loadPacket(m, depth_8 + 5);
-        L_G = lhs.loadPacket(m, depth_8 + 6);
-        L_H = _mm256_setzero_si256();
-        break;
+        case 1:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 2:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 3:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 4:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 5:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 6:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 7:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
+          L_G = lhs.template loadPacket<Packet>(m, depth_8 + 6);
+          L_H = _mm256_setzero_si256();
+          break;
       }
 
       // Interleave 8-bit elements
@@ -875,21 +910,21 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
       __m256i L_G = _mm256_setzero_si256();
       __m256i L_H = _mm256_setzero_si256();
       for (Index m = 0; m < rows - rows_32; m++) {
-        QInt8* ptr = (QInt8*) &L_A;
+        QInt8* ptr = (QInt8*)&L_A;
         ptr[m] = lhs(rows_32 + m, k);
-        ptr = (QInt8*) &L_B;
+        ptr = (QInt8*)&L_B;
         ptr[m] = lhs(rows_32 + m, k + 1);
-        ptr = (QInt8*) &L_C;
+        ptr = (QInt8*)&L_C;
         ptr[m] = lhs(rows_32 + m, k + 2);
-        ptr = (QInt8*) &L_D;
+        ptr = (QInt8*)&L_D;
         ptr[m] = lhs(rows_32 + m, k + 3);
-        ptr = (QInt8*) &L_E;
+        ptr = (QInt8*)&L_E;
         ptr[m] = lhs(rows_32 + m, k + 4);
-        ptr = (QInt8*) &L_F;
+        ptr = (QInt8*)&L_F;
         ptr[m] = lhs(rows_32 + m, k + 5);
-        ptr = (QInt8*) &L_G;
+        ptr = (QInt8*)&L_G;
         ptr[m] = lhs(rows_32 + m, k + 6);
-        ptr = (QInt8*) &L_H;
+        ptr = (QInt8*)&L_H;
         ptr[m] = lhs(rows_32 + m, k + 7);
       }
 
@@ -939,146 +974,146 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
       __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
       QInt8* ptr;
       switch (depth - depth_8) {
-      case 1:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          QInt8* ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-        }
-        break;
-      case 2:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-        }
-        break;
-      case 3:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-        }
-        break;
-      case 4:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-        }
-        break;
-      case 5:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          ptr = (QInt8*) &L_E;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-        }
-        break;
-      case 6:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          ptr = (QInt8*) &L_E;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-          ptr = (QInt8*) &L_F;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
-        }
-        break;
-      case 7:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          ptr = (QInt8*) &L_E;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-          ptr = (QInt8*) &L_F;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
-          ptr = (QInt8*) &L_G;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 6);
-        }
-        break;
+        case 1:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            QInt8* ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+          }
+          break;
+        case 2:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          }
+          break;
+        case 3:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          }
+          break;
+        case 4:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          }
+          break;
+        case 5:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+          }
+          break;
+        case 6:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+            ptr = (QInt8*)&L_F;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+          }
+          break;
+        case 7:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+            ptr = (QInt8*)&L_F;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+            ptr = (QInt8*)&L_G;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 6);
+          }
+          break;
       }
 
       // Interleave 8-bit elements
@@ -1124,12 +1159,17 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
   }
 }
 
-template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
-operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr,
+                                         ColMajor, Conjugate, PanelMode>::
+operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
+           Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QUInt8>::type Packet;
+
   // Get vector pointer
   __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
 
@@ -1158,52 +1198,52 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
   for (Index n = 0; n < cols_32; n += 32) {
     // Pack depth in sets of 32
     for (Index k = 0; k < depth_32; k += 32) {
-      __m256i R_A = rhs.loadPacket(k, n);
-      __m256i R_B = rhs.loadPacket(k, n + 1);
-      __m256i R_C = rhs.loadPacket(k, n + 2);
-      __m256i R_D = rhs.loadPacket(k, n + 3);
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 4);
-      R_B = rhs.loadPacket(k, n + 5);
-      R_C = rhs.loadPacket(k, n + 6);
-      R_D = rhs.loadPacket(k, n + 7);
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 8);
-      R_B = rhs.loadPacket(k, n + 9);
-      R_C = rhs.loadPacket(k, n + 10);
-      R_D = rhs.loadPacket(k, n + 11);
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 12);
-      R_B = rhs.loadPacket(k, n + 13);
-      R_C = rhs.loadPacket(k, n + 14);
-      R_D = rhs.loadPacket(k, n + 15);
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 16);
-      R_B = rhs.loadPacket(k, n + 17);
-      R_C = rhs.loadPacket(k, n + 18);
-      R_D = rhs.loadPacket(k, n + 19);
+      R_A = rhs.template loadPacket<Packet>(k, n + 16);
+      R_B = rhs.template loadPacket<Packet>(k, n + 17);
+      R_C = rhs.template loadPacket<Packet>(k, n + 18);
+      R_D = rhs.template loadPacket<Packet>(k, n + 19);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 20);
-      R_B = rhs.loadPacket(k, n + 21);
-      R_C = rhs.loadPacket(k, n + 22);
-      R_D = rhs.loadPacket(k, n + 23);
+      R_A = rhs.template loadPacket<Packet>(k, n + 20);
+      R_B = rhs.template loadPacket<Packet>(k, n + 21);
+      R_C = rhs.template loadPacket<Packet>(k, n + 22);
+      R_D = rhs.template loadPacket<Packet>(k, n + 23);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 24);
-      R_B = rhs.loadPacket(k, n + 25);
-      R_C = rhs.loadPacket(k, n + 26);
-      R_D = rhs.loadPacket(k, n + 27);
+      R_A = rhs.template loadPacket<Packet>(k, n + 24);
+      R_B = rhs.template loadPacket<Packet>(k, n + 25);
+      R_C = rhs.template loadPacket<Packet>(k, n + 26);
+      R_D = rhs.template loadPacket<Packet>(k, n + 27);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 28);
-      R_B = rhs.loadPacket(k, n + 29);
-      R_C = rhs.loadPacket(k, n + 30);
-      R_D = rhs.loadPacket(k, n + 31);
+      R_A = rhs.template loadPacket<Packet>(k, n + 28);
+      R_B = rhs.template loadPacket<Packet>(k, n + 29);
+      R_C = rhs.template loadPacket<Packet>(k, n + 30);
+      R_D = rhs.template loadPacket<Packet>(k, n + 31);
       PACK_STEP;
 
       blockB_256 += 24;
@@ -1216,13 +1256,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       __m256i R_C = _mm256_setzero_si256();
       __m256i R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 1);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 2);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 3);
       }
       PACK_STEP;
@@ -1232,13 +1272,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 4);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 5);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 6);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 7);
       }
       PACK_STEP;
@@ -1248,13 +1288,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 8);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 9);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 10);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 11);
       }
       PACK_STEP;
@@ -1264,13 +1304,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 12);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 13);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 14);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 15);
       }
       PACK_STEP;
@@ -1280,13 +1320,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 16);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 17);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 18);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 19);
       }
       PACK_STEP;
@@ -1296,13 +1336,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 20);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 21);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 22);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 23);
       }
       PACK_STEP;
@@ -1312,13 +1352,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 24);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 25);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 26);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 27);
       }
       PACK_STEP;
@@ -1328,13 +1368,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 28);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 29);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 30);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 31);
       }
       PACK_STEP;
@@ -1350,34 +1390,34 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       Index n;
       for (n = cols_32; n < cols; n += 4) {
         switch (cols - n) {
-        case 1:
-          R_A = rhs.loadPacket(k, n);
-          R_B = _mm256_setzero_si256();
-          R_C = _mm256_setzero_si256();
-          R_D = _mm256_setzero_si256();
-          PACK_STEP;
-          break;
-        case 2:
-          R_A = rhs.loadPacket(k, n);
-          R_B = rhs.loadPacket(k, n + 1);
-          R_C = _mm256_setzero_si256();
-          R_D = _mm256_setzero_si256();
-          PACK_STEP;
-          break;
-        case 3:
-          R_A = rhs.loadPacket(k, n);
-          R_B = rhs.loadPacket(k, n + 1);
-          R_C = rhs.loadPacket(k, n + 2);
-          R_D = _mm256_setzero_si256();
-          PACK_STEP;
-          break;
-        default:
-          R_A = rhs.loadPacket(k, n);
-          R_B = rhs.loadPacket(k, n + 1);
-          R_C = rhs.loadPacket(k, n + 2);
-          R_D = rhs.loadPacket(k, n + 3);
-          PACK_STEP;
-          break;
+          case 1:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = _mm256_setzero_si256();
+            R_C = _mm256_setzero_si256();
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          case 2:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = _mm256_setzero_si256();
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          case 3:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = rhs.template loadPacket<Packet>(k, n + 2);
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          default:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = rhs.template loadPacket<Packet>(k, n + 2);
+            R_D = rhs.template loadPacket<Packet>(k, n + 3);
+            PACK_STEP;
+            break;
         }
       }
 
@@ -1394,46 +1434,46 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
         __m256i R_C = _mm256_setzero_si256();
         __m256i R_D = _mm256_setzero_si256();
         switch (cols - n) {
-        case 1:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-          }
-          PACK_STEP;
-          break;
-        case 2:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-            ptr = (QUInt8*) &R_B;
-            ptr[k - depth_32] = rhs(k, n + 1);
-          }
-          PACK_STEP;
-          break;
-        case 3:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-            ptr = (QUInt8*) &R_B;
-            ptr[k - depth_32] = rhs(k, n + 1);
-            ptr = (QUInt8*) &R_C;
-            ptr[k - depth_32] = rhs(k, n + 2);
-          }
-          PACK_STEP;
-          break;
-        default:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-            ptr = (QUInt8*) &R_B;
-            ptr[k - depth_32] = rhs(k, n + 1);
-            ptr = (QUInt8*) &R_C;
-            ptr[k - depth_32] = rhs(k, n + 2);
-            ptr = (QUInt8*) &R_D;
-            ptr[k - depth_32] = rhs(k, n + 3);
-          }
-          PACK_STEP;
-          break;
+          case 1:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+            }
+            PACK_STEP;
+            break;
+          case 2:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+            }
+            PACK_STEP;
+            break;
+          case 3:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+              ptr = (QUInt8*)&R_C;
+              ptr[k - depth_32] = rhs(k, n + 2);
+            }
+            PACK_STEP;
+            break;
+          default:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+              ptr = (QUInt8*)&R_C;
+              ptr[k - depth_32] = rhs(k, n + 2);
+              ptr = (QUInt8*)&R_D;
+              ptr[k - depth_32] = rhs(k, n + 3);
+            }
+            PACK_STEP;
+            break;
         }
       }
     }
@@ -1441,13 +1481,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
 #undef PACK_STEP
 }
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                       ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   eigen_assert(alpha.value == 1);
@@ -1678,17 +1718,21 @@ void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Con
           LinearMapper r1 = res.getLinearMapper(m + 8, j);
           LinearMapper r2 = res.getLinearMapper(m + 16, j);
           LinearMapper r3 = res.getLinearMapper(m + 24, j);
-          r0.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
-          r1.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
-          r2.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
-          r3.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+          typedef typename packet_traits<QInt32>::type Packet;
+          r0.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r0.template loadPacket<Packet>(0)));
+          r1.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r1.template loadPacket<Packet>(0)));
+          r2.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r2.template loadPacket<Packet>(0)));
+          r3.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r3.template loadPacket<Packet>(0)));
         }
-      }
-      else {
+      } else {
         for (Index j = n; j < cols; j++) {
           for (Index i = m; i < rows; i++) {
             res(i, j) = blockO[(j - n) * 32 + (i - m)];
@@ -1745,7 +1789,7 @@ void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Con
 // madd both perform an adjacent addition in the kernel.
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, QInt8, ColMajor,
                      Conjugate, PanelMode> {
   EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
                                     Index depth, Index rows, Index stride = 0,
@@ -1755,15 +1799,18 @@ struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2,
-                                     ColMajor, Conjugate, PanelMode>::
+                                     QInt8, ColMajor, Conjugate, PanelMode>::
 operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
            Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt8>::type Packet;
+
   // Use alternate function for weird sizes
   if (rows % 32 != 0 || depth % 32 != 0) {
-    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> lhs_pack;
+    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                      Conjugate, PanelMode> lhs_pack;
     return lhs_pack(blockA, lhs, depth, rows, stride, offset);
   }
 
@@ -1775,15 +1822,15 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
     // Pack depth in sets of 8
     for (Index k = 0; k < depth; k += 8) {
       // Load vectors
-      __m256i L_A = lhs.loadPacket(m, k);
-      __m256i L_B = lhs.loadPacket(m, k + 1);
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
 
       // Interleave 8-bit elements
       __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
       __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
 
-      __m256i L_C = lhs.loadPacket(m, k + 2);
-      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
       __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
       __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
 
@@ -1804,12 +1851,12 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
       _mm256_store_si256(blockA_256++, L_AD16);
       __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
       _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_E = lhs.loadPacket(m, k + 4);
-      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
+      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
       __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
       __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_G = lhs.loadPacket(m, k + 6);
-      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
+      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
       __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
       __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
       __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
@@ -1868,9 +1915,12 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QUInt8>::type Packet;
+
   // Use alternate function for weird sizes
   if (cols % 32 != 0 || depth % 32 != 0) {
-    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> rhs_pack;
+    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                      PanelMode> rhs_pack;
     return rhs_pack(blockB, rhs, depth, cols, stride, offset);
   }
 
@@ -1898,52 +1948,52 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
   for (Index n = 0; n < cols; n += 32) {
     // Pack depth in sets of 32
     for (Index k = 0; k < depth; k += 32) {
-      __m256i R_A = rhs.loadPacket(k, n);
-      __m256i R_B = rhs.loadPacket(k, n + 1);
-      __m256i R_C = rhs.loadPacket(k, n + 2);
-      __m256i R_D = rhs.loadPacket(k, n + 3);
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 4);
-      R_B = rhs.loadPacket(k, n + 5);
-      R_C = rhs.loadPacket(k, n + 6);
-      R_D = rhs.loadPacket(k, n + 7);
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 8);
-      R_B = rhs.loadPacket(k, n + 9);
-      R_C = rhs.loadPacket(k, n + 10);
-      R_D = rhs.loadPacket(k, n + 11);
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 12);
-      R_B = rhs.loadPacket(k, n + 13);
-      R_C = rhs.loadPacket(k, n + 14);
-      R_D = rhs.loadPacket(k, n + 15);
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 16);
-      R_B = rhs.loadPacket(k, n + 17);
-      R_C = rhs.loadPacket(k, n + 18);
-      R_D = rhs.loadPacket(k, n + 19);
+      R_A = rhs.template loadPacket<Packet>(k, n + 16);
+      R_B = rhs.template loadPacket<Packet>(k, n + 17);
+      R_C = rhs.template loadPacket<Packet>(k, n + 18);
+      R_D = rhs.template loadPacket<Packet>(k, n + 19);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 20);
-      R_B = rhs.loadPacket(k, n + 21);
-      R_C = rhs.loadPacket(k, n + 22);
-      R_D = rhs.loadPacket(k, n + 23);
+      R_A = rhs.template loadPacket<Packet>(k, n + 20);
+      R_B = rhs.template loadPacket<Packet>(k, n + 21);
+      R_C = rhs.template loadPacket<Packet>(k, n + 22);
+      R_D = rhs.template loadPacket<Packet>(k, n + 23);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 24);
-      R_B = rhs.loadPacket(k, n + 25);
-      R_C = rhs.loadPacket(k, n + 26);
-      R_D = rhs.loadPacket(k, n + 27);
+      R_A = rhs.template loadPacket<Packet>(k, n + 24);
+      R_B = rhs.template loadPacket<Packet>(k, n + 25);
+      R_C = rhs.template loadPacket<Packet>(k, n + 26);
+      R_D = rhs.template loadPacket<Packet>(k, n + 27);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 28);
-      R_B = rhs.loadPacket(k, n + 29);
-      R_C = rhs.loadPacket(k, n + 30);
-      R_D = rhs.loadPacket(k, n + 31);
+      R_A = rhs.template loadPacket<Packet>(k, n + 28);
+      R_B = rhs.template loadPacket<Packet>(k, n + 29);
+      R_C = rhs.template loadPacket<Packet>(k, n + 30);
+      R_D = rhs.template loadPacket<Packet>(k, n + 31);
       PACK_STEP;
 
       blockB_256 += 24;
@@ -1953,24 +2003,26 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
 }
 
 // Perform the actual multiplication on packed inputs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   typedef typename DataMapper::LinearMapper LinearMapper;
 
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   eigen_assert(alpha.value == 1);
@@ -1986,8 +2038,10 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 
   // Use alternate function for weird sizes
   if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) {
-    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> gebp;
-    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                    ConjugateRhs> gebp;
+    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB,
+                offsetA, offsetB);
   }
 
   // Create result block
@@ -2205,14 +2259,19 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
         LinearMapper r1 = res.getLinearMapper(m + 8, j);
         LinearMapper r2 = res.getLinearMapper(m + 16, j);
         LinearMapper r3 = res.getLinearMapper(m + 24, j);
-        r0.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
-        r1.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
-        r2.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
-        r3.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+        typedef typename packet_traits<QInt32>::type Packet;
+        r0.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r0.template loadPacket<Packet>(0)));
+        r1.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r1.template loadPacket<Packet>(0)));
+        r2.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r2.template loadPacket<Packet>(0)));
+        r3.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r3.template loadPacket<Packet>(0)));
       }
 
       // Zero the result block so it can be reused
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
index 9cd31570231173337ef0a7049171055bca897be4..9e0efae6c9b3516bbc130be44d87d18e62038237 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
@@ -14,15 +14,14 @@
 namespace Eigen {
 namespace internal {
 
-
-// AVX2 optimized implementation of the case where the lhs is encoded using signed 8bit
+// AVX2 optimized implementation of the case where the lhs is encoded using
+// signed 8bit
 // integers and the rhs using unsigned 8bit integers.
 #ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
 
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
   typedef QInt32 ResScalar;
@@ -40,22 +39,24 @@ public:
 };
 
 // Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -85,7 +86,6 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 }
 #endif
 
-
 }  // namespace internal
 }  // namespace Eigen
 
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
index ad11d3d44b813830c87f2634a9234adfeac80329..f15200caba5d14e08c0bc3cc51f3f8bcc7f5debe 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
@@ -15,25 +15,23 @@ namespace internal {
 
 // Mat-Vec product
 // Both lhs and rhs are encoded as 8bit signed integers
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
-{
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  QInt32* res, Index resIncr,
-  QInt8 alpha);
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt8 alpha);
 };
 
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
-    Index rows, Index cols,
-    const LhsMapper& lhs,
-    const RhsMapper& rhs,
-    QInt32* res, Index resIncr,
-    QInt8 alpha)
-{
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt8 alpha) {
   eigen_assert(alpha.value == 1);
   eigen_assert(resIncr == 1);
   eigen_assert(rows > 0);
@@ -78,26 +76,25 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<
 }
 
 // Mat-Vec product
-// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>
-{
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  QInt32* res, Index resIncr,
-  QUInt8 alpha);
+// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned
+// integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QUInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QUInt8 alpha);
 };
 
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>::run(
-    Index rows, Index cols,
-    const LhsMapper& lhs,
-    const RhsMapper& rhs,
-    QInt32* res, Index resIncr,
-    QUInt8 alpha)
-{
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QUInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QUInt8 alpha) {
   eigen_assert(alpha.value == 1);
   eigen_assert(resIncr == 1);
   eigen_assert(rows > 0);
@@ -110,28 +107,26 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMa
   }
 }
 
-
 // Mat-Vec product
-// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed integers
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
-{
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  QInt32* res, Index resIncr,
-  QInt8 alpha);
+// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed
+// integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QUInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt8 alpha);
 };
 
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
-    Index rows, Index cols,
-    const LhsMapper& lhs,
-    const RhsMapper& rhs,
-    QInt32* res, Index resIncr,
-    QInt8 alpha)
-{
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QUInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt8 alpha) {
   eigen_assert(alpha.value == 1);
   eigen_assert(resIncr == 1);
   eigen_assert(rows > 0);
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 3abd4ee49c2a6596ff9545faddedf926b4da857f..223ea4d58bf4c40b2790e2f5d73e2a4fc1a79eec 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -8,24 +8,20 @@
 
 #endif
 
-inline int _mm256_extract_epi16_N0(const __m256i X)
-{
-	return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
+inline int _mm256_extract_epi16_N0(const __m256i X) {
+  return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
 }
 
-inline int _mm256_extract_epi16_N1(const __m256i X)
-{
-	return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
+inline int _mm256_extract_epi16_N1(const __m256i X) {
+  return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
 }
 
-inline int _mm256_extract_epi8_N0(const __m256i X)
-{
-	return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
+inline int _mm256_extract_epi8_N0(const __m256i X) {
+  return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
 }
 
-inline int _mm256_extract_epi8_N1(const __m256i X)
-{
-	return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
+inline int _mm256_extract_epi8_N1(const __m256i X) {
+  return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
 }
 
 namespace Eigen {
@@ -34,56 +30,56 @@ namespace internal {
 typedef struct Packet32q8i {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet32q8i();
+  Packet32q8i() : val(_mm256_setzero_si256()){};
   Packet32q8i(__m256i val) : val(val) {}
 } Packet32q8i;
 
 typedef struct Packet16q16i {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet16q16i();
+  Packet16q16i() : val(_mm256_setzero_si256()){};
   Packet16q16i(__m256i val) : val(val) {}
 } Packet16q16i;
 
 typedef struct Packet32q8u {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet32q8u();
+  Packet32q8u() : val(_mm256_setzero_si256()){};
   Packet32q8u(__m256i val) : val(val) {}
 } Packet32q8u;
 
 typedef struct Packet16q8i {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet16q8i();
+  Packet16q8i() : val(_mm_setzero_si128()) {}
   Packet16q8i(__m128i val) : val(val) {}
 } Packet16q8i;
 
 typedef struct Packet16q8u {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet16q8u();
+  Packet16q8u() : val(_mm_setzero_si128()) {}
   Packet16q8u(__m128i val) : val(val) {}
 } Packet16q8u;
 
 typedef struct Packet8q16i {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet8q16i();
+  Packet8q16i() : val(_mm_setzero_si128()) {}
   Packet8q16i(__m128i val) : val(val) {}
 } Packet8q16i;
 
 typedef struct Packet8q32i {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet8q32i();
+  Packet8q32i() : val(_mm256_setzero_si256()){};
   Packet8q32i(__m256i val) : val(val) {}
 } Packet8q32i;
 
 typedef struct Packet4q32i {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet4q32i();
+  Packet4q32i() : val(_mm_setzero_si128()) {}
   Packet4q32i(__m128i val) : val(val) {}
 } Packet4q32i;
 
@@ -182,25 +178,25 @@ template <>
 struct unpacket_traits<Packet32q8i> {
   typedef QInt8 type;
   typedef Packet16q8i half;
-  enum { size = 32, alignment=Aligned32 };
+  enum { size = 32, alignment = Aligned32 };
 };
 template <>
 struct unpacket_traits<Packet16q16i> {
   typedef QInt16 type;
   typedef Packet8q16i half;
-  enum { size = 16, alignment=Aligned32 };
+  enum { size = 16, alignment = Aligned32 };
 };
 template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
-  enum { size = 32, alignment=Aligned32 };
+  enum { size = 32, alignment = Aligned32 };
 };
 template <>
 struct unpacket_traits<Packet8q32i> {
   typedef QInt32 type;
   typedef Packet4q32i half;
-  enum { size = 8, alignment=Aligned32 };
+  enum { size = 8, alignment = Aligned32 };
 };
 
 // Unaligned load
@@ -455,40 +451,47 @@ EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
   __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp =
+      _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
   __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp =
+      _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
 }
 
 // Vectorized scaling of Packet32q8i by float.
-template<>
+template <>
 struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
   typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
 #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
 #else
-  scalar_product_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
+  scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const QInt32& a, const double& b) const { return a * b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  operator()(const QInt32& a, const double& b) const {
+    return a * b;
+  }
 
-  EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, const double& b) const {
+  EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a,
+                                                 const double& b) const {
     __m256d scale = _mm256_set1_pd(b);
     __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
     __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
     __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
     __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
-    return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
+                                   1);
   }
 };
 
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index 2092ce1d4c92754ce52b78f6a6e5fe814d4b7aaa..84750c1945a6125bf6be92647106de535c90a21f 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -127,25 +127,25 @@ template <>
 struct unpacket_traits<Packet64q8i> {
   typedef QInt8 type;
   typedef Packet32q8i half;
-  enum { size = 64, alignment=Aligned64 };
+  enum { size = 64, alignment = Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet32q16i> {
   typedef QInt16 type;
   typedef Packet16q16i half;
-  enum { size = 32, alignment=Aligned64 };
+  enum { size = 32, alignment = Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet64q8u> {
   typedef QUInt8 type;
   typedef Packet32q8u half;
-  enum { size = 64, alignment=Aligned64 };
+  enum { size = 64, alignment = Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet16q32i> {
   typedef QInt32 type;
   typedef Packet8q32i half;
-  enum { size = 16, alignment=Aligned64 };
+  enum { size = 16, alignment = Aligned64 };
 };
 
 // Unaligned load
@@ -244,7 +244,7 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
 template <>
 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
   return static_cast<uint8_t>(
-           _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
+      _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
@@ -410,9 +410,7 @@ EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
       _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
   res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   return pfirst(
-           _mm_min_epi32(
-             res,
-             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+      _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
 }
 template <>
 EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
@@ -424,9 +422,7 @@ EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
       _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
   res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   return pfirst(
-           _mm_max_epi32(
-             res,
-             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+      _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
@@ -437,13 +433,10 @@ EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
   Packet4i res =
       _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
   res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::int16_t>(w >> 16),
-           static_cast<std::int16_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
@@ -454,13 +447,10 @@ EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
   Packet4i res =
       _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
   res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::max({
-           static_cast<std::int16_t>(w >> 16),
-           static_cast<std::int16_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::max(
+      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
@@ -471,15 +461,11 @@ EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
   Packet4i res =
       _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
   res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::uint8_t>(w >> 24),
-           static_cast<std::uint8_t>(w >> 16),
-           static_cast<std::uint8_t>(w >> 8),
-           static_cast<std::uint8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
+       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
@@ -490,15 +476,11 @@ EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
   Packet4i res =
       _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
   res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::max({
-           static_cast<std::uint8_t>(w >> 24),
-           static_cast<std::uint8_t>(w >> 16),
-           static_cast<std::uint8_t>(w >> 8),
-           static_cast<std::uint8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::max(
+      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
+       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
@@ -509,15 +491,11 @@ EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
   Packet4i res =
       _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
   res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::int8_t>(w >> 24),
-           static_cast<std::int8_t>(w >> 16),
-           static_cast<std::int8_t>(w >> 8),
-           static_cast<std::int8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
+       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
@@ -528,15 +506,11 @@ EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
   Packet4i res =
       _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
   res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::int8_t>(w >> 24),
-           static_cast<std::int8_t>(w >> 16),
-           static_cast<std::int8_t>(w >> 8),
-           static_cast<std::int8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
+       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
 }
 
 }  // end namespace internal
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
index a09eac67070477ad4b7ad7fd041800d1d815cac3..d3b02402971145f0bee4eec0f02dae24431a1da5 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
@@ -33,28 +33,23 @@ struct type_casting_traits<float, QInt16> {
 };
 
 template <>
-EIGEN_STRONG_INLINE Packet32q16i
-pcast<Packet16f>(const Packet16f& a, const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16f>(const Packet16f& a,
+                                                  const Packet16f& b) {
   Packet16i a_int = _mm512_cvtps_epi32(a);
   Packet16i b_int = _mm512_cvtps_epi32(b);
 #ifdef EIGEN_VECTORIZE_AVX512BW
   return _mm512_packs_epi32(a_int, b_int);
 #else
-  Packet8i ab_int16_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_castsi512_si256(a_int),
-          _mm512_castsi512_si256(b_int)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i ab_int16_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_extracti32x8_epi32(a_int, 1),
-          _mm512_extracti32x8_epi32(b_int, 1)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  return _mm512_inserti32x8(
-           _mm512_castsi256_si512(ab_int16_low),
-           ab_int16_high, 1);
+  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
+                         _mm512_castsi512_si256(b_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
+                         _mm512_extracti32x8_epi32(b_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(_mm512_castsi256_si512(ab_int16_low), ab_int16_high,
+                            1);
 #endif
 }
 
@@ -64,55 +59,41 @@ struct type_casting_traits<float, QInt8> {
 };
 
 template <>
-EIGEN_STRONG_INLINE Packet64q8i
-pcast<Packet16f>(const Packet16f& a,
-                 const Packet16f& b,
-                 const Packet16f& c,
-                 const Packet16f& d) {
+EIGEN_STRONG_INLINE Packet64q8i pcast<Packet16f>(const Packet16f& a,
+                                                 const Packet16f& b,
+                                                 const Packet16f& c,
+                                                 const Packet16f& d) {
   Packet16i a_int = _mm512_cvtps_epi32(a);
   Packet16i b_int = _mm512_cvtps_epi32(b);
   Packet16i c_int = _mm512_cvtps_epi32(c);
   Packet16i d_int = _mm512_cvtps_epi32(d);
 #ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_packs_epi16(
-           _mm512_packs_epi32(a_int, b_int),
-           _mm512_packs_epi32(c_int, d_int));
+  return _mm512_packs_epi16(_mm512_packs_epi32(a_int, b_int),
+                            _mm512_packs_epi32(c_int, d_int));
 #else
-  Packet8i ab_int16_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_castsi512_si256(a_int),
-          _mm512_castsi512_si256(b_int)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i cd_int16_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_castsi512_si256(c_int),
-          _mm512_castsi512_si256(d_int)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i ab_int16_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_extracti32x8_epi32(a_int, 1),
-          _mm512_extracti32x8_epi32(b_int, 1)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i cd_int16_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_extracti32x8_epi32(c_int, 1),
-          _mm512_extracti32x8_epi32(d_int, 1)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i abcd_int8_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi16(ab_int16_low, cd_int16_low),
-        _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
+                         _mm512_castsi512_si256(b_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(c_int),
+                         _mm512_castsi512_si256(d_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
+                         _mm512_extracti32x8_epi32(b_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(c_int, 1),
+                         _mm512_extracti32x8_epi32(d_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i abcd_int8_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(ab_int16_low, cd_int16_low), _MM_SHUFFLE(0, 2, 1, 3));
   Packet8i abcd_int8_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi16(ab_int16_high, cd_int16_high),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  return _mm512_inserti32x8(
-           _mm512_castsi256_si512(abcd_int8_low),
-           abcd_int8_high, 1);
+      _mm256_permute4x64_epi64(_mm256_packs_epi16(ab_int16_high, cd_int16_high),
+                               _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(_mm512_castsi256_si512(abcd_int8_low),
+                            abcd_int8_high, 1);
 #endif
 }
 
@@ -128,10 +109,8 @@ struct type_casting_traits<QInt32, QInt16> {
 
 template <>
 EIGEN_STRONG_INLINE Packet64q8i
-pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a,
-                                 const Packet16q32i& b,
-                                 const Packet16q32i& c,
-                                 const Packet16q32i& d) {
+pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a, const Packet16q32i& b,
+                                 const Packet16q32i& c, const Packet16q32i& d) {
   __m128i a_part = _mm512_cvtsepi32_epi8(a);
   __m128i b_part = _mm512_cvtsepi32_epi8(b);
   __m128i c_part = _mm512_cvtsepi32_epi8(c);
@@ -145,9 +124,8 @@ pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a,
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32q16i
-pcast<Packet16q32i, Packet32q16i>(const Packet16q32i& a,
-                                  const Packet16q32i& b) {
+EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16q32i, Packet32q16i>(
+    const Packet16q32i& a, const Packet16q32i& b) {
   __m256i a_part = _mm512_cvtsepi32_epi16(a);
   __m256i b_part = _mm512_cvtsepi32_epi16(b);
   __m512i converted =
diff --git a/third_party/googleapis.BUILD b/third_party/googleapis.BUILD
index 95e999af1886576317aa59d133e8d5c88ba368d3..b8871eda7280becb7c3f53412120600d52c0fb54 100644
--- a/third_party/googleapis.BUILD
+++ b/third_party/googleapis.BUILD
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 package(default_visibility = ["//visibility:public"])
+
 licenses(["notice"])  # Apache 2.0
+
 exports_files(["LICENSE"])
 
 load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
@@ -21,6 +23,9 @@ load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
 cc_proto_library(
     name = "bigtable_protos",
     srcs = [
+        "google/api/annotations.proto",
+        "google/api/auth.proto",
+        "google/api/http.proto",
         "google/bigtable/admin/v2/bigtable_instance_admin.proto",
         "google/bigtable/admin/v2/bigtable_table_admin.proto",
         "google/bigtable/admin/v2/common.proto",
@@ -31,15 +36,12 @@ cc_proto_library(
         "google/iam/v1/iam_policy.proto",
         "google/iam/v1/policy.proto",
         "google/longrunning/operations.proto",
-        "google/rpc/status.proto",
         "google/rpc/error_details.proto",
-        "google/api/annotations.proto",
-        "google/api/auth.proto",
-        "google/api/http.proto",
+        "google/rpc/status.proto",
     ],
     include = ".",
-    protoc = "@protobuf_archive//:protoc",
     default_runtime = "@protobuf_archive//:protobuf",
-    deps = ["@protobuf_archive//:cc_wkt_protos"],
+    protoc = "@protobuf_archive//:protoc",
     use_grpc_plugin = True,
+    deps = ["@protobuf_archive//:cc_wkt_protos"],
 )
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
index 3189cf8e31610c432f03f8f3a30efc3ada4d9652..921188cbb431d925df69fbd0cc06aac07fe1a1a9 100644
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@@ -184,7 +184,8 @@ toolchain {
       action: "c++-link-dynamic-library"
       action: "c++-link-nodeps-dynamic-library"
       flag_group {
-        flag:"-no-canonical-prefixes"
+        flag: "-no-canonical-prefixes"
+        %{extra_no_canonical_prefixes_flags}
       }
     }
   }
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 831a3067b2413c2975a920dfa5edbf1838e9a5dc..03c67bcb3d75aca19bcad8b824d79283193dc115 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1418,6 +1418,7 @@ def _create_local_cuda_repository(repository_ctx):
         flag: "-Wno-invalid-partial-specialization"
     """
     cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
     _tpl(repository_ctx, "crosstool:BUILD", {
         "%{linker_files}": ":empty",
         "%{win_linker_files}": ":empty"
@@ -1439,6 +1440,14 @@ def _create_local_cuda_repository(repository_ctx):
             repository_ctx, cuda_config) +
         "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
         "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
+
+    # For gcc, do not canonicalize system header paths; some versions of gcc
+    # pick the shortest possible path for system includes when creating the
+    # .d file - given that includes that are prefixed with "../" multiple
+    # time quickly grow longer than the root of the tree, this can lead to
+    # bazel's header check failing.
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
+        "flag: \"-fno-canonical-system-headers\"")
     nvcc_path = str(
         repository_ctx.path("%s/bin/nvcc%s" % (
             cuda_config.cuda_toolkit_path,
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 9108639b0bf74ab4b14468d77a0570ff8913f107..6df6799bd7696d5dbcc70345bf7b5e19f709b8d4 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -105,7 +105,7 @@ def get_cxx_inc_directories(repository_ctx, cc):
     return includes_cpp + [
         inc
         for inc in includes_c
-        if inc not in includes_cpp_set
+        if inc not in includes_cpp_set.to_list()
     ]
 
 def auto_configure_fail(msg):
diff --git a/third_party/icu/data/BUILD.bazel b/third_party/icu/data/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..7db21566e4e65960d59caa9584c944ef8375bd7e
--- /dev/null
+++ b/third_party/icu/data/BUILD.bazel
@@ -0,0 +1,46 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Data for core MIME/Unix/Windows encodings:
+# ISO 8859-2..9, 15; Windows-125x; EUC-CN; GBK (Windows cp936); GB 18030;
+# Big5 (Windows cp950); SJIS (Windows cp932); EUC-JP; EUC-KR, KS C 5601;
+# Windows cp949. Data is pre-processed for little-endian platforms. To replicate
+# this pre-processing (if you want additional encodings, for example), do the
+# following:
+#
+# First, download, build, and install ICU. This installs tools such as makeconv.
+# Then, run the following from your icu4c/source directory:
+#   $ cd data/mappings
+#   $ rm *.cnv  # there shouldn't be any .cnv files here to begin with
+#   $ grep \.ucm ucmcore.mk | \
+#     sed 's/\(UCM_SOURCE_CORE=\)\?\([^ ]\+\.ucm\)\\\?/\2/g' | \
+#     tr '\n' ' ' | xargs makeconv
+#   $ ls *.cnv > filelist.lst
+#   $ pkgdata -m common -p ucmcore filelist.lst
+#   $ genccode -f custom_conversion_data ucmcore.dat
+# This creates custom_conversion_data.c. You will need to change the target
+# :conversion_data to depend on your custom source instead of :conversion_data.c
+filegroup(
+    name = "conversion_files",
+    srcs = glob(["icu_conversion_data.c.gz.*"]),
+)
+
+# Data files are compressed and split to work around git performance degradation
+# around large files.
+genrule(
+    name = "merge_conversion_data",
+    srcs = [":conversion_files"],
+    outs = ["conversion_data.c"],
+    cmd = "cat $(locations :conversion_files) | gunzip > $@",
+)
+
+cc_library(
+    name = "conversion_data",
+    srcs = [":conversion_data.c"],
+    deps = ["@icu//:headers"],
+)
diff --git a/third_party/icu/data/LICENSE b/third_party/icu/data/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..25b6eb9d3415e62e99af6a349362349c091bc6c7
--- /dev/null
+++ b/third_party/icu/data/LICENSE
@@ -0,0 +1,414 @@
+COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
+
+Copyright © 1991-2018 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+---------------------
+
+Third-Party Software Licenses
+
+This section contains third-party software notices and/or additional
+terms for licensed third-party software components included within ICU
+libraries.
+
+1. ICU License - ICU 1.8.1 to ICU 57.1
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2016 International Business Machines Corporation and others
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the
+property of their respective owners.
+
+2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
+
+ #     The Google Chrome software developed by Google is licensed under
+ # the BSD license. Other software included in this distribution is
+ # provided under other licenses, as set forth below.
+ #
+ #  The BSD License
+ #  http://opensource.org/licenses/bsd-license.php
+ #  Copyright (C) 2006-2008, Google Inc.
+ #
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions are met:
+ #
+ #  Redistributions of source code must retain the above copyright notice,
+ # this list of conditions and the following disclaimer.
+ #  Redistributions in binary form must reproduce the above
+ # copyright notice, this list of conditions and the following
+ # disclaimer in the documentation and/or other materials provided with
+ # the distribution.
+ #  Neither the name of  Google Inc. nor the names of its
+ # contributors may be used to endorse or promote products derived from
+ # this software without specific prior written permission.
+ #
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #
+ #
+ #  The word list in cjdict.txt are generated by combining three word lists
+ # listed below with further processing for compound word breaking. The
+ # frequency is generated with an iterative training against Google web
+ # corpora.
+ #
+ #  * Libtabe (Chinese)
+ #    - https://sourceforge.net/project/?group_id=1519
+ #    - Its license terms and conditions are shown below.
+ #
+ #  * IPADIC (Japanese)
+ #    - http://chasen.aist-nara.ac.jp/chasen/distribution.html
+ #    - Its license terms and conditions are shown below.
+ #
+ #  ---------COPYING.libtabe ---- BEGIN--------------------
+ #
+ #  /*
+ #   * Copyright (c) 1999 TaBE Project.
+ #   * Copyright (c) 1999 Pai-Hsiang Hsiao.
+ #   * All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the TaBE Project nor the names of its
+ #   *   contributors may be used to endorse or promote products derived
+ #   *   from this software without specific prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  /*
+ #   * Copyright (c) 1999 Computer Systems and Communication Lab,
+ #   *                    Institute of Information Science, Academia
+ #       *                    Sinica. All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the Computer Systems and Communication Lab
+ #   *   nor the names of its contributors may be used to endorse or
+ #   *   promote products derived from this software without specific
+ #   *   prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
+ #      University of Illinois
+ #  c-tsai4@uiuc.edu  http://casper.beckman.uiuc.edu/~c-tsai4
+ #
+ #  ---------------COPYING.libtabe-----END--------------------------------
+ #
+ #
+ #  ---------------COPYING.ipadic-----BEGIN-------------------------------
+ #
+ #  Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
+ #  and Technology.  All Rights Reserved.
+ #
+ #  Use, reproduction, and distribution of this software is permitted.
+ #  Any copy of this software, whether in its original form or modified,
+ #  must include both the above copyright notice and the following
+ #  paragraphs.
+ #
+ #  Nara Institute of Science and Technology (NAIST),
+ #  the copyright holders, disclaims all warranties with regard to this
+ #  software, including all implied warranties of merchantability and
+ #  fitness, in no event shall NAIST be liable for
+ #  any special, indirect or consequential damages or any damages
+ #  whatsoever resulting from loss of use, data or profits, whether in an
+ #  action of contract, negligence or other tortuous action, arising out
+ #  of or in connection with the use or performance of this software.
+ #
+ #  A large portion of the dictionary entries
+ #  originate from ICOT Free Software.  The following conditions for ICOT
+ #  Free Software applies to the current dictionary as well.
+ #
+ #  Each User may also freely distribute the Program, whether in its
+ #  original form or modified, to any third party or parties, PROVIDED
+ #  that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+ #  on, or be attached to, the Program, which is distributed substantially
+ #  in the same form as set out herein and that such intended
+ #  distribution, if actually made, will neither violate or otherwise
+ #  contravene any of the laws and regulations of the countries having
+ #  jurisdiction over the User or the intended distribution itself.
+ #
+ #  NO WARRANTY
+ #
+ #  The program was produced on an experimental basis in the course of the
+ #  research and development conducted during the project and is provided
+ #  to users as so produced on an experimental basis.  Accordingly, the
+ #  program is provided without any warranty whatsoever, whether express,
+ #  implied, statutory or otherwise.  The term "warranty" used herein
+ #  includes, but is not limited to, any warranty of the quality,
+ #  performance, merchantability and fitness for a particular purpose of
+ #  the program and the nonexistence of any infringement or violation of
+ #  any right of any third party.
+ #
+ #  Each user of the program will agree and understand, and be deemed to
+ #  have agreed and understood, that there is no warranty whatsoever for
+ #  the program and, accordingly, the entire risk arising from or
+ #  otherwise connected with the program is assumed by the user.
+ #
+ #  Therefore, neither ICOT, the copyright holder, or any other
+ #  organization that participated in or was otherwise related to the
+ #  development of the program and their respective officials, directors,
+ #  officers and other employees shall be held liable for any and all
+ #  damages, including, without limitation, general, special, incidental
+ #  and consequential damages, arising out of or otherwise in connection
+ #  with the use or inability to use the program or any product, material
+ #  or result produced or otherwise obtained by using the program,
+ #  regardless of whether they have been advised of, or otherwise had
+ #  knowledge of, the possibility of such damages at any time during the
+ #  project or thereafter.  Each user will be deemed to have agreed to the
+ #  foregoing by his or her commencement of use of the program.  The term
+ #  "use" as used herein includes, but is not limited to, the use,
+ #  modification, copying and distribution of the program and the
+ #  production of secondary products from the program.
+ #
+ #  In the case where the program, whether in its original form or
+ #  modified, was distributed or delivered to or received by a user from
+ #  any person, organization or entity other than ICOT, unless it makes or
+ #  grants independently of ICOT any specific warranty to the user in
+ #  writing, such person, organization or entity, will also be exempted
+ #  from and not be held liable to the user for any such damages as noted
+ #  above as far as the program is concerned.
+ #
+ #  ---------------COPYING.ipadic-----END----------------------------------
+
+3. Lao Word Break Dictionary Data (laodict.txt)
+
+ #  Copyright (c) 2013 International Business Machines Corporation
+ #  and others. All Rights Reserved.
+ #
+ # Project: http://code.google.com/p/lao-dictionary/
+ # Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
+ # License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
+ #              (copied below)
+ #
+ #  This file is derived from the above dictionary, with slight
+ #  modifications.
+ #  ----------------------------------------------------------------------
+ #  Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification,
+ #  are permitted provided that the following conditions are met:
+ #
+ #
+ # Redistributions of source code must retain the above copyright notice, this
+ #  list of conditions and the following disclaimer. Redistributions in
+ #  binary form must reproduce the above copyright notice, this list of
+ #  conditions and the following disclaimer in the documentation and/or
+ #  other materials provided with the distribution.
+ #
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # OF THE POSSIBILITY OF SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+4. Burmese Word Break Dictionary Data (burmesedict.txt)
+
+ #  Copyright (c) 2014 International Business Machines Corporation
+ #  and others. All Rights Reserved.
+ #
+ #  This list is part of a project hosted at:
+ #    github.com/kanyawtech/myanmar-karen-word-lists
+ #
+ #  --------------------------------------------------------------------------
+ #  Copyright (c) 2013, LeRoy Benjamin Sharon
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification, are permitted provided that the following conditions
+ #  are met: Redistributions of source code must retain the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer.  Redistributions in binary form must reproduce the
+ #  above copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #    Neither the name Myanmar Karen Word Lists, nor the names of its
+ #    contributors may be used to endorse or promote products derived
+ #    from this software without specific prior written permission.
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ #  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ #  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ #  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ #  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ #  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ #  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ #  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ #  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ #  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ #  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ #  THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ #  SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+5. Time Zone Database
+
+  ICU uses the public domain data and code derived from Time Zone
+Database for its time zone support. The ownership of the TZ database
+is explained in BCP 175: Procedure for Maintaining the Time Zone
+Database section 7.
+
+ # 7.  Database Ownership
+ #
+ #    The TZ database itself is not an IETF Contribution or an IETF
+ #    document.  Rather it is a pre-existing and regularly updated work
+ #    that is in the public domain, and is intended to remain in the
+ #    public domain.  Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
+ #    not apply to the TZ Database or contributions that individuals make
+ #    to it.  Should any claims be made and substantiated against the TZ
+ #    Database, the organization that is providing the IANA
+ #    Considerations defined in this RFC, under the memorandum of
+ #    understanding with the IETF, currently ICANN, may act in accordance
+ #    with all competent court orders.  No ownership claims will be made
+ #    by ICANN or the IETF Trust on the database or the code.  Any person
+ #    making a contribution to the database or code waives all rights to
+ #    future claims in that contribution or in the TZ Database.
+
+6. Google double-conversion
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.aa b/third_party/icu/data/icu_conversion_data.c.gz.aa
new file mode 100644
index 0000000000000000000000000000000000000000..b68a2c6516f8183e805c509a9139cf63d1ee3fa5
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.aa differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ab b/third_party/icu/data/icu_conversion_data.c.gz.ab
new file mode 100644
index 0000000000000000000000000000000000000000..d60aa92d675c85f95e811221bffc012d65e6c29e
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ab differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ac b/third_party/icu/data/icu_conversion_data.c.gz.ac
new file mode 100644
index 0000000000000000000000000000000000000000..de9b69ff9474e0c9ccc799d40d092d2ab2ad98bb
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ac differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ad b/third_party/icu/data/icu_conversion_data.c.gz.ad
new file mode 100644
index 0000000000000000000000000000000000000000..d5abb06b8ca21e1e6116ef1732c661c815b1489a
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ad differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ae b/third_party/icu/data/icu_conversion_data.c.gz.ae
new file mode 100644
index 0000000000000000000000000000000000000000..0e54fdb9eaffd814477460f71bc194104c1b247d
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ae differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.af b/third_party/icu/data/icu_conversion_data.c.gz.af
new file mode 100644
index 0000000000000000000000000000000000000000..cfbeb165ad3428555276a463a90a1ed2e34740f0
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.af differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ag b/third_party/icu/data/icu_conversion_data.c.gz.ag
new file mode 100644
index 0000000000000000000000000000000000000000..bde20b6da6253d866f87fcadc7e6c3571bd64d44
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ag differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ah b/third_party/icu/data/icu_conversion_data.c.gz.ah
new file mode 100644
index 0000000000000000000000000000000000000000..ae31dffbe2afc8ad59ae1dc323447d8cf9d61032
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ah differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ai b/third_party/icu/data/icu_conversion_data.c.gz.ai
new file mode 100644
index 0000000000000000000000000000000000000000..981b869561a615f21639482929b89d2b2e5ca360
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ai differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.aj b/third_party/icu/data/icu_conversion_data.c.gz.aj
new file mode 100644
index 0000000000000000000000000000000000000000..1ae6bce382a05570b46217e1a031414515439a42
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.aj differ
diff --git a/third_party/icu/udata.patch b/third_party/icu/udata.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d6d59100e48b8346fcaa54f0cbdebdc5e4658f92
--- /dev/null
+++ b/third_party/icu/udata.patch
@@ -0,0 +1,53 @@
+--- /icu4c/source/common/udata.cpp.old	2018-06-19 22:34:56.000000000 -0700
++++ /icu4c/source/common/udata.cpp	2018-10-19 14:26:09.778950855 -0700
+@@ -18,15 +18,15 @@
+ 
+ #include "unicode/utypes.h"  /* U_PLATFORM etc. */
+ 
+-#ifdef __GNUC__
+-/* if gcc
+-#define ATTRIBUTE_WEAK __attribute__ ((weak))
+-might have to #include some other header
+-*/
++#if defined(__GNUC__) || defined(__SUNPRO_CC)
++#  define ATTRIBUTE_WEAK __attribute__ ((weak))
++#else
++#  define ATTRIBUTE_WEAK
+ #endif
+ 
+ #include "unicode/putil.h"
+ #include "unicode/udata.h"
++#include "unicode/umachine.h"
+ #include "unicode/uversion.h"
+ #include "charstr.h"
+ #include "cmemory.h"
+@@ -641,10 +641,11 @@
+  * partial-data-library access functions where each returns a pointer
+  * to its data package, if it is linked in.
+  */
+-/*
+-extern const void *uprv_getICUData_collation(void) ATTRIBUTE_WEAK;
+-extern const void *uprv_getICUData_conversion(void) ATTRIBUTE_WEAK;
+-*/
++
++//extern "C" const void *uprv_getICUData_collation(void);
++U_CDECL_BEGIN
++const void *uprv_getICUData_conversion(void) ATTRIBUTE_WEAK;
++U_CDECL_END
+ 
+ /*----------------------------------------------------------------------*
+  *                                                                      *
+@@ -702,10 +703,11 @@
+         if (uprv_getICUData_collation) {
+             setCommonICUDataPointer(uprv_getICUData_collation(), FALSE, pErrorCode);
+         }
++        */
+         if (uprv_getICUData_conversion) {
+-            setCommonICUDataPointer(uprv_getICUData_conversion(), FALSE, pErrorCode);
++          setCommonICUDataPointer(uprv_getICUData_conversion(), FALSE, pErrorCode);
+         }
+-        */
++
+ #if U_PLATFORM_HAS_WINUWP_API == 0 // Windows UWP Platform does not support dll icu data at this time
+         setCommonICUDataPointer(&U_ICUDATA_ENTRY_POINT, FALSE, pErrorCode);
+         {
diff --git a/third_party/icu/workspace.bzl b/third_party/icu/workspace.bzl
index a4f653e026138d233a9041bea484c809eefa4fdc..f100836b4101efa0a20e09e7d430b0b44953e89a 100644
--- a/third_party/icu/workspace.bzl
+++ b/third_party/icu/workspace.bzl
@@ -2,6 +2,11 @@
 
 load("//third_party:repo.bzl", "third_party_http_archive")
 
+# Sanitize a dependency so that it works correctly from code that includes
+# TensorFlow as a submodule.
+def clean_dep(dep):
+    return str(Label(dep))
+
 def repo():
     third_party_http_archive(
         name = "icu",
@@ -13,4 +18,5 @@ def repo():
         ],
         build_file = "//third_party/icu:BUILD.bazel",
         system_build_file = "//third_party/icu:BUILD.system",
+        patch_file = clean_dep("//third_party/icu:udata.patch"),
     )
diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl
index 54ca86f3272cb6c91541e20d9ba5326d2cf726a0..5a977f82c417a9ae3e3022fa43534affe727cae2 100644
--- a/third_party/llvm/llvm.bzl
+++ b/third_party/llvm/llvm.bzl
@@ -250,6 +250,7 @@ linux_cmake_vars = {
 # CMake variables specific to the Darwin (Mac OS X) platform.
 darwin_cmake_vars = {
     "HAVE_MALLOC_MALLOC_H": 1,
+    "HAVE_MALLOC_ZONE_STATISTICS": 1,
 }
 
 # CMake variables specific to the Windows platform.
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 597ac69e2ffed73210733fab98bed3d1227b0d23..7a8ed3bf43955dfa3a77c7cafa30817b9d176d2d 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -42,8 +42,8 @@ cc_library(
         "src",
         "src/common",
         "src/cpu",
-        "src/cpu/xbyak",
         "src/cpu/gemm",
+        "src/cpu/xbyak",
     ],
     nocopts = "-fno-exceptions",
     visibility = ["//visibility:public"],
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index f57f04c75ed64ff1c5bf9cbb45b6a43a953627e2..7a08f97ef328a7a731d7c76de8bda70c8d004dac 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -7,10 +7,10 @@ exports_files(["LICENSE.txt"])
 
 load(
     "@local_config_nccl//:build_defs.bzl",
-    "device_link",
     "gen_nccl_h",
     "nccl_library",
     "rdc_copts",
+    "rdc_library",
 )
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
@@ -64,13 +64,13 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=0"] + rdc_copts(),
+    linkstatic = True,
     prefix = "sum_",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
@@ -80,13 +80,13 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=1"] + rdc_copts(),
+    linkstatic = True,
     prefix = "_prod",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
@@ -96,13 +96,13 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=2"] + rdc_copts(),
+    linkstatic = True,
     prefix = "min_",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
@@ -112,33 +112,33 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=3"] + rdc_copts(),
+    linkstatic = True,
     prefix = "max_",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
     name = "functions",
     srcs = [
-        ":device_hdrs",
         "src/collectives/device/functions.cu",
+        ":device_hdrs",
     ],
     copts = rdc_copts(),
+    linkstatic = True,
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
-device_link(
+rdc_library(
     name = "device_code",
-    srcs = [
+    deps = [
         ":functions",
         ":max",
         ":min",
@@ -162,18 +162,13 @@ nccl_library(
         "src/nccl.h",
     ],
     hdrs = ["src/nccl.h"],
+    copts = cuda_default_copts(),
     include_prefix = "third_party/nccl",
     strip_include_prefix = "src",
-    copts = cuda_default_copts(),
+    visibility = ["//visibility:public"],
     deps = [
         ":device_code",
-        ":functions",
         ":include_hdrs",
-        ":max",
-        ":min",
-        ":prod",
         ":src_hdrs",
-        ":sum",
     ],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index bd65cb98f7365ed1fdd19358103e0f1f5f3f7590..42de79c411c844d48982c47753337102b915aefd 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -43,8 +43,7 @@ def _process_srcs_impl(ctx):
             substitutions = {
                 "\"collectives.h": "\"collectives/collectives.h",
                 "\"../collectives.h": "\"collectives/collectives.h",
-                "#if __CUDACC_VER_MAJOR__":
-                    "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
+                "#if __CUDACC_VER_MAJOR__": "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
                 # Substitutions are applied in order.
                 "std::nullptr_t": "nullptr_t",
                 "nullptr_t": "std::nullptr_t",
@@ -140,13 +139,16 @@ _gen_link_src = rule(
 )
 """Patches the include directives for the link.stub file."""
 
-def device_link(name, srcs):
-    """Links seperately compiled relocatable device code into a cc_library."""
+def rdc_library(name, deps):
+    """Produces a cc_library from deps containing relocatable device code."""
 
-    # From .a and .pic.a archives, just use the latter.
+    # From .a and .pic.a archives, just use the latter. Otherwise we get
+    # multiply defined symbols.
+    # TODO(csigg): C++ Sandwich once available should allow passing this target
+    # to a cc_library dependency, which would avoid the linking order issue.
     _filter(
-        name = name + "_pic_a",
-        srcs = srcs,
+        name = name + "_deps_a",
+        srcs = deps,
         suffix = ".pic.a",
     )
 
@@ -160,10 +162,8 @@ def device_link(name, srcs):
         cmd = ("$(location %s) " % nvlink +
                select({
                    # NCCL is only supported on Linux.
-                   "@org_tensorflow//tensorflow:linux_x86_64":
-                       "--cpu-arch=X86_64 ",
-                   "@org_tensorflow//tensorflow:linux_ppc64le":
-                       "--cpu-arch=PPC64LE ",
+                   "@org_tensorflow//tensorflow:linux_x86_64": "--cpu-arch=X86_64 ",
+                   "@org_tensorflow//tensorflow:linux_ppc64le": "--cpu-arch=PPC64LE ",
                    "//conditions:default": "",
                }) +
                "--arch=%s $(SRCS) " % arch +
@@ -172,7 +172,7 @@ def device_link(name, srcs):
         native.genrule(
             name = "%s_%s" % (name, arch),
             outs = [register_hdr, cubin],
-            srcs = [name + "_pic_a"],
+            srcs = [name + "_deps_a"],
             cmd = cmd,
             tools = [nvlink],
         )
@@ -197,7 +197,7 @@ def device_link(name, srcs):
 
     # Generate the source file #including the headers generated above.
     _gen_link_src(
-        name = name + "_cc",
+        name = name + "_dlink_src",
         # Include just the last one, they are equivalent.
         register_hdr = register_hdr,
         fatbin_hdr = fatbin_hdr,
@@ -207,12 +207,13 @@ def device_link(name, srcs):
 
     # Compile the source file into the cc_library.
     native.cc_library(
-        name = name,
-        srcs = [name + "_cc"],
+        name = name + "_dlink_a",
+        srcs = [
+            name + "_dlink_src",
+        ],
         textual_hdrs = [register_hdr, fatbin_hdr],
         deps = [
             "@local_config_cuda//cuda:cuda_headers",
-            "@local_config_cuda//cuda:cudart_static",
         ],
         defines = [
             # Silence warning about including internal header.
@@ -221,4 +222,31 @@ def device_link(name, srcs):
             "__NV_EXTRA_INITIALIZATION=",
             "__NV_EXTRA_FINALIZATION=",
         ],
+        linkstatic = True,
+    )
+
+    # Repackage deps into a single archive. This avoid unresolved symbols when
+    # the archives happen to be linked in the wrong order. For more details, see
+    # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
+    native.genrule(
+        name = name + "_a",
+        srcs = [
+            name + "_deps_a",
+            name + "_dlink_a",
+        ],
+        outs = [name + ".a"],
+        # See https://stackoverflow.com/a/23621751
+        cmd = """
+addlibs=$$(echo $(SRCS) | sed "s/[^ ]* */\\naddlib &/g")
+printf "create $@$${addlibs}\\nsave\\nend" | $(AR) -M
+""",
+    )
+
+    native.cc_library(
+        name = name,
+        srcs = [name + "_a"],
+        deps = [
+            "@local_config_cuda//cuda:cudart_static",
+        ],
+        linkstatic = True,
     )
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index 6602a480afbf29ed9777aa0ab3e10b2860b84d66..63e9548c53262461cfc9c3fd160f4f17430319c7 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -34,8 +34,9 @@ cc_library(
         "src/ngraph/runtime/cpu/builder/one_hot.cpp",
         "src/ngraph/runtime/cpu/builder/pad.cpp",
         "src/ngraph/runtime/cpu/builder/product.cpp",
-        "src/ngraph/runtime/cpu/builder/quantize.cpp",
+        "src/ngraph/runtime/cpu/builder/quantization.cpp",
         "src/ngraph/runtime/cpu/builder/quantized_avg_pool.cpp",
+        "src/ngraph/runtime/cpu/builder/quantized_conv.cpp",
         "src/ngraph/runtime/cpu/builder/quantized_max_pool.cpp",
         "src/ngraph/runtime/cpu/builder/reduce_function.cpp",
         "src/ngraph/runtime/cpu/builder/reduce_function_window.cpp",
@@ -61,6 +62,7 @@ cc_library(
         "src/ngraph/runtime/cpu/cpu_tensor_view.cpp",
         "src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp",
         "src/ngraph/runtime/cpu/cpu_tracing.cpp",
+        "src/ngraph/runtime/cpu/cpu_visualize_tree.cpp",
         "src/ngraph/runtime/cpu/kernel/eigen_thread_pool.cpp",
         "src/ngraph/runtime/cpu/kernel/pad.cpp",
         "src/ngraph/runtime/cpu/kernel/reduce_max.cpp",
@@ -76,15 +78,11 @@ cc_library(
         "src/ngraph/runtime/cpu/op/conv_bias.cpp",
         "src/ngraph/runtime/cpu/op/conv_relu.cpp",
         "src/ngraph/runtime/cpu/op/convert_layout.cpp",
-        "src/ngraph/runtime/cpu/op/dequantize.cpp",
         "src/ngraph/runtime/cpu/op/group_conv.cpp",
         "src/ngraph/runtime/cpu/op/loop_kernel.cpp",
         "src/ngraph/runtime/cpu/op/lstm.cpp",
         "src/ngraph/runtime/cpu/op/matmul_bias.cpp",
         "src/ngraph/runtime/cpu/op/max_pool_with_indices.cpp",
-        "src/ngraph/runtime/cpu/op/quantize.cpp",
-        "src/ngraph/runtime/cpu/op/quantized_avg_pool.cpp",
-        "src/ngraph/runtime/cpu/op/quantized_max_pool.cpp",
         "src/ngraph/runtime/cpu/op/rnn.cpp",
         "src/ngraph/runtime/cpu/op/sigmoid_mul.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_assignment.cpp",
@@ -99,21 +97,22 @@ cc_library(
         "src/ngraph/runtime/cpu/pass/cpu_workspace_insertion.cpp",
     ],
     hdrs = glob(["src/ngraph/runtime/cpu/**/*.hpp"]) + glob([]),
-    deps = [
-        ":ngraph_headers",
-        "@eigen_archive//:eigen",
-        "@nlohmann_json_lib",
-        "@tbb",
-        "@mkl_dnn//:mkl_dnn",
-    ],
     copts = [
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.1\\"',
+        '-D NGRAPH_VERSION=\\"0.9.1\\"',
         "-D NGRAPH_DEX_ONLY",
+        '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
+    deps = [
+        ":ngraph_headers",
+        "@eigen_archive//:eigen",
+        "@mkl_dnn",
+        "@nlohmann_json_lib",
+        "@tbb",
+    ],
     alwayslink = 1,
 )
 
@@ -125,6 +124,11 @@ cc_library(
         "src/ngraph/builder/*.cpp",
         "src/ngraph/descriptor/*.cpp",
         "src/ngraph/descriptor/layout/*.cpp",
+        "src/ngraph/op/experimental/quantized_avg_pool.cpp",
+        "src/ngraph/op/experimental/quantized_conv_bias.cpp",
+        "src/ngraph/op/experimental/quantized_conv_relu.cpp",
+        "src/ngraph/op/experimental/quantized_conv.cpp",
+        "src/ngraph/op/experimental/quantized_max_pool.cpp",
         "src/ngraph/op/*.cpp",
         "src/ngraph/op/util/*.cpp",
         "src/ngraph/pattern/*.cpp",
@@ -134,18 +138,19 @@ cc_library(
         "src/ngraph/runtime/*.cpp",
         "src/ngraph/type/*.cpp",
     ]),
-    deps = [
-        ":ngraph_headers",
-        ":ngraph_cpu_backend",
-        "@eigen_archive//:eigen",
-        "@nlohmann_json_lib",
-    ],
     copts = [
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.1\\"',
+        '-D NGRAPH_VERSION=\\"0.9.1\\"',
+        '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
+    deps = [
+        ":ngraph_cpu_backend",
+        ":ngraph_headers",
+        "@eigen_archive//:eigen",
+        "@nlohmann_json_lib",
+    ],
     alwayslink = 1,
 )
diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD
index dbedca0a03c68d3099233e45102dd9401ea4359d..db9a66f9b5bcdaa29ec55175f1a8c76ac5f6f22a 100644
--- a/third_party/ngraph/ngraph_tf.BUILD
+++ b/third_party/ngraph/ngraph_tf.BUILD
@@ -10,6 +10,10 @@ load(
 cc_library(
     name = "ngraph_tf",
     srcs = [
+        "logging/ngraph_log.cc",
+        "logging/ngraph_log.h",
+        "logging/tf_graph_writer.cc",
+        "logging/tf_graph_writer.h",
         "src/ngraph_api.cc",
         "src/ngraph_api.h",
         "src/ngraph_assign_clusters.cc",
@@ -41,24 +45,23 @@ cc_library(
         "src/tf_deadness_analysis.h",
         "src/tf_graphcycles.cc",
         "src/tf_graphcycles.h",
-        "logging/ngraph_log.h",
-        "logging/ngraph_log.cc",
-        "logging/tf_graph_writer.h",
-        "logging/tf_graph_writer.cc",
-    ],
-    deps = [
-        "@org_tensorflow//tensorflow/core:protos_all_proto_text",
-        "@org_tensorflow//tensorflow/core:framework_headers_lib",
-        "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
-        "@ngraph//:ngraph_core",
     ],
     copts = [
         "-I external/ngraph_tf/src",
         "-I external/ngraph_tf/logging",
         "-I external/ngraph/src",
     ],
-    alwayslink = 1,
     visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/container:container_memory",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:variant",
+        "@ngraph//:ngraph_core",
+        "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
+        "@org_tensorflow//tensorflow/core:framework_headers_lib",
+        "@org_tensorflow//tensorflow/core:protos_all_proto_text",
+    ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -79,6 +82,12 @@ tf_cc_test(
         "test/test_utilities.h",
         "test/tf_exec.cpp",
     ],
+    extra_copts = [
+        "-fexceptions ",
+        "-I external/ngraph_tf/src",
+        "-I external/ngraph_tf/logging",
+        "-I external/ngraph/src",
+    ],
     deps = [
         ":ngraph_tf",
         "@com_google_googletest//:gtest",
@@ -86,10 +95,4 @@ tf_cc_test(
         "@org_tensorflow//tensorflow/cc:client_session",
         "@org_tensorflow//tensorflow/core:tensorflow",
     ],
-    extra_copts = [
-        "-fexceptions ",
-        "-I external/ngraph_tf/src",
-        "-I external/ngraph_tf/logging",
-        "-I external/ngraph/src",
-    ],
 )
diff --git a/third_party/ngraph/tbb.BUILD b/third_party/ngraph/tbb.BUILD
index 04e6544ffb579a94db2ffeed123068a64afbfcb7..c78a2d79ddfff53ddede0a70427dac89d08fbdcc 100644
--- a/third_party/ngraph/tbb.BUILD
+++ b/third_party/ngraph/tbb.BUILD
@@ -14,6 +14,10 @@ genrule(
     srcs = glob(["**"]) + [
         "@local_config_cc//:toolchain",
     ],
+    outs = [
+        "libtbb.a",
+        "libtbbmalloc.a",
+    ],
     cmd = """
 	    set -e
 	    WORK_DIR=$$PWD
@@ -45,19 +49,15 @@ genrule(
         cp build/build_{release,debug}/*.a $$DEST_DIR
 		cd $$WORK_DIR
 	""",
-    outs = [
-        "libtbb.a",
-        "libtbbmalloc.a",
-    ],
 )
 
 cc_library(
     name = "tbb",
+    srcs = ["libtbb.a"],
     hdrs = glob([
         "include/serial/**",
         "include/tbb/**/**",
     ]),
-    srcs = ["libtbb.a"],
     includes = ["include"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index c26a2897176e57220b42b7d2cc5b61d114ecfc5f..e82948648e42e14e97238726e7db5a932bbea946 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -44,11 +44,11 @@ cc_library(
         "png.h",
         "pngconf.h",
     ],
-    includes = ["."],
     copts = select({
         ":windows": ["-DPNG_INTEL_SSE_OPT=1"],
         "//conditions:default": [],
     }),
+    includes = ["."],
     linkopts = select({
         ":windows": [],
         "//conditions:default": ["-lm"],
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 07b853ff11cb737f26a9b0ec37aaff6fd7ada203..bad6d20a08c0ee27345bf16a5a4f7c9e4d67a05f 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -84,7 +84,7 @@ def _apply_delete(ctx, paths):
 def _tf_http_archive(ctx):
     if ("mirror.bazel.build" not in ctx.attr.urls[0] and
         (len(ctx.attr.urls) < 2 and
-         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
         fail("tf_http_archive(urls) must have redundant URLs. The " +
              "mirror.bazel.build URL must be present and it must come first. " +
              "Even if you don't have permission to mirror the file, please " +
@@ -150,7 +150,7 @@ ensure best practices are followed.
 def _third_party_http_archive(ctx):
     if ("mirror.bazel.build" not in ctx.attr.urls[0] and
         (len(ctx.attr.urls) < 2 and
-         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
         fail("tf_http_archive(urls) must have redundant URLs. The " +
              "mirror.bazel.build URL must be present and it must come first. " +
              "Even if you don't have permission to mirror the file, please " +
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index 6e1416ced16a7dac39c55f300e4cbc9412ecce63..a7b4687c020e3d9176a5e451bdf9e20aec569b5e 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -2,6 +2,8 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
+load("//third_party/toolchains/preconfig/generate:containers.bzl", "container_digests")
+
 # Platform for use with remote execution with
 # custom container based off RBE Ubuntu16_04
 # http://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04
@@ -30,6 +32,6 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@sha256:e5099ff15650986e268a43ee99e2d2b7ffe2459b8b6935385078d1d3b2ed4d02"
-        }""",
+            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
+        }""" % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
 )
diff --git a/third_party/toolchains/gpus/cuda/BUILD b/third_party/toolchains/gpus/cuda/BUILD
index f59e025019caffa333a1570b572dd7f0d9913923..f63a0ea81925783085b1b551aab778d41ba1fb2c 100644
--- a/third_party/toolchains/gpus/cuda/BUILD
+++ b/third_party/toolchains/gpus/cuda/BUILD
@@ -1258,7 +1258,7 @@ genrule(
         "cuda/lib/libcupti.so.9.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
    """,
 )
 
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7e3e93d6004894029135f3151a282bcc43b8938f
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -0,0 +1,35 @@
+licenses(["restricted"])
+
+load(":generate.bzl", "tensorflow_rbe_config")
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-gcc-cuda9.0-cudnn7-nccl2",
+    compiler = "gcc",
+    cuda_version = "9.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-clang-cuda9.0-cudnn7-nccl2",
+    compiler = "clang",
+    cuda_version = "9.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-gcc-cuda10.0-cudnn7-nccl2",
+    compiler = "gcc",
+    cuda_version = "10.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-clang-cuda10.0-cudnn7-nccl2",
+    compiler = "clang",
+    cuda_version = "10.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..1f9e29d4402dc2a969d01291d7772219415bbf3e
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -0,0 +1,4 @@
+container_digests = {
+    "cuda9.0-cudnn7-ubuntu14.04": "sha256:c26138f4c38c754da2bad44a8a068523abf7fbd71d58a57ce92e5342c5431bf5",
+    "cuda10.0-cudnn7-ubuntu14.04": "sha256:34c4a55e2376b300cdc2b903775fc32e62352f6e33f927df5653743324378bfc",
+}
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..2fb3a94cdca7430b522939266a4b2b398a65df8d
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -0,0 +1,46 @@
+load(
+    "@bazel_toolchains//rules:docker_config.bzl",
+    "docker_toolchain_autoconfig",
+)
+
+def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, compiler):
+    docker_toolchain_autoconfig(
+        name = name,
+        base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version),
+        bazel_version = "0.16.1",
+        config_repos = [
+            "local_config_cuda",
+            "local_config_python",
+            "local_config_nccl",
+        ],
+        env = {
+            "ABI_VERSION": "gcc",
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "BAZEL_COMPILER": compiler,
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CC": compiler,
+            "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+            "TF_NEED_CUDA": "1",
+            "TF_CUDA_CLANG": "1" if compiler == "clang" else "0",
+            "CLEAR_CACHE": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.0",
+            "TF_ENABLE_XLA": "1",
+            "TF_CUDNN_VERSION": cudnn_version,
+            "TF_CUDA_VERSION": cuda_version,
+            "NCCL_INSTALL_PATH": "/usr/lib",
+            "NCCL_HDR_PATH": "/usr/include",
+            "TF_NCCL_VERSION": "2",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+        },
+        # TODO(klimek): We should use the sources that we currently work on, not
+        # just the latest snapshot of tensorflow that is checked in.
+        git_repo = "https://github.com/tensorflow/tensorflow",
+        tags = ["manual"],
+        incompatible_changes_off = True,
+    )
+
+tensorflow_rbe_config = _tensorflow_rbe_config
diff --git a/third_party/toolchains/preconfig/generate/generate.sh b/third_party/toolchains/preconfig/generate/generate.sh
new file mode 100755
index 0000000000000000000000000000000000000000..37c5211278abf243ab388d83688e6c8c7888cea3
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/generate.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+TARGET="$1"
+OUTPUT="$2"
+
+if [[ -z "${TARGET}" || -z "${OUTPUT}" ]]; then
+  echo "Usage:"
+  echo "$0 <target> <output>"
+  exit 1
+fi
+
+TEMPDIR="$(mktemp -d)"
+ROOT="${PWD}"
+PKG="third_party/toolchains/preconfig"
+IFS='-' read -ra PLATFORM <<< "${TARGET}"
+OS="${PLATFORM[0]}"
+PY_VERSION="${PLATFORM[1]}"
+COMPILER="${PLATFORM[2]}"
+CUDA_VERSION="${PLATFORM[3]}"
+CUDNN_VERSION="${PLATFORM[4]}"
+NCCL_VERSION="${PLATFORM[5]}"
+
+if [[ "${COMPILER}" == "gcc" ]]; then
+  COMPILER="gcc-nvcc-${CUDA_VERSION}"
+fi
+
+echo "OS: ${OS}"
+echo "Python: ${PY_VERSION}"
+echo "Compiler: ${COMPILER}"
+echo "CUDA: ${CUDA_VERSION}"
+echo "CUDNN: ${CUDNN_VERSION}"
+echo "NCCL: ${NCCL_VERSION}"
+
+bazel build "${PKG}/generate:${TARGET}"
+cd "${TEMPDIR}"
+tar xvf "${ROOT}/bazel-bin/${PKG}/generate/${TARGET}_outputs.tar"
+
+# Delete all empty files: configurations leave empty files around when they are
+# unnecessary.
+find . -empty -delete
+
+# We build up the following directory structure with preconfigured packages:
+# <OS>/
+#   <CUDA>-<CUDNN>/
+#   <COMPILER>/
+#   <NCCL>/
+#   <PYTHON>/
+
+# Create our toplevel output directory for the OS.
+mkdir "${OS}"
+
+# Python:
+mv local_config_python "${OS}/${PY_VERSION}"
+
+# NCCL:
+mv local_config_nccl "${OS}/${NCCL_VERSION}"
+
+# Compiler:
+mv local_config_cuda/crosstool "${OS}/${COMPILER}"
+
+# CUDA:
+mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
+
+# Cleanup for copybara.
+find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs buildifier
+find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs -I {} mv {} {}.oss
+
+# Tar it up:
+tar cvf "${OUTPUT}" "${OS}"
+
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..f30c2f1ae6318c645e174617a74b8fdadac1598e
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/workspace.bzl
@@ -0,0 +1,25 @@
+load(
+    "@io_bazel_rules_docker//container:container.bzl",
+    "container_pull",
+    container_repositories = "repositories",
+)
+load(":containers.bzl", "container_digests")
+
+def _remote_config_workspace():
+    container_repositories()
+
+    container_pull(
+        name = "cuda9.0-cudnn7-ubuntu14.04",
+        registry = "gcr.io",
+        repository = "asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04",
+        digest = container_digests["cuda9.0-cudnn7-ubuntu14.04"],
+    )
+
+    container_pull(
+        name = "cuda10.0-cudnn7-ubuntu14.04",
+        registry = "gcr.io",
+        repository = "asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04",
+        digest = container_digests["cuda10.0-cudnn7-ubuntu14.04"],
+    )
+
+remote_config_workspace = _remote_config_workspace
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
index 05abcb56d84789844616f1c884021ca9ea9eca10..c6930904b564bf2cce70b484a0e7b0759f13b7c9 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
@@ -1188,7 +1188,7 @@ genrule(
         "cuda/include/vector_types.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp -f "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp -f "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp -f "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
    """,
 )
 
@@ -1198,7 +1198,7 @@ genrule(
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
    """,
 )
 
@@ -1235,7 +1235,7 @@ genrule(
         "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
    """,
 )
 
@@ -1253,7 +1253,7 @@ genrule(
         "cuda/lib/libcupti.so.9.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
    """,
 )
 
@@ -1263,6 +1263,6 @@ genrule(
         "cuda/include/cudnn.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/include/cudnn.h" "$(@D)/cudnn.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
    """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
index 5c6703aab4fbdaf92c5b63a5c0f2600ad699c0cf..a53c891d8bba1b80a880ddd9c16091db27861a8d 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
@@ -9,15 +9,13 @@ def if_cuda(if_true, if_false = []):
     return select({
         "@local_config_cuda//cuda:using_nvcc": if_true,
         "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
 
-
 def cuda_default_copts():
     """Default options for all CUDA compilations."""
     return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
 
-
 def cuda_is_configured():
     """Returns true if CUDA was enabled during the configure process."""
     return True
@@ -29,5 +27,5 @@ def if_cuda_is_configured(x):
     --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
     """
     if cuda_is_configured():
-      return x
+        return x
     return []
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..6442e7628a416e3298cfd2579cee275459780145
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
@@ -0,0 +1,87 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..0d89a539b8d70788eb0f6924636824fba778a058
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL
@@ -0,0 +1,1431 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "x64_windows"
+  toolchain_identifier: "local_windows"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        flag: "-fno-canonical-system-headers"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 0000000000000000000000000000000000000000..63893d3722f6b43579758e5f747076b1f1e73ed7
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '9.0'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # currently only c++11 is supported by Cuda 7.0 std argument
+  nvcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ "3.0" ]
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat
new file mode 100755
index 0000000000000000000000000000000000000000..e896e654fd7ecd578c80d102895f51ce18bbd4eb
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat
@@ -0,0 +1,20 @@
+:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
+@echo OFF
+set arg0=%~0
+for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
+"/usr/bin/python3" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
new file mode 100755
index 0000000000000000000000000000000000000000..859b3196d5dba9afadeae56f34be04247b00fe09
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+NVCC_VERSION = '9.0'
+NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
+supported_cuda_compute_capabilities = [ "3.0" ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
index e021df9e1e3066b597dddc5dc78da3121ddd2430..460c879d32f1381454b6d043bded61e66b02f41d 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
@@ -136,7 +136,7 @@ genrule(
         "python_include/weakrefobject.h",
     ],
     cmd = """
-cp "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+cp -f "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
    """,
 )
 
@@ -171,6 +171,6 @@ genrule(
         "numpy_include/numpy/utils.h",
     ],
     cmd = """
-cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
    """,
 )